python-filmaffinity 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ """Exceptions."""
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ class FilmAffinityException(Exception):
6
+ """Any exception generated by FilmAffinityException."""
7
+ pass
8
+
9
+
10
+ class FilmAffinityInvalidLanguage(FilmAffinityException):
11
+ """Invalid language given on FilmAffinity."""
12
+ def __init__(self, value, supported):
13
+ self.value = value
14
+ self.supported = supported
15
+
16
+ def __str__(self):
17
+ return repr('Invalid Language selected: {} '
18
+ '[Available languages are: {}]'.format(
19
+ self.value, ', '.join(self.supported)))
20
+
21
+
22
+ class FilmAffinityInvalidBackend(FilmAffinityException):
23
+ """Invalid Backend for requests-cache."""
24
+ def __init__(self, value):
25
+ self.value = value
26
+
27
+ def __str__(self):
28
+ return repr('Invalid Backend for requests-cache: {}'.format(
29
+ self.value))
30
+
31
+
32
+ class FilmAffinityConnectionError(FilmAffinityException):
33
+ """Invalid connection tp FilmAffinity."""
34
+ def __init__(self, value):
35
+ self.value = value
36
+
37
+ def __str__(self):
38
+ return repr('Error connecting to FilmAffinity: {}'.format(self.value))
@@ -0,0 +1,5 @@
1
+ from .detail import DetailPage
2
+ from .search import SearchPage
3
+ from .top import TopPage
4
+ from .top_service import TopServicePage
5
+ from .images import ImagesPage
@@ -0,0 +1,202 @@
1
+ """Page type: detail."""
2
+ import re
3
+
4
+ from .page import Page
5
+
6
+
7
+ class DetailPage(Page):
8
+ """Page type: detail."""
9
+
10
+ def get_id(self):
11
+ id_cell = self.soup.find("div", {"class": "rate-movie-box"})
12
+ if id_cell:
13
+ return int(id_cell.get("data-movie-id"))
14
+ return None
15
+
16
+
17
+ def get_title(self):
18
+ """Get the title."""
19
+ name = None
20
+ name_cell = self.soup.find("span", {"itemprop": 'name'})
21
+ if name_cell:
22
+ name = name_cell.get_text().strip()
23
+ return name
24
+
25
+ def get_original_title(self):
26
+ """Get the original title."""
27
+ info_cell = self.soup.find("dl", {"class": 'movie-info'})
28
+ if info_cell:
29
+ original_title_cell = info_cell.find('dd')
30
+ if not original_title_cell:
31
+ return None
32
+ inner_span_tag = original_title_cell.span
33
+ if inner_span_tag:
34
+ inner_span_tag.decompose()
35
+ return original_title_cell.text.strip()
36
+ return None
37
+
38
+ def get_year(self):
39
+ """Get the year."""
40
+ year = None
41
+ year_cell = self.soup.find("dd", {"itemprop": 'datePublished'})
42
+ if year_cell:
43
+ year = year_cell.get_text()
44
+ return year
45
+
46
+ def get_description(self):
47
+ """Get the description."""
48
+ description = None
49
+ description_cell = self.soup.find("dd", {"itemprop": 'description'})
50
+ if description_cell:
51
+ description = description_cell.get_text()
52
+ return description
53
+
54
+ def get_rating(self):
55
+ """Get the rating."""
56
+ rating = self.soup.find("div", {"id": 'movie-rat-avg'})
57
+ if rating:
58
+ try:
59
+ rating = str(rating['content'])
60
+ rating = float(re.sub(r"[^\d,.]", "" , rating).replace(',', '.'))
61
+ except ValueError:
62
+ rating = float(0)
63
+ return rating
64
+
65
+ def get_number_of_votes(self):
66
+ """Get the number of votes."""
67
+ votes = self.soup.find("span", {"itemprop": 'ratingCount'})
68
+ votes_checked = None
69
+ if votes:
70
+ try:
71
+ votes = votes['content']
72
+ votes_checked = int(re.sub(r"[., ]", "", votes))
73
+ except ValueError:
74
+ votes_checked = None
75
+ return votes_checked
76
+
77
+ def get_actors(self):
78
+ """Get the actors."""
79
+ actors = []
80
+ actors_cell = self.soup.find_all("li", {"itemprop": 'actor'})
81
+ try:
82
+ for actor_cell in actors_cell:
83
+ actor = actor_cell.find("div", {"itemprop": 'name'})
84
+ actors.append(actor.get_text())
85
+ return actors
86
+ except Exception as e:
87
+ return []
88
+
89
+ def get_poster(self):
90
+ """Get the poster."""
91
+ image = self.soup.find("img", {"itemprop": 'image'})
92
+ if image:
93
+ try:
94
+ image = str(image['src'])
95
+ except ValueError:
96
+ image = None
97
+ return image
98
+
99
+ def get_directors(self):
100
+ """Get the directors."""
101
+ directors = []
102
+ directors_cell = self.soup.find_all("span", {"itemprop": 'director'})
103
+ for director_cell in directors_cell:
104
+ director = director_cell.find("span", {"itemprop": 'name'})
105
+ directors.append(director.get_text())
106
+ return directors
107
+
108
+ def get_duration(self):
109
+ """Get Duration."""
110
+ duration = ''
111
+ dc = self.soup.find("dd", {"itemprop": 'duration'})
112
+ if dc:
113
+ duration = dc.get_text()
114
+ return duration
115
+
116
+ def get_country(self):
117
+ """Get the country."""
118
+ country = ''
119
+ dc = self.soup.find("span", {"id": 'country-img'})
120
+ if dc:
121
+ country = dc.img['alt']
122
+ return country
123
+
124
+ def get_writers(self):
125
+ """Get writers."""
126
+ writers = []
127
+ for dt in self.soup.find_all('dt'):
128
+ if dt.get_text() == 'Guion':
129
+ dd = dt.next_sibling.next_sibling
130
+ for nb in dd.find_all('span', {'class': 'nb'}):
131
+ writers.append(nb.find('a').get_text())
132
+ return writers
133
+
134
+ def get_music(self):
135
+ """Get music."""
136
+ music = []
137
+ for dt in self.soup.find_all('dt'):
138
+ if dt.get_text() == 'Música':
139
+ dd = dt.next_sibling.next_sibling
140
+ for nb in dd.find_all('span', {'class': 'nb'}):
141
+ music.append(nb.find('a').get_text())
142
+ return music
143
+
144
+ def get_cinematography(self):
145
+ """Get cinematography."""
146
+ cinematography = []
147
+ for dt in self.soup.find_all('dt'):
148
+ if dt.get_text() == 'Fotografía':
149
+ dd = dt.next_sibling.next_sibling
150
+ for nb in dd.find_all('span', {'class': 'nb'}):
151
+ cinematography.append(nb.find('a').get_text())
152
+ return cinematography
153
+
154
+ def get_producers(self):
155
+ """Get producers."""
156
+ producers = []
157
+ dd = self.soup.find('dd', {'class': 'card-producer'})
158
+ if dd:
159
+ for nb in dd.find_all('span', {'class': 'nb'}):
160
+ producers.append(nb.find('a').get_text())
161
+ return producers
162
+
163
+ def get_genre(self):
164
+ """Get the genre."""
165
+ genres = []
166
+ for i in self.soup.find_all("span", {"itemprop": 'genre'}):
167
+ link = i.find('a')
168
+ genres.append(link.get_text() if link else i.get_text().strip())
169
+ return genres
170
+
171
+ def get_awards(self):
172
+ """Get the awards.
173
+
174
+ Returns:
175
+ TYPE: List of dicts (dict keys: year, award)
176
+ """
177
+ awards = []
178
+ ac = self.soup.find("dd", {"class": 'award'})
179
+ if not ac:
180
+ return awards
181
+ for i in ac.find_all("a"):
182
+ award = ''.join(str(s) for s in i.next_siblings).strip()
183
+ award = award[2:] if award.startswith('- ') else award
184
+ awards.append({'year': i.get_text(),
185
+ 'award': award})
186
+ return awards
187
+
188
+ def get_reviews(self):
189
+ """Get the critics reviews.
190
+
191
+ Returns:
192
+ TYPE: List of dicts (dict keys: author, review, url)
193
+ """
194
+ reviews = []
195
+ for i in self.soup.find_all("div", {"class": 'pro-review'}):
196
+ reviews.append(
197
+ {'author': i.find("div", {
198
+ "itemprop": 'author'}).get_text(),
199
+ 'review': i.find("div", {
200
+ "itemprop": 'reviewBody'}).get_text(),
201
+ 'url': i.a['href'] if i.a else None})
202
+ return reviews
@@ -0,0 +1,64 @@
1
+ """Page type: images."""
2
+ import re
3
+ from .page import Page
4
+ pattern_thumbnail = re.compile(r'\((.*?)\)', re.IGNORECASE)
5
+ pattern_country = re.compile(r'>Pa[ií]s: </strong>(.*?)</div>')
6
+ pattern_country_div = re.compile(r'<div>(.*?)</div>')
7
+
8
+
9
+ class ImagesPage(Page):
10
+ """Page type: images."""
11
+
12
+ def _get_list_of_images(self, imgs_soup):
13
+ """:returns List of dicts containing two keys for each dict:
14
+ - image (full resolution image)
15
+ - thumbnail (low resolution image)
16
+ """
17
+ if not imgs_soup:
18
+ return []
19
+ all_imgs = imgs_soup.find_all("div", {"class": 'colorbox-image'})
20
+ imgs = []
21
+ for i in all_imgs:
22
+ link = i.find('a')
23
+ if link:
24
+ im = link.get('href')
25
+ th = None
26
+ title = link.get('title') or link.get('data-bs-title') or ''
27
+ country = pattern_country.search(title)
28
+ if not country:
29
+ country = pattern_country_div.search(title)
30
+ country = country.group(1) if country else None
31
+ div = link.find('div')
32
+ if div and div.get('style'):
33
+ re_th = pattern_thumbnail.search(div['style'])
34
+ if re_th:
35
+ th = re_th.group(1)
36
+ imgs.append({'image': im,
37
+ 'thumbnail': th,
38
+ 'country': country})
39
+ return imgs
40
+
41
+ def get_posters(self):
42
+ """Get images of type posters."""
43
+ imgs_cells = self.soup.find("div", {"id": 'type_imgs_2'})
44
+ return self._get_list_of_images(imgs_cells)
45
+
46
+ def get_stills(self):
47
+ """Get images of type stills."""
48
+ imgs_cells = self.soup.find("div", {"id": 'type_imgs_9'})
49
+ return self._get_list_of_images(imgs_cells)
50
+
51
+ def get_promos(self):
52
+ """Get images of type promos."""
53
+ imgs_cells = self.soup.find("div", {"id": 'type_imgs_8'})
54
+ return self._get_list_of_images(imgs_cells)
55
+
56
+ def get_events(self):
57
+ """Get images of type events/red carpet."""
58
+ imgs_cells = self.soup.find("div", {"id": 'type_imgs_11'})
59
+ return self._get_list_of_images(imgs_cells)
60
+
61
+ def get_shootings(self):
62
+ """Get images of type shootings/making of."""
63
+ imgs_cells = self.soup.find("div", {"id": 'type_imgs_13'})
64
+ return self._get_list_of_images(imgs_cells)
@@ -0,0 +1,209 @@
1
+ import re
2
+
3
+ pattern_title_year = re.compile(
4
+ r'(.+[^0-9|^(|^\[])[([|\[| |.|_]*(19\d\d|20\d\d)[)|\]]?',
5
+ re.IGNORECASE)
6
+
7
+ classifications = [
8
+ 'Documentary',
9
+ 'Animation',
10
+ 'Tv Series',
11
+ 'TV Miniseries',
12
+ ]
13
+
14
+
15
+ class Page(object):
16
+ """This is a simple python scraping object that represents
17
+ all possible data that we can scrap from a FilmAffinity detail page.
18
+ Most of the scraping for this class is based on tags from the
19
+ top searches (the most used in this module).
20
+ Some of the results will return nothing, depending if the scrapped page
21
+ contains the corresponding data. Some specific data are implemented
22
+ into his own subclass (Ex: get_awards and get_reviews are implemented
23
+ into DetailPage because those fields only are in detail page)
24
+
25
+ Attributes:
26
+ soup (TYPE): Page analyzed by BeautifulSoup.
27
+ """
28
+ def __init__(self, soup):
29
+ """Init the class.
30
+
31
+ Args:
32
+ soup (TYPE): Page analyzed by BeautifulSoup
33
+ """
34
+ self.soup = soup
35
+
36
+ def _movie_card(self):
37
+ if self.soup.get('data-movie-id'):
38
+ return self.soup
39
+ return self.soup.select_one(".movie-card[data-movie-id]")
40
+
41
+ def _text(self, selector):
42
+ cell = self.soup.select_one(selector)
43
+ return cell.get_text().strip() if cell else None
44
+
45
+ def _src_from_image(self, image):
46
+ if not image:
47
+ return None
48
+ if image.get('data-src'):
49
+ return image['data-src']
50
+ srcset = image.get('data-srcset') or image.get('srcset')
51
+ if srcset:
52
+ candidates = [part.strip().split()[0] for part in srcset.split(',')]
53
+ return candidates[-1] if candidates else None
54
+ src = image.get('src')
55
+ return None if src == '/images/empty.gif' else src
56
+
57
+ def _parse_float(self, value):
58
+ if value is None:
59
+ return None
60
+ try:
61
+ value = re.sub(r"[^\d,.]", "", value).replace(',', '.')
62
+ return float(value)
63
+ except ValueError:
64
+ return None
65
+
66
+ def get_id(self):
67
+ """Get the id."""
68
+ cell = self._movie_card()
69
+ if not cell:
70
+ return None
71
+ return cell.get('data-movie-id', None)
72
+
73
+ def get_title(self):
74
+ """Get title."""
75
+ title = self.soup.find('div', {'class': 'mc-title'})
76
+ if not title:
77
+ return None
78
+ link = title.find('a')
79
+ return link.get_text().strip() if link else title.get_text().strip()
80
+
81
+ def get_original_title(self):
82
+ """ Get original title."""
83
+ return None
84
+
85
+ def get_rating(self):
86
+ """Get rating."""
87
+ cell = self.soup.select_one(".avg, .avg-rating, .avgrat-box")
88
+ return self._parse_float(cell.get_text() if cell else None)
89
+
90
+ def get_directors(self):
91
+ """Get directors."""
92
+ director_cell = self.soup.find(
93
+ 'div', {'class': ['director', 'mc-director']})
94
+ if not director_cell:
95
+ return []
96
+ cell = director_cell.find_all('span', {'class': 'nb'})
97
+ # Sometimes the FilmAffinity classification
98
+ # appears inside a directors tag, so we filter it
99
+ if not cell:
100
+ return None
101
+ return [
102
+ i.a['title'] for i in cell if i.a['title'] not in classifications
103
+ ]
104
+
105
+ def get_writers(self):
106
+ """Get writers."""
107
+ return None
108
+
109
+ def get_music(self):
110
+ """Get music."""
111
+ return None
112
+
113
+ def get_cinematography(self):
114
+ """Get cinematography."""
115
+ return None
116
+
117
+ def get_producers(self):
118
+ """Get producers."""
119
+ return None
120
+
121
+ def get_actors(self):
122
+ """Get the actors."""
123
+ actors_cell = self.soup.find(
124
+ 'div', {'class': ['cast', 'mc-cast']})
125
+ if not actors_cell:
126
+ return None
127
+ cell = actors_cell.find_all("span", {'class': 'nb'})
128
+ # Sometimes the FilmAffinity classification
129
+ # appears inside a actors tag, so we filter it
130
+ if not cell:
131
+ return None
132
+ try:
133
+ return [
134
+ i.a['title'] for i in cell if i.a['title'] not in classifications
135
+ ] if cell else None
136
+ except:
137
+ return None
138
+
139
+ def get_poster(self):
140
+ """Get poster."""
141
+ poster = self.soup.find('div', {'class': 'mc-poster'})
142
+ if poster:
143
+ return self._src_from_image(poster.find('img'))
144
+ return None
145
+
146
+ def get_duration(self):
147
+ """Get Duration."""
148
+ cell = self.soup.find('div', {'class': 'duration'})
149
+ return cell.get_text().strip() if cell else None
150
+
151
+ def get_year(self):
152
+ """Get the year."""
153
+ year = self._text(".mc-year")
154
+ if year:
155
+ return year
156
+ cell_r = self.soup.find('div', {'class': 'ye-w'})
157
+ if cell_r:
158
+ return cell_r.text.strip()
159
+ cell = self.soup.find('div', {'class': 'mc-data'})
160
+ return cell.find_all('div')[0].get_text() if cell else self._get_year_from_title()
161
+
162
+ def _get_year_from_title(self):
163
+ """Get the year from title."""
164
+ # Sometimes we cannot find the year inside specific tag,
165
+ # so...we try to guess from the title
166
+ t = self.get_title()
167
+ re_match = pattern_title_year.match(t if t else '')
168
+ return re_match.group(2) if re_match else None
169
+
170
+ def get_country(self):
171
+ """Get the country."""
172
+ flag = self.soup.select_one("img.nflag")
173
+ if flag and flag.get('alt'):
174
+ return flag['alt']
175
+ cell = self.soup.find('div', {'class': ['mc-data', 'mc-title']})
176
+ if not cell:
177
+ return None
178
+ return cell.img['alt'] if cell.img else None
179
+
180
+ def get_genre(self):
181
+ """Get the genre."""
182
+ cell = self.soup.find('div', {'class': 'mc-data'})
183
+ genre = self.soup.select_one("a.genre")
184
+ if genre:
185
+ return genre.get_text().strip()
186
+ return cell.find('a', {'class': 'genre'}).get_text() if cell else None
187
+
188
+ def get_description(self):
189
+ """Get the description."""
190
+ cell = self.soup.find('div', {'class': 'mc-data'})
191
+ synop = self.soup.select_one("a.synop-text")
192
+ if synop:
193
+ return synop.get_text().strip()
194
+ return cell.find('a', {'class': 'synop-text'}).get_text() if cell else None
195
+
196
+ def get_number_of_votes(self):
197
+ """Get the number of votes."""
198
+ cell = self.soup.select_one(".count, .rat-count, .ratcount-box")
199
+ return cell.get_text().strip() if cell else None
200
+
201
+ def get_awards(self):
202
+ """Get the awards."""
203
+ # Implemented into subclass: see DetailPage for more details
204
+ return []
205
+
206
+ def get_reviews(self):
207
+ """Get the reviews."""
208
+ # Implemented into subclass: see DetailPage for more details
209
+ return []
@@ -0,0 +1,10 @@
1
+ """Page type: search."""
2
+ from .page import Page
3
+
4
+
5
+ class SearchPage(Page):
6
+ """Page type: search."""
7
+
8
+ # def get_id(self):
9
+ # """Get the id."""
10
+ # return str(self.soup['data-movie-id'])
@@ -0,0 +1,6 @@
1
+ """Page type: top."""
2
+ from .page import Page
3
+
4
+
5
+ class TopPage(Page):
6
+ """Page type: top."""
@@ -0,0 +1,15 @@
1
+ """Page type: top_service."""
2
+ from .page import Page
3
+
4
+
5
+ class TopServicePage(Page):
6
+ """Page type: top_service."""
7
+
8
+ def get_title(self):
9
+ """Get the title."""
10
+ title_cell = self.soup.find('div', {'class': 'mc-right'})
11
+ if title_cell:
12
+ title = title_cell.find('h3')
13
+ if title:
14
+ return title.get_text().strip()
15
+ return super(TopServicePage, self).get_title()
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import os
7
+
8
+
9
+ def get_random_proxy():
10
+ proxy = {}
11
+ if os.getenv("FILMAFFINITY_USE_PROXIES") != "1":
12
+ return proxy
13
+ try:
14
+ response = requests.get("https://www.sslproxies.org/", verify=False)
15
+ soup = BeautifulSoup(response.text, "html.parser")
16
+ https_proxies = list(filter(
17
+ lambda item: "yes" in item.text,
18
+ soup.select("table.table tr")
19
+ ))
20
+ if https_proxies:
21
+ http_proxy = https_proxies[0]
22
+ proxy = {'http': 'https://{}:{}'.format(
23
+ http_proxy.select_one("td").text,
24
+ http_proxy.select_one("td:nth-of-type(2)").text,
25
+ )}
26
+ except:
27
+ proxy = {}
28
+ return proxy