PyPI - python-filmaffinity - Versions diffs - 0.0.21__py3-none-any.whl - Mend

python-filmaffinity 0.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

python_filmaffinity/__init__.py +254 -0
python_filmaffinity/__meta__.py +19 -0
python_filmaffinity/client.py +379 -0
python_filmaffinity/config.py +11 -0
python_filmaffinity/exceptions.py +38 -0
python_filmaffinity/pages/__init__.py +5 -0
python_filmaffinity/pages/detail.py +202 -0
python_filmaffinity/pages/images.py +64 -0
python_filmaffinity/pages/page.py +209 -0
python_filmaffinity/pages/search.py +10 -0
python_filmaffinity/pages/top.py +6 -0
python_filmaffinity/pages/top_service.py +15 -0
python_filmaffinity/proxies.py +28 -0
python_filmaffinity-0.0.21.dist-info/METADATA +427 -0
python_filmaffinity-0.0.21.dist-info/RECORD +19 -0
python_filmaffinity-0.0.21.dist-info/WHEEL +5 -0
python_filmaffinity-0.0.21.dist-info/licenses/AUTHORS.rst +15 -0
python_filmaffinity-0.0.21.dist-info/licenses/LICENSE.rst +22 -0
python_filmaffinity-0.0.21.dist-info/top_level.txt +1 -0

python_filmaffinity/exceptions.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Exceptions."""
+# -*- coding: utf-8 -*-
+class FilmAffinityException(Exception):
+    """Any exception generated by FilmAffinityException."""
+    pass
+class FilmAffinityInvalidLanguage(FilmAffinityException):
+    """Invalid language given on FilmAffinity."""
+    def __init__(self, value, supported):
+        self.value = value
+        self.supported = supported
+    def __str__(self):
+        return repr('Invalid Language selected: {}    '
+                    '[Available languages are: {}]'.format(
+                        self.value, ', '.join(self.supported)))
+class FilmAffinityInvalidBackend(FilmAffinityException):
+    """Invalid Backend for requests-cache."""
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return repr('Invalid Backend for requests-cache: {}'.format(
+            self.value))
+class FilmAffinityConnectionError(FilmAffinityException):
+    """Invalid connection tp FilmAffinity."""
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return repr('Error connecting to FilmAffinity: {}'.format(self.value))

python_filmaffinity/pages/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .detail import DetailPage
+from .search import SearchPage
+from .top import TopPage
+from .top_service import TopServicePage
+from .images import ImagesPage

python_filmaffinity/pages/detail.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Page type: detail."""
+import re
+from .page import Page
+class DetailPage(Page):
+    """Page type: detail."""
+    def get_id(self):
+        id_cell = self.soup.find("div", {"class": "rate-movie-box"})
+        if id_cell:
+            return int(id_cell.get("data-movie-id"))
+        return None
+    def get_title(self):
+        """Get the title."""
+        name = None
+        name_cell = self.soup.find("span", {"itemprop": 'name'})
+        if name_cell:
+            name = name_cell.get_text().strip()
+        return name
+    def get_original_title(self):
+        """Get the original title."""
+        info_cell = self.soup.find("dl", {"class": 'movie-info'})
+        if info_cell:
+            original_title_cell = info_cell.find('dd')
+            if not original_title_cell:
+                return None
+            inner_span_tag = original_title_cell.span
+            if inner_span_tag:
+                inner_span_tag.decompose()
+            return original_title_cell.text.strip()
+        return None
+    def get_year(self):
+        """Get the year."""
+        year = None
+        year_cell = self.soup.find("dd", {"itemprop": 'datePublished'})
+        if year_cell:
+            year = year_cell.get_text()
+        return year
+    def get_description(self):
+        """Get the description."""
+        description = None
+        description_cell = self.soup.find("dd", {"itemprop": 'description'})
+        if description_cell:
+            description = description_cell.get_text()
+        return description
+    def get_rating(self):
+        """Get the rating."""
+        rating = self.soup.find("div", {"id": 'movie-rat-avg'})
+        if rating:
+            try:
+                rating = str(rating['content'])
+                rating = float(re.sub(r"[^\d,.]", "" , rating).replace(',', '.'))
+            except ValueError:
+                rating = float(0)
+        return rating
+    def get_number_of_votes(self):
+        """Get the number of votes."""
+        votes = self.soup.find("span", {"itemprop": 'ratingCount'})
+        votes_checked = None
+        if votes:
+            try:
+                votes = votes['content']
+                votes_checked = int(re.sub(r"[., ]", "", votes))
+            except ValueError:
+                votes_checked = None
+        return votes_checked
+    def get_actors(self):
+        """Get the actors."""
+        actors = []
+        actors_cell = self.soup.find_all("li", {"itemprop": 'actor'})
+        try:
+            for actor_cell in actors_cell:
+                actor = actor_cell.find("div", {"itemprop": 'name'})
+                actors.append(actor.get_text())
+            return actors
+        except Exception as e:
+            return []
+    def get_poster(self):
+        """Get the poster."""
+        image = self.soup.find("img", {"itemprop": 'image'})
+        if image:
+            try:
+                image = str(image['src'])
+            except ValueError:
+                image = None
+        return image
+    def get_directors(self):
+        """Get the directors."""
+        directors = []
+        directors_cell = self.soup.find_all("span", {"itemprop": 'director'})
+        for director_cell in directors_cell:
+            director = director_cell.find("span", {"itemprop": 'name'})
+            directors.append(director.get_text())
+        return directors
+    def get_duration(self):
+        """Get Duration."""
+        duration = ''
+        dc = self.soup.find("dd", {"itemprop": 'duration'})
+        if dc:
+            duration = dc.get_text()
+        return duration
+    def get_country(self):
+        """Get the country."""
+        country = ''
+        dc = self.soup.find("span", {"id": 'country-img'})
+        if dc:
+            country = dc.img['alt']
+        return country
+    def get_writers(self):
+        """Get writers."""
+        writers = []
+        for dt in self.soup.find_all('dt'):
+            if dt.get_text() == 'Guion':
+                dd = dt.next_sibling.next_sibling
+                for nb in dd.find_all('span', {'class': 'nb'}):
+                    writers.append(nb.find('a').get_text())
+        return writers
+    def get_music(self):
+        """Get music."""
+        music = []
+        for dt in self.soup.find_all('dt'):
+            if dt.get_text() == 'Música':
+                dd = dt.next_sibling.next_sibling
+                for nb in dd.find_all('span', {'class': 'nb'}):
+                    music.append(nb.find('a').get_text())
+        return music
+    def get_cinematography(self):
+        """Get cinematography."""
+        cinematography = []
+        for dt in self.soup.find_all('dt'):
+            if dt.get_text() == 'Fotografía':
+                dd = dt.next_sibling.next_sibling
+                for nb in dd.find_all('span', {'class': 'nb'}):
+                    cinematography.append(nb.find('a').get_text())
+        return cinematography
+    def get_producers(self):
+        """Get producers."""
+        producers = []
+        dd = self.soup.find('dd', {'class': 'card-producer'})
+        if dd:
+            for nb in dd.find_all('span', {'class': 'nb'}):
+                producers.append(nb.find('a').get_text())
+        return producers
+    def get_genre(self):
+        """Get the genre."""
+        genres = []
+        for i in self.soup.find_all("span", {"itemprop": 'genre'}):
+            link = i.find('a')
+            genres.append(link.get_text() if link else i.get_text().strip())
+        return genres
+    def get_awards(self):
+        """Get the awards.
+        Returns:
+            TYPE: List of dicts (dict keys: year, award)
+        """
+        awards = []
+        ac = self.soup.find("dd", {"class": 'award'})
+        if not ac:
+            return awards
+        for i in ac.find_all("a"):
+            award = ''.join(str(s) for s in i.next_siblings).strip()
+            award = award[2:] if award.startswith('- ') else award
+            awards.append({'year': i.get_text(),
+                           'award': award})
+        return awards
+    def get_reviews(self):
+        """Get the critics reviews.
+        Returns:
+            TYPE: List of dicts (dict keys: author, review, url)
+        """
+        reviews = []
+        for i in self.soup.find_all("div", {"class": 'pro-review'}):
+            reviews.append(
+                {'author': i.find("div", {
+                    "itemprop": 'author'}).get_text(),
+                 'review': i.find("div", {
+                     "itemprop": 'reviewBody'}).get_text(),
+                 'url': i.a['href'] if i.a else None})
+        return reviews

python_filmaffinity/pages/images.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""Page type: images."""
+import re
+from .page import Page
+pattern_thumbnail = re.compile(r'\((.*?)\)', re.IGNORECASE)
+pattern_country = re.compile(r'>Pa[ií]s: </strong>(.*?)</div>')
+pattern_country_div = re.compile(r'<div>(.*?)</div>')
+class ImagesPage(Page):
+    """Page type: images."""
+    def _get_list_of_images(self, imgs_soup):
+        """:returns List of dicts containing two keys for each dict:
+                - image (full resolution image)
+                - thumbnail (low resolution image)
+                """
+        if not imgs_soup:
+            return []
+        all_imgs = imgs_soup.find_all("div", {"class": 'colorbox-image'})
+        imgs = []
+        for i in all_imgs:
+            link = i.find('a')
+            if link:
+                im = link.get('href')
+                th = None
+                title = link.get('title') or link.get('data-bs-title') or ''
+                country = pattern_country.search(title)
+                if not country:
+                    country = pattern_country_div.search(title)
+                country = country.group(1) if country else None
+                div = link.find('div')
+                if div and div.get('style'):
+                    re_th = pattern_thumbnail.search(div['style'])
+                    if re_th:
+                        th = re_th.group(1)
+                imgs.append({'image': im,
+                             'thumbnail': th,
+                             'country': country})
+        return imgs
+    def get_posters(self):
+        """Get images of type posters."""
+        imgs_cells = self.soup.find("div", {"id": 'type_imgs_2'})
+        return self._get_list_of_images(imgs_cells)
+    def get_stills(self):
+        """Get images of type stills."""
+        imgs_cells = self.soup.find("div", {"id": 'type_imgs_9'})
+        return self._get_list_of_images(imgs_cells)
+    def get_promos(self):
+        """Get images of type promos."""
+        imgs_cells = self.soup.find("div", {"id": 'type_imgs_8'})
+        return self._get_list_of_images(imgs_cells)
+    def get_events(self):
+        """Get images of type events/red carpet."""
+        imgs_cells = self.soup.find("div", {"id": 'type_imgs_11'})
+        return self._get_list_of_images(imgs_cells)
+    def get_shootings(self):
+        """Get images of type shootings/making of."""
+        imgs_cells = self.soup.find("div", {"id": 'type_imgs_13'})
+        return self._get_list_of_images(imgs_cells)

python_filmaffinity/pages/page.py ADDED Viewed

@@ -0,0 +1,209 @@
+import re
+pattern_title_year = re.compile(
+    r'(.+[^0-9|^(|^\[])[([|\[| |.|_]*(19\d\d|20\d\d)[)|\]]?',
+    re.IGNORECASE)
+classifications = [
+    'Documentary',
+    'Animation',
+    'Tv Series',
+    'TV Miniseries',
+]
+class Page(object):
+    """This is a simple python scraping object that represents
+    all possible data that we can scrap from a FilmAffinity detail page.
+    Most of the scraping for this class is based on tags from the
+    top searches (the most used in this module).
+    Some of the results will return nothing, depending if the scrapped page
+    contains the corresponding data. Some specific data are implemented
+    into his own subclass (Ex: get_awards and get_reviews are implemented
+    into DetailPage because those fields only are in detail page)
+    Attributes:
+        soup (TYPE): Page analyzed by BeautifulSoup.
+    """
+    def __init__(self, soup):
+        """Init the class.
+        Args:
+            soup (TYPE): Page analyzed by BeautifulSoup
+        """
+        self.soup = soup
+    def _movie_card(self):
+        if self.soup.get('data-movie-id'):
+            return self.soup
+        return self.soup.select_one(".movie-card[data-movie-id]")
+    def _text(self, selector):
+        cell = self.soup.select_one(selector)
+        return cell.get_text().strip() if cell else None
+    def _src_from_image(self, image):
+        if not image:
+            return None
+        if image.get('data-src'):
+            return image['data-src']
+        srcset = image.get('data-srcset') or image.get('srcset')
+        if srcset:
+            candidates = [part.strip().split()[0] for part in srcset.split(',')]
+            return candidates[-1] if candidates else None
+        src = image.get('src')
+        return None if src == '/images/empty.gif' else src
+    def _parse_float(self, value):
+        if value is None:
+            return None
+        try:
+            value = re.sub(r"[^\d,.]", "", value).replace(',', '.')
+            return float(value)
+        except ValueError:
+            return None
+    def get_id(self):
+        """Get the id."""
+        cell = self._movie_card()
+        if not cell:
+            return None
+        return cell.get('data-movie-id', None)
+    def get_title(self):
+        """Get title."""
+        title = self.soup.find('div', {'class': 'mc-title'})
+        if not title:
+            return None
+        link = title.find('a')
+        return link.get_text().strip() if link else title.get_text().strip()
+    def get_original_title(self):
+        """ Get original title."""
+        return None
+    def get_rating(self):
+        """Get rating."""
+        cell = self.soup.select_one(".avg, .avg-rating, .avgrat-box")
+        return self._parse_float(cell.get_text() if cell else None)
+    def get_directors(self):
+        """Get directors."""
+        director_cell = self.soup.find(
+            'div', {'class': ['director', 'mc-director']})
+        if not director_cell:
+            return []
+        cell = director_cell.find_all('span', {'class': 'nb'})
+        # Sometimes the FilmAffinity classification
+        # appears inside a directors tag, so we filter it
+        if not cell:
+            return None
+        return [
+            i.a['title'] for i in cell if i.a['title'] not in classifications
+        ]
+    def get_writers(self):
+        """Get writers."""
+        return None
+    def get_music(self):
+        """Get music."""
+        return None
+    def get_cinematography(self):
+        """Get cinematography."""
+        return None
+    def get_producers(self):
+        """Get producers."""
+        return None
+    def get_actors(self):
+        """Get the actors."""
+        actors_cell = self.soup.find(
+            'div', {'class': ['cast', 'mc-cast']})
+        if not actors_cell:
+            return None
+        cell = actors_cell.find_all("span", {'class': 'nb'})
+        # Sometimes the  FilmAffinity classification
+        # appears inside a actors tag, so we filter it
+        if not cell:
+            return None
+        try:
+            return [
+                i.a['title'] for i in cell if i.a['title'] not in classifications
+            ] if cell else None
+        except:
+            return None
+    def get_poster(self):
+        """Get poster."""
+        poster = self.soup.find('div', {'class': 'mc-poster'})
+        if poster:
+            return self._src_from_image(poster.find('img'))
+        return None
+    def get_duration(self):
+        """Get Duration."""
+        cell = self.soup.find('div', {'class': 'duration'})
+        return cell.get_text().strip() if cell else None
+    def get_year(self):
+        """Get the year."""
+        year = self._text(".mc-year")
+        if year:
+            return year
+        cell_r = self.soup.find('div', {'class': 'ye-w'})
+        if cell_r:
+            return cell_r.text.strip()
+        cell = self.soup.find('div', {'class': 'mc-data'})
+        return cell.find_all('div')[0].get_text() if cell else self._get_year_from_title()
+    def _get_year_from_title(self):
+        """Get the year from title."""
+        # Sometimes we cannot find the year inside specific tag,
+        # so...we try to guess from the title
+        t = self.get_title()
+        re_match = pattern_title_year.match(t if t else '')
+        return re_match.group(2) if re_match else None
+    def get_country(self):
+        """Get the country."""
+        flag = self.soup.select_one("img.nflag")
+        if flag and flag.get('alt'):
+            return flag['alt']
+        cell = self.soup.find('div', {'class': ['mc-data', 'mc-title']})
+        if not cell:
+            return None
+        return cell.img['alt'] if cell.img else None
+    def get_genre(self):
+        """Get the genre."""
+        cell = self.soup.find('div', {'class': 'mc-data'})
+        genre = self.soup.select_one("a.genre")
+        if genre:
+            return genre.get_text().strip()
+        return cell.find('a', {'class': 'genre'}).get_text() if cell else None
+    def get_description(self):
+        """Get the description."""
+        cell = self.soup.find('div', {'class': 'mc-data'})
+        synop = self.soup.select_one("a.synop-text")
+        if synop:
+            return synop.get_text().strip()
+        return cell.find('a', {'class': 'synop-text'}).get_text() if cell else None
+    def get_number_of_votes(self):
+        """Get the number of votes."""
+        cell = self.soup.select_one(".count, .rat-count, .ratcount-box")
+        return cell.get_text().strip() if cell else None
+    def get_awards(self):
+        """Get the awards."""
+        # Implemented into subclass: see DetailPage for more details
+        return []
+    def get_reviews(self):
+        """Get the reviews."""
+        # Implemented into subclass: see DetailPage for more details
+        return []

python_filmaffinity/pages/search.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Page type: search."""
+from .page import Page
+class SearchPage(Page):
+    """Page type: search."""
+    # def get_id(self):
+    #     """Get the id."""
+    #     return str(self.soup['data-movie-id'])

python_filmaffinity/pages/top.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Page type: top."""
+from .page import Page
+class TopPage(Page):
+    """Page type: top."""

python_filmaffinity/pages/top_service.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Page type: top_service."""
+from .page import Page
+class TopServicePage(Page):
+    """Page type: top_service."""
+    def get_title(self):
+        """Get the title."""
+        title_cell = self.soup.find('div', {'class': 'mc-right'})
+        if title_cell:
+            title = title_cell.find('h3')
+            if title:
+                return title.get_text().strip()
+        return super(TopServicePage, self).get_title()

python_filmaffinity/proxies.py ADDED Viewed

@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import requests
+from bs4 import BeautifulSoup
+import os
+def get_random_proxy():
+    proxy = {}
+    if os.getenv("FILMAFFINITY_USE_PROXIES") != "1":
+        return proxy
+    try:
+        response = requests.get("https://www.sslproxies.org/", verify=False)
+        soup = BeautifulSoup(response.text, "html.parser")
+        https_proxies = list(filter(
+            lambda item: "yes" in item.text,
+            soup.select("table.table tr")
+        ))
+        if https_proxies:
+            http_proxy = https_proxies[0]
+            proxy = {'http': 'https://{}:{}'.format(
+                http_proxy.select_one("td").text,
+                http_proxy.select_one("td:nth-of-type(2)").text,
+            )}
+    except:
+        proxy = {}
+    return proxy