PyPI - instapaper-scraper - Versions diffs - 1.0.0__py3-none-any.whl - Mend

instapaper-scraper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

instapaper_scraper/__init__.py +7 -0
instapaper_scraper/api.py +303 -0
instapaper_scraper/auth.py +211 -0
instapaper_scraper/cli.py +202 -0
instapaper_scraper/exceptions.py +4 -0
instapaper_scraper/output.py +88 -0
instapaper_scraper-1.0.0.dist-info/METADATA +280 -0
instapaper_scraper-1.0.0.dist-info/RECORD +12 -0
instapaper_scraper-1.0.0.dist-info/WHEEL +5 -0
instapaper_scraper-1.0.0.dist-info/entry_points.txt +2 -0
instapaper_scraper-1.0.0.dist-info/licenses/LICENSE +674 -0
instapaper_scraper-1.0.0.dist-info/top_level.txt +1 -0

instapaper_scraper/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+import importlib.metadata
+try:
+    __version__ = importlib.metadata.version("instapaper-scraper")
+except importlib.metadata.PackageNotFoundError:
+    # package is not installed
+    __version__ = "unknown"

instapaper_scraper/api.py ADDED Viewed

@@ -0,0 +1,303 @@
+import os
+import logging
+import time
+from typing import List, Dict, Tuple, Optional
+import requests
+from bs4 import BeautifulSoup
+from .exceptions import ScraperStructureChanged
+class InstapaperClient:
+    """
+    A client for interacting with the Instapaper website to fetch articles.
+    """
+    BASE_URL = "https://www.instapaper.com"
+    # Environment variable names
+    ENV_MAX_RETRIES = "MAX_RETRIES"
+    ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
+    # Default values
+    DEFAULT_MAX_RETRIES = 3
+    DEFAULT_BACKOFF_FACTOR = 1.0
+    DEFAULT_REQUEST_TIMEOUT = 30
+    DEFAULT_PAGE_START = 1
+    # HTML parsing constants
+    HTML_PARSER = "html.parser"
+    ARTICLE_LIST_ID = "article_list"
+    ARTICLE_TAG = "article"
+    ARTICLE_ID_PREFIX = "article_"
+    PAGINATE_OLDER_CLASS = "paginate_older"
+    ARTICLE_TITLE_CLASS = "article_title"
+    TITLE_META_CLASS = "title_meta"
+    # URL paths
+    URL_PATH_USER = "/u/"
+    URL_PATH_FOLDER = "/u/folder/"
+    # Dictionary keys for article data
+    KEY_ID = "id"
+    KEY_TITLE = "title"
+    KEY_URL = "url"
+    # HTTP status codes
+    HTTP_TOO_MANY_REQUESTS = 429
+    HTTP_SERVER_ERROR_START = 500
+    HTTP_SERVER_ERROR_END = 600
+    # Logging and error messages
+    MSG_ARTICLE_LIST_NOT_FOUND = "Could not find article list ('#article_list')."
+    MSG_SCRAPING_PAGE = "Scraping page {page}..."
+    MSG_ARTICLE_ELEMENT_NOT_FOUND = "Article element '{article_id_full}' not found."
+    MSG_TITLE_ELEMENT_NOT_FOUND = "Title element not found"
+    MSG_LINK_ELEMENT_NOT_FOUND = "Link element or href not found"
+    MSG_PARSE_ARTICLE_WARNING = (
+        "Could not parse article with id {article_id} on page {page}. Details: {e}"
+    )
+    MSG_RATE_LIMITED_RETRY = (
+        "Rate limited ({status_code}). Retrying after {wait_time} seconds."
+    )
+    MSG_RATE_LIMITED_REASON = "Rate limited ({status_code})"
+    MSG_REQUEST_FAILED_STATUS_REASON = "Request failed with status {status_code}"
+    MSG_REQUEST_FAILED_UNRECOVERABLE = (
+        "Request failed with unrecoverable status code {status_code}."
+    )
+    MSG_NETWORK_ERROR_REASON = "Network error ({error_type})"
+    MSG_SCRAPING_FAILED_STRUCTURE_CHANGE = (
+        "Scraping failed due to HTML structure change: {e}"
+    )
+    MSG_ALL_RETRIES_FAILED = "All {max_retries} retries failed."
+    MSG_SCRAPING_FAILED_UNKNOWN = (
+        "Scraping failed after multiple retries for an unknown reason."
+    )
+    MSG_RETRY_ATTEMPT = "{reason} (attempt {attempt_num}/{max_retries}). Retrying in {sleep_time:.2f} seconds."
+    def __init__(self, session: requests.Session):
+        """
+        Initializes the client with a requests Session.
+        Args:
+            session: A requests.Session object, presumably authenticated.
+        """
+        self.session = session
+        try:
+            self.max_retries = int(
+                os.getenv(self.ENV_MAX_RETRIES, str(self.DEFAULT_MAX_RETRIES))
+            )
+        except ValueError:
+            logging.warning(
+                f"Invalid value for {self.ENV_MAX_RETRIES}, using default {self.DEFAULT_MAX_RETRIES}"
+            )
+            self.max_retries = self.DEFAULT_MAX_RETRIES
+        try:
+            self.backoff_factor = float(
+                os.getenv(self.ENV_BACKOFF_FACTOR, str(self.DEFAULT_BACKOFF_FACTOR))
+            )
+        except ValueError:
+            logging.warning(
+                f"Invalid value for {self.ENV_BACKOFF_FACTOR}, using default {self.DEFAULT_BACKOFF_FACTOR}"
+            )
+            self.backoff_factor = self.DEFAULT_BACKOFF_FACTOR
+    def get_articles(
+        self,
+        page: int = DEFAULT_PAGE_START,
+        folder_info: Optional[Dict[str, str]] = None,
+    ) -> Tuple[List[Dict[str, str]], bool]:
+        """
+        Fetches a single page of articles and determines if there are more pages.
+        Args:
+            page: The page number to fetch.
+            folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
+        Returns:
+            A tuple containing:
+            - A list of article data (dictionaries with id, title, url).
+            - A boolean indicating if there is a next page.
+        """
+        url = self._get_page_url(page, folder_info)
+        last_exception: Optional[Exception] = None
+        for attempt in range(self.max_retries):
+            try:
+                response = self.session.get(url, timeout=self.DEFAULT_REQUEST_TIMEOUT)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.text, self.HTML_PARSER)
+                article_list = soup.find(id=self.ARTICLE_LIST_ID)
+                if not article_list:
+                    raise ScraperStructureChanged(self.MSG_ARTICLE_LIST_NOT_FOUND)
+                articles = article_list.find_all(self.ARTICLE_TAG)
+                article_ids = [
+                    article[self.KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
+                    for article in articles
+                ]
+                data = self._parse_article_data(soup, article_ids, page)
+                has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
+                return data, has_more
+            except requests.exceptions.HTTPError as e:
+                last_exception = e
+                if self._handle_http_error(e, attempt):
+                    continue  # Retry if the handler decided to wait
+                else:
+                    raise e  # Re-raise if the error is unrecoverable
+            except (
+                requests.exceptions.ConnectionError,
+                requests.exceptions.Timeout,
+            ) as e:
+                last_exception = e
+                self._wait_for_retry(
+                    attempt,
+                    self.MSG_NETWORK_ERROR_REASON.format(error_type=type(e).__name__),
+                )
+            except ScraperStructureChanged as e:
+                logging.error(self.MSG_SCRAPING_FAILED_STRUCTURE_CHANGE.format(e=e))
+                raise e
+            except Exception as e:
+                last_exception = e
+                self._wait_for_retry(
+                    attempt,
+                    self.MSG_SCRAPING_FAILED_UNKNOWN,
+                )
+        logging.error(self.MSG_ALL_RETRIES_FAILED.format(max_retries=self.max_retries))
+        if last_exception:
+            raise last_exception
+        raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
+    def get_all_articles(
+        self, limit: Optional[int] = None, folder_info: Optional[Dict[str, str]] = None
+    ) -> List[Dict[str, str]]:
+        """
+        Iterates through pages and fetches articles up to a specified limit.
+        Args:
+            limit: The maximum number of pages to scrape. If None, scrapes all pages.
+            folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
+        """
+        all_articles = []
+        page = self.DEFAULT_PAGE_START
+        has_more = True
+        while has_more:
+            if limit is not None and page > limit:
+                logging.info(f"Reached page limit of {limit}.")
+                break
+            logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
+            data, has_more = self.get_articles(page=page, folder_info=folder_info)
+            if data:
+                all_articles.extend(data)
+            page += 1
+        return all_articles
+    def _get_page_url(
+        self, page: int, folder_info: Optional[Dict[str, str]] = None
+    ) -> str:
+        """Constructs the URL for the given page, considering folder mode."""
+        if folder_info and folder_info.get("id") and folder_info.get("slug"):
+            return f"{self.BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
+        return f"{self.BASE_URL}{self.URL_PATH_USER}{page}"
+    def _parse_article_data(
+        self, soup: BeautifulSoup, article_ids: List[str], page: int
+    ) -> List[Dict[str, str]]:
+        """Parses the raw HTML to extract structured data for each article."""
+        data = []
+        for article_id in article_ids:
+            article_id_full = f"{self.ARTICLE_ID_PREFIX}{article_id}"
+            article_element = soup.find(id=article_id_full)
+            try:
+                if not article_element:
+                    raise AttributeError(
+                        self.MSG_ARTICLE_ELEMENT_NOT_FOUND.format(
+                            article_id_full=article_id_full
+                        )
+                    )
+                title_element = article_element.find(class_=self.ARTICLE_TITLE_CLASS)
+                if not title_element:
+                    raise AttributeError(self.MSG_TITLE_ELEMENT_NOT_FOUND)
+                title = title_element.get_text().strip()
+                link_element = article_element.find(class_=self.TITLE_META_CLASS).find(
+                    "a"
+                )
+                if not link_element or "href" not in link_element.attrs:
+                    raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
+                link = link_element["href"]
+                data.append(
+                    {self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
+                )
+            except AttributeError as e:
+                logging.warning(
+                    self.MSG_PARSE_ARTICLE_WARNING.format(
+                        article_id=article_id, page=page, e=e
+                    )
+                )
+                continue
+        return data
+    def _handle_http_error(
+        self, e: requests.exceptions.HTTPError, attempt: int
+    ) -> bool:
+        """Handles HTTP errors, returns True if a retry should be attempted."""
+        status_code = e.response.status_code
+        if status_code == self.HTTP_TOO_MANY_REQUESTS:  # Too Many Requests
+            wait_time_str = e.response.headers.get("Retry-After")
+            try:
+                wait_time = int(wait_time_str) if wait_time_str else 0
+                if wait_time > 0:
+                    logging.warning(
+                        self.MSG_RATE_LIMITED_RETRY.format(
+                            status_code=status_code, wait_time=wait_time
+                        )
+                    )
+                    time.sleep(wait_time)
+                    return True
+            except (ValueError, TypeError):
+                pass  # Fallback to exponential backoff
+            self._wait_for_retry(
+                attempt, self.MSG_RATE_LIMITED_REASON.format(status_code=status_code)
+            )
+            return True
+        elif (
+            self.HTTP_SERVER_ERROR_START <= status_code < self.HTTP_SERVER_ERROR_END
+        ):  # Server-side errors
+            self._wait_for_retry(
+                attempt,
+                self.MSG_REQUEST_FAILED_STATUS_REASON.format(status_code=status_code),
+            )
+            return True
+        elif status_code == 404:
+            logging.error(
+                f"Error 404: Not Found. This might indicate an invalid folder ID or slug. URL: {e.response.url}"
+            )
+            return False  # Do not retry, unrecoverable
+        else:  # Other client-side errors (4xx) are not worth retrying
+            logging.error(
+                self.MSG_REQUEST_FAILED_UNRECOVERABLE.format(status_code=status_code)
+            )
+            return False
+    def _wait_for_retry(self, attempt: int, reason: str):
+        """Calculates and waits for an exponential backoff period."""
+        sleep_time = self.backoff_factor * (2**attempt)
+        logging.warning(
+            self.MSG_RETRY_ATTEMPT.format(
+                reason=reason,
+                attempt_num=attempt + 1,
+                max_retries=self.max_retries,
+                sleep_time=sleep_time,
+            )
+        )
+        time.sleep(sleep_time)

instapaper_scraper/auth.py ADDED Viewed

@@ -0,0 +1,211 @@
+import os
+import getpass
+import logging
+import stat
+from pathlib import Path
+from typing import Union
+from cryptography.fernet import Fernet
+import requests
+# --- Constants ---
+class InstapaperConstants:
+    # URLs
+    INSTAPAPER_BASE_URL = "https://www.instapaper.com"
+    INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
+    INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
+    # Session/Cookie related
+    COOKIE_PART_COUNT = 3
+    REQUIRED_COOKIES = {"pfu", "pfp", "pfh"}
+    LOGIN_FORM_IDENTIFIER = "login_form"
+    LOGIN_SUCCESS_PATH = "/u"
+    # Request related
+    REQUEST_TIMEOUT = 10
+    # App config
+    APP_NAME = "instapaper-scraper"
+    CONFIG_DIR = Path.home() / ".config" / APP_NAME
+    # Prompts
+    PROMPT_USERNAME = "Enter your Instapaper username: "
+    PROMPT_PASSWORD = "Enter your Instapaper password: "
+    # Log messages
+    LOG_NO_VALID_SESSION = "No valid session found. Please log in."
+    LOG_LOGIN_SUCCESS = "Login successful."
+    LOG_LOGIN_FAILED = "Login failed. Please check your credentials."
+    LOG_SESSION_LOAD_SUCCESS = "Successfully logged in using the loaded session data."
+    LOG_SESSION_LOAD_FAILED = "Session loaded but verification failed."
+    LOG_SESSION_LOAD_ERROR = "Could not load session from {session_file}: {e}. A new session will be created."
+    LOG_SESSION_VERIFY_FAILED = "Session verification request failed: {e}"
+    LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
+    LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
+# --- Encryption Helper ---
+def get_encryption_key(key_file: Union[str, Path]) -> bytes:
+    """
+    Loads the encryption key from a file or generates a new one.
+    Sets strict file permissions for the key file.
+    """
+    key_path = Path(key_file)
+    key_path.parent.mkdir(parents=True, exist_ok=True)
+    if key_path.exists():
+        with open(key_path, "rb") as f:
+            key = f.read()
+    else:
+        key = Fernet.generate_key()
+        with open(key_path, "wb") as f:
+            f.write(key)
+        # Set file permissions to 0600 (owner read/write only)
+        os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
+        logging.info(f"Generated new encryption key at {key_path}.")
+    return key
+class InstapaperAuthenticator:
+    def __init__(
+        self,
+        session: requests.Session,
+        session_file: Union[str, Path],
+        key_file: Union[str, Path],
+        username: str = None,
+        password: str = None,
+    ):
+        self.session = session
+        self.session_file = Path(session_file)
+        self.key = get_encryption_key(key_file)
+        self.fernet = Fernet(self.key)
+        self.username = username
+        self.password = password
+    def login(self) -> bool:
+        """
+        Handles the complete login process:
+        1. Tries to load an existing session.
+        2. If that fails, prompts for credentials and logs in.
+        3. Saves the new session.
+        """
+        if self._load_session():
+            return True
+        if self._login_with_credentials():
+            self._save_session()
+            return True
+        return False
+    def _load_session(self) -> bool:
+        """Tries to load and verify a session from the session file."""
+        if not self.session_file.exists():
+            return False
+        logging.info(f"Loading encrypted session from {self.session_file}...")
+        try:
+            with open(self.session_file, "rb") as f:
+                encrypted_data = f.read()
+            decrypted_data = self.fernet.decrypt(encrypted_data).decode("utf-8")
+            for line in decrypted_data.splitlines():
+                line = line.strip()
+                if not line:
+                    continue
+                parts = line.split(":", 2)
+                if len(parts) == InstapaperConstants.COOKIE_PART_COUNT:
+                    name, value, domain = parts
+                    self.session.cookies.set(name, value, domain=domain)
+            if self.session.cookies and self._verify_session():
+                logging.info(InstapaperConstants.LOG_SESSION_LOAD_SUCCESS)
+                return True
+            else:
+                logging.warning(InstapaperConstants.LOG_SESSION_LOAD_FAILED)
+                # Clear cookies if verification fails
+                self.session.cookies.clear()
+                return False
+        except Exception as e:
+            logging.warning(
+                InstapaperConstants.LOG_SESSION_LOAD_ERROR.format(
+                    session_file=self.session_file, e=e
+                )
+            )
+            self.session_file.unlink(missing_ok=True)
+            return False
+    def _verify_session(self) -> bool:
+        """Checks if the current session is valid by making a request."""
+        try:
+            verify_response = self.session.get(
+                InstapaperConstants.INSTAPAPER_VERIFY_URL,
+                timeout=InstapaperConstants.REQUEST_TIMEOUT,
+            )
+            verify_response.raise_for_status()
+            return InstapaperConstants.LOGIN_FORM_IDENTIFIER not in verify_response.text
+        except requests.RequestException as e:
+            logging.error(InstapaperConstants.LOG_SESSION_VERIFY_FAILED.format(e=e))
+            return False
+    def _login_with_credentials(self) -> bool:
+        """Logs in using username/password from arguments or user prompt."""
+        logging.info(InstapaperConstants.LOG_NO_VALID_SESSION)
+        username = self.username
+        password = self.password
+        if not username or not password:
+            username = input(InstapaperConstants.PROMPT_USERNAME)
+            password = getpass.getpass(InstapaperConstants.PROMPT_PASSWORD)
+        elif self.username:
+            logging.info(
+                f"Using username '{self.username}' from command-line arguments."
+            )
+        login_response = self.session.post(
+            InstapaperConstants.INSTAPAPER_LOGIN_URL,
+            data={"username": username, "password": password, "keep_logged_in": "yes"},
+            timeout=InstapaperConstants.REQUEST_TIMEOUT,
+        )
+        required_cookies = InstapaperConstants.REQUIRED_COOKIES
+        found_cookies = {c.name for c in self.session.cookies}
+        if (
+            InstapaperConstants.LOGIN_SUCCESS_PATH in login_response.url
+            and required_cookies.issubset(found_cookies)
+        ):
+            logging.info(InstapaperConstants.LOG_LOGIN_SUCCESS)
+            return True
+        else:
+            logging.error(InstapaperConstants.LOG_LOGIN_FAILED)
+            return False
+    def _save_session(self):
+        """Saves the current session cookies to an encrypted file."""
+        required_cookies = InstapaperConstants.REQUIRED_COOKIES
+        cookies_to_save = [
+            c for c in self.session.cookies if c.name in required_cookies
+        ]
+        if not cookies_to_save:
+            logging.warning(InstapaperConstants.LOG_NO_KNOWN_COOKIE_TO_SAVE)
+            return
+        cookie_data = ""
+        for cookie in cookies_to_save:
+            cookie_data += f"{cookie.name}:{cookie.value}:{cookie.domain}\n"
+        encrypted_data = self.fernet.encrypt(cookie_data.encode("utf-8"))
+        self.session_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.session_file, "wb") as f:
+            f.write(encrypted_data)
+        os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
+        logging.info(
+            InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
+        )