PyPI - instapaper-scraper - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

instapaper-scraper 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

instapaper_scraper/api.py +38 -26
instapaper_scraper/auth.py +51 -61
instapaper_scraper/cli.py +40 -16
instapaper_scraper/constants.py +17 -0
instapaper_scraper/output.py +108 -35
instapaper_scraper-1.1.0.dist-info/METADATA +352 -0
instapaper_scraper-1.1.0.dist-info/RECORD +13 -0
instapaper_scraper-1.0.0.dist-info/METADATA +0 -280
instapaper_scraper-1.0.0.dist-info/RECORD +0 -12
{instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/WHEEL +0 -0
{instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/entry_points.txt +0 -0
{instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/licenses/LICENSE +0 -0
{instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/top_level.txt +0 -0

instapaper_scraper/api.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import os
 import logging
 import time
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Tuple, Optional, Any
 import requests
 from bs4 import BeautifulSoup
+from bs4.element import Tag
 from .exceptions import ScraperStructureChanged
+from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
 class InstapaperClient:
@@ -14,8 +16,6 @@ class InstapaperClient:
     A client for interacting with the Instapaper website to fetch articles.
     """
-    BASE_URL = "https://www.instapaper.com"
     # Environment variable names
     ENV_MAX_RETRIES = "MAX_RETRIES"
     ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
@@ -39,11 +39,6 @@ class InstapaperClient:
     URL_PATH_USER = "/u/"
     URL_PATH_FOLDER = "/u/folder/"
-    # Dictionary keys for article data
-    KEY_ID = "id"
-    KEY_TITLE = "title"
-    KEY_URL = "url"
     # HTTP status codes
     HTTP_TOO_MANY_REQUESTS = 429
     HTTP_SERVER_ERROR_START = 500
@@ -129,14 +124,28 @@ class InstapaperClient:
                 soup = BeautifulSoup(response.text, self.HTML_PARSER)
                 article_list = soup.find(id=self.ARTICLE_LIST_ID)
-                if not article_list:
+                if not isinstance(article_list, Tag):
                     raise ScraperStructureChanged(self.MSG_ARTICLE_LIST_NOT_FOUND)
                 articles = article_list.find_all(self.ARTICLE_TAG)
-                article_ids = [
-                    article[self.KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
-                    for article in articles
-                ]
+                article_ids = []
+                for article in articles:
+                    if not isinstance(article, Tag):
+                        continue
+                    article_id_val = article.get(KEY_ID)
+                    # Ensure article_id_val is a string before calling replace
+                    # If it's a list, take the first element. This is a pragmatic
+                    # approach since 'id' attributes should ideally be unique strings.
+                    if isinstance(article_id_val, list):
+                        article_id_val = article_id_val[0] if article_id_val else None
+                    if isinstance(article_id_val, str) and article_id_val.startswith(
+                        self.ARTICLE_ID_PREFIX
+                    ):
+                        article_ids.append(
+                            article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
+                        )
                 data = self._parse_article_data(soup, article_ids, page)
                 has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
@@ -204,19 +213,19 @@ class InstapaperClient:
     ) -> str:
         """Constructs the URL for the given page, considering folder mode."""
         if folder_info and folder_info.get("id") and folder_info.get("slug"):
-            return f"{self.BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
-        return f"{self.BASE_URL}{self.URL_PATH_USER}{page}"
+            return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
+        return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
     def _parse_article_data(
         self, soup: BeautifulSoup, article_ids: List[str], page: int
-    ) -> List[Dict[str, str]]:
+    ) -> List[Dict[str, Any]]:
         """Parses the raw HTML to extract structured data for each article."""
         data = []
         for article_id in article_ids:
             article_id_full = f"{self.ARTICLE_ID_PREFIX}{article_id}"
             article_element = soup.find(id=article_id_full)
             try:
-                if not article_element:
+                if not isinstance(article_element, Tag):
                     raise AttributeError(
                         self.MSG_ARTICLE_ELEMENT_NOT_FOUND.format(
                             article_id_full=article_id_full
@@ -224,20 +233,23 @@ class InstapaperClient:
                     )
                 title_element = article_element.find(class_=self.ARTICLE_TITLE_CLASS)
-                if not title_element:
+                if not isinstance(title_element, Tag):
                     raise AttributeError(self.MSG_TITLE_ELEMENT_NOT_FOUND)
                 title = title_element.get_text().strip()
-                link_element = article_element.find(class_=self.TITLE_META_CLASS).find(
-                    "a"
-                )
-                if not link_element or "href" not in link_element.attrs:
+                meta_element = article_element.find(class_=self.TITLE_META_CLASS)
+                if not isinstance(meta_element, Tag):
+                    raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
+                link_element = meta_element.find("a")
+                if (
+                    not isinstance(link_element, Tag)
+                    or "href" not in link_element.attrs
+                ):
                     raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
                 link = link_element["href"]
-                data.append(
-                    {self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
-                )
+                data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
             except AttributeError as e:
                 logging.warning(
                     self.MSG_PARSE_ARTICLE_WARNING.format(
@@ -289,7 +301,7 @@ class InstapaperClient:
             )
             return False
-    def _wait_for_retry(self, attempt: int, reason: str):
+    def _wait_for_retry(self, attempt: int, reason: str) -> None:
         """Calculates and waits for an exponential backoff period."""
         sleep_time = self.backoff_factor * (2**attempt)
         logging.warning(

instapaper_scraper/auth.py CHANGED Viewed

@@ -3,16 +3,38 @@ import getpass
 import logging
 import stat
 from pathlib import Path
-from typing import Union
+from typing import Union, Optional
 from cryptography.fernet import Fernet
 import requests
+from .constants import INSTAPAPER_BASE_URL
-# --- Constants ---
-class InstapaperConstants:
+# --- Encryption Helper ---
+def get_encryption_key(key_file: Union[str, Path]) -> bytes:
+    """
+    Loads the encryption key from a file or generates a new one.
+    Sets strict file permissions for the key file.
+    """
+    key_path = Path(key_file)
+    key_path.parent.mkdir(parents=True, exist_ok=True)
+    if key_path.exists():
+        with open(key_path, "rb") as f:
+            key = f.read()
+    else:
+        key = Fernet.generate_key()
+        with open(key_path, "wb") as f:
+            f.write(key)
+        # Set file permissions to 0600 (owner read/write only)
+        os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
+        logging.info(f"Generated new encryption key at {key_path}.")
+    return key
+class InstapaperAuthenticator:
     # URLs
-    INSTAPAPER_BASE_URL = "https://www.instapaper.com"
     INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
     INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
@@ -25,10 +47,6 @@ class InstapaperConstants:
     # Request related
     REQUEST_TIMEOUT = 10
-    # App config
-    APP_NAME = "instapaper-scraper"
-    CONFIG_DIR = Path.home() / ".config" / APP_NAME
     # Prompts
     PROMPT_USERNAME = "Enter your Instapaper username: "
     PROMPT_PASSWORD = "Enter your Instapaper password: "
@@ -44,40 +62,17 @@ class InstapaperConstants:
     LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
     LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
-# --- Encryption Helper ---
-def get_encryption_key(key_file: Union[str, Path]) -> bytes:
-    """
-    Loads the encryption key from a file or generates a new one.
-    Sets strict file permissions for the key file.
-    """
-    key_path = Path(key_file)
-    key_path.parent.mkdir(parents=True, exist_ok=True)
-    if key_path.exists():
-        with open(key_path, "rb") as f:
-            key = f.read()
-    else:
-        key = Fernet.generate_key()
-        with open(key_path, "wb") as f:
-            f.write(key)
-        # Set file permissions to 0600 (owner read/write only)
-        os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
-        logging.info(f"Generated new encryption key at {key_path}.")
-    return key
-class InstapaperAuthenticator:
     def __init__(
         self,
         session: requests.Session,
         session_file: Union[str, Path],
         key_file: Union[str, Path],
-        username: str = None,
-        password: str = None,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
     ):
         self.session = session
         self.session_file = Path(session_file)
+        self.key_file = Path(key_file)
         self.key = get_encryption_key(key_file)
         self.fernet = Fernet(self.key)
         self.username = username
@@ -116,24 +111,22 @@ class InstapaperAuthenticator:
                 if not line:
                     continue
                 parts = line.split(":", 2)
-                if len(parts) == InstapaperConstants.COOKIE_PART_COUNT:
+                if len(parts) == self.COOKIE_PART_COUNT:
                     name, value, domain = parts
                     self.session.cookies.set(name, value, domain=domain)
             if self.session.cookies and self._verify_session():
-                logging.info(InstapaperConstants.LOG_SESSION_LOAD_SUCCESS)
+                logging.info(self.LOG_SESSION_LOAD_SUCCESS)
                 return True
             else:
-                logging.warning(InstapaperConstants.LOG_SESSION_LOAD_FAILED)
+                logging.warning(self.LOG_SESSION_LOAD_FAILED)
                 # Clear cookies if verification fails
                 self.session.cookies.clear()
                 return False
         except Exception as e:
             logging.warning(
-                InstapaperConstants.LOG_SESSION_LOAD_ERROR.format(
-                    session_file=self.session_file, e=e
-                )
+                self.LOG_SESSION_LOAD_ERROR.format(session_file=self.session_file, e=e)
             )
             self.session_file.unlink(missing_ok=True)
             return False
@@ -142,57 +135,56 @@ class InstapaperAuthenticator:
         """Checks if the current session is valid by making a request."""
         try:
             verify_response = self.session.get(
-                InstapaperConstants.INSTAPAPER_VERIFY_URL,
-                timeout=InstapaperConstants.REQUEST_TIMEOUT,
+                self.INSTAPAPER_VERIFY_URL,
+                timeout=self.REQUEST_TIMEOUT,
             )
             verify_response.raise_for_status()
-            return InstapaperConstants.LOGIN_FORM_IDENTIFIER not in verify_response.text
+            return self.LOGIN_FORM_IDENTIFIER not in verify_response.text
         except requests.RequestException as e:
-            logging.error(InstapaperConstants.LOG_SESSION_VERIFY_FAILED.format(e=e))
+            logging.error(self.LOG_SESSION_VERIFY_FAILED.format(e=e))
             return False
     def _login_with_credentials(self) -> bool:
         """Logs in using username/password from arguments or user prompt."""
-        logging.info(InstapaperConstants.LOG_NO_VALID_SESSION)
+        logging.info(self.LOG_NO_VALID_SESSION)
         username = self.username
         password = self.password
         if not username or not password:
-            username = input(InstapaperConstants.PROMPT_USERNAME)
-            password = getpass.getpass(InstapaperConstants.PROMPT_PASSWORD)
+            username = input(self.PROMPT_USERNAME)
+            password = getpass.getpass(self.PROMPT_PASSWORD)
         elif self.username:
             logging.info(
                 f"Using username '{self.username}' from command-line arguments."
             )
         login_response = self.session.post(
-            InstapaperConstants.INSTAPAPER_LOGIN_URL,
+            self.INSTAPAPER_LOGIN_URL,
             data={"username": username, "password": password, "keep_logged_in": "yes"},
-            timeout=InstapaperConstants.REQUEST_TIMEOUT,
+            timeout=self.REQUEST_TIMEOUT,
         )
-        required_cookies = InstapaperConstants.REQUIRED_COOKIES
+        required_cookies = self.REQUIRED_COOKIES
         found_cookies = {c.name for c in self.session.cookies}
-        if (
-            InstapaperConstants.LOGIN_SUCCESS_PATH in login_response.url
-            and required_cookies.issubset(found_cookies)
+        if self.LOGIN_SUCCESS_PATH in login_response.url and required_cookies.issubset(
+            found_cookies
         ):
-            logging.info(InstapaperConstants.LOG_LOGIN_SUCCESS)
+            logging.info(self.LOG_LOGIN_SUCCESS)
             return True
         else:
-            logging.error(InstapaperConstants.LOG_LOGIN_FAILED)
+            logging.error(self.LOG_LOGIN_FAILED)
             return False
-    def _save_session(self):
+    def _save_session(self) -> None:
         """Saves the current session cookies to an encrypted file."""
-        required_cookies = InstapaperConstants.REQUIRED_COOKIES
+        required_cookies = self.REQUIRED_COOKIES
         cookies_to_save = [
             c for c in self.session.cookies if c.name in required_cookies
         ]
         if not cookies_to_save:
-            logging.warning(InstapaperConstants.LOG_NO_KNOWN_COOKIE_TO_SAVE)
+            logging.warning(self.LOG_NO_KNOWN_COOKIE_TO_SAVE)
             return
         cookie_data = ""
@@ -206,6 +198,4 @@ class InstapaperAuthenticator:
             f.write(encrypted_data)
         os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
-        logging.info(
-            InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
-        )
+        logging.info(self.LOG_SAVED_SESSION.format(session_file=self.session_file))

instapaper_scraper/cli.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import argparse
 import requests
 from pathlib import Path
-from typing import Union
+from typing import Union, List, Dict, Any, Optional, cast
 if sys.version_info >= (3, 11):
     import tomllib
@@ -15,6 +15,13 @@ from .auth import InstapaperAuthenticator
 from .api import InstapaperClient
 from .output import save_articles
 from .exceptions import ScraperStructureChanged
+from .constants import CONFIG_DIR
+# --- Constants ---
+CONFIG_FILENAME = "config.toml"
+DEFAULT_SESSION_FILENAME = ".instapaper_session"
+DEFAULT_KEY_FILENAME = ".session_key"
+DEFAULT_OUTPUT_FILENAME = "output/bookmarks.{ext}"
 def _resolve_path(
@@ -32,19 +39,18 @@ def _resolve_path(
     return user_dir_filename
-def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
+def load_config(config_path_str: Union[str, None] = None) -> Optional[Dict[str, Any]]:
     """
     Loads configuration from a TOML file.
     It checks the provided path, then config.toml in the project root,
     and finally ~/.config/instapaper-scraper/config.toml.
     """
-    app_name = "instapaper-scraper"
     default_paths = [
-        Path("config.toml"),
-        Path.home() / ".config" / app_name / "config.toml",
+        Path(CONFIG_FILENAME),
+        CONFIG_DIR / CONFIG_FILENAME,
     ]
-    paths_to_check = []
+    paths_to_check: List[Path] = []
     if config_path_str:
         paths_to_check.insert(0, Path(config_path_str).expanduser())
     paths_to_check.extend(default_paths)
@@ -54,7 +60,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
             try:
                 with open(path, "rb") as f:
                     logging.info(f"Loading configuration from {path}")
-                    return tomllib.load(f)
+                    return cast(Dict[str, Any], tomllib.load(f))
             except tomllib.TOMLDecodeError as e:
                 logging.error(f"Error decoding TOML file at {path}: {e}")
                 return None
@@ -62,7 +68,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
     return None
-def main():
+def main() -> None:
     """
     Main entry point for the Instapaper scraper CLI.
     """
@@ -95,6 +101,11 @@ def main():
     parser.add_argument("--key-file", help="Path to the session key file.")
     parser.add_argument("--username", help="Instapaper username.")
     parser.add_argument("--password", help="Instapaper password.")
+    parser.add_argument(
+        "--add-instapaper-url",
+        action="store_true",
+        help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
+    )
     parser.add_argument(
         "--limit",
         type=int,
@@ -133,7 +144,7 @@ def main():
         print("  0: none (non-folder mode)")
         for i, folder in enumerate(folders):
             display_name = folder.get("key") or folder.get("slug") or folder.get("id")
-            print(f"  {i+1}: {display_name}")
+            print(f"  {i + 1}: {display_name}")
         try:
             choice = int(input("Select a folder (enter a number): "))
@@ -153,18 +164,21 @@ def main():
             output_filename = config["output_filename"]
         else:
             ext = "db" if args.format == "sqlite" else args.format
-            output_filename = f"output/bookmarks.{ext}"
+            output_filename = DEFAULT_OUTPUT_FILENAME.format(ext=ext)
     session = requests.Session()
     # Resolve session and key file paths
-    app_name = "instapaper-scraper"
-    user_config_dir = Path.home() / ".config" / app_name
     session_file = _resolve_path(
-        args.session_file, ".instapaper_session", user_config_dir / ".instapaper_session"
+        args.session_file,
+        DEFAULT_SESSION_FILENAME,
+        CONFIG_DIR / DEFAULT_SESSION_FILENAME,
+    )
+    key_file = _resolve_path(
+        args.key_file,
+        DEFAULT_KEY_FILENAME,
+        CONFIG_DIR / DEFAULT_KEY_FILENAME,
     )
-    key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
     # 1. Authenticate
     authenticator = InstapaperAuthenticator(
@@ -195,7 +209,17 @@ def main():
         sys.exit(1)
     # 3. Save Articles
-    save_articles(all_articles, args.format, output_filename)
+    try:
+        save_articles(
+            all_articles,
+            args.format,
+            output_filename,
+            add_instapaper_url=args.add_instapaper_url,
+        )
+        logging.info("Articles scraped and saved successfully.")
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during saving: {e}")
+        sys.exit(1)
 if __name__ == "__main__":

instapaper_scraper/constants.py ADDED Viewed

@@ -0,0 +1,17 @@
+# Shared constants used across the instapaper-scraper project.
+from pathlib import Path
+# --- General ---
+APP_NAME = "instapaper-scraper"
+# --- URLS ---
+INSTAPAPER_BASE_URL = "https://www.instapaper.com"
+INSTAPAPER_READ_URL = f"{INSTAPAPER_BASE_URL}/read/"
+# --- Paths ---
+CONFIG_DIR = Path.home() / ".config" / APP_NAME
+# --- Article Data Keys ---
+KEY_ID = "id"
+KEY_TITLE = "title"
+KEY_URL = "url"

instapaper-scraper 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

instapaper-scraper 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl