PyPI - instapaper-scraper - Versions diffs - 1.0.0.post1__py3-none-any.whl → 1.1.0rc1__py3-none-any.whl - Mend

instapaper-scraper 1.0.0.post1py3-none-any.whl → 1.1.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

instapaper_scraper/api.py CHANGED Viewed

@@ -7,6 +7,7 @@ import requests
 from bs4 import BeautifulSoup
 from .exceptions import ScraperStructureChanged
+from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
 class InstapaperClient:
@@ -14,8 +15,6 @@ class InstapaperClient:
     A client for interacting with the Instapaper website to fetch articles.
     """
-    BASE_URL = "https://www.instapaper.com"
     # Environment variable names
     ENV_MAX_RETRIES = "MAX_RETRIES"
     ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
@@ -39,11 +38,6 @@ class InstapaperClient:
     URL_PATH_USER = "/u/"
     URL_PATH_FOLDER = "/u/folder/"
-    # Dictionary keys for article data
-    KEY_ID = "id"
-    KEY_TITLE = "title"
-    KEY_URL = "url"
     # HTTP status codes
     HTTP_TOO_MANY_REQUESTS = 429
     HTTP_SERVER_ERROR_START = 500
@@ -134,7 +128,7 @@ class InstapaperClient:
                 articles = article_list.find_all(self.ARTICLE_TAG)
                 article_ids = [
-                    article[self.KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
+                    article[KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
                     for article in articles
                 ]
@@ -204,8 +198,8 @@ class InstapaperClient:
     ) -> str:
         """Constructs the URL for the given page, considering folder mode."""
         if folder_info and folder_info.get("id") and folder_info.get("slug"):
-            return f"{self.BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
-        return f"{self.BASE_URL}{self.URL_PATH_USER}{page}"
+            return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
+        return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
     def _parse_article_data(
         self, soup: BeautifulSoup, article_ids: List[str], page: int
@@ -235,9 +229,7 @@ class InstapaperClient:
                     raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
                 link = link_element["href"]
-                data.append(
-                    {self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
-                )
+                data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
             except AttributeError as e:
                 logging.warning(
                     self.MSG_PARSE_ARTICLE_WARNING.format(

instapaper_scraper/auth.py CHANGED Viewed

@@ -8,11 +8,33 @@ from typing import Union
 from cryptography.fernet import Fernet
 import requests
+from .constants import INSTAPAPER_BASE_URL
-# --- Constants ---
-class InstapaperConstants:
+# --- Encryption Helper ---
+def get_encryption_key(key_file: Union[str, Path]) -> bytes:
+    """
+    Loads the encryption key from a file or generates a new one.
+    Sets strict file permissions for the key file.
+    """
+    key_path = Path(key_file)
+    key_path.parent.mkdir(parents=True, exist_ok=True)
+    if key_path.exists():
+        with open(key_path, "rb") as f:
+            key = f.read()
+    else:
+        key = Fernet.generate_key()
+        with open(key_path, "wb") as f:
+            f.write(key)
+        # Set file permissions to 0600 (owner read/write only)
+        os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
+        logging.info(f"Generated new encryption key at {key_path}.")
+    return key
+class InstapaperAuthenticator:
     # URLs
-    INSTAPAPER_BASE_URL = "https://www.instapaper.com"
     INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
     INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
@@ -25,10 +47,6 @@ class InstapaperConstants:
     # Request related
     REQUEST_TIMEOUT = 10
-    # App config
-    APP_NAME = "instapaper-scraper"
-    CONFIG_DIR = Path.home() / ".config" / APP_NAME
     # Prompts
     PROMPT_USERNAME = "Enter your Instapaper username: "
     PROMPT_PASSWORD = "Enter your Instapaper password: "
@@ -44,30 +62,6 @@ class InstapaperConstants:
     LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
     LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
-# --- Encryption Helper ---
-def get_encryption_key(key_file: Union[str, Path]) -> bytes:
-    """
-    Loads the encryption key from a file or generates a new one.
-    Sets strict file permissions for the key file.
-    """
-    key_path = Path(key_file)
-    key_path.parent.mkdir(parents=True, exist_ok=True)
-    if key_path.exists():
-        with open(key_path, "rb") as f:
-            key = f.read()
-    else:
-        key = Fernet.generate_key()
-        with open(key_path, "wb") as f:
-            f.write(key)
-        # Set file permissions to 0600 (owner read/write only)
-        os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
-        logging.info(f"Generated new encryption key at {key_path}.")
-    return key
-class InstapaperAuthenticator:
     def __init__(
         self,
         session: requests.Session,
@@ -116,24 +110,22 @@ class InstapaperAuthenticator:
                 if not line:
                     continue
                 parts = line.split(":", 2)
-                if len(parts) == InstapaperConstants.COOKIE_PART_COUNT:
+                if len(parts) == self.COOKIE_PART_COUNT:
                     name, value, domain = parts
                     self.session.cookies.set(name, value, domain=domain)
             if self.session.cookies and self._verify_session():
-                logging.info(InstapaperConstants.LOG_SESSION_LOAD_SUCCESS)
+                logging.info(self.LOG_SESSION_LOAD_SUCCESS)
                 return True
             else:
-                logging.warning(InstapaperConstants.LOG_SESSION_LOAD_FAILED)
+                logging.warning(self.LOG_SESSION_LOAD_FAILED)
                 # Clear cookies if verification fails
                 self.session.cookies.clear()
                 return False
         except Exception as e:
             logging.warning(
-                InstapaperConstants.LOG_SESSION_LOAD_ERROR.format(
-                    session_file=self.session_file, e=e
-                )
+                self.LOG_SESSION_LOAD_ERROR.format(session_file=self.session_file, e=e)
             )
             self.session_file.unlink(missing_ok=True)
             return False
@@ -142,57 +134,56 @@ class InstapaperAuthenticator:
         """Checks if the current session is valid by making a request."""
         try:
             verify_response = self.session.get(
-                InstapaperConstants.INSTAPAPER_VERIFY_URL,
-                timeout=InstapaperConstants.REQUEST_TIMEOUT,
+                self.INSTAPAPER_VERIFY_URL,
+                timeout=self.REQUEST_TIMEOUT,
             )
             verify_response.raise_for_status()
-            return InstapaperConstants.LOGIN_FORM_IDENTIFIER not in verify_response.text
+            return self.LOGIN_FORM_IDENTIFIER not in verify_response.text
         except requests.RequestException as e:
-            logging.error(InstapaperConstants.LOG_SESSION_VERIFY_FAILED.format(e=e))
+            logging.error(self.LOG_SESSION_VERIFY_FAILED.format(e=e))
             return False
     def _login_with_credentials(self) -> bool:
         """Logs in using username/password from arguments or user prompt."""
-        logging.info(InstapaperConstants.LOG_NO_VALID_SESSION)
+        logging.info(self.LOG_NO_VALID_SESSION)
         username = self.username
         password = self.password
         if not username or not password:
-            username = input(InstapaperConstants.PROMPT_USERNAME)
-            password = getpass.getpass(InstapaperConstants.PROMPT_PASSWORD)
+            username = input(self.PROMPT_USERNAME)
+            password = getpass.getpass(self.PROMPT_PASSWORD)
         elif self.username:
             logging.info(
                 f"Using username '{self.username}' from command-line arguments."
             )
         login_response = self.session.post(
-            InstapaperConstants.INSTAPAPER_LOGIN_URL,
+            self.INSTAPAPER_LOGIN_URL,
             data={"username": username, "password": password, "keep_logged_in": "yes"},
-            timeout=InstapaperConstants.REQUEST_TIMEOUT,
+            timeout=self.REQUEST_TIMEOUT,
         )
-        required_cookies = InstapaperConstants.REQUIRED_COOKIES
+        required_cookies = self.REQUIRED_COOKIES
         found_cookies = {c.name for c in self.session.cookies}
-        if (
-            InstapaperConstants.LOGIN_SUCCESS_PATH in login_response.url
-            and required_cookies.issubset(found_cookies)
+        if self.LOGIN_SUCCESS_PATH in login_response.url and required_cookies.issubset(
+            found_cookies
         ):
-            logging.info(InstapaperConstants.LOG_LOGIN_SUCCESS)
+            logging.info(self.LOG_LOGIN_SUCCESS)
             return True
         else:
-            logging.error(InstapaperConstants.LOG_LOGIN_FAILED)
+            logging.error(self.LOG_LOGIN_FAILED)
             return False
     def _save_session(self):
         """Saves the current session cookies to an encrypted file."""
-        required_cookies = InstapaperConstants.REQUIRED_COOKIES
+        required_cookies = self.REQUIRED_COOKIES
         cookies_to_save = [
             c for c in self.session.cookies if c.name in required_cookies
         ]
         if not cookies_to_save:
-            logging.warning(InstapaperConstants.LOG_NO_KNOWN_COOKIE_TO_SAVE)
+            logging.warning(self.LOG_NO_KNOWN_COOKIE_TO_SAVE)
             return
         cookie_data = ""
@@ -206,6 +197,4 @@ class InstapaperAuthenticator:
             f.write(encrypted_data)
         os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
-        logging.info(
-            InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
-        )
+        logging.info(self.LOG_SAVED_SESSION.format(session_file=self.session_file))

instapaper_scraper/cli.py CHANGED Viewed

@@ -15,6 +15,13 @@ from .auth import InstapaperAuthenticator
 from .api import InstapaperClient
 from .output import save_articles
 from .exceptions import ScraperStructureChanged
+from .constants import CONFIG_DIR
+# --- Constants ---
+CONFIG_FILENAME = "config.toml"
+DEFAULT_SESSION_FILENAME = ".instapaper_session"
+DEFAULT_KEY_FILENAME = ".session_key"
+DEFAULT_OUTPUT_FILENAME = "output/bookmarks.{ext}"
 def _resolve_path(
@@ -38,10 +45,9 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
     It checks the provided path, then config.toml in the project root,
     and finally ~/.config/instapaper-scraper/config.toml.
     """
-    app_name = "instapaper-scraper"
     default_paths = [
-        Path("config.toml"),
-        Path.home() / ".config" / app_name / "config.toml",
+        Path(CONFIG_FILENAME),
+        CONFIG_DIR / CONFIG_FILENAME,
     ]
     paths_to_check = []
@@ -95,6 +101,11 @@ def main():
     parser.add_argument("--key-file", help="Path to the session key file.")
     parser.add_argument("--username", help="Instapaper username.")
     parser.add_argument("--password", help="Instapaper password.")
+    parser.add_argument(
+        "--add-instapaper-url",
+        action="store_true",
+        help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
+    )
     parser.add_argument(
         "--limit",
         type=int,
@@ -153,18 +164,21 @@ def main():
             output_filename = config["output_filename"]
         else:
             ext = "db" if args.format == "sqlite" else args.format
-            output_filename = f"output/bookmarks.{ext}"
+            output_filename = DEFAULT_OUTPUT_FILENAME.format(ext=ext)
     session = requests.Session()
     # Resolve session and key file paths
-    app_name = "instapaper-scraper"
-    user_config_dir = Path.home() / ".config" / app_name
     session_file = _resolve_path(
-        args.session_file, ".instapaper_session", user_config_dir / ".instapaper_session"
+        args.session_file,
+        DEFAULT_SESSION_FILENAME,
+        CONFIG_DIR / DEFAULT_SESSION_FILENAME,
+    )
+    key_file = _resolve_path(
+        args.key_file,
+        DEFAULT_KEY_FILENAME,
+        CONFIG_DIR / DEFAULT_KEY_FILENAME,
     )
-    key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
     # 1. Authenticate
     authenticator = InstapaperAuthenticator(
@@ -195,7 +209,17 @@ def main():
         sys.exit(1)
     # 3. Save Articles
-    save_articles(all_articles, args.format, output_filename)
+    try:
+        save_articles(
+            all_articles,
+            args.format,
+            output_filename,
+            add_instapaper_url=args.add_instapaper_url,
+        )
+        logging.info("Articles scraped and saved successfully.")
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during saving: {e}")
+        sys.exit(1)
 if __name__ == "__main__":

instapaper_scraper/constants.py ADDED Viewed

@@ -0,0 +1,17 @@
+# Shared constants used across the instapaper-scraper project.
+from pathlib import Path
+# --- General ---
+APP_NAME = "instapaper-scraper"
+# --- URLS ---
+INSTAPAPER_BASE_URL = "https://www.instapaper.com"
+INSTAPAPER_READ_URL = f"{INSTAPAPER_BASE_URL}/read/"
+# --- Paths ---
+CONFIG_DIR = Path.home() / ".config" / APP_NAME
+# --- Article Data Keys ---
+KEY_ID = "id"
+KEY_TITLE = "title"
+KEY_URL = "url"

instapaper_scraper/output.py CHANGED Viewed

@@ -2,29 +2,17 @@ import os
 import json
 import sqlite3
 import logging
+import csv
 from typing import List, Dict, Any
+from .constants import INSTAPAPER_READ_URL, KEY_ID, KEY_TITLE, KEY_URL
 # Constants for file operations
 JSON_INDENT = 4
-# Constants for CSV output
-CSV_HEADER = "id,title,url\n"
-CSV_DELIMITER = ","
-CSV_ROW_FORMAT = "{id},{title},{url}\n"
 # Constants for SQLite output
 SQLITE_TABLE_NAME = "articles"
-SQLITE_ID_COL = "id"
-SQLITE_TITLE_COL = "title"
-SQLITE_URL_COL = "url"
-SQLITE_CREATE_TABLE_SQL = f"""
-        CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} (
-            {SQLITE_ID_COL} TEXT PRIMARY KEY,
-            {SQLITE_TITLE_COL} TEXT NOT NULL,
-            {SQLITE_URL_COL} TEXT NOT NULL
-        )
-    """
-SQLITE_INSERT_SQL = f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({SQLITE_ID_COL}, {SQLITE_TITLE_COL}, {SQLITE_URL_COL}) VALUES (:{SQLITE_ID_COL}, :{SQLITE_TITLE_COL}, :{SQLITE_URL_COL})"
+SQLITE_INSTAPAPER_URL_COL = "instapaper_url"
 # Constants for logging messages
 LOG_NO_ARTICLES = "No articles found to save."
@@ -32,21 +20,52 @@ LOG_SAVED_ARTICLES = "Saved {count} articles to {filename}"
 LOG_UNKNOWN_FORMAT = "Unknown output format: {format}"
-def save_to_csv(data: List[Dict[str, Any]], filename: str):
+def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
+    """Returns the SQL statement to create the articles table."""
+    columns = [
+        f"{KEY_ID} TEXT PRIMARY KEY",
+        f"{KEY_TITLE} TEXT NOT NULL",
+        f"{KEY_URL} TEXT NOT NULL",
+    ]
+    if add_instapaper_url:
+        # The GENERATED ALWAYS AS syntax was added in SQLite 3.31.0
+        if sqlite3.sqlite_version_info >= (3, 31, 0):
+            columns.append(
+                f"{SQLITE_INSTAPAPER_URL_COL} TEXT GENERATED ALWAYS AS ('{INSTAPAPER_READ_URL}' || {KEY_ID}) VIRTUAL"
+            )
+        else:
+            columns.append(f"{SQLITE_INSTAPAPER_URL_COL} TEXT")
+    return f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} ({', '.join(columns)})"
+def get_sqlite_insert_sql(add_instapaper_url_manually: bool = False) -> str:
+    """Returns the SQL statement to insert an article."""
+    cols = [KEY_ID, KEY_TITLE, KEY_URL]
+    placeholders = [f":{KEY_ID}", f":{KEY_TITLE}", f":{KEY_URL}"]
+    if add_instapaper_url_manually:
+        cols.append(SQLITE_INSTAPAPER_URL_COL)
+        placeholders.append(f":{SQLITE_INSTAPAPER_URL_COL}")
+    return f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({', '.join(cols)}) VALUES ({', '.join(placeholders)})"
+def save_to_csv(
+    data: List[Dict[str, Any]], filename: str, add_instapaper_url: bool = False
+):
     """Saves a list of articles to a CSV file."""
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "w", newline="", encoding="utf-8") as f:
-        f.write(CSV_HEADER)
-        for article in data:
-            # Basic CSV quoting for titles with commas
-            title = article[SQLITE_TITLE_COL]
-            if CSV_DELIMITER in title:
-                title = f'"{title}"'
-            f.write(
-                CSV_ROW_FORMAT.format(
-                    id=article[SQLITE_ID_COL], title=title, url=article[SQLITE_URL_COL]
-                )
-            )
+        fieldnames = [KEY_ID, KEY_TITLE, KEY_URL]
+        if add_instapaper_url:
+            # Insert instapaper_url after the id column
+            fieldnames.insert(1, SQLITE_INSTAPAPER_URL_COL)
+        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
+        writer.writeheader()
+        writer.writerows(data)
     logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
@@ -58,19 +77,61 @@ def save_to_json(data: List[Dict[str, Any]], filename: str):
     logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
-def save_to_sqlite(data: List[Dict[str, Any]], db_name: str):
+def save_to_sqlite(
+    data: List[Dict[str, Any]], db_name: str, add_instapaper_url: bool = False
+):
     """Saves a list of articles to a SQLite database."""
     os.makedirs(os.path.dirname(db_name), exist_ok=True)
     conn = sqlite3.connect(db_name)
     cursor = conn.cursor()
-    cursor.execute(SQLITE_CREATE_TABLE_SQL)
-    cursor.executemany(SQLITE_INSERT_SQL, data)
+    cursor.execute(get_sqlite_create_table_sql(add_instapaper_url))
+    # For older SQLite versions, we need to manually add the URL
+    manual_insert_required = add_instapaper_url and sqlite3.sqlite_version_info < (
+        3,
+        31,
+        0,
+    )
+    if manual_insert_required:
+        data_to_insert = [
+            {
+                **article,
+                SQLITE_INSTAPAPER_URL_COL: f"{INSTAPAPER_READ_URL}{article[KEY_ID]}",
+            }
+            for article in data
+        ]
+    else:
+        data_to_insert = data
+    insert_sql = get_sqlite_insert_sql(
+        add_instapaper_url_manually=manual_insert_required
+    )
+    cursor.executemany(insert_sql, data_to_insert)
     conn.commit()
     conn.close()
     logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=db_name))
-def save_articles(data: List[Dict[str, Any]], format: str, filename: str):
+def _correct_ext(filename: str, format: str) -> str:
+    """Corrects the filename extension based on the specified format."""
+    extension_map = {
+        "csv": ".csv",
+        "json": ".json",
+        "sqlite": ".db",
+    }
+    if format in extension_map:
+        name, _ = os.path.splitext(filename)
+        return name + extension_map[format]
+    return filename
+def save_articles(
+    data: List[Dict[str, Any]],
+    format: str,
+    filename: str,
+    add_instapaper_url: bool = False,
+):
     """
     Dispatches to the correct save function based on the format.
     """
@@ -78,11 +139,23 @@ def save_articles(data: List[Dict[str, Any]], format: str, filename: str):
         logging.info(LOG_NO_ARTICLES)
         return
+    filename = _correct_ext(filename, format)
+    # Add the instapaper_url to the data for formats that don't auto-generate it
+    if add_instapaper_url and format in ("csv", "json"):
+        data = [
+            {
+                **article,
+                SQLITE_INSTAPAPER_URL_COL: f"{INSTAPAPER_READ_URL}{article[KEY_ID]}",
+            }
+            for article in data
+        ]
     if format == "csv":
-        save_to_csv(data, filename=filename)
+        save_to_csv(data, filename=filename, add_instapaper_url=add_instapaper_url)
     elif format == "json":
         save_to_json(data, filename=filename)
     elif format == "sqlite":
-        save_to_sqlite(data, db_name=filename)
+        save_to_sqlite(data, db_name=filename, add_instapaper_url=add_instapaper_url)
     else:
         logging.error(LOG_UNKNOWN_FORMAT.format(format=format))

{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: instapaper-scraper
-Version: 1.0.0.post1
+Version: 1.1.0rc1
 Summary: A tool to scrape articles from Instapaper.
 Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
 Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
@@ -30,7 +30,7 @@ Requires-Dist: python-dotenv~=1.2.1
 Requires-Dist: requests~=2.32.5
 Requires-Dist: soupsieve~=2.8
 Requires-Dist: typing_extensions~=4.15.0
-Requires-Dist: urllib3~=2.5.0
+Requires-Dist: urllib3<2.7,>=2.5
 Requires-Dist: tomli~=2.0.1; python_version < "3.11"
 Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
@@ -49,6 +49,7 @@ Dynamic: license-file
 ![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fchriskyfung%2FInstapaperScraper%2Frefs%2Fheads%2Fmaster%2Fpyproject.toml)
 [![CI](https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml/badge.svg)](https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml)
 [![PyPI version](https://img.shields.io/pypi/v/instapaper-scraper.svg)](https://pypi.org/project/instapaper-scraper/)
+[![PyPI Downloads](https://static.pepy.tech/personalized-badge/instapaper-scraper?period=total&left_text=downloads)](https://pepy.tech/projects/instapaper-scraper)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![GitHub License](https://img.shields.io/github/license/chriskyfung/InstapaperScraper)
@@ -68,6 +69,7 @@ A Python tool to scrape all your saved Instapaper bookmarks and export them to v
 ## Getting Started
 ### 1. Requirements
 - Python 3.9+
 ### 2. Installation
@@ -152,14 +154,15 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
 ### Command-line Arguments
-| Argument              | Description                                                              |
-| --------------------- | ------------------------------------------------------------------------ |
+| Argument | Description |
+| --- | --- |
 | `--config-path <path>`| Path to the configuration file. Searches `~/.config/instapaper-scraper/config.toml` and `config.toml` in the current directory by default. |
-| `--folder <value>`    | Specify a folder by key, ID, or slug from your `config.toml`. **Requires a configuration file to be loaded.** Use `none` to explicitly disable folder mode. If a configuration file is not found or fails to load, and this option is used (not set to `none`), the program will exit. |
-| `--format <format>`   | Output format (`csv`, `json`, `sqlite`). Default: `csv`.                 |
-| `--output <filename>` | Specify a custom output filename.                                        |
-| `--username <user>`   | Your Instapaper account username.                                        |
-| `--password <pass>`   | Your Instapaper account password.                                        |
+| `--folder <value>` | Specify a folder by key, ID, or slug from your `config.toml`. **Requires a configuration file to be loaded.** Use `none` to explicitly disable folder mode. If a configuration file is not found or fails to load, and this option is used (not set to `none`), the program will exit. |
+| `--format <format>` | Output format (`csv`, `json`, `sqlite`). Default: `csv`. |
+| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
+| `--username <user>` | Your Instapaper account username. |
+| `--password <pass>` | Your Instapaper account password. |
+| `--add-instapaper-url` | Adds a `instapaper_url` column to the output, containing a full, clickable URL for each article. |
 ### Output Formats
@@ -168,54 +171,64 @@ You can control the output format using the `--format` argument. The supported f
 - `csv` (default): Exports data to `output/bookmarks.csv`.
 - `json`: Exports data to `output/bookmarks.json`.
 - `sqlite`: Exports data to an `articles` table in `output/bookmarks.db`.
-- `--output <filename>`: Specify a custom output filename.
 If the `--format` flag is omitted, the script will default to `csv`.
+When using `--output <filename>`, the file extension is automatically corrected to match the chosen format. For example, `instapaper-scraper --format json --output my_articles.txt` will create `my_articles.json`.
 #### Opening Articles in Instapaper
-The output data includes a unique `id` for each article. To open an article directly in Instapaper's reader view, append this ID to the base URL:
-`https://www.instapaper.com/read/<article_id>`
+The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
+For convenience, you can use the `--add-instapaper-url` flag to have the script include a full, clickable URL in the output.
+```sh
+instapaper-scraper --add-instapaper-url
+```
+This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
 ## How It Works
 The tool is designed with a modular architecture for reliability and maintainability.
 1. **Authentication**: The `InstapaperAuthenticator` handles secure login and session management.
-2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries.
+2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries. Shared constants, like the Instapaper base URL, are managed through `src/instapaper_scraper/constants.py`.
 3. **Data Collection**: All fetched articles are aggregated into a single list.
 4. **Export**: Finally, the collected data is written to a file in your chosen format (`.csv`, `.json`, or `.db`).
 ## Example Output
-### CSV (`output/bookmarks.csv`)
+### CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
 ```csv
-id,title,url
-999901234,"Article 1",https://www.example.com/page-1/
-999002345,"Article 2",https://www.example.com/page-2/
+"id","instapaper_url","title","url"
+"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
+"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
 ```
-### JSON (`output/bookmarks.json`)
+### JSON (`output/bookmarks.json`) (with --add-instapaper-url)
 ```json
 [
     {
         "id": "999901234",
         "title": "Article 1",
-        "url": "https://www.example.com/page-1/"
+        "url": "https://www.example.com/page-1/",
+        "instapaper_url": "https://www.instapaper.com/read/999901234"
     },
     {
         "id": "999002345",
         "title": "Article 2",
-        "url": "https://www.example.com/page-2/"
+        "url": "https://www.example.com/page-2/",
+        "instapaper_url": "https://www.instapaper.com/read/999002345"
     }
 ]
 ```
 ### SQLite (`output/bookmarks.db`)
-A SQLite database file is created with an `articles` table containing `id`, `title`, and `url` columns.
+A SQLite database file is created with an `articles` table. The table includes `id`, `title`, and `url` columns. If the `--add-instapaper-url` flag is used, a `instapaper_url` column is also included. This feature is fully backward-compatible and will automatically adapt to the user's installed SQLite version, using an efficient generated column on modern versions (3.31.0+) and a fallback for older versions.
 ## Development & Testing

instapaper_scraper-1.1.0rc1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
+instapaper_scraper/api.py,sha256=-Dq5fOAGSGopb-qonIbETd9ZlxWdULKRgl1DCOuVemY,11618
+instapaper_scraper/auth.py,sha256=VTBE9KhGGJm0KbMT5DCTMCbh-N3HiJuJ9wMDb8CyZT4,7015
+instapaper_scraper/cli.py,sha256=wsQxTVFIyJq3EQiAtz7dCjg1vI2_Y9quZv4ifuEPDU8,7495
+instapaper_scraper/constants.py,sha256=ubFWa47985lIz58qokMC0xQzTmCB6NOa17KFgWLn65E,403
+instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
+instapaper_scraper/output.py,sha256=lxJgW71-m1YuMYJHeK6nu479pk_3bQGc0axzNCvxtZQ,5338
+instapaper_scraper-1.1.0rc1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
+instapaper_scraper-1.1.0rc1.dist-info/METADATA,sha256=O-VJZg1yN3cuPRfBCevmD9_IrOR07NGpzrgZXI2-6hk,11637
+instapaper_scraper-1.1.0rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+instapaper_scraper-1.1.0rc1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
+instapaper_scraper-1.1.0rc1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
+instapaper_scraper-1.1.0rc1.dist-info/RECORD,,

instapaper_scraper-1.0.0.post1.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
-instapaper_scraper/api.py,sha256=KvGxK2P35-3TsONPWcQTVBZT-q70p7hobeQ7E9PhXwA,11740
-instapaper_scraper/auth.py,sha256=DepQKDdVSm1dMFNIkpK_LIlaI0JllAYZb3_LJWhMe-g,7554
-instapaper_scraper/cli.py,sha256=Pxf1cAoLW9N-X1BP73HE0i2Qv7rPTaIyrPqG3cgdSTI,6860
-instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
-instapaper_scraper/output.py,sha256=0vQQ4AHZwFJg3O5O2zzvKUf0cOS1fTjXdivFqEHAun0,3081
-instapaper_scraper-1.0.0.post1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
-instapaper_scraper-1.0.0.post1.dist-info/METADATA,sha256=rWkPxBIY-Vo2opYPJ6KiSGiGfmrklMkI-CM9HwOf9to,10353
-instapaper_scraper-1.0.0.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-instapaper_scraper-1.0.0.post1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
-instapaper_scraper-1.0.0.post1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
-instapaper_scraper-1.0.0.post1.dist-info/RECORD,,

{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

instapaper-scraper 1.0.0.post1__py3-none-any.whl → 1.1.0rc1__py3-none-any.whl

instapaper-scraper 1.0.0.post1py3-none-any.whl → 1.1.0rc1py3-none-any.whl