instapaper-scraper 1.0.0.post1__py3-none-any.whl → 1.1.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
instapaper_scraper/api.py CHANGED
@@ -7,6 +7,7 @@ import requests
7
7
  from bs4 import BeautifulSoup
8
8
 
9
9
  from .exceptions import ScraperStructureChanged
10
+ from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
10
11
 
11
12
 
12
13
  class InstapaperClient:
@@ -14,8 +15,6 @@ class InstapaperClient:
14
15
  A client for interacting with the Instapaper website to fetch articles.
15
16
  """
16
17
 
17
- BASE_URL = "https://www.instapaper.com"
18
-
19
18
  # Environment variable names
20
19
  ENV_MAX_RETRIES = "MAX_RETRIES"
21
20
  ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
@@ -39,11 +38,6 @@ class InstapaperClient:
39
38
  URL_PATH_USER = "/u/"
40
39
  URL_PATH_FOLDER = "/u/folder/"
41
40
 
42
- # Dictionary keys for article data
43
- KEY_ID = "id"
44
- KEY_TITLE = "title"
45
- KEY_URL = "url"
46
-
47
41
  # HTTP status codes
48
42
  HTTP_TOO_MANY_REQUESTS = 429
49
43
  HTTP_SERVER_ERROR_START = 500
@@ -134,7 +128,7 @@ class InstapaperClient:
134
128
 
135
129
  articles = article_list.find_all(self.ARTICLE_TAG)
136
130
  article_ids = [
137
- article[self.KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
131
+ article[KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
138
132
  for article in articles
139
133
  ]
140
134
 
@@ -204,8 +198,8 @@ class InstapaperClient:
204
198
  ) -> str:
205
199
  """Constructs the URL for the given page, considering folder mode."""
206
200
  if folder_info and folder_info.get("id") and folder_info.get("slug"):
207
- return f"{self.BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
208
- return f"{self.BASE_URL}{self.URL_PATH_USER}{page}"
201
+ return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
202
+ return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
209
203
 
210
204
  def _parse_article_data(
211
205
  self, soup: BeautifulSoup, article_ids: List[str], page: int
@@ -235,9 +229,7 @@ class InstapaperClient:
235
229
  raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
236
230
  link = link_element["href"]
237
231
 
238
- data.append(
239
- {self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
240
- )
232
+ data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
241
233
  except AttributeError as e:
242
234
  logging.warning(
243
235
  self.MSG_PARSE_ARTICLE_WARNING.format(
@@ -8,11 +8,33 @@ from typing import Union
8
8
  from cryptography.fernet import Fernet
9
9
  import requests
10
10
 
11
+ from .constants import INSTAPAPER_BASE_URL
11
12
 
12
- # --- Constants ---
13
- class InstapaperConstants:
13
+
14
+ # --- Encryption Helper ---
15
+ def get_encryption_key(key_file: Union[str, Path]) -> bytes:
16
+ """
17
+ Loads the encryption key from a file or generates a new one.
18
+ Sets strict file permissions for the key file.
19
+ """
20
+ key_path = Path(key_file)
21
+ key_path.parent.mkdir(parents=True, exist_ok=True)
22
+
23
+ if key_path.exists():
24
+ with open(key_path, "rb") as f:
25
+ key = f.read()
26
+ else:
27
+ key = Fernet.generate_key()
28
+ with open(key_path, "wb") as f:
29
+ f.write(key)
30
+ # Set file permissions to 0600 (owner read/write only)
31
+ os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
32
+ logging.info(f"Generated new encryption key at {key_path}.")
33
+ return key
34
+
35
+
36
+ class InstapaperAuthenticator:
14
37
  # URLs
15
- INSTAPAPER_BASE_URL = "https://www.instapaper.com"
16
38
  INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
17
39
  INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
18
40
 
@@ -25,10 +47,6 @@ class InstapaperConstants:
25
47
  # Request related
26
48
  REQUEST_TIMEOUT = 10
27
49
 
28
- # App config
29
- APP_NAME = "instapaper-scraper"
30
- CONFIG_DIR = Path.home() / ".config" / APP_NAME
31
-
32
50
  # Prompts
33
51
  PROMPT_USERNAME = "Enter your Instapaper username: "
34
52
  PROMPT_PASSWORD = "Enter your Instapaper password: "
@@ -44,30 +62,6 @@ class InstapaperConstants:
44
62
  LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
45
63
  LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
46
64
 
47
-
48
- # --- Encryption Helper ---
49
- def get_encryption_key(key_file: Union[str, Path]) -> bytes:
50
- """
51
- Loads the encryption key from a file or generates a new one.
52
- Sets strict file permissions for the key file.
53
- """
54
- key_path = Path(key_file)
55
- key_path.parent.mkdir(parents=True, exist_ok=True)
56
-
57
- if key_path.exists():
58
- with open(key_path, "rb") as f:
59
- key = f.read()
60
- else:
61
- key = Fernet.generate_key()
62
- with open(key_path, "wb") as f:
63
- f.write(key)
64
- # Set file permissions to 0600 (owner read/write only)
65
- os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
66
- logging.info(f"Generated new encryption key at {key_path}.")
67
- return key
68
-
69
-
70
- class InstapaperAuthenticator:
71
65
  def __init__(
72
66
  self,
73
67
  session: requests.Session,
@@ -116,24 +110,22 @@ class InstapaperAuthenticator:
116
110
  if not line:
117
111
  continue
118
112
  parts = line.split(":", 2)
119
- if len(parts) == InstapaperConstants.COOKIE_PART_COUNT:
113
+ if len(parts) == self.COOKIE_PART_COUNT:
120
114
  name, value, domain = parts
121
115
  self.session.cookies.set(name, value, domain=domain)
122
116
 
123
117
  if self.session.cookies and self._verify_session():
124
- logging.info(InstapaperConstants.LOG_SESSION_LOAD_SUCCESS)
118
+ logging.info(self.LOG_SESSION_LOAD_SUCCESS)
125
119
  return True
126
120
  else:
127
- logging.warning(InstapaperConstants.LOG_SESSION_LOAD_FAILED)
121
+ logging.warning(self.LOG_SESSION_LOAD_FAILED)
128
122
  # Clear cookies if verification fails
129
123
  self.session.cookies.clear()
130
124
  return False
131
125
 
132
126
  except Exception as e:
133
127
  logging.warning(
134
- InstapaperConstants.LOG_SESSION_LOAD_ERROR.format(
135
- session_file=self.session_file, e=e
136
- )
128
+ self.LOG_SESSION_LOAD_ERROR.format(session_file=self.session_file, e=e)
137
129
  )
138
130
  self.session_file.unlink(missing_ok=True)
139
131
  return False
@@ -142,57 +134,56 @@ class InstapaperAuthenticator:
142
134
  """Checks if the current session is valid by making a request."""
143
135
  try:
144
136
  verify_response = self.session.get(
145
- InstapaperConstants.INSTAPAPER_VERIFY_URL,
146
- timeout=InstapaperConstants.REQUEST_TIMEOUT,
137
+ self.INSTAPAPER_VERIFY_URL,
138
+ timeout=self.REQUEST_TIMEOUT,
147
139
  )
148
140
  verify_response.raise_for_status()
149
- return InstapaperConstants.LOGIN_FORM_IDENTIFIER not in verify_response.text
141
+ return self.LOGIN_FORM_IDENTIFIER not in verify_response.text
150
142
  except requests.RequestException as e:
151
- logging.error(InstapaperConstants.LOG_SESSION_VERIFY_FAILED.format(e=e))
143
+ logging.error(self.LOG_SESSION_VERIFY_FAILED.format(e=e))
152
144
  return False
153
145
 
154
146
  def _login_with_credentials(self) -> bool:
155
147
  """Logs in using username/password from arguments or user prompt."""
156
- logging.info(InstapaperConstants.LOG_NO_VALID_SESSION)
148
+ logging.info(self.LOG_NO_VALID_SESSION)
157
149
  username = self.username
158
150
  password = self.password
159
151
 
160
152
  if not username or not password:
161
- username = input(InstapaperConstants.PROMPT_USERNAME)
162
- password = getpass.getpass(InstapaperConstants.PROMPT_PASSWORD)
153
+ username = input(self.PROMPT_USERNAME)
154
+ password = getpass.getpass(self.PROMPT_PASSWORD)
163
155
  elif self.username:
164
156
  logging.info(
165
157
  f"Using username '{self.username}' from command-line arguments."
166
158
  )
167
159
 
168
160
  login_response = self.session.post(
169
- InstapaperConstants.INSTAPAPER_LOGIN_URL,
161
+ self.INSTAPAPER_LOGIN_URL,
170
162
  data={"username": username, "password": password, "keep_logged_in": "yes"},
171
- timeout=InstapaperConstants.REQUEST_TIMEOUT,
163
+ timeout=self.REQUEST_TIMEOUT,
172
164
  )
173
165
 
174
- required_cookies = InstapaperConstants.REQUIRED_COOKIES
166
+ required_cookies = self.REQUIRED_COOKIES
175
167
  found_cookies = {c.name for c in self.session.cookies}
176
168
 
177
- if (
178
- InstapaperConstants.LOGIN_SUCCESS_PATH in login_response.url
179
- and required_cookies.issubset(found_cookies)
169
+ if self.LOGIN_SUCCESS_PATH in login_response.url and required_cookies.issubset(
170
+ found_cookies
180
171
  ):
181
- logging.info(InstapaperConstants.LOG_LOGIN_SUCCESS)
172
+ logging.info(self.LOG_LOGIN_SUCCESS)
182
173
  return True
183
174
  else:
184
- logging.error(InstapaperConstants.LOG_LOGIN_FAILED)
175
+ logging.error(self.LOG_LOGIN_FAILED)
185
176
  return False
186
177
 
187
178
  def _save_session(self):
188
179
  """Saves the current session cookies to an encrypted file."""
189
- required_cookies = InstapaperConstants.REQUIRED_COOKIES
180
+ required_cookies = self.REQUIRED_COOKIES
190
181
  cookies_to_save = [
191
182
  c for c in self.session.cookies if c.name in required_cookies
192
183
  ]
193
184
 
194
185
  if not cookies_to_save:
195
- logging.warning(InstapaperConstants.LOG_NO_KNOWN_COOKIE_TO_SAVE)
186
+ logging.warning(self.LOG_NO_KNOWN_COOKIE_TO_SAVE)
196
187
  return
197
188
 
198
189
  cookie_data = ""
@@ -206,6 +197,4 @@ class InstapaperAuthenticator:
206
197
  f.write(encrypted_data)
207
198
 
208
199
  os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
209
- logging.info(
210
- InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
211
- )
200
+ logging.info(self.LOG_SAVED_SESSION.format(session_file=self.session_file))
instapaper_scraper/cli.py CHANGED
@@ -15,6 +15,13 @@ from .auth import InstapaperAuthenticator
15
15
  from .api import InstapaperClient
16
16
  from .output import save_articles
17
17
  from .exceptions import ScraperStructureChanged
18
+ from .constants import CONFIG_DIR
19
+
20
+ # --- Constants ---
21
+ CONFIG_FILENAME = "config.toml"
22
+ DEFAULT_SESSION_FILENAME = ".instapaper_session"
23
+ DEFAULT_KEY_FILENAME = ".session_key"
24
+ DEFAULT_OUTPUT_FILENAME = "output/bookmarks.{ext}"
18
25
 
19
26
 
20
27
  def _resolve_path(
@@ -38,10 +45,9 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
38
45
  It checks the provided path, then config.toml in the project root,
39
46
  and finally ~/.config/instapaper-scraper/config.toml.
40
47
  """
41
- app_name = "instapaper-scraper"
42
48
  default_paths = [
43
- Path("config.toml"),
44
- Path.home() / ".config" / app_name / "config.toml",
49
+ Path(CONFIG_FILENAME),
50
+ CONFIG_DIR / CONFIG_FILENAME,
45
51
  ]
46
52
 
47
53
  paths_to_check = []
@@ -95,6 +101,11 @@ def main():
95
101
  parser.add_argument("--key-file", help="Path to the session key file.")
96
102
  parser.add_argument("--username", help="Instapaper username.")
97
103
  parser.add_argument("--password", help="Instapaper password.")
104
+ parser.add_argument(
105
+ "--add-instapaper-url",
106
+ action="store_true",
107
+ help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
108
+ )
98
109
  parser.add_argument(
99
110
  "--limit",
100
111
  type=int,
@@ -153,18 +164,21 @@ def main():
153
164
  output_filename = config["output_filename"]
154
165
  else:
155
166
  ext = "db" if args.format == "sqlite" else args.format
156
- output_filename = f"output/bookmarks.{ext}"
167
+ output_filename = DEFAULT_OUTPUT_FILENAME.format(ext=ext)
157
168
 
158
169
  session = requests.Session()
159
170
 
160
171
  # Resolve session and key file paths
161
- app_name = "instapaper-scraper"
162
- user_config_dir = Path.home() / ".config" / app_name
163
-
164
172
  session_file = _resolve_path(
165
- args.session_file, ".instapaper_session", user_config_dir / ".instapaper_session"
173
+ args.session_file,
174
+ DEFAULT_SESSION_FILENAME,
175
+ CONFIG_DIR / DEFAULT_SESSION_FILENAME,
176
+ )
177
+ key_file = _resolve_path(
178
+ args.key_file,
179
+ DEFAULT_KEY_FILENAME,
180
+ CONFIG_DIR / DEFAULT_KEY_FILENAME,
166
181
  )
167
- key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
168
182
 
169
183
  # 1. Authenticate
170
184
  authenticator = InstapaperAuthenticator(
@@ -195,7 +209,17 @@ def main():
195
209
  sys.exit(1)
196
210
 
197
211
  # 3. Save Articles
198
- save_articles(all_articles, args.format, output_filename)
212
+ try:
213
+ save_articles(
214
+ all_articles,
215
+ args.format,
216
+ output_filename,
217
+ add_instapaper_url=args.add_instapaper_url,
218
+ )
219
+ logging.info("Articles scraped and saved successfully.")
220
+ except Exception as e:
221
+ logging.error(f"An unexpected error occurred during saving: {e}")
222
+ sys.exit(1)
199
223
 
200
224
 
201
225
  if __name__ == "__main__":
@@ -0,0 +1,17 @@
1
+ # Shared constants used across the instapaper-scraper project.
2
+ from pathlib import Path
3
+
4
+ # --- General ---
5
+ APP_NAME = "instapaper-scraper"
6
+
7
+ # --- URLS ---
8
+ INSTAPAPER_BASE_URL = "https://www.instapaper.com"
9
+ INSTAPAPER_READ_URL = f"{INSTAPAPER_BASE_URL}/read/"
10
+
11
+ # --- Paths ---
12
+ CONFIG_DIR = Path.home() / ".config" / APP_NAME
13
+
14
+ # --- Article Data Keys ---
15
+ KEY_ID = "id"
16
+ KEY_TITLE = "title"
17
+ KEY_URL = "url"
@@ -2,29 +2,17 @@ import os
2
2
  import json
3
3
  import sqlite3
4
4
  import logging
5
+ import csv
5
6
  from typing import List, Dict, Any
6
7
 
8
+ from .constants import INSTAPAPER_READ_URL, KEY_ID, KEY_TITLE, KEY_URL
9
+
7
10
  # Constants for file operations
8
11
  JSON_INDENT = 4
9
12
 
10
- # Constants for CSV output
11
- CSV_HEADER = "id,title,url\n"
12
- CSV_DELIMITER = ","
13
- CSV_ROW_FORMAT = "{id},{title},{url}\n"
14
-
15
13
  # Constants for SQLite output
16
14
  SQLITE_TABLE_NAME = "articles"
17
- SQLITE_ID_COL = "id"
18
- SQLITE_TITLE_COL = "title"
19
- SQLITE_URL_COL = "url"
20
- SQLITE_CREATE_TABLE_SQL = f"""
21
- CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} (
22
- {SQLITE_ID_COL} TEXT PRIMARY KEY,
23
- {SQLITE_TITLE_COL} TEXT NOT NULL,
24
- {SQLITE_URL_COL} TEXT NOT NULL
25
- )
26
- """
27
- SQLITE_INSERT_SQL = f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({SQLITE_ID_COL}, {SQLITE_TITLE_COL}, {SQLITE_URL_COL}) VALUES (:{SQLITE_ID_COL}, :{SQLITE_TITLE_COL}, :{SQLITE_URL_COL})"
15
+ SQLITE_INSTAPAPER_URL_COL = "instapaper_url"
28
16
 
29
17
  # Constants for logging messages
30
18
  LOG_NO_ARTICLES = "No articles found to save."
@@ -32,21 +20,52 @@ LOG_SAVED_ARTICLES = "Saved {count} articles to {filename}"
32
20
  LOG_UNKNOWN_FORMAT = "Unknown output format: {format}"
33
21
 
34
22
 
35
- def save_to_csv(data: List[Dict[str, Any]], filename: str):
23
+ def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
24
+ """Returns the SQL statement to create the articles table."""
25
+ columns = [
26
+ f"{KEY_ID} TEXT PRIMARY KEY",
27
+ f"{KEY_TITLE} TEXT NOT NULL",
28
+ f"{KEY_URL} TEXT NOT NULL",
29
+ ]
30
+ if add_instapaper_url:
31
+ # The GENERATED ALWAYS AS syntax was added in SQLite 3.31.0
32
+ if sqlite3.sqlite_version_info >= (3, 31, 0):
33
+ columns.append(
34
+ f"{SQLITE_INSTAPAPER_URL_COL} TEXT GENERATED ALWAYS AS ('{INSTAPAPER_READ_URL}' || {KEY_ID}) VIRTUAL"
35
+ )
36
+ else:
37
+ columns.append(f"{SQLITE_INSTAPAPER_URL_COL} TEXT")
38
+
39
+ return f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} ({', '.join(columns)})"
40
+
41
+
42
+ def get_sqlite_insert_sql(add_instapaper_url_manually: bool = False) -> str:
43
+ """Returns the SQL statement to insert an article."""
44
+ cols = [KEY_ID, KEY_TITLE, KEY_URL]
45
+ placeholders = [f":{KEY_ID}", f":{KEY_TITLE}", f":{KEY_URL}"]
46
+
47
+ if add_instapaper_url_manually:
48
+ cols.append(SQLITE_INSTAPAPER_URL_COL)
49
+ placeholders.append(f":{SQLITE_INSTAPAPER_URL_COL}")
50
+
51
+ return f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({', '.join(cols)}) VALUES ({', '.join(placeholders)})"
52
+
53
+
54
+ def save_to_csv(
55
+ data: List[Dict[str, Any]], filename: str, add_instapaper_url: bool = False
56
+ ):
36
57
  """Saves a list of articles to a CSV file."""
37
58
  os.makedirs(os.path.dirname(filename), exist_ok=True)
38
59
  with open(filename, "w", newline="", encoding="utf-8") as f:
39
- f.write(CSV_HEADER)
40
- for article in data:
41
- # Basic CSV quoting for titles with commas
42
- title = article[SQLITE_TITLE_COL]
43
- if CSV_DELIMITER in title:
44
- title = f'"{title}"'
45
- f.write(
46
- CSV_ROW_FORMAT.format(
47
- id=article[SQLITE_ID_COL], title=title, url=article[SQLITE_URL_COL]
48
- )
49
- )
60
+ fieldnames = [KEY_ID, KEY_TITLE, KEY_URL]
61
+ if add_instapaper_url:
62
+ # Insert instapaper_url after the id column
63
+ fieldnames.insert(1, SQLITE_INSTAPAPER_URL_COL)
64
+
65
+ writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
66
+ writer.writeheader()
67
+ writer.writerows(data)
68
+
50
69
  logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
51
70
 
52
71
 
@@ -58,19 +77,61 @@ def save_to_json(data: List[Dict[str, Any]], filename: str):
58
77
  logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
59
78
 
60
79
 
61
- def save_to_sqlite(data: List[Dict[str, Any]], db_name: str):
80
+ def save_to_sqlite(
81
+ data: List[Dict[str, Any]], db_name: str, add_instapaper_url: bool = False
82
+ ):
62
83
  """Saves a list of articles to a SQLite database."""
63
84
  os.makedirs(os.path.dirname(db_name), exist_ok=True)
64
85
  conn = sqlite3.connect(db_name)
65
86
  cursor = conn.cursor()
66
- cursor.execute(SQLITE_CREATE_TABLE_SQL)
67
- cursor.executemany(SQLITE_INSERT_SQL, data)
87
+ cursor.execute(get_sqlite_create_table_sql(add_instapaper_url))
88
+
89
+ # For older SQLite versions, we need to manually add the URL
90
+ manual_insert_required = add_instapaper_url and sqlite3.sqlite_version_info < (
91
+ 3,
92
+ 31,
93
+ 0,
94
+ )
95
+ if manual_insert_required:
96
+ data_to_insert = [
97
+ {
98
+ **article,
99
+ SQLITE_INSTAPAPER_URL_COL: f"{INSTAPAPER_READ_URL}{article[KEY_ID]}",
100
+ }
101
+ for article in data
102
+ ]
103
+ else:
104
+ data_to_insert = data
105
+
106
+ insert_sql = get_sqlite_insert_sql(
107
+ add_instapaper_url_manually=manual_insert_required
108
+ )
109
+ cursor.executemany(insert_sql, data_to_insert)
110
+
68
111
  conn.commit()
69
112
  conn.close()
70
113
  logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=db_name))
71
114
 
72
115
 
73
- def save_articles(data: List[Dict[str, Any]], format: str, filename: str):
116
+ def _correct_ext(filename: str, format: str) -> str:
117
+ """Corrects the filename extension based on the specified format."""
118
+ extension_map = {
119
+ "csv": ".csv",
120
+ "json": ".json",
121
+ "sqlite": ".db",
122
+ }
123
+ if format in extension_map:
124
+ name, _ = os.path.splitext(filename)
125
+ return name + extension_map[format]
126
+ return filename
127
+
128
+
129
+ def save_articles(
130
+ data: List[Dict[str, Any]],
131
+ format: str,
132
+ filename: str,
133
+ add_instapaper_url: bool = False,
134
+ ):
74
135
  """
75
136
  Dispatches to the correct save function based on the format.
76
137
  """
@@ -78,11 +139,23 @@ def save_articles(data: List[Dict[str, Any]], format: str, filename: str):
78
139
  logging.info(LOG_NO_ARTICLES)
79
140
  return
80
141
 
142
+ filename = _correct_ext(filename, format)
143
+
144
+ # Add the instapaper_url to the data for formats that don't auto-generate it
145
+ if add_instapaper_url and format in ("csv", "json"):
146
+ data = [
147
+ {
148
+ **article,
149
+ SQLITE_INSTAPAPER_URL_COL: f"{INSTAPAPER_READ_URL}{article[KEY_ID]}",
150
+ }
151
+ for article in data
152
+ ]
153
+
81
154
  if format == "csv":
82
- save_to_csv(data, filename=filename)
155
+ save_to_csv(data, filename=filename, add_instapaper_url=add_instapaper_url)
83
156
  elif format == "json":
84
157
  save_to_json(data, filename=filename)
85
158
  elif format == "sqlite":
86
- save_to_sqlite(data, db_name=filename)
159
+ save_to_sqlite(data, db_name=filename, add_instapaper_url=add_instapaper_url)
87
160
  else:
88
161
  logging.error(LOG_UNKNOWN_FORMAT.format(format=format))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: instapaper-scraper
3
- Version: 1.0.0.post1
3
+ Version: 1.1.0rc1
4
4
  Summary: A tool to scrape articles from Instapaper.
5
5
  Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
6
6
  Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
@@ -30,7 +30,7 @@ Requires-Dist: python-dotenv~=1.2.1
30
30
  Requires-Dist: requests~=2.32.5
31
31
  Requires-Dist: soupsieve~=2.8
32
32
  Requires-Dist: typing_extensions~=4.15.0
33
- Requires-Dist: urllib3~=2.5.0
33
+ Requires-Dist: urllib3<2.7,>=2.5
34
34
  Requires-Dist: tomli~=2.0.1; python_version < "3.11"
35
35
  Provides-Extra: dev
36
36
  Requires-Dist: pytest; extra == "dev"
@@ -49,6 +49,7 @@ Dynamic: license-file
49
49
  ![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fchriskyfung%2FInstapaperScraper%2Frefs%2Fheads%2Fmaster%2Fpyproject.toml)
50
50
  [![CI](https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml/badge.svg)](https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml)
51
51
  [![PyPI version](https://img.shields.io/pypi/v/instapaper-scraper.svg)](https://pypi.org/project/instapaper-scraper/)
52
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/instapaper-scraper?period=total&left_text=downloads)](https://pepy.tech/projects/instapaper-scraper)
52
53
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
53
54
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
54
55
  [![GitHub License](https://img.shields.io/github/license/chriskyfung/InstapaperScraper)
@@ -68,6 +69,7 @@ A Python tool to scrape all your saved Instapaper bookmarks and export them to v
68
69
  ## Getting Started
69
70
 
70
71
  ### 1. Requirements
72
+
71
73
  - Python 3.9+
72
74
 
73
75
  ### 2. Installation
@@ -152,14 +154,15 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
152
154
 
153
155
  ### Command-line Arguments
154
156
 
155
- | Argument | Description |
156
- | --------------------- | ------------------------------------------------------------------------ |
157
+ | Argument | Description |
158
+ | --- | --- |
157
159
  | `--config-path <path>`| Path to the configuration file. Searches `~/.config/instapaper-scraper/config.toml` and `config.toml` in the current directory by default. |
158
- | `--folder <value>` | Specify a folder by key, ID, or slug from your `config.toml`. **Requires a configuration file to be loaded.** Use `none` to explicitly disable folder mode. If a configuration file is not found or fails to load, and this option is used (not set to `none`), the program will exit. |
159
- | `--format <format>` | Output format (`csv`, `json`, `sqlite`). Default: `csv`. |
160
- | `--output <filename>` | Specify a custom output filename. |
161
- | `--username <user>` | Your Instapaper account username. |
162
- | `--password <pass>` | Your Instapaper account password. |
160
+ | `--folder <value>` | Specify a folder by key, ID, or slug from your `config.toml`. **Requires a configuration file to be loaded.** Use `none` to explicitly disable folder mode. If a configuration file is not found or fails to load, and this option is used (not set to `none`), the program will exit. |
161
+ | `--format <format>` | Output format (`csv`, `json`, `sqlite`). Default: `csv`. |
162
+ | `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
163
+ | `--username <user>` | Your Instapaper account username. |
164
+ | `--password <pass>` | Your Instapaper account password. |
165
+ | `--add-instapaper-url` | Adds a `instapaper_url` column to the output, containing a full, clickable URL for each article. |
163
166
 
164
167
  ### Output Formats
165
168
 
@@ -168,54 +171,64 @@ You can control the output format using the `--format` argument. The supported f
168
171
  - `csv` (default): Exports data to `output/bookmarks.csv`.
169
172
  - `json`: Exports data to `output/bookmarks.json`.
170
173
  - `sqlite`: Exports data to an `articles` table in `output/bookmarks.db`.
171
- - `--output <filename>`: Specify a custom output filename.
172
174
 
173
175
  If the `--format` flag is omitted, the script will default to `csv`.
174
176
 
177
+ When using `--output <filename>`, the file extension is automatically corrected to match the chosen format. For example, `instapaper-scraper --format json --output my_articles.txt` will create `my_articles.json`.
178
+
175
179
  #### Opening Articles in Instapaper
176
180
 
177
- The output data includes a unique `id` for each article. To open an article directly in Instapaper's reader view, append this ID to the base URL:
178
- `https://www.instapaper.com/read/<article_id>`
181
+ The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
182
+
183
+ For convenience, you can use the `--add-instapaper-url` flag to have the script include a full, clickable URL in the output.
184
+
185
+ ```sh
186
+ instapaper-scraper --add-instapaper-url
187
+ ```
188
+
189
+ This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
179
190
 
180
191
  ## How It Works
181
192
 
182
193
  The tool is designed with a modular architecture for reliability and maintainability.
183
194
 
184
195
  1. **Authentication**: The `InstapaperAuthenticator` handles secure login and session management.
185
- 2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries.
196
+ 2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries. Shared constants, like the Instapaper base URL, are managed through `src/instapaper_scraper/constants.py`.
186
197
  3. **Data Collection**: All fetched articles are aggregated into a single list.
187
198
  4. **Export**: Finally, the collected data is written to a file in your chosen format (`.csv`, `.json`, or `.db`).
188
199
 
189
200
  ## Example Output
190
201
 
191
- ### CSV (`output/bookmarks.csv`)
202
+ ### CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
192
203
 
193
204
  ```csv
194
- id,title,url
195
- 999901234,"Article 1",https://www.example.com/page-1/
196
- 999002345,"Article 2",https://www.example.com/page-2/
205
+ "id","instapaper_url","title","url"
206
+ "999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
207
+ "999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
197
208
  ```
198
209
 
199
- ### JSON (`output/bookmarks.json`)
210
+ ### JSON (`output/bookmarks.json`) (with --add-instapaper-url)
200
211
 
201
212
  ```json
202
213
  [
203
214
  {
204
215
  "id": "999901234",
205
216
  "title": "Article 1",
206
- "url": "https://www.example.com/page-1/"
217
+ "url": "https://www.example.com/page-1/",
218
+ "instapaper_url": "https://www.instapaper.com/read/999901234"
207
219
  },
208
220
  {
209
221
  "id": "999002345",
210
222
  "title": "Article 2",
211
- "url": "https://www.example.com/page-2/"
223
+ "url": "https://www.example.com/page-2/",
224
+ "instapaper_url": "https://www.instapaper.com/read/999002345"
212
225
  }
213
226
  ]
214
227
  ```
215
228
 
216
229
  ### SQLite (`output/bookmarks.db`)
217
230
 
218
- A SQLite database file is created with an `articles` table containing `id`, `title`, and `url` columns.
231
+ A SQLite database file is created with an `articles` table. The table includes `id`, `title`, and `url` columns. If the `--add-instapaper-url` flag is used, a `instapaper_url` column is also included. This feature is fully backward-compatible and will automatically adapt to the user's installed SQLite version, using an efficient generated column on modern versions (3.31.0+) and a fallback for older versions.
219
232
 
220
233
  ## Development & Testing
221
234
 
@@ -0,0 +1,13 @@
1
+ instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
2
+ instapaper_scraper/api.py,sha256=-Dq5fOAGSGopb-qonIbETd9ZlxWdULKRgl1DCOuVemY,11618
3
+ instapaper_scraper/auth.py,sha256=VTBE9KhGGJm0KbMT5DCTMCbh-N3HiJuJ9wMDb8CyZT4,7015
4
+ instapaper_scraper/cli.py,sha256=wsQxTVFIyJq3EQiAtz7dCjg1vI2_Y9quZv4ifuEPDU8,7495
5
+ instapaper_scraper/constants.py,sha256=ubFWa47985lIz58qokMC0xQzTmCB6NOa17KFgWLn65E,403
6
+ instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
7
+ instapaper_scraper/output.py,sha256=lxJgW71-m1YuMYJHeK6nu479pk_3bQGc0axzNCvxtZQ,5338
8
+ instapaper_scraper-1.1.0rc1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
9
+ instapaper_scraper-1.1.0rc1.dist-info/METADATA,sha256=O-VJZg1yN3cuPRfBCevmD9_IrOR07NGpzrgZXI2-6hk,11637
10
+ instapaper_scraper-1.1.0rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ instapaper_scraper-1.1.0rc1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
12
+ instapaper_scraper-1.1.0rc1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
13
+ instapaper_scraper-1.1.0rc1.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
2
- instapaper_scraper/api.py,sha256=KvGxK2P35-3TsONPWcQTVBZT-q70p7hobeQ7E9PhXwA,11740
3
- instapaper_scraper/auth.py,sha256=DepQKDdVSm1dMFNIkpK_LIlaI0JllAYZb3_LJWhMe-g,7554
4
- instapaper_scraper/cli.py,sha256=Pxf1cAoLW9N-X1BP73HE0i2Qv7rPTaIyrPqG3cgdSTI,6860
5
- instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
6
- instapaper_scraper/output.py,sha256=0vQQ4AHZwFJg3O5O2zzvKUf0cOS1fTjXdivFqEHAun0,3081
7
- instapaper_scraper-1.0.0.post1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
8
- instapaper_scraper-1.0.0.post1.dist-info/METADATA,sha256=rWkPxBIY-Vo2opYPJ6KiSGiGfmrklMkI-CM9HwOf9to,10353
9
- instapaper_scraper-1.0.0.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
10
- instapaper_scraper-1.0.0.post1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
11
- instapaper_scraper-1.0.0.post1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
12
- instapaper_scraper-1.0.0.post1.dist-info/RECORD,,