instapaper-scraper 1.0.0.post1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
instapaper_scraper/api.py CHANGED
@@ -1,12 +1,14 @@
1
1
  import os
2
2
  import logging
3
3
  import time
4
- from typing import List, Dict, Tuple, Optional
4
+ from typing import List, Dict, Tuple, Optional, Any
5
5
 
6
6
  import requests
7
7
  from bs4 import BeautifulSoup
8
+ from bs4.element import Tag
8
9
 
9
10
  from .exceptions import ScraperStructureChanged
11
+ from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
10
12
 
11
13
 
12
14
  class InstapaperClient:
@@ -14,8 +16,6 @@ class InstapaperClient:
14
16
  A client for interacting with the Instapaper website to fetch articles.
15
17
  """
16
18
 
17
- BASE_URL = "https://www.instapaper.com"
18
-
19
19
  # Environment variable names
20
20
  ENV_MAX_RETRIES = "MAX_RETRIES"
21
21
  ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
@@ -39,11 +39,6 @@ class InstapaperClient:
39
39
  URL_PATH_USER = "/u/"
40
40
  URL_PATH_FOLDER = "/u/folder/"
41
41
 
42
- # Dictionary keys for article data
43
- KEY_ID = "id"
44
- KEY_TITLE = "title"
45
- KEY_URL = "url"
46
-
47
42
  # HTTP status codes
48
43
  HTTP_TOO_MANY_REQUESTS = 429
49
44
  HTTP_SERVER_ERROR_START = 500
@@ -129,14 +124,28 @@ class InstapaperClient:
129
124
  soup = BeautifulSoup(response.text, self.HTML_PARSER)
130
125
 
131
126
  article_list = soup.find(id=self.ARTICLE_LIST_ID)
132
- if not article_list:
127
+ if not isinstance(article_list, Tag):
133
128
  raise ScraperStructureChanged(self.MSG_ARTICLE_LIST_NOT_FOUND)
134
129
 
135
130
  articles = article_list.find_all(self.ARTICLE_TAG)
136
- article_ids = [
137
- article[self.KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
138
- for article in articles
139
- ]
131
+ article_ids = []
132
+ for article in articles:
133
+ if not isinstance(article, Tag):
134
+ continue
135
+ article_id_val = article.get(KEY_ID)
136
+
137
+ # Ensure article_id_val is a string before calling replace
138
+ # If it's a list, take the first element. This is a pragmatic
139
+ # approach since 'id' attributes should ideally be unique strings.
140
+ if isinstance(article_id_val, list):
141
+ article_id_val = article_id_val[0] if article_id_val else None
142
+
143
+ if isinstance(article_id_val, str) and article_id_val.startswith(
144
+ self.ARTICLE_ID_PREFIX
145
+ ):
146
+ article_ids.append(
147
+ article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
148
+ )
140
149
 
141
150
  data = self._parse_article_data(soup, article_ids, page)
142
151
  has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
@@ -204,19 +213,19 @@ class InstapaperClient:
204
213
  ) -> str:
205
214
  """Constructs the URL for the given page, considering folder mode."""
206
215
  if folder_info and folder_info.get("id") and folder_info.get("slug"):
207
- return f"{self.BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
208
- return f"{self.BASE_URL}{self.URL_PATH_USER}{page}"
216
+ return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
217
+ return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
209
218
 
210
219
  def _parse_article_data(
211
220
  self, soup: BeautifulSoup, article_ids: List[str], page: int
212
- ) -> List[Dict[str, str]]:
221
+ ) -> List[Dict[str, Any]]:
213
222
  """Parses the raw HTML to extract structured data for each article."""
214
223
  data = []
215
224
  for article_id in article_ids:
216
225
  article_id_full = f"{self.ARTICLE_ID_PREFIX}{article_id}"
217
226
  article_element = soup.find(id=article_id_full)
218
227
  try:
219
- if not article_element:
228
+ if not isinstance(article_element, Tag):
220
229
  raise AttributeError(
221
230
  self.MSG_ARTICLE_ELEMENT_NOT_FOUND.format(
222
231
  article_id_full=article_id_full
@@ -224,20 +233,23 @@ class InstapaperClient:
224
233
  )
225
234
 
226
235
  title_element = article_element.find(class_=self.ARTICLE_TITLE_CLASS)
227
- if not title_element:
236
+ if not isinstance(title_element, Tag):
228
237
  raise AttributeError(self.MSG_TITLE_ELEMENT_NOT_FOUND)
229
238
  title = title_element.get_text().strip()
230
239
 
231
- link_element = article_element.find(class_=self.TITLE_META_CLASS).find(
232
- "a"
233
- )
234
- if not link_element or "href" not in link_element.attrs:
240
+ meta_element = article_element.find(class_=self.TITLE_META_CLASS)
241
+ if not isinstance(meta_element, Tag):
242
+ raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
243
+
244
+ link_element = meta_element.find("a")
245
+ if (
246
+ not isinstance(link_element, Tag)
247
+ or "href" not in link_element.attrs
248
+ ):
235
249
  raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
236
250
  link = link_element["href"]
237
251
 
238
- data.append(
239
- {self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
240
- )
252
+ data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
241
253
  except AttributeError as e:
242
254
  logging.warning(
243
255
  self.MSG_PARSE_ARTICLE_WARNING.format(
@@ -289,7 +301,7 @@ class InstapaperClient:
289
301
  )
290
302
  return False
291
303
 
292
- def _wait_for_retry(self, attempt: int, reason: str):
304
+ def _wait_for_retry(self, attempt: int, reason: str) -> None:
293
305
  """Calculates and waits for an exponential backoff period."""
294
306
  sleep_time = self.backoff_factor * (2**attempt)
295
307
  logging.warning(
@@ -3,16 +3,38 @@ import getpass
3
3
  import logging
4
4
  import stat
5
5
  from pathlib import Path
6
- from typing import Union
6
+ from typing import Union, Optional
7
7
 
8
8
  from cryptography.fernet import Fernet
9
9
  import requests
10
10
 
11
+ from .constants import INSTAPAPER_BASE_URL
11
12
 
12
- # --- Constants ---
13
- class InstapaperConstants:
13
+
14
+ # --- Encryption Helper ---
15
+ def get_encryption_key(key_file: Union[str, Path]) -> bytes:
16
+ """
17
+ Loads the encryption key from a file or generates a new one.
18
+ Sets strict file permissions for the key file.
19
+ """
20
+ key_path = Path(key_file)
21
+ key_path.parent.mkdir(parents=True, exist_ok=True)
22
+
23
+ if key_path.exists():
24
+ with open(key_path, "rb") as f:
25
+ key = f.read()
26
+ else:
27
+ key = Fernet.generate_key()
28
+ with open(key_path, "wb") as f:
29
+ f.write(key)
30
+ # Set file permissions to 0600 (owner read/write only)
31
+ os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
32
+ logging.info(f"Generated new encryption key at {key_path}.")
33
+ return key
34
+
35
+
36
+ class InstapaperAuthenticator:
14
37
  # URLs
15
- INSTAPAPER_BASE_URL = "https://www.instapaper.com"
16
38
  INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
17
39
  INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
18
40
 
@@ -25,10 +47,6 @@ class InstapaperConstants:
25
47
  # Request related
26
48
  REQUEST_TIMEOUT = 10
27
49
 
28
- # App config
29
- APP_NAME = "instapaper-scraper"
30
- CONFIG_DIR = Path.home() / ".config" / APP_NAME
31
-
32
50
  # Prompts
33
51
  PROMPT_USERNAME = "Enter your Instapaper username: "
34
52
  PROMPT_PASSWORD = "Enter your Instapaper password: "
@@ -44,40 +62,17 @@ class InstapaperConstants:
44
62
  LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
45
63
  LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
46
64
 
47
-
48
- # --- Encryption Helper ---
49
- def get_encryption_key(key_file: Union[str, Path]) -> bytes:
50
- """
51
- Loads the encryption key from a file or generates a new one.
52
- Sets strict file permissions for the key file.
53
- """
54
- key_path = Path(key_file)
55
- key_path.parent.mkdir(parents=True, exist_ok=True)
56
-
57
- if key_path.exists():
58
- with open(key_path, "rb") as f:
59
- key = f.read()
60
- else:
61
- key = Fernet.generate_key()
62
- with open(key_path, "wb") as f:
63
- f.write(key)
64
- # Set file permissions to 0600 (owner read/write only)
65
- os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
66
- logging.info(f"Generated new encryption key at {key_path}.")
67
- return key
68
-
69
-
70
- class InstapaperAuthenticator:
71
65
  def __init__(
72
66
  self,
73
67
  session: requests.Session,
74
68
  session_file: Union[str, Path],
75
69
  key_file: Union[str, Path],
76
- username: str = None,
77
- password: str = None,
70
+ username: Optional[str] = None,
71
+ password: Optional[str] = None,
78
72
  ):
79
73
  self.session = session
80
74
  self.session_file = Path(session_file)
75
+ self.key_file = Path(key_file)
81
76
  self.key = get_encryption_key(key_file)
82
77
  self.fernet = Fernet(self.key)
83
78
  self.username = username
@@ -116,24 +111,22 @@ class InstapaperAuthenticator:
116
111
  if not line:
117
112
  continue
118
113
  parts = line.split(":", 2)
119
- if len(parts) == InstapaperConstants.COOKIE_PART_COUNT:
114
+ if len(parts) == self.COOKIE_PART_COUNT:
120
115
  name, value, domain = parts
121
116
  self.session.cookies.set(name, value, domain=domain)
122
117
 
123
118
  if self.session.cookies and self._verify_session():
124
- logging.info(InstapaperConstants.LOG_SESSION_LOAD_SUCCESS)
119
+ logging.info(self.LOG_SESSION_LOAD_SUCCESS)
125
120
  return True
126
121
  else:
127
- logging.warning(InstapaperConstants.LOG_SESSION_LOAD_FAILED)
122
+ logging.warning(self.LOG_SESSION_LOAD_FAILED)
128
123
  # Clear cookies if verification fails
129
124
  self.session.cookies.clear()
130
125
  return False
131
126
 
132
127
  except Exception as e:
133
128
  logging.warning(
134
- InstapaperConstants.LOG_SESSION_LOAD_ERROR.format(
135
- session_file=self.session_file, e=e
136
- )
129
+ self.LOG_SESSION_LOAD_ERROR.format(session_file=self.session_file, e=e)
137
130
  )
138
131
  self.session_file.unlink(missing_ok=True)
139
132
  return False
@@ -142,57 +135,56 @@ class InstapaperAuthenticator:
142
135
  """Checks if the current session is valid by making a request."""
143
136
  try:
144
137
  verify_response = self.session.get(
145
- InstapaperConstants.INSTAPAPER_VERIFY_URL,
146
- timeout=InstapaperConstants.REQUEST_TIMEOUT,
138
+ self.INSTAPAPER_VERIFY_URL,
139
+ timeout=self.REQUEST_TIMEOUT,
147
140
  )
148
141
  verify_response.raise_for_status()
149
- return InstapaperConstants.LOGIN_FORM_IDENTIFIER not in verify_response.text
142
+ return self.LOGIN_FORM_IDENTIFIER not in verify_response.text
150
143
  except requests.RequestException as e:
151
- logging.error(InstapaperConstants.LOG_SESSION_VERIFY_FAILED.format(e=e))
144
+ logging.error(self.LOG_SESSION_VERIFY_FAILED.format(e=e))
152
145
  return False
153
146
 
154
147
  def _login_with_credentials(self) -> bool:
155
148
  """Logs in using username/password from arguments or user prompt."""
156
- logging.info(InstapaperConstants.LOG_NO_VALID_SESSION)
149
+ logging.info(self.LOG_NO_VALID_SESSION)
157
150
  username = self.username
158
151
  password = self.password
159
152
 
160
153
  if not username or not password:
161
- username = input(InstapaperConstants.PROMPT_USERNAME)
162
- password = getpass.getpass(InstapaperConstants.PROMPT_PASSWORD)
154
+ username = input(self.PROMPT_USERNAME)
155
+ password = getpass.getpass(self.PROMPT_PASSWORD)
163
156
  elif self.username:
164
157
  logging.info(
165
158
  f"Using username '{self.username}' from command-line arguments."
166
159
  )
167
160
 
168
161
  login_response = self.session.post(
169
- InstapaperConstants.INSTAPAPER_LOGIN_URL,
162
+ self.INSTAPAPER_LOGIN_URL,
170
163
  data={"username": username, "password": password, "keep_logged_in": "yes"},
171
- timeout=InstapaperConstants.REQUEST_TIMEOUT,
164
+ timeout=self.REQUEST_TIMEOUT,
172
165
  )
173
166
 
174
- required_cookies = InstapaperConstants.REQUIRED_COOKIES
167
+ required_cookies = self.REQUIRED_COOKIES
175
168
  found_cookies = {c.name for c in self.session.cookies}
176
169
 
177
- if (
178
- InstapaperConstants.LOGIN_SUCCESS_PATH in login_response.url
179
- and required_cookies.issubset(found_cookies)
170
+ if self.LOGIN_SUCCESS_PATH in login_response.url and required_cookies.issubset(
171
+ found_cookies
180
172
  ):
181
- logging.info(InstapaperConstants.LOG_LOGIN_SUCCESS)
173
+ logging.info(self.LOG_LOGIN_SUCCESS)
182
174
  return True
183
175
  else:
184
- logging.error(InstapaperConstants.LOG_LOGIN_FAILED)
176
+ logging.error(self.LOG_LOGIN_FAILED)
185
177
  return False
186
178
 
187
- def _save_session(self):
179
+ def _save_session(self) -> None:
188
180
  """Saves the current session cookies to an encrypted file."""
189
- required_cookies = InstapaperConstants.REQUIRED_COOKIES
181
+ required_cookies = self.REQUIRED_COOKIES
190
182
  cookies_to_save = [
191
183
  c for c in self.session.cookies if c.name in required_cookies
192
184
  ]
193
185
 
194
186
  if not cookies_to_save:
195
- logging.warning(InstapaperConstants.LOG_NO_KNOWN_COOKIE_TO_SAVE)
187
+ logging.warning(self.LOG_NO_KNOWN_COOKIE_TO_SAVE)
196
188
  return
197
189
 
198
190
  cookie_data = ""
@@ -206,6 +198,4 @@ class InstapaperAuthenticator:
206
198
  f.write(encrypted_data)
207
199
 
208
200
  os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
209
- logging.info(
210
- InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
211
- )
201
+ logging.info(self.LOG_SAVED_SESSION.format(session_file=self.session_file))
instapaper_scraper/cli.py CHANGED
@@ -3,7 +3,7 @@ import logging
3
3
  import argparse
4
4
  import requests
5
5
  from pathlib import Path
6
- from typing import Union
6
+ from typing import Union, List, Dict, Any, Optional, cast
7
7
 
8
8
  if sys.version_info >= (3, 11):
9
9
  import tomllib
@@ -15,6 +15,13 @@ from .auth import InstapaperAuthenticator
15
15
  from .api import InstapaperClient
16
16
  from .output import save_articles
17
17
  from .exceptions import ScraperStructureChanged
18
+ from .constants import CONFIG_DIR
19
+
20
+ # --- Constants ---
21
+ CONFIG_FILENAME = "config.toml"
22
+ DEFAULT_SESSION_FILENAME = ".instapaper_session"
23
+ DEFAULT_KEY_FILENAME = ".session_key"
24
+ DEFAULT_OUTPUT_FILENAME = "output/bookmarks.{ext}"
18
25
 
19
26
 
20
27
  def _resolve_path(
@@ -32,19 +39,18 @@ def _resolve_path(
32
39
  return user_dir_filename
33
40
 
34
41
 
35
- def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
42
+ def load_config(config_path_str: Union[str, None] = None) -> Optional[Dict[str, Any]]:
36
43
  """
37
44
  Loads configuration from a TOML file.
38
45
  It checks the provided path, then config.toml in the project root,
39
46
  and finally ~/.config/instapaper-scraper/config.toml.
40
47
  """
41
- app_name = "instapaper-scraper"
42
48
  default_paths = [
43
- Path("config.toml"),
44
- Path.home() / ".config" / app_name / "config.toml",
49
+ Path(CONFIG_FILENAME),
50
+ CONFIG_DIR / CONFIG_FILENAME,
45
51
  ]
46
52
 
47
- paths_to_check = []
53
+ paths_to_check: List[Path] = []
48
54
  if config_path_str:
49
55
  paths_to_check.insert(0, Path(config_path_str).expanduser())
50
56
  paths_to_check.extend(default_paths)
@@ -54,7 +60,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
54
60
  try:
55
61
  with open(path, "rb") as f:
56
62
  logging.info(f"Loading configuration from {path}")
57
- return tomllib.load(f)
63
+ return cast(Dict[str, Any], tomllib.load(f))
58
64
  except tomllib.TOMLDecodeError as e:
59
65
  logging.error(f"Error decoding TOML file at {path}: {e}")
60
66
  return None
@@ -62,7 +68,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
62
68
  return None
63
69
 
64
70
 
65
- def main():
71
+ def main() -> None:
66
72
  """
67
73
  Main entry point for the Instapaper scraper CLI.
68
74
  """
@@ -95,6 +101,11 @@ def main():
95
101
  parser.add_argument("--key-file", help="Path to the session key file.")
96
102
  parser.add_argument("--username", help="Instapaper username.")
97
103
  parser.add_argument("--password", help="Instapaper password.")
104
+ parser.add_argument(
105
+ "--add-instapaper-url",
106
+ action="store_true",
107
+ help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
108
+ )
98
109
  parser.add_argument(
99
110
  "--limit",
100
111
  type=int,
@@ -133,7 +144,7 @@ def main():
133
144
  print(" 0: none (non-folder mode)")
134
145
  for i, folder in enumerate(folders):
135
146
  display_name = folder.get("key") or folder.get("slug") or folder.get("id")
136
- print(f" {i+1}: {display_name}")
147
+ print(f" {i + 1}: {display_name}")
137
148
 
138
149
  try:
139
150
  choice = int(input("Select a folder (enter a number): "))
@@ -153,18 +164,21 @@ def main():
153
164
  output_filename = config["output_filename"]
154
165
  else:
155
166
  ext = "db" if args.format == "sqlite" else args.format
156
- output_filename = f"output/bookmarks.{ext}"
167
+ output_filename = DEFAULT_OUTPUT_FILENAME.format(ext=ext)
157
168
 
158
169
  session = requests.Session()
159
170
 
160
171
  # Resolve session and key file paths
161
- app_name = "instapaper-scraper"
162
- user_config_dir = Path.home() / ".config" / app_name
163
-
164
172
  session_file = _resolve_path(
165
- args.session_file, ".instapaper_session", user_config_dir / ".instapaper_session"
173
+ args.session_file,
174
+ DEFAULT_SESSION_FILENAME,
175
+ CONFIG_DIR / DEFAULT_SESSION_FILENAME,
176
+ )
177
+ key_file = _resolve_path(
178
+ args.key_file,
179
+ DEFAULT_KEY_FILENAME,
180
+ CONFIG_DIR / DEFAULT_KEY_FILENAME,
166
181
  )
167
- key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
168
182
 
169
183
  # 1. Authenticate
170
184
  authenticator = InstapaperAuthenticator(
@@ -195,7 +209,17 @@ def main():
195
209
  sys.exit(1)
196
210
 
197
211
  # 3. Save Articles
198
- save_articles(all_articles, args.format, output_filename)
212
+ try:
213
+ save_articles(
214
+ all_articles,
215
+ args.format,
216
+ output_filename,
217
+ add_instapaper_url=args.add_instapaper_url,
218
+ )
219
+ logging.info("Articles scraped and saved successfully.")
220
+ except Exception as e:
221
+ logging.error(f"An unexpected error occurred during saving: {e}")
222
+ sys.exit(1)
199
223
 
200
224
 
201
225
  if __name__ == "__main__":
@@ -0,0 +1,17 @@
1
+ # Shared constants used across the instapaper-scraper project.
2
+ from pathlib import Path
3
+
4
+ # --- General ---
5
+ APP_NAME = "instapaper-scraper"
6
+
7
+ # --- URLS ---
8
+ INSTAPAPER_BASE_URL = "https://www.instapaper.com"
9
+ INSTAPAPER_READ_URL = f"{INSTAPAPER_BASE_URL}/read/"
10
+
11
+ # --- Paths ---
12
+ CONFIG_DIR = Path.home() / ".config" / APP_NAME
13
+
14
+ # --- Article Data Keys ---
15
+ KEY_ID = "id"
16
+ KEY_TITLE = "title"
17
+ KEY_URL = "url"