instapaper-scraper 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ import importlib.metadata
2
+
3
+ try:
4
+ __version__ = importlib.metadata.version("instapaper-scraper")
5
+ except importlib.metadata.PackageNotFoundError:
6
+ # package is not installed
7
+ __version__ = "unknown"
@@ -0,0 +1,303 @@
1
+ import os
2
+ import logging
3
+ import time
4
+ from typing import List, Dict, Tuple, Optional
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+ from .exceptions import ScraperStructureChanged
10
+
11
+
12
+ class InstapaperClient:
13
+ """
14
+ A client for interacting with the Instapaper website to fetch articles.
15
+ """
16
+
17
+ BASE_URL = "https://www.instapaper.com"
18
+
19
+ # Environment variable names
20
+ ENV_MAX_RETRIES = "MAX_RETRIES"
21
+ ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
22
+
23
+ # Default values
24
+ DEFAULT_MAX_RETRIES = 3
25
+ DEFAULT_BACKOFF_FACTOR = 1.0
26
+ DEFAULT_REQUEST_TIMEOUT = 30
27
+ DEFAULT_PAGE_START = 1
28
+
29
+ # HTML parsing constants
30
+ HTML_PARSER = "html.parser"
31
+ ARTICLE_LIST_ID = "article_list"
32
+ ARTICLE_TAG = "article"
33
+ ARTICLE_ID_PREFIX = "article_"
34
+ PAGINATE_OLDER_CLASS = "paginate_older"
35
+ ARTICLE_TITLE_CLASS = "article_title"
36
+ TITLE_META_CLASS = "title_meta"
37
+
38
+ # URL paths
39
+ URL_PATH_USER = "/u/"
40
+ URL_PATH_FOLDER = "/u/folder/"
41
+
42
+ # Dictionary keys for article data
43
+ KEY_ID = "id"
44
+ KEY_TITLE = "title"
45
+ KEY_URL = "url"
46
+
47
+ # HTTP status codes
48
+ HTTP_TOO_MANY_REQUESTS = 429
49
+ HTTP_SERVER_ERROR_START = 500
50
+ HTTP_SERVER_ERROR_END = 600
51
+
52
+ # Logging and error messages
53
+ MSG_ARTICLE_LIST_NOT_FOUND = "Could not find article list ('#article_list')."
54
+ MSG_SCRAPING_PAGE = "Scraping page {page}..."
55
+ MSG_ARTICLE_ELEMENT_NOT_FOUND = "Article element '{article_id_full}' not found."
56
+ MSG_TITLE_ELEMENT_NOT_FOUND = "Title element not found"
57
+ MSG_LINK_ELEMENT_NOT_FOUND = "Link element or href not found"
58
+ MSG_PARSE_ARTICLE_WARNING = (
59
+ "Could not parse article with id {article_id} on page {page}. Details: {e}"
60
+ )
61
+ MSG_RATE_LIMITED_RETRY = (
62
+ "Rate limited ({status_code}). Retrying after {wait_time} seconds."
63
+ )
64
+ MSG_RATE_LIMITED_REASON = "Rate limited ({status_code})"
65
+ MSG_REQUEST_FAILED_STATUS_REASON = "Request failed with status {status_code}"
66
+ MSG_REQUEST_FAILED_UNRECOVERABLE = (
67
+ "Request failed with unrecoverable status code {status_code}."
68
+ )
69
+ MSG_NETWORK_ERROR_REASON = "Network error ({error_type})"
70
+ MSG_SCRAPING_FAILED_STRUCTURE_CHANGE = (
71
+ "Scraping failed due to HTML structure change: {e}"
72
+ )
73
+ MSG_ALL_RETRIES_FAILED = "All {max_retries} retries failed."
74
+ MSG_SCRAPING_FAILED_UNKNOWN = (
75
+ "Scraping failed after multiple retries for an unknown reason."
76
+ )
77
+ MSG_RETRY_ATTEMPT = "{reason} (attempt {attempt_num}/{max_retries}). Retrying in {sleep_time:.2f} seconds."
78
+
79
+ def __init__(self, session: requests.Session):
80
+ """
81
+ Initializes the client with a requests Session.
82
+ Args:
83
+ session: A requests.Session object, presumably authenticated.
84
+ """
85
+ self.session = session
86
+ try:
87
+ self.max_retries = int(
88
+ os.getenv(self.ENV_MAX_RETRIES, str(self.DEFAULT_MAX_RETRIES))
89
+ )
90
+ except ValueError:
91
+ logging.warning(
92
+ f"Invalid value for {self.ENV_MAX_RETRIES}, using default {self.DEFAULT_MAX_RETRIES}"
93
+ )
94
+ self.max_retries = self.DEFAULT_MAX_RETRIES
95
+
96
+ try:
97
+ self.backoff_factor = float(
98
+ os.getenv(self.ENV_BACKOFF_FACTOR, str(self.DEFAULT_BACKOFF_FACTOR))
99
+ )
100
+ except ValueError:
101
+ logging.warning(
102
+ f"Invalid value for {self.ENV_BACKOFF_FACTOR}, using default {self.DEFAULT_BACKOFF_FACTOR}"
103
+ )
104
+ self.backoff_factor = self.DEFAULT_BACKOFF_FACTOR
105
+
106
+ def get_articles(
107
+ self,
108
+ page: int = DEFAULT_PAGE_START,
109
+ folder_info: Optional[Dict[str, str]] = None,
110
+ ) -> Tuple[List[Dict[str, str]], bool]:
111
+ """
112
+ Fetches a single page of articles and determines if there are more pages.
113
+ Args:
114
+ page: The page number to fetch.
115
+ folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
116
+ Returns:
117
+ A tuple containing:
118
+ - A list of article data (dictionaries with id, title, url).
119
+ - A boolean indicating if there is a next page.
120
+ """
121
+ url = self._get_page_url(page, folder_info)
122
+ last_exception: Optional[Exception] = None
123
+
124
+ for attempt in range(self.max_retries):
125
+ try:
126
+ response = self.session.get(url, timeout=self.DEFAULT_REQUEST_TIMEOUT)
127
+ response.raise_for_status()
128
+
129
+ soup = BeautifulSoup(response.text, self.HTML_PARSER)
130
+
131
+ article_list = soup.find(id=self.ARTICLE_LIST_ID)
132
+ if not article_list:
133
+ raise ScraperStructureChanged(self.MSG_ARTICLE_LIST_NOT_FOUND)
134
+
135
+ articles = article_list.find_all(self.ARTICLE_TAG)
136
+ article_ids = [
137
+ article[self.KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
138
+ for article in articles
139
+ ]
140
+
141
+ data = self._parse_article_data(soup, article_ids, page)
142
+ has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
143
+
144
+ return data, has_more
145
+
146
+ except requests.exceptions.HTTPError as e:
147
+ last_exception = e
148
+ if self._handle_http_error(e, attempt):
149
+ continue # Retry if the handler decided to wait
150
+ else:
151
+ raise e # Re-raise if the error is unrecoverable
152
+
153
+ except (
154
+ requests.exceptions.ConnectionError,
155
+ requests.exceptions.Timeout,
156
+ ) as e:
157
+ last_exception = e
158
+ self._wait_for_retry(
159
+ attempt,
160
+ self.MSG_NETWORK_ERROR_REASON.format(error_type=type(e).__name__),
161
+ )
162
+
163
+ except ScraperStructureChanged as e:
164
+ logging.error(self.MSG_SCRAPING_FAILED_STRUCTURE_CHANGE.format(e=e))
165
+ raise e
166
+ except Exception as e:
167
+ last_exception = e
168
+ self._wait_for_retry(
169
+ attempt,
170
+ self.MSG_SCRAPING_FAILED_UNKNOWN,
171
+ )
172
+
173
+ logging.error(self.MSG_ALL_RETRIES_FAILED.format(max_retries=self.max_retries))
174
+ if last_exception:
175
+ raise last_exception
176
+ raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
177
+
178
+ def get_all_articles(
179
+ self, limit: Optional[int] = None, folder_info: Optional[Dict[str, str]] = None
180
+ ) -> List[Dict[str, str]]:
181
+ """
182
+ Iterates through pages and fetches articles up to a specified limit.
183
+ Args:
184
+ limit: The maximum number of pages to scrape. If None, scrapes all pages.
185
+ folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
186
+ """
187
+ all_articles = []
188
+ page = self.DEFAULT_PAGE_START
189
+ has_more = True
190
+ while has_more:
191
+ if limit is not None and page > limit:
192
+ logging.info(f"Reached page limit of {limit}.")
193
+ break
194
+
195
+ logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
196
+ data, has_more = self.get_articles(page=page, folder_info=folder_info)
197
+ if data:
198
+ all_articles.extend(data)
199
+ page += 1
200
+ return all_articles
201
+
202
+ def _get_page_url(
203
+ self, page: int, folder_info: Optional[Dict[str, str]] = None
204
+ ) -> str:
205
+ """Constructs the URL for the given page, considering folder mode."""
206
+ if folder_info and folder_info.get("id") and folder_info.get("slug"):
207
+ return f"{self.BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
208
+ return f"{self.BASE_URL}{self.URL_PATH_USER}{page}"
209
+
210
+ def _parse_article_data(
211
+ self, soup: BeautifulSoup, article_ids: List[str], page: int
212
+ ) -> List[Dict[str, str]]:
213
+ """Parses the raw HTML to extract structured data for each article."""
214
+ data = []
215
+ for article_id in article_ids:
216
+ article_id_full = f"{self.ARTICLE_ID_PREFIX}{article_id}"
217
+ article_element = soup.find(id=article_id_full)
218
+ try:
219
+ if not article_element:
220
+ raise AttributeError(
221
+ self.MSG_ARTICLE_ELEMENT_NOT_FOUND.format(
222
+ article_id_full=article_id_full
223
+ )
224
+ )
225
+
226
+ title_element = article_element.find(class_=self.ARTICLE_TITLE_CLASS)
227
+ if not title_element:
228
+ raise AttributeError(self.MSG_TITLE_ELEMENT_NOT_FOUND)
229
+ title = title_element.get_text().strip()
230
+
231
+ link_element = article_element.find(class_=self.TITLE_META_CLASS).find(
232
+ "a"
233
+ )
234
+ if not link_element or "href" not in link_element.attrs:
235
+ raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
236
+ link = link_element["href"]
237
+
238
+ data.append(
239
+ {self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
240
+ )
241
+ except AttributeError as e:
242
+ logging.warning(
243
+ self.MSG_PARSE_ARTICLE_WARNING.format(
244
+ article_id=article_id, page=page, e=e
245
+ )
246
+ )
247
+ continue
248
+ return data
249
+
250
+ def _handle_http_error(
251
+ self, e: requests.exceptions.HTTPError, attempt: int
252
+ ) -> bool:
253
+ """Handles HTTP errors, returns True if a retry should be attempted."""
254
+ status_code = e.response.status_code
255
+ if status_code == self.HTTP_TOO_MANY_REQUESTS: # Too Many Requests
256
+ wait_time_str = e.response.headers.get("Retry-After")
257
+ try:
258
+ wait_time = int(wait_time_str) if wait_time_str else 0
259
+ if wait_time > 0:
260
+ logging.warning(
261
+ self.MSG_RATE_LIMITED_RETRY.format(
262
+ status_code=status_code, wait_time=wait_time
263
+ )
264
+ )
265
+ time.sleep(wait_time)
266
+ return True
267
+ except (ValueError, TypeError):
268
+ pass # Fallback to exponential backoff
269
+ self._wait_for_retry(
270
+ attempt, self.MSG_RATE_LIMITED_REASON.format(status_code=status_code)
271
+ )
272
+ return True
273
+ elif (
274
+ self.HTTP_SERVER_ERROR_START <= status_code < self.HTTP_SERVER_ERROR_END
275
+ ): # Server-side errors
276
+ self._wait_for_retry(
277
+ attempt,
278
+ self.MSG_REQUEST_FAILED_STATUS_REASON.format(status_code=status_code),
279
+ )
280
+ return True
281
+ elif status_code == 404:
282
+ logging.error(
283
+ f"Error 404: Not Found. This might indicate an invalid folder ID or slug. URL: {e.response.url}"
284
+ )
285
+ return False # Do not retry, unrecoverable
286
+ else: # Other client-side errors (4xx) are not worth retrying
287
+ logging.error(
288
+ self.MSG_REQUEST_FAILED_UNRECOVERABLE.format(status_code=status_code)
289
+ )
290
+ return False
291
+
292
+ def _wait_for_retry(self, attempt: int, reason: str):
293
+ """Calculates and waits for an exponential backoff period."""
294
+ sleep_time = self.backoff_factor * (2**attempt)
295
+ logging.warning(
296
+ self.MSG_RETRY_ATTEMPT.format(
297
+ reason=reason,
298
+ attempt_num=attempt + 1,
299
+ max_retries=self.max_retries,
300
+ sleep_time=sleep_time,
301
+ )
302
+ )
303
+ time.sleep(sleep_time)
@@ -0,0 +1,211 @@
1
+ import os
2
+ import getpass
3
+ import logging
4
+ import stat
5
+ from pathlib import Path
6
+ from typing import Union
7
+
8
+ from cryptography.fernet import Fernet
9
+ import requests
10
+
11
+
12
+ # --- Constants ---
13
+ class InstapaperConstants:
14
+ # URLs
15
+ INSTAPAPER_BASE_URL = "https://www.instapaper.com"
16
+ INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
17
+ INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
18
+
19
+ # Session/Cookie related
20
+ COOKIE_PART_COUNT = 3
21
+ REQUIRED_COOKIES = {"pfu", "pfp", "pfh"}
22
+ LOGIN_FORM_IDENTIFIER = "login_form"
23
+ LOGIN_SUCCESS_PATH = "/u"
24
+
25
+ # Request related
26
+ REQUEST_TIMEOUT = 10
27
+
28
+ # App config
29
+ APP_NAME = "instapaper-scraper"
30
+ CONFIG_DIR = Path.home() / ".config" / APP_NAME
31
+
32
+ # Prompts
33
+ PROMPT_USERNAME = "Enter your Instapaper username: "
34
+ PROMPT_PASSWORD = "Enter your Instapaper password: "
35
+
36
+ # Log messages
37
+ LOG_NO_VALID_SESSION = "No valid session found. Please log in."
38
+ LOG_LOGIN_SUCCESS = "Login successful."
39
+ LOG_LOGIN_FAILED = "Login failed. Please check your credentials."
40
+ LOG_SESSION_LOAD_SUCCESS = "Successfully logged in using the loaded session data."
41
+ LOG_SESSION_LOAD_FAILED = "Session loaded but verification failed."
42
+ LOG_SESSION_LOAD_ERROR = "Could not load session from {session_file}: {e}. A new session will be created."
43
+ LOG_SESSION_VERIFY_FAILED = "Session verification request failed: {e}"
44
+ LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
45
+ LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
46
+
47
+
48
+ # --- Encryption Helper ---
49
+ def get_encryption_key(key_file: Union[str, Path]) -> bytes:
50
+ """
51
+ Loads the encryption key from a file or generates a new one.
52
+ Sets strict file permissions for the key file.
53
+ """
54
+ key_path = Path(key_file)
55
+ key_path.parent.mkdir(parents=True, exist_ok=True)
56
+
57
+ if key_path.exists():
58
+ with open(key_path, "rb") as f:
59
+ key = f.read()
60
+ else:
61
+ key = Fernet.generate_key()
62
+ with open(key_path, "wb") as f:
63
+ f.write(key)
64
+ # Set file permissions to 0600 (owner read/write only)
65
+ os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
66
+ logging.info(f"Generated new encryption key at {key_path}.")
67
+ return key
68
+
69
+
70
+ class InstapaperAuthenticator:
71
+ def __init__(
72
+ self,
73
+ session: requests.Session,
74
+ session_file: Union[str, Path],
75
+ key_file: Union[str, Path],
76
+ username: str = None,
77
+ password: str = None,
78
+ ):
79
+ self.session = session
80
+ self.session_file = Path(session_file)
81
+ self.key = get_encryption_key(key_file)
82
+ self.fernet = Fernet(self.key)
83
+ self.username = username
84
+ self.password = password
85
+
86
+ def login(self) -> bool:
87
+ """
88
+ Handles the complete login process:
89
+ 1. Tries to load an existing session.
90
+ 2. If that fails, prompts for credentials and logs in.
91
+ 3. Saves the new session.
92
+ """
93
+ if self._load_session():
94
+ return True
95
+
96
+ if self._login_with_credentials():
97
+ self._save_session()
98
+ return True
99
+
100
+ return False
101
+
102
+ def _load_session(self) -> bool:
103
+ """Tries to load and verify a session from the session file."""
104
+ if not self.session_file.exists():
105
+ return False
106
+
107
+ logging.info(f"Loading encrypted session from {self.session_file}...")
108
+ try:
109
+ with open(self.session_file, "rb") as f:
110
+ encrypted_data = f.read()
111
+
112
+ decrypted_data = self.fernet.decrypt(encrypted_data).decode("utf-8")
113
+
114
+ for line in decrypted_data.splitlines():
115
+ line = line.strip()
116
+ if not line:
117
+ continue
118
+ parts = line.split(":", 2)
119
+ if len(parts) == InstapaperConstants.COOKIE_PART_COUNT:
120
+ name, value, domain = parts
121
+ self.session.cookies.set(name, value, domain=domain)
122
+
123
+ if self.session.cookies and self._verify_session():
124
+ logging.info(InstapaperConstants.LOG_SESSION_LOAD_SUCCESS)
125
+ return True
126
+ else:
127
+ logging.warning(InstapaperConstants.LOG_SESSION_LOAD_FAILED)
128
+ # Clear cookies if verification fails
129
+ self.session.cookies.clear()
130
+ return False
131
+
132
+ except Exception as e:
133
+ logging.warning(
134
+ InstapaperConstants.LOG_SESSION_LOAD_ERROR.format(
135
+ session_file=self.session_file, e=e
136
+ )
137
+ )
138
+ self.session_file.unlink(missing_ok=True)
139
+ return False
140
+
141
+ def _verify_session(self) -> bool:
142
+ """Checks if the current session is valid by making a request."""
143
+ try:
144
+ verify_response = self.session.get(
145
+ InstapaperConstants.INSTAPAPER_VERIFY_URL,
146
+ timeout=InstapaperConstants.REQUEST_TIMEOUT,
147
+ )
148
+ verify_response.raise_for_status()
149
+ return InstapaperConstants.LOGIN_FORM_IDENTIFIER not in verify_response.text
150
+ except requests.RequestException as e:
151
+ logging.error(InstapaperConstants.LOG_SESSION_VERIFY_FAILED.format(e=e))
152
+ return False
153
+
154
+ def _login_with_credentials(self) -> bool:
155
+ """Logs in using username/password from arguments or user prompt."""
156
+ logging.info(InstapaperConstants.LOG_NO_VALID_SESSION)
157
+ username = self.username
158
+ password = self.password
159
+
160
+ if not username or not password:
161
+ username = input(InstapaperConstants.PROMPT_USERNAME)
162
+ password = getpass.getpass(InstapaperConstants.PROMPT_PASSWORD)
163
+ elif self.username:
164
+ logging.info(
165
+ f"Using username '{self.username}' from command-line arguments."
166
+ )
167
+
168
+ login_response = self.session.post(
169
+ InstapaperConstants.INSTAPAPER_LOGIN_URL,
170
+ data={"username": username, "password": password, "keep_logged_in": "yes"},
171
+ timeout=InstapaperConstants.REQUEST_TIMEOUT,
172
+ )
173
+
174
+ required_cookies = InstapaperConstants.REQUIRED_COOKIES
175
+ found_cookies = {c.name for c in self.session.cookies}
176
+
177
+ if (
178
+ InstapaperConstants.LOGIN_SUCCESS_PATH in login_response.url
179
+ and required_cookies.issubset(found_cookies)
180
+ ):
181
+ logging.info(InstapaperConstants.LOG_LOGIN_SUCCESS)
182
+ return True
183
+ else:
184
+ logging.error(InstapaperConstants.LOG_LOGIN_FAILED)
185
+ return False
186
+
187
+ def _save_session(self):
188
+ """Saves the current session cookies to an encrypted file."""
189
+ required_cookies = InstapaperConstants.REQUIRED_COOKIES
190
+ cookies_to_save = [
191
+ c for c in self.session.cookies if c.name in required_cookies
192
+ ]
193
+
194
+ if not cookies_to_save:
195
+ logging.warning(InstapaperConstants.LOG_NO_KNOWN_COOKIE_TO_SAVE)
196
+ return
197
+
198
+ cookie_data = ""
199
+ for cookie in cookies_to_save:
200
+ cookie_data += f"{cookie.name}:{cookie.value}:{cookie.domain}\n"
201
+
202
+ encrypted_data = self.fernet.encrypt(cookie_data.encode("utf-8"))
203
+
204
+ self.session_file.parent.mkdir(parents=True, exist_ok=True)
205
+ with open(self.session_file, "wb") as f:
206
+ f.write(encrypted_data)
207
+
208
+ os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
209
+ logging.info(
210
+ InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
211
+ )