instapaper-scraper 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instapaper_scraper/api.py +38 -26
- instapaper_scraper/auth.py +51 -61
- instapaper_scraper/cli.py +40 -16
- instapaper_scraper/constants.py +17 -0
- instapaper_scraper/output.py +108 -35
- instapaper_scraper-1.1.0.dist-info/METADATA +352 -0
- instapaper_scraper-1.1.0.dist-info/RECORD +13 -0
- instapaper_scraper-1.0.0.dist-info/METADATA +0 -280
- instapaper_scraper-1.0.0.dist-info/RECORD +0 -12
- {instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/WHEEL +0 -0
- {instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/entry_points.txt +0 -0
- {instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {instapaper_scraper-1.0.0.dist-info → instapaper_scraper-1.1.0.dist-info}/top_level.txt +0 -0
instapaper_scraper/api.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
-
from typing import List, Dict, Tuple, Optional
|
|
4
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from bs4 import BeautifulSoup
|
|
8
|
+
from bs4.element import Tag
|
|
8
9
|
|
|
9
10
|
from .exceptions import ScraperStructureChanged
|
|
11
|
+
from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
class InstapaperClient:
|
|
@@ -14,8 +16,6 @@ class InstapaperClient:
|
|
|
14
16
|
A client for interacting with the Instapaper website to fetch articles.
|
|
15
17
|
"""
|
|
16
18
|
|
|
17
|
-
BASE_URL = "https://www.instapaper.com"
|
|
18
|
-
|
|
19
19
|
# Environment variable names
|
|
20
20
|
ENV_MAX_RETRIES = "MAX_RETRIES"
|
|
21
21
|
ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
|
|
@@ -39,11 +39,6 @@ class InstapaperClient:
|
|
|
39
39
|
URL_PATH_USER = "/u/"
|
|
40
40
|
URL_PATH_FOLDER = "/u/folder/"
|
|
41
41
|
|
|
42
|
-
# Dictionary keys for article data
|
|
43
|
-
KEY_ID = "id"
|
|
44
|
-
KEY_TITLE = "title"
|
|
45
|
-
KEY_URL = "url"
|
|
46
|
-
|
|
47
42
|
# HTTP status codes
|
|
48
43
|
HTTP_TOO_MANY_REQUESTS = 429
|
|
49
44
|
HTTP_SERVER_ERROR_START = 500
|
|
@@ -129,14 +124,28 @@ class InstapaperClient:
|
|
|
129
124
|
soup = BeautifulSoup(response.text, self.HTML_PARSER)
|
|
130
125
|
|
|
131
126
|
article_list = soup.find(id=self.ARTICLE_LIST_ID)
|
|
132
|
-
if not article_list:
|
|
127
|
+
if not isinstance(article_list, Tag):
|
|
133
128
|
raise ScraperStructureChanged(self.MSG_ARTICLE_LIST_NOT_FOUND)
|
|
134
129
|
|
|
135
130
|
articles = article_list.find_all(self.ARTICLE_TAG)
|
|
136
|
-
article_ids = [
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
131
|
+
article_ids = []
|
|
132
|
+
for article in articles:
|
|
133
|
+
if not isinstance(article, Tag):
|
|
134
|
+
continue
|
|
135
|
+
article_id_val = article.get(KEY_ID)
|
|
136
|
+
|
|
137
|
+
# Ensure article_id_val is a string before calling replace
|
|
138
|
+
# If it's a list, take the first element. This is a pragmatic
|
|
139
|
+
# approach since 'id' attributes should ideally be unique strings.
|
|
140
|
+
if isinstance(article_id_val, list):
|
|
141
|
+
article_id_val = article_id_val[0] if article_id_val else None
|
|
142
|
+
|
|
143
|
+
if isinstance(article_id_val, str) and article_id_val.startswith(
|
|
144
|
+
self.ARTICLE_ID_PREFIX
|
|
145
|
+
):
|
|
146
|
+
article_ids.append(
|
|
147
|
+
article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
|
|
148
|
+
)
|
|
140
149
|
|
|
141
150
|
data = self._parse_article_data(soup, article_ids, page)
|
|
142
151
|
has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
|
|
@@ -204,19 +213,19 @@ class InstapaperClient:
|
|
|
204
213
|
) -> str:
|
|
205
214
|
"""Constructs the URL for the given page, considering folder mode."""
|
|
206
215
|
if folder_info and folder_info.get("id") and folder_info.get("slug"):
|
|
207
|
-
return f"{
|
|
208
|
-
return f"{
|
|
216
|
+
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
|
|
217
|
+
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
209
218
|
|
|
210
219
|
def _parse_article_data(
|
|
211
220
|
self, soup: BeautifulSoup, article_ids: List[str], page: int
|
|
212
|
-
) -> List[Dict[str,
|
|
221
|
+
) -> List[Dict[str, Any]]:
|
|
213
222
|
"""Parses the raw HTML to extract structured data for each article."""
|
|
214
223
|
data = []
|
|
215
224
|
for article_id in article_ids:
|
|
216
225
|
article_id_full = f"{self.ARTICLE_ID_PREFIX}{article_id}"
|
|
217
226
|
article_element = soup.find(id=article_id_full)
|
|
218
227
|
try:
|
|
219
|
-
if not article_element:
|
|
228
|
+
if not isinstance(article_element, Tag):
|
|
220
229
|
raise AttributeError(
|
|
221
230
|
self.MSG_ARTICLE_ELEMENT_NOT_FOUND.format(
|
|
222
231
|
article_id_full=article_id_full
|
|
@@ -224,20 +233,23 @@ class InstapaperClient:
|
|
|
224
233
|
)
|
|
225
234
|
|
|
226
235
|
title_element = article_element.find(class_=self.ARTICLE_TITLE_CLASS)
|
|
227
|
-
if not title_element:
|
|
236
|
+
if not isinstance(title_element, Tag):
|
|
228
237
|
raise AttributeError(self.MSG_TITLE_ELEMENT_NOT_FOUND)
|
|
229
238
|
title = title_element.get_text().strip()
|
|
230
239
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
240
|
+
meta_element = article_element.find(class_=self.TITLE_META_CLASS)
|
|
241
|
+
if not isinstance(meta_element, Tag):
|
|
242
|
+
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
243
|
+
|
|
244
|
+
link_element = meta_element.find("a")
|
|
245
|
+
if (
|
|
246
|
+
not isinstance(link_element, Tag)
|
|
247
|
+
or "href" not in link_element.attrs
|
|
248
|
+
):
|
|
235
249
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
236
250
|
link = link_element["href"]
|
|
237
251
|
|
|
238
|
-
data.append(
|
|
239
|
-
{self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
|
|
240
|
-
)
|
|
252
|
+
data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
|
|
241
253
|
except AttributeError as e:
|
|
242
254
|
logging.warning(
|
|
243
255
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
|
@@ -289,7 +301,7 @@ class InstapaperClient:
|
|
|
289
301
|
)
|
|
290
302
|
return False
|
|
291
303
|
|
|
292
|
-
def _wait_for_retry(self, attempt: int, reason: str):
|
|
304
|
+
def _wait_for_retry(self, attempt: int, reason: str) -> None:
|
|
293
305
|
"""Calculates and waits for an exponential backoff period."""
|
|
294
306
|
sleep_time = self.backoff_factor * (2**attempt)
|
|
295
307
|
logging.warning(
|
instapaper_scraper/auth.py
CHANGED
|
@@ -3,16 +3,38 @@ import getpass
|
|
|
3
3
|
import logging
|
|
4
4
|
import stat
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Union
|
|
6
|
+
from typing import Union, Optional
|
|
7
7
|
|
|
8
8
|
from cryptography.fernet import Fernet
|
|
9
9
|
import requests
|
|
10
10
|
|
|
11
|
+
from .constants import INSTAPAPER_BASE_URL
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
|
|
14
|
+
# --- Encryption Helper ---
|
|
15
|
+
def get_encryption_key(key_file: Union[str, Path]) -> bytes:
|
|
16
|
+
"""
|
|
17
|
+
Loads the encryption key from a file or generates a new one.
|
|
18
|
+
Sets strict file permissions for the key file.
|
|
19
|
+
"""
|
|
20
|
+
key_path = Path(key_file)
|
|
21
|
+
key_path.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
if key_path.exists():
|
|
24
|
+
with open(key_path, "rb") as f:
|
|
25
|
+
key = f.read()
|
|
26
|
+
else:
|
|
27
|
+
key = Fernet.generate_key()
|
|
28
|
+
with open(key_path, "wb") as f:
|
|
29
|
+
f.write(key)
|
|
30
|
+
# Set file permissions to 0600 (owner read/write only)
|
|
31
|
+
os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
|
|
32
|
+
logging.info(f"Generated new encryption key at {key_path}.")
|
|
33
|
+
return key
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class InstapaperAuthenticator:
|
|
14
37
|
# URLs
|
|
15
|
-
INSTAPAPER_BASE_URL = "https://www.instapaper.com"
|
|
16
38
|
INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
|
|
17
39
|
INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
|
|
18
40
|
|
|
@@ -25,10 +47,6 @@ class InstapaperConstants:
|
|
|
25
47
|
# Request related
|
|
26
48
|
REQUEST_TIMEOUT = 10
|
|
27
49
|
|
|
28
|
-
# App config
|
|
29
|
-
APP_NAME = "instapaper-scraper"
|
|
30
|
-
CONFIG_DIR = Path.home() / ".config" / APP_NAME
|
|
31
|
-
|
|
32
50
|
# Prompts
|
|
33
51
|
PROMPT_USERNAME = "Enter your Instapaper username: "
|
|
34
52
|
PROMPT_PASSWORD = "Enter your Instapaper password: "
|
|
@@ -44,40 +62,17 @@ class InstapaperConstants:
|
|
|
44
62
|
LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
|
|
45
63
|
LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
|
|
46
64
|
|
|
47
|
-
|
|
48
|
-
# --- Encryption Helper ---
|
|
49
|
-
def get_encryption_key(key_file: Union[str, Path]) -> bytes:
|
|
50
|
-
"""
|
|
51
|
-
Loads the encryption key from a file or generates a new one.
|
|
52
|
-
Sets strict file permissions for the key file.
|
|
53
|
-
"""
|
|
54
|
-
key_path = Path(key_file)
|
|
55
|
-
key_path.parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
-
|
|
57
|
-
if key_path.exists():
|
|
58
|
-
with open(key_path, "rb") as f:
|
|
59
|
-
key = f.read()
|
|
60
|
-
else:
|
|
61
|
-
key = Fernet.generate_key()
|
|
62
|
-
with open(key_path, "wb") as f:
|
|
63
|
-
f.write(key)
|
|
64
|
-
# Set file permissions to 0600 (owner read/write only)
|
|
65
|
-
os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
|
|
66
|
-
logging.info(f"Generated new encryption key at {key_path}.")
|
|
67
|
-
return key
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class InstapaperAuthenticator:
|
|
71
65
|
def __init__(
|
|
72
66
|
self,
|
|
73
67
|
session: requests.Session,
|
|
74
68
|
session_file: Union[str, Path],
|
|
75
69
|
key_file: Union[str, Path],
|
|
76
|
-
username: str = None,
|
|
77
|
-
password: str = None,
|
|
70
|
+
username: Optional[str] = None,
|
|
71
|
+
password: Optional[str] = None,
|
|
78
72
|
):
|
|
79
73
|
self.session = session
|
|
80
74
|
self.session_file = Path(session_file)
|
|
75
|
+
self.key_file = Path(key_file)
|
|
81
76
|
self.key = get_encryption_key(key_file)
|
|
82
77
|
self.fernet = Fernet(self.key)
|
|
83
78
|
self.username = username
|
|
@@ -116,24 +111,22 @@ class InstapaperAuthenticator:
|
|
|
116
111
|
if not line:
|
|
117
112
|
continue
|
|
118
113
|
parts = line.split(":", 2)
|
|
119
|
-
if len(parts) ==
|
|
114
|
+
if len(parts) == self.COOKIE_PART_COUNT:
|
|
120
115
|
name, value, domain = parts
|
|
121
116
|
self.session.cookies.set(name, value, domain=domain)
|
|
122
117
|
|
|
123
118
|
if self.session.cookies and self._verify_session():
|
|
124
|
-
logging.info(
|
|
119
|
+
logging.info(self.LOG_SESSION_LOAD_SUCCESS)
|
|
125
120
|
return True
|
|
126
121
|
else:
|
|
127
|
-
logging.warning(
|
|
122
|
+
logging.warning(self.LOG_SESSION_LOAD_FAILED)
|
|
128
123
|
# Clear cookies if verification fails
|
|
129
124
|
self.session.cookies.clear()
|
|
130
125
|
return False
|
|
131
126
|
|
|
132
127
|
except Exception as e:
|
|
133
128
|
logging.warning(
|
|
134
|
-
|
|
135
|
-
session_file=self.session_file, e=e
|
|
136
|
-
)
|
|
129
|
+
self.LOG_SESSION_LOAD_ERROR.format(session_file=self.session_file, e=e)
|
|
137
130
|
)
|
|
138
131
|
self.session_file.unlink(missing_ok=True)
|
|
139
132
|
return False
|
|
@@ -142,57 +135,56 @@ class InstapaperAuthenticator:
|
|
|
142
135
|
"""Checks if the current session is valid by making a request."""
|
|
143
136
|
try:
|
|
144
137
|
verify_response = self.session.get(
|
|
145
|
-
|
|
146
|
-
timeout=
|
|
138
|
+
self.INSTAPAPER_VERIFY_URL,
|
|
139
|
+
timeout=self.REQUEST_TIMEOUT,
|
|
147
140
|
)
|
|
148
141
|
verify_response.raise_for_status()
|
|
149
|
-
return
|
|
142
|
+
return self.LOGIN_FORM_IDENTIFIER not in verify_response.text
|
|
150
143
|
except requests.RequestException as e:
|
|
151
|
-
logging.error(
|
|
144
|
+
logging.error(self.LOG_SESSION_VERIFY_FAILED.format(e=e))
|
|
152
145
|
return False
|
|
153
146
|
|
|
154
147
|
def _login_with_credentials(self) -> bool:
|
|
155
148
|
"""Logs in using username/password from arguments or user prompt."""
|
|
156
|
-
logging.info(
|
|
149
|
+
logging.info(self.LOG_NO_VALID_SESSION)
|
|
157
150
|
username = self.username
|
|
158
151
|
password = self.password
|
|
159
152
|
|
|
160
153
|
if not username or not password:
|
|
161
|
-
username = input(
|
|
162
|
-
password = getpass.getpass(
|
|
154
|
+
username = input(self.PROMPT_USERNAME)
|
|
155
|
+
password = getpass.getpass(self.PROMPT_PASSWORD)
|
|
163
156
|
elif self.username:
|
|
164
157
|
logging.info(
|
|
165
158
|
f"Using username '{self.username}' from command-line arguments."
|
|
166
159
|
)
|
|
167
160
|
|
|
168
161
|
login_response = self.session.post(
|
|
169
|
-
|
|
162
|
+
self.INSTAPAPER_LOGIN_URL,
|
|
170
163
|
data={"username": username, "password": password, "keep_logged_in": "yes"},
|
|
171
|
-
timeout=
|
|
164
|
+
timeout=self.REQUEST_TIMEOUT,
|
|
172
165
|
)
|
|
173
166
|
|
|
174
|
-
required_cookies =
|
|
167
|
+
required_cookies = self.REQUIRED_COOKIES
|
|
175
168
|
found_cookies = {c.name for c in self.session.cookies}
|
|
176
169
|
|
|
177
|
-
if (
|
|
178
|
-
|
|
179
|
-
and required_cookies.issubset(found_cookies)
|
|
170
|
+
if self.LOGIN_SUCCESS_PATH in login_response.url and required_cookies.issubset(
|
|
171
|
+
found_cookies
|
|
180
172
|
):
|
|
181
|
-
logging.info(
|
|
173
|
+
logging.info(self.LOG_LOGIN_SUCCESS)
|
|
182
174
|
return True
|
|
183
175
|
else:
|
|
184
|
-
logging.error(
|
|
176
|
+
logging.error(self.LOG_LOGIN_FAILED)
|
|
185
177
|
return False
|
|
186
178
|
|
|
187
|
-
def _save_session(self):
|
|
179
|
+
def _save_session(self) -> None:
|
|
188
180
|
"""Saves the current session cookies to an encrypted file."""
|
|
189
|
-
required_cookies =
|
|
181
|
+
required_cookies = self.REQUIRED_COOKIES
|
|
190
182
|
cookies_to_save = [
|
|
191
183
|
c for c in self.session.cookies if c.name in required_cookies
|
|
192
184
|
]
|
|
193
185
|
|
|
194
186
|
if not cookies_to_save:
|
|
195
|
-
logging.warning(
|
|
187
|
+
logging.warning(self.LOG_NO_KNOWN_COOKIE_TO_SAVE)
|
|
196
188
|
return
|
|
197
189
|
|
|
198
190
|
cookie_data = ""
|
|
@@ -206,6 +198,4 @@ class InstapaperAuthenticator:
|
|
|
206
198
|
f.write(encrypted_data)
|
|
207
199
|
|
|
208
200
|
os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
|
|
209
|
-
logging.info(
|
|
210
|
-
InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
|
|
211
|
-
)
|
|
201
|
+
logging.info(self.LOG_SAVED_SESSION.format(session_file=self.session_file))
|
instapaper_scraper/cli.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import argparse
|
|
4
4
|
import requests
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Union
|
|
6
|
+
from typing import Union, List, Dict, Any, Optional, cast
|
|
7
7
|
|
|
8
8
|
if sys.version_info >= (3, 11):
|
|
9
9
|
import tomllib
|
|
@@ -15,6 +15,13 @@ from .auth import InstapaperAuthenticator
|
|
|
15
15
|
from .api import InstapaperClient
|
|
16
16
|
from .output import save_articles
|
|
17
17
|
from .exceptions import ScraperStructureChanged
|
|
18
|
+
from .constants import CONFIG_DIR
|
|
19
|
+
|
|
20
|
+
# --- Constants ---
|
|
21
|
+
CONFIG_FILENAME = "config.toml"
|
|
22
|
+
DEFAULT_SESSION_FILENAME = ".instapaper_session"
|
|
23
|
+
DEFAULT_KEY_FILENAME = ".session_key"
|
|
24
|
+
DEFAULT_OUTPUT_FILENAME = "output/bookmarks.{ext}"
|
|
18
25
|
|
|
19
26
|
|
|
20
27
|
def _resolve_path(
|
|
@@ -32,19 +39,18 @@ def _resolve_path(
|
|
|
32
39
|
return user_dir_filename
|
|
33
40
|
|
|
34
41
|
|
|
35
|
-
def load_config(config_path_str: Union[str, None] = None) ->
|
|
42
|
+
def load_config(config_path_str: Union[str, None] = None) -> Optional[Dict[str, Any]]:
|
|
36
43
|
"""
|
|
37
44
|
Loads configuration from a TOML file.
|
|
38
45
|
It checks the provided path, then config.toml in the project root,
|
|
39
46
|
and finally ~/.config/instapaper-scraper/config.toml.
|
|
40
47
|
"""
|
|
41
|
-
app_name = "instapaper-scraper"
|
|
42
48
|
default_paths = [
|
|
43
|
-
Path(
|
|
44
|
-
|
|
49
|
+
Path(CONFIG_FILENAME),
|
|
50
|
+
CONFIG_DIR / CONFIG_FILENAME,
|
|
45
51
|
]
|
|
46
52
|
|
|
47
|
-
paths_to_check = []
|
|
53
|
+
paths_to_check: List[Path] = []
|
|
48
54
|
if config_path_str:
|
|
49
55
|
paths_to_check.insert(0, Path(config_path_str).expanduser())
|
|
50
56
|
paths_to_check.extend(default_paths)
|
|
@@ -54,7 +60,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
|
54
60
|
try:
|
|
55
61
|
with open(path, "rb") as f:
|
|
56
62
|
logging.info(f"Loading configuration from {path}")
|
|
57
|
-
return tomllib.load(f)
|
|
63
|
+
return cast(Dict[str, Any], tomllib.load(f))
|
|
58
64
|
except tomllib.TOMLDecodeError as e:
|
|
59
65
|
logging.error(f"Error decoding TOML file at {path}: {e}")
|
|
60
66
|
return None
|
|
@@ -62,7 +68,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
|
62
68
|
return None
|
|
63
69
|
|
|
64
70
|
|
|
65
|
-
def main():
|
|
71
|
+
def main() -> None:
|
|
66
72
|
"""
|
|
67
73
|
Main entry point for the Instapaper scraper CLI.
|
|
68
74
|
"""
|
|
@@ -95,6 +101,11 @@ def main():
|
|
|
95
101
|
parser.add_argument("--key-file", help="Path to the session key file.")
|
|
96
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
97
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--add-instapaper-url",
|
|
106
|
+
action="store_true",
|
|
107
|
+
help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
|
|
108
|
+
)
|
|
98
109
|
parser.add_argument(
|
|
99
110
|
"--limit",
|
|
100
111
|
type=int,
|
|
@@ -133,7 +144,7 @@ def main():
|
|
|
133
144
|
print(" 0: none (non-folder mode)")
|
|
134
145
|
for i, folder in enumerate(folders):
|
|
135
146
|
display_name = folder.get("key") or folder.get("slug") or folder.get("id")
|
|
136
|
-
print(f" {i+1}: {display_name}")
|
|
147
|
+
print(f" {i + 1}: {display_name}")
|
|
137
148
|
|
|
138
149
|
try:
|
|
139
150
|
choice = int(input("Select a folder (enter a number): "))
|
|
@@ -153,18 +164,21 @@ def main():
|
|
|
153
164
|
output_filename = config["output_filename"]
|
|
154
165
|
else:
|
|
155
166
|
ext = "db" if args.format == "sqlite" else args.format
|
|
156
|
-
output_filename =
|
|
167
|
+
output_filename = DEFAULT_OUTPUT_FILENAME.format(ext=ext)
|
|
157
168
|
|
|
158
169
|
session = requests.Session()
|
|
159
170
|
|
|
160
171
|
# Resolve session and key file paths
|
|
161
|
-
app_name = "instapaper-scraper"
|
|
162
|
-
user_config_dir = Path.home() / ".config" / app_name
|
|
163
|
-
|
|
164
172
|
session_file = _resolve_path(
|
|
165
|
-
args.session_file,
|
|
173
|
+
args.session_file,
|
|
174
|
+
DEFAULT_SESSION_FILENAME,
|
|
175
|
+
CONFIG_DIR / DEFAULT_SESSION_FILENAME,
|
|
176
|
+
)
|
|
177
|
+
key_file = _resolve_path(
|
|
178
|
+
args.key_file,
|
|
179
|
+
DEFAULT_KEY_FILENAME,
|
|
180
|
+
CONFIG_DIR / DEFAULT_KEY_FILENAME,
|
|
166
181
|
)
|
|
167
|
-
key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
|
|
168
182
|
|
|
169
183
|
# 1. Authenticate
|
|
170
184
|
authenticator = InstapaperAuthenticator(
|
|
@@ -195,7 +209,17 @@ def main():
|
|
|
195
209
|
sys.exit(1)
|
|
196
210
|
|
|
197
211
|
# 3. Save Articles
|
|
198
|
-
|
|
212
|
+
try:
|
|
213
|
+
save_articles(
|
|
214
|
+
all_articles,
|
|
215
|
+
args.format,
|
|
216
|
+
output_filename,
|
|
217
|
+
add_instapaper_url=args.add_instapaper_url,
|
|
218
|
+
)
|
|
219
|
+
logging.info("Articles scraped and saved successfully.")
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logging.error(f"An unexpected error occurred during saving: {e}")
|
|
222
|
+
sys.exit(1)
|
|
199
223
|
|
|
200
224
|
|
|
201
225
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Shared constants used across the instapaper-scraper project.
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
# --- General ---
|
|
5
|
+
APP_NAME = "instapaper-scraper"
|
|
6
|
+
|
|
7
|
+
# --- URLS ---
|
|
8
|
+
INSTAPAPER_BASE_URL = "https://www.instapaper.com"
|
|
9
|
+
INSTAPAPER_READ_URL = f"{INSTAPAPER_BASE_URL}/read/"
|
|
10
|
+
|
|
11
|
+
# --- Paths ---
|
|
12
|
+
CONFIG_DIR = Path.home() / ".config" / APP_NAME
|
|
13
|
+
|
|
14
|
+
# --- Article Data Keys ---
|
|
15
|
+
KEY_ID = "id"
|
|
16
|
+
KEY_TITLE = "title"
|
|
17
|
+
KEY_URL = "url"
|