instapaper-scraper 1.1.0rc1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instapaper_scraper/api.py +75 -20
- instapaper_scraper/auth.py +5 -4
- instapaper_scraper/cli.py +36 -11
- instapaper_scraper/constants.py +1 -0
- instapaper_scraper/output.py +69 -17
- {instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/METADATA +156 -65
- instapaper_scraper-1.2.0.dist-info/RECORD +13 -0
- {instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/WHEEL +1 -1
- instapaper_scraper-1.1.0rc1.dist-info/RECORD +0 -13
- {instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/entry_points.txt +0 -0
- {instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/top_level.txt +0 -0
instapaper_scraper/api.py
CHANGED
|
@@ -1,13 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
-
from typing import List, Dict, Tuple, Optional
|
|
4
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from bs4 import BeautifulSoup
|
|
8
|
+
from bs4.element import Tag
|
|
8
9
|
|
|
9
10
|
from .exceptions import ScraperStructureChanged
|
|
10
|
-
from .constants import
|
|
11
|
+
from .constants import (
|
|
12
|
+
INSTAPAPER_BASE_URL,
|
|
13
|
+
KEY_ID,
|
|
14
|
+
KEY_TITLE,
|
|
15
|
+
KEY_URL,
|
|
16
|
+
KEY_ARTICLE_PREVIEW,
|
|
17
|
+
)
|
|
11
18
|
|
|
12
19
|
|
|
13
20
|
class InstapaperClient:
|
|
@@ -33,6 +40,7 @@ class InstapaperClient:
|
|
|
33
40
|
PAGINATE_OLDER_CLASS = "paginate_older"
|
|
34
41
|
ARTICLE_TITLE_CLASS = "article_title"
|
|
35
42
|
TITLE_META_CLASS = "title_meta"
|
|
43
|
+
ARTICLE_PREVIEW_CLASS = "article_preview"
|
|
36
44
|
|
|
37
45
|
# URL paths
|
|
38
46
|
URL_PATH_USER = "/u/"
|
|
@@ -101,12 +109,14 @@ class InstapaperClient:
|
|
|
101
109
|
self,
|
|
102
110
|
page: int = DEFAULT_PAGE_START,
|
|
103
111
|
folder_info: Optional[Dict[str, str]] = None,
|
|
112
|
+
add_article_preview: bool = False,
|
|
104
113
|
) -> Tuple[List[Dict[str, str]], bool]:
|
|
105
114
|
"""
|
|
106
115
|
Fetches a single page of articles and determines if there are more pages.
|
|
107
116
|
Args:
|
|
108
117
|
page: The page number to fetch.
|
|
109
118
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
119
|
+
add_article_preview: Whether to include the article preview.
|
|
110
120
|
Returns:
|
|
111
121
|
A tuple containing:
|
|
112
122
|
- A list of article data (dictionaries with id, title, url).
|
|
@@ -123,16 +133,32 @@ class InstapaperClient:
|
|
|
123
133
|
soup = BeautifulSoup(response.text, self.HTML_PARSER)
|
|
124
134
|
|
|
125
135
|
article_list = soup.find(id=self.ARTICLE_LIST_ID)
|
|
126
|
-
if not article_list:
|
|
136
|
+
if not isinstance(article_list, Tag):
|
|
127
137
|
raise ScraperStructureChanged(self.MSG_ARTICLE_LIST_NOT_FOUND)
|
|
128
138
|
|
|
129
139
|
articles = article_list.find_all(self.ARTICLE_TAG)
|
|
130
|
-
article_ids = [
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
140
|
+
article_ids = []
|
|
141
|
+
for article in articles:
|
|
142
|
+
if not isinstance(article, Tag):
|
|
143
|
+
continue
|
|
144
|
+
article_id_val = article.get(KEY_ID)
|
|
145
|
+
|
|
146
|
+
# Ensure article_id_val is a string before calling replace
|
|
147
|
+
# If it's a list, take the first element. This is a pragmatic
|
|
148
|
+
# approach since 'id' attributes should ideally be unique strings.
|
|
149
|
+
if isinstance(article_id_val, list):
|
|
150
|
+
article_id_val = article_id_val[0] if article_id_val else None
|
|
151
|
+
|
|
152
|
+
if isinstance(article_id_val, str) and article_id_val.startswith(
|
|
153
|
+
self.ARTICLE_ID_PREFIX
|
|
154
|
+
):
|
|
155
|
+
article_ids.append(
|
|
156
|
+
article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
|
|
157
|
+
)
|
|
134
158
|
|
|
135
|
-
data = self._parse_article_data(
|
|
159
|
+
data = self._parse_article_data(
|
|
160
|
+
soup, article_ids, page, add_article_preview
|
|
161
|
+
)
|
|
136
162
|
has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
|
|
137
163
|
|
|
138
164
|
return data, has_more
|
|
@@ -170,13 +196,17 @@ class InstapaperClient:
|
|
|
170
196
|
raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
|
|
171
197
|
|
|
172
198
|
def get_all_articles(
|
|
173
|
-
self,
|
|
199
|
+
self,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
folder_info: Optional[Dict[str, str]] = None,
|
|
202
|
+
add_article_preview: bool = False,
|
|
174
203
|
) -> List[Dict[str, str]]:
|
|
175
204
|
"""
|
|
176
205
|
Iterates through pages and fetches articles up to a specified limit.
|
|
177
206
|
Args:
|
|
178
207
|
limit: The maximum number of pages to scrape. If None, scrapes all pages.
|
|
179
208
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
209
|
+
add_article_preview: Whether to include the article preview.
|
|
180
210
|
"""
|
|
181
211
|
all_articles = []
|
|
182
212
|
page = self.DEFAULT_PAGE_START
|
|
@@ -187,7 +217,11 @@ class InstapaperClient:
|
|
|
187
217
|
break
|
|
188
218
|
|
|
189
219
|
logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
|
|
190
|
-
data, has_more = self.get_articles(
|
|
220
|
+
data, has_more = self.get_articles(
|
|
221
|
+
page=page,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=add_article_preview,
|
|
224
|
+
)
|
|
191
225
|
if data:
|
|
192
226
|
all_articles.extend(data)
|
|
193
227
|
page += 1
|
|
@@ -202,15 +236,19 @@ class InstapaperClient:
|
|
|
202
236
|
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
203
237
|
|
|
204
238
|
def _parse_article_data(
|
|
205
|
-
self,
|
|
206
|
-
|
|
239
|
+
self,
|
|
240
|
+
soup: BeautifulSoup,
|
|
241
|
+
article_ids: List[str],
|
|
242
|
+
page: int,
|
|
243
|
+
add_article_preview: bool = False,
|
|
244
|
+
) -> List[Dict[str, Any]]:
|
|
207
245
|
"""Parses the raw HTML to extract structured data for each article."""
|
|
208
246
|
data = []
|
|
209
247
|
for article_id in article_ids:
|
|
210
248
|
article_id_full = f"{self.ARTICLE_ID_PREFIX}{article_id}"
|
|
211
249
|
article_element = soup.find(id=article_id_full)
|
|
212
250
|
try:
|
|
213
|
-
if not article_element:
|
|
251
|
+
if not isinstance(article_element, Tag):
|
|
214
252
|
raise AttributeError(
|
|
215
253
|
self.MSG_ARTICLE_ELEMENT_NOT_FOUND.format(
|
|
216
254
|
article_id_full=article_id_full
|
|
@@ -218,18 +256,35 @@ class InstapaperClient:
|
|
|
218
256
|
)
|
|
219
257
|
|
|
220
258
|
title_element = article_element.find(class_=self.ARTICLE_TITLE_CLASS)
|
|
221
|
-
if not title_element:
|
|
259
|
+
if not isinstance(title_element, Tag):
|
|
222
260
|
raise AttributeError(self.MSG_TITLE_ELEMENT_NOT_FOUND)
|
|
223
261
|
title = title_element.get_text().strip()
|
|
224
262
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
263
|
+
meta_element = article_element.find(class_=self.TITLE_META_CLASS)
|
|
264
|
+
if not isinstance(meta_element, Tag):
|
|
265
|
+
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
266
|
+
|
|
267
|
+
link_element = meta_element.find("a")
|
|
268
|
+
if (
|
|
269
|
+
not isinstance(link_element, Tag)
|
|
270
|
+
or "href" not in link_element.attrs
|
|
271
|
+
):
|
|
229
272
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
230
273
|
link = link_element["href"]
|
|
231
274
|
|
|
232
|
-
|
|
275
|
+
article_data = {KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link}
|
|
276
|
+
|
|
277
|
+
if add_article_preview:
|
|
278
|
+
preview_element = article_element.find(
|
|
279
|
+
class_=self.ARTICLE_PREVIEW_CLASS
|
|
280
|
+
)
|
|
281
|
+
article_data[KEY_ARTICLE_PREVIEW] = (
|
|
282
|
+
preview_element.get_text().strip()
|
|
283
|
+
if isinstance(preview_element, Tag)
|
|
284
|
+
else ""
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
data.append(article_data)
|
|
233
288
|
except AttributeError as e:
|
|
234
289
|
logging.warning(
|
|
235
290
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
|
@@ -281,7 +336,7 @@ class InstapaperClient:
|
|
|
281
336
|
)
|
|
282
337
|
return False
|
|
283
338
|
|
|
284
|
-
def _wait_for_retry(self, attempt: int, reason: str):
|
|
339
|
+
def _wait_for_retry(self, attempt: int, reason: str) -> None:
|
|
285
340
|
"""Calculates and waits for an exponential backoff period."""
|
|
286
341
|
sleep_time = self.backoff_factor * (2**attempt)
|
|
287
342
|
logging.warning(
|
instapaper_scraper/auth.py
CHANGED
|
@@ -3,7 +3,7 @@ import getpass
|
|
|
3
3
|
import logging
|
|
4
4
|
import stat
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Union
|
|
6
|
+
from typing import Union, Optional
|
|
7
7
|
|
|
8
8
|
from cryptography.fernet import Fernet
|
|
9
9
|
import requests
|
|
@@ -67,11 +67,12 @@ class InstapaperAuthenticator:
|
|
|
67
67
|
session: requests.Session,
|
|
68
68
|
session_file: Union[str, Path],
|
|
69
69
|
key_file: Union[str, Path],
|
|
70
|
-
username: str = None,
|
|
71
|
-
password: str = None,
|
|
70
|
+
username: Optional[str] = None,
|
|
71
|
+
password: Optional[str] = None,
|
|
72
72
|
):
|
|
73
73
|
self.session = session
|
|
74
74
|
self.session_file = Path(session_file)
|
|
75
|
+
self.key_file = Path(key_file)
|
|
75
76
|
self.key = get_encryption_key(key_file)
|
|
76
77
|
self.fernet = Fernet(self.key)
|
|
77
78
|
self.username = username
|
|
@@ -175,7 +176,7 @@ class InstapaperAuthenticator:
|
|
|
175
176
|
logging.error(self.LOG_LOGIN_FAILED)
|
|
176
177
|
return False
|
|
177
178
|
|
|
178
|
-
def _save_session(self):
|
|
179
|
+
def _save_session(self) -> None:
|
|
179
180
|
"""Saves the current session cookies to an encrypted file."""
|
|
180
181
|
required_cookies = self.REQUIRED_COOKIES
|
|
181
182
|
cookies_to_save = [
|
instapaper_scraper/cli.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import argparse
|
|
4
4
|
import requests
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Union
|
|
6
|
+
from typing import Union, List, Dict, Any, Optional, cast
|
|
7
7
|
|
|
8
8
|
if sys.version_info >= (3, 11):
|
|
9
9
|
import tomllib
|
|
@@ -39,7 +39,7 @@ def _resolve_path(
|
|
|
39
39
|
return user_dir_filename
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def load_config(config_path_str: Union[str, None] = None) ->
|
|
42
|
+
def load_config(config_path_str: Union[str, None] = None) -> Optional[Dict[str, Any]]:
|
|
43
43
|
"""
|
|
44
44
|
Loads configuration from a TOML file.
|
|
45
45
|
It checks the provided path, then config.toml in the project root,
|
|
@@ -50,7 +50,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
|
50
50
|
CONFIG_DIR / CONFIG_FILENAME,
|
|
51
51
|
]
|
|
52
52
|
|
|
53
|
-
paths_to_check = []
|
|
53
|
+
paths_to_check: List[Path] = []
|
|
54
54
|
if config_path_str:
|
|
55
55
|
paths_to_check.insert(0, Path(config_path_str).expanduser())
|
|
56
56
|
paths_to_check.extend(default_paths)
|
|
@@ -60,7 +60,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
|
60
60
|
try:
|
|
61
61
|
with open(path, "rb") as f:
|
|
62
62
|
logging.info(f"Loading configuration from {path}")
|
|
63
|
-
return tomllib.load(f)
|
|
63
|
+
return cast(Dict[str, Any], tomllib.load(f))
|
|
64
64
|
except tomllib.TOMLDecodeError as e:
|
|
65
65
|
logging.error(f"Error decoding TOML file at {path}: {e}")
|
|
66
66
|
return None
|
|
@@ -68,7 +68,7 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
|
68
68
|
return None
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def main():
|
|
71
|
+
def main() -> None:
|
|
72
72
|
"""
|
|
73
73
|
Main entry point for the Instapaper scraper CLI.
|
|
74
74
|
"""
|
|
@@ -102,9 +102,18 @@ def main():
|
|
|
102
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
103
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
104
|
parser.add_argument(
|
|
105
|
-
"--
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
"--read-url", # New, preferred flag
|
|
106
|
+
"--add-instapaper-url", # Old, for backward compatibility
|
|
107
|
+
dest="add_instapaper_url",
|
|
108
|
+
action=argparse.BooleanOptionalAction,
|
|
109
|
+
help="Include the Instapaper read URL. Overrides config.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--article-preview", # New, preferred flag
|
|
113
|
+
"--add-article-preview", # Old, for backward compatibility
|
|
114
|
+
dest="add_article_preview",
|
|
115
|
+
action=argparse.BooleanOptionalAction,
|
|
116
|
+
help="Include the article preview text. Overrides config.",
|
|
108
117
|
)
|
|
109
118
|
parser.add_argument(
|
|
110
119
|
"--limit",
|
|
@@ -120,8 +129,21 @@ def main():
|
|
|
120
129
|
|
|
121
130
|
config = load_config(args.config_path)
|
|
122
131
|
folders = config.get("folders", []) if config else []
|
|
132
|
+
fields_config = config.get("fields", {}) if config else {}
|
|
123
133
|
selected_folder = None
|
|
124
134
|
|
|
135
|
+
# Resolve boolean flags, giving CLI priority over config
|
|
136
|
+
final_add_instapaper_url = (
|
|
137
|
+
args.add_instapaper_url
|
|
138
|
+
if args.add_instapaper_url is not None
|
|
139
|
+
else fields_config.get("read_url", False)
|
|
140
|
+
)
|
|
141
|
+
final_add_article_preview = (
|
|
142
|
+
args.add_article_preview
|
|
143
|
+
if args.add_article_preview is not None
|
|
144
|
+
else fields_config.get("article_preview", False)
|
|
145
|
+
)
|
|
146
|
+
|
|
125
147
|
if args.folder:
|
|
126
148
|
if args.folder.lower() == "none":
|
|
127
149
|
selected_folder = None
|
|
@@ -144,7 +166,7 @@ def main():
|
|
|
144
166
|
print(" 0: none (non-folder mode)")
|
|
145
167
|
for i, folder in enumerate(folders):
|
|
146
168
|
display_name = folder.get("key") or folder.get("slug") or folder.get("id")
|
|
147
|
-
print(f" {i+1}: {display_name}")
|
|
169
|
+
print(f" {i + 1}: {display_name}")
|
|
148
170
|
|
|
149
171
|
try:
|
|
150
172
|
choice = int(input("Select a folder (enter a number): "))
|
|
@@ -196,7 +218,9 @@ def main():
|
|
|
196
218
|
try:
|
|
197
219
|
folder_info = selected_folder if selected_folder else None
|
|
198
220
|
all_articles = client.get_all_articles(
|
|
199
|
-
limit=args.limit,
|
|
221
|
+
limit=args.limit,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=final_add_article_preview,
|
|
200
224
|
)
|
|
201
225
|
except ScraperStructureChanged as e:
|
|
202
226
|
logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
|
|
@@ -214,7 +238,8 @@ def main():
|
|
|
214
238
|
all_articles,
|
|
215
239
|
args.format,
|
|
216
240
|
output_filename,
|
|
217
|
-
add_instapaper_url=
|
|
241
|
+
add_instapaper_url=final_add_instapaper_url,
|
|
242
|
+
add_article_preview=final_add_article_preview,
|
|
218
243
|
)
|
|
219
244
|
logging.info("Articles scraped and saved successfully.")
|
|
220
245
|
except Exception as e:
|
instapaper_scraper/constants.py
CHANGED
instapaper_scraper/output.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import json
|
|
3
|
-
import sqlite3
|
|
4
2
|
import logging
|
|
5
|
-
import
|
|
6
|
-
from typing import List, Dict, Any
|
|
3
|
+
from typing import List, Dict, Any, TYPE_CHECKING
|
|
7
4
|
|
|
8
|
-
from .constants import
|
|
5
|
+
from .constants import (
|
|
6
|
+
INSTAPAPER_READ_URL,
|
|
7
|
+
KEY_ID,
|
|
8
|
+
KEY_TITLE,
|
|
9
|
+
KEY_URL,
|
|
10
|
+
KEY_ARTICLE_PREVIEW,
|
|
11
|
+
)
|
|
9
12
|
|
|
10
13
|
# Constants for file operations
|
|
11
14
|
JSON_INDENT = 4
|
|
@@ -19,8 +22,17 @@ LOG_NO_ARTICLES = "No articles found to save."
|
|
|
19
22
|
LOG_SAVED_ARTICLES = "Saved {count} articles to {filename}"
|
|
20
23
|
LOG_UNKNOWN_FORMAT = "Unknown output format: {format}"
|
|
21
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
# Import for type-checking purposes, and use an alias
|
|
27
|
+
# to signal to linters like ruff that it is being used.
|
|
28
|
+
import sqlite3 as sqlite3
|
|
22
29
|
|
|
23
|
-
|
|
30
|
+
__all__ = ["sqlite3"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_sqlite_create_table_sql(
|
|
34
|
+
add_instapaper_url: bool = False, add_article_preview: bool = False
|
|
35
|
+
) -> str:
|
|
24
36
|
"""Returns the SQL statement to create the articles table."""
|
|
25
37
|
columns = [
|
|
26
38
|
f"{KEY_ID} TEXT PRIMARY KEY",
|
|
@@ -28,6 +40,8 @@ def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
|
|
|
28
40
|
f"{KEY_URL} TEXT NOT NULL",
|
|
29
41
|
]
|
|
30
42
|
if add_instapaper_url:
|
|
43
|
+
import sqlite3
|
|
44
|
+
|
|
31
45
|
# The GENERATED ALWAYS AS syntax was added in SQLite 3.31.0
|
|
32
46
|
if sqlite3.sqlite_version_info >= (3, 31, 0):
|
|
33
47
|
columns.append(
|
|
@@ -36,10 +50,15 @@ def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
|
|
|
36
50
|
else:
|
|
37
51
|
columns.append(f"{SQLITE_INSTAPAPER_URL_COL} TEXT")
|
|
38
52
|
|
|
53
|
+
if add_article_preview:
|
|
54
|
+
columns.append(f"{KEY_ARTICLE_PREVIEW} TEXT")
|
|
55
|
+
|
|
39
56
|
return f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} ({', '.join(columns)})"
|
|
40
57
|
|
|
41
58
|
|
|
42
|
-
def get_sqlite_insert_sql(
|
|
59
|
+
def get_sqlite_insert_sql(
|
|
60
|
+
add_instapaper_url_manually: bool = False, add_article_preview: bool = False
|
|
61
|
+
) -> str:
|
|
43
62
|
"""Returns the SQL statement to insert an article."""
|
|
44
63
|
cols = [KEY_ID, KEY_TITLE, KEY_URL]
|
|
45
64
|
placeholders = [f":{KEY_ID}", f":{KEY_TITLE}", f":{KEY_URL}"]
|
|
@@ -48,19 +67,30 @@ def get_sqlite_insert_sql(add_instapaper_url_manually: bool = False) -> str:
|
|
|
48
67
|
cols.append(SQLITE_INSTAPAPER_URL_COL)
|
|
49
68
|
placeholders.append(f":{SQLITE_INSTAPAPER_URL_COL}")
|
|
50
69
|
|
|
70
|
+
if add_article_preview:
|
|
71
|
+
cols.append(KEY_ARTICLE_PREVIEW)
|
|
72
|
+
placeholders.append(f":{KEY_ARTICLE_PREVIEW}")
|
|
73
|
+
|
|
51
74
|
return f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({', '.join(cols)}) VALUES ({', '.join(placeholders)})"
|
|
52
75
|
|
|
53
76
|
|
|
54
77
|
def save_to_csv(
|
|
55
|
-
data: List[Dict[str, Any]],
|
|
56
|
-
|
|
78
|
+
data: List[Dict[str, Any]],
|
|
79
|
+
filename: str,
|
|
80
|
+
add_instapaper_url: bool = False,
|
|
81
|
+
add_article_preview: bool = False,
|
|
82
|
+
) -> None:
|
|
57
83
|
"""Saves a list of articles to a CSV file."""
|
|
84
|
+
import csv
|
|
85
|
+
|
|
58
86
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
59
87
|
with open(filename, "w", newline="", encoding="utf-8") as f:
|
|
60
88
|
fieldnames = [KEY_ID, KEY_TITLE, KEY_URL]
|
|
61
89
|
if add_instapaper_url:
|
|
62
90
|
# Insert instapaper_url after the id column
|
|
63
91
|
fieldnames.insert(1, SQLITE_INSTAPAPER_URL_COL)
|
|
92
|
+
if add_article_preview:
|
|
93
|
+
fieldnames.append(KEY_ARTICLE_PREVIEW)
|
|
64
94
|
|
|
65
95
|
writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
|
66
96
|
writer.writeheader()
|
|
@@ -69,8 +99,13 @@ def save_to_csv(
|
|
|
69
99
|
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
70
100
|
|
|
71
101
|
|
|
72
|
-
def save_to_json(
|
|
102
|
+
def save_to_json(
|
|
103
|
+
data: List[Dict[str, Any]],
|
|
104
|
+
filename: str,
|
|
105
|
+
) -> None:
|
|
73
106
|
"""Saves a list of articles to a JSON file."""
|
|
107
|
+
import json
|
|
108
|
+
|
|
74
109
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
75
110
|
with open(filename, "w", encoding="utf-8") as f:
|
|
76
111
|
json.dump(data, f, indent=JSON_INDENT, ensure_ascii=False)
|
|
@@ -78,13 +113,18 @@ def save_to_json(data: List[Dict[str, Any]], filename: str):
|
|
|
78
113
|
|
|
79
114
|
|
|
80
115
|
def save_to_sqlite(
|
|
81
|
-
data: List[Dict[str, Any]],
|
|
82
|
-
|
|
116
|
+
data: List[Dict[str, Any]],
|
|
117
|
+
db_name: str,
|
|
118
|
+
add_instapaper_url: bool = False,
|
|
119
|
+
add_article_preview: bool = False,
|
|
120
|
+
) -> None:
|
|
83
121
|
"""Saves a list of articles to a SQLite database."""
|
|
122
|
+
import sqlite3
|
|
123
|
+
|
|
84
124
|
os.makedirs(os.path.dirname(db_name), exist_ok=True)
|
|
85
125
|
conn = sqlite3.connect(db_name)
|
|
86
126
|
cursor = conn.cursor()
|
|
87
|
-
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url))
|
|
127
|
+
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url, add_article_preview))
|
|
88
128
|
|
|
89
129
|
# For older SQLite versions, we need to manually add the URL
|
|
90
130
|
manual_insert_required = add_instapaper_url and sqlite3.sqlite_version_info < (
|
|
@@ -104,7 +144,8 @@ def save_to_sqlite(
|
|
|
104
144
|
data_to_insert = data
|
|
105
145
|
|
|
106
146
|
insert_sql = get_sqlite_insert_sql(
|
|
107
|
-
add_instapaper_url_manually=manual_insert_required
|
|
147
|
+
add_instapaper_url_manually=manual_insert_required,
|
|
148
|
+
add_article_preview=add_article_preview,
|
|
108
149
|
)
|
|
109
150
|
cursor.executemany(insert_sql, data_to_insert)
|
|
110
151
|
|
|
@@ -131,7 +172,8 @@ def save_articles(
|
|
|
131
172
|
format: str,
|
|
132
173
|
filename: str,
|
|
133
174
|
add_instapaper_url: bool = False,
|
|
134
|
-
|
|
175
|
+
add_article_preview: bool = False,
|
|
176
|
+
) -> None:
|
|
135
177
|
"""
|
|
136
178
|
Dispatches to the correct save function based on the format.
|
|
137
179
|
"""
|
|
@@ -152,10 +194,20 @@ def save_articles(
|
|
|
152
194
|
]
|
|
153
195
|
|
|
154
196
|
if format == "csv":
|
|
155
|
-
save_to_csv(
|
|
197
|
+
save_to_csv(
|
|
198
|
+
data,
|
|
199
|
+
filename=filename,
|
|
200
|
+
add_instapaper_url=add_instapaper_url,
|
|
201
|
+
add_article_preview=add_article_preview,
|
|
202
|
+
)
|
|
156
203
|
elif format == "json":
|
|
157
204
|
save_to_json(data, filename=filename)
|
|
158
205
|
elif format == "sqlite":
|
|
159
|
-
save_to_sqlite(
|
|
206
|
+
save_to_sqlite(
|
|
207
|
+
data,
|
|
208
|
+
db_name=filename,
|
|
209
|
+
add_instapaper_url=add_instapaper_url,
|
|
210
|
+
add_article_preview=add_article_preview,
|
|
211
|
+
)
|
|
160
212
|
else:
|
|
161
213
|
logging.error(LOG_UNKNOWN_FORMAT.format(format=format))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: instapaper-scraper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A tool to scrape articles from Instapaper.
|
|
5
5
|
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
6
|
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
@@ -21,7 +21,7 @@ Requires-Python: >=3.9
|
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Requires-Dist: beautifulsoup4~=4.14.2
|
|
24
|
-
Requires-Dist: certifi
|
|
24
|
+
Requires-Dist: certifi<2026.2.0,>=2025.11.12
|
|
25
25
|
Requires-Dist: charset-normalizer~=3.4.3
|
|
26
26
|
Requires-Dist: cryptography~=46.0.3
|
|
27
27
|
Requires-Dist: guara~=0.0.14
|
|
@@ -35,30 +35,58 @@ Requires-Dist: tomli~=2.0.1; python_version < "3.11"
|
|
|
35
35
|
Provides-Extra: dev
|
|
36
36
|
Requires-Dist: pytest; extra == "dev"
|
|
37
37
|
Requires-Dist: pytest-cov; extra == "dev"
|
|
38
|
-
Requires-Dist: black; extra == "dev"
|
|
39
38
|
Requires-Dist: ruff; extra == "dev"
|
|
40
39
|
Requires-Dist: types-requests; extra == "dev"
|
|
41
40
|
Requires-Dist: types-beautifulsoup4; extra == "dev"
|
|
42
41
|
Requires-Dist: requests-mock; extra == "dev"
|
|
43
42
|
Requires-Dist: build; extra == "dev"
|
|
44
43
|
Requires-Dist: twine; extra == "dev"
|
|
44
|
+
Requires-Dist: mypy; extra == "dev"
|
|
45
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
46
|
+
Requires-Dist: licensecheck; extra == "dev"
|
|
45
47
|
Dynamic: license-file
|
|
46
48
|
|
|
47
49
|
# Instapaper Scraper
|
|
48
50
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
51
|
+
<!-- Badges -->
|
|
52
|
+
<p align="center">
|
|
53
|
+
<a href="https://pypi.org/project/instapaper-scraper/">
|
|
54
|
+
<img src="https://img.shields.io/pypi/v/instapaper-scraper.svg" alt="PyPI version">
|
|
55
|
+
</a>
|
|
56
|
+
<a href="https://pepy.tech/projects/instapaper-scraper">
|
|
57
|
+
<img src="https://static.pepy.tech/personalized-badge/instapaper-scraper?period=total&left_text=downloads" alt="PyPI Downloads">
|
|
58
|
+
</a>
|
|
59
|
+
<a href="https://github.com/chriskyfung/InstapaperScraper">
|
|
60
|
+
<img src="https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fchriskyfung%2FInstapaperScraper%2Frefs%2Fheads%2Fmaster%2Fpyproject.toml" alt="Python Version from PEP 621 TOML">
|
|
61
|
+
</a>
|
|
62
|
+
<a href="https://github.com/astral-sh/ruff">
|
|
63
|
+
<img src="https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2Fastral-sh%2Fruff%2Fmain%2Fassets%2Fbadge%2Fv2.json" alt="Ruff">
|
|
64
|
+
</a>
|
|
65
|
+
<a href="https://codecov.io/gh/chriskyfung/InstapaperScraper">
|
|
66
|
+
<img src="https://codecov.io/gh/chriskyfung/InstapaperScraper/graph/badge.svg" alt="Code Coverage">
|
|
67
|
+
</a>
|
|
68
|
+
<wbr />
|
|
69
|
+
<a href="https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml">
|
|
70
|
+
<img src="https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml/badge.svg" alt="CI Status">
|
|
71
|
+
</a>
|
|
72
|
+
<a href="https://www.gnu.org/licenses/gpl-3.0.en.html">
|
|
73
|
+
<img src="https://img.shields.io/github/license/chriskyfung/InstapaperScraper" alt="GitHub License">
|
|
74
|
+
</a>
|
|
75
|
+
</p>
|
|
76
|
+
|
|
77
|
+
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
78
|
+
|
|
79
|
+
<!-- Sponsors -->
|
|
80
|
+
<p align="center">
|
|
81
|
+
<a href="https://github.com/sponsors/chriskyfung" title="Sponsor on GitHub">
|
|
82
|
+
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?style=for-the-badge&logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
83
|
+
</a>
|
|
84
|
+
<a href="https://www.buymeacoffee.com/chriskyfung" title="Support Coffee">
|
|
85
|
+
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
86
|
+
</a>
|
|
87
|
+
</p>
|
|
88
|
+
|
|
89
|
+
## ✨ Features
|
|
62
90
|
|
|
63
91
|
- Scrapes all bookmarks from your Instapaper account.
|
|
64
92
|
- Supports scraping from specific folders.
|
|
@@ -66,13 +94,13 @@ A Python tool to scrape all your saved Instapaper bookmarks and export them to v
|
|
|
66
94
|
- Securely stores your session for future runs.
|
|
67
95
|
- Modern, modular, and tested architecture.
|
|
68
96
|
|
|
69
|
-
## Getting Started
|
|
97
|
+
## 🚀 Getting Started
|
|
70
98
|
|
|
71
|
-
### 1. Requirements
|
|
99
|
+
### 📋 1. Requirements
|
|
72
100
|
|
|
73
101
|
- Python 3.9+
|
|
74
102
|
|
|
75
|
-
### 2. Installation
|
|
103
|
+
### 📦 2. Installation
|
|
76
104
|
|
|
77
105
|
This package is available on PyPI and can be installed with pip:
|
|
78
106
|
|
|
@@ -80,7 +108,7 @@ This package is available on PyPI and can be installed with pip:
|
|
|
80
108
|
pip install instapaper-scraper
|
|
81
109
|
```
|
|
82
110
|
|
|
83
|
-
### 3. Usage
|
|
111
|
+
### 💻 3. Usage
|
|
84
112
|
|
|
85
113
|
Run the tool from the command line, specifying your desired output format:
|
|
86
114
|
|
|
@@ -95,35 +123,35 @@ instapaper-scraper --format json
|
|
|
95
123
|
instapaper-scraper --format sqlite --output my_articles.db
|
|
96
124
|
```
|
|
97
125
|
|
|
98
|
-
## Configuration
|
|
126
|
+
## ⚙️ Configuration
|
|
99
127
|
|
|
100
|
-
### Authentication
|
|
128
|
+
### 🔐 Authentication
|
|
101
129
|
|
|
102
130
|
The script authenticates using one of the following methods, in order of priority:
|
|
103
131
|
|
|
104
|
-
1.
|
|
132
|
+
1. **Command-line Arguments**: Provide your username and password directly when running the script:
|
|
105
133
|
|
|
106
134
|
```sh
|
|
107
135
|
instapaper-scraper --username your_username --password your_password
|
|
108
136
|
```
|
|
109
137
|
|
|
110
|
-
2.
|
|
138
|
+
2. **Session Files (`.session_key`, `.instapaper_session`)**: The script attempts to load these files in the following order:
|
|
111
139
|
a. Path specified by `--session-file` or `--key-file` arguments.
|
|
112
140
|
b. Files in the current working directory (e.g., `./.session_key`).
|
|
113
141
|
c. Files in the user's configuration directory (`~/.config/instapaper-scraper/`).
|
|
114
142
|
After the first successful login, the script creates an encrypted `.instapaper_session` file and a `.session_key` file to reuse your session securely.
|
|
115
143
|
|
|
116
|
-
3.
|
|
144
|
+
3. **Interactive Prompt**: If no other method is available, the script will prompt you for your username and password.
|
|
117
145
|
|
|
118
146
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
119
147
|
|
|
120
|
-
### Folder Configuration
|
|
148
|
+
### 📁 Folder and Field Configuration
|
|
121
149
|
|
|
122
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
150
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
123
151
|
|
|
124
|
-
1.
|
|
125
|
-
2.
|
|
126
|
-
3.
|
|
152
|
+
1. The path specified by the `--config-path` argument.
|
|
153
|
+
2. `config.toml` in the current working directory.
|
|
154
|
+
3. `~/.config/instapaper-scraper/config.toml`
|
|
127
155
|
|
|
128
156
|
Here is an example of `config.toml`:
|
|
129
157
|
|
|
@@ -131,6 +159,12 @@ Here is an example of `config.toml`:
|
|
|
131
159
|
# Default output filename for non-folder mode
|
|
132
160
|
output_filename = "home-articles.csv"
|
|
133
161
|
|
|
162
|
+
# Optional fields to include in the output.
|
|
163
|
+
# These can be overridden by command-line flags.
|
|
164
|
+
[fields]
|
|
165
|
+
read_url = false
|
|
166
|
+
article_preview = false
|
|
167
|
+
|
|
134
168
|
[[folders]]
|
|
135
169
|
key = "ml"
|
|
136
170
|
id = "1234567"
|
|
@@ -145,14 +179,18 @@ output_filename = "python-articles.db"
|
|
|
145
179
|
```
|
|
146
180
|
|
|
147
181
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
148
|
-
- **
|
|
149
|
-
-
|
|
150
|
-
-
|
|
151
|
-
- **
|
|
182
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
183
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
184
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
185
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
186
|
+
- **key**: A short alias for the folder.
|
|
187
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
188
|
+
- **slug**: The human-readable part of the folder URL.
|
|
189
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
152
190
|
|
|
153
191
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
154
192
|
|
|
155
|
-
### Command-line Arguments
|
|
193
|
+
### 💻 Command-line Arguments
|
|
156
194
|
|
|
157
195
|
| Argument | Description |
|
|
158
196
|
| --- | --- |
|
|
@@ -162,9 +200,10 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
162
200
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
163
201
|
| `--username <user>` | Your Instapaper account username. |
|
|
164
202
|
| `--password <pass>` | Your Instapaper account password. |
|
|
165
|
-
| `--
|
|
203
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
204
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
166
205
|
|
|
167
|
-
### Output Formats
|
|
206
|
+
### 📄 Output Formats
|
|
168
207
|
|
|
169
208
|
You can control the output format using the `--format` argument. The supported formats are:
|
|
170
209
|
|
|
@@ -176,19 +215,19 @@ If the `--format` flag is omitted, the script will default to `csv`.
|
|
|
176
215
|
|
|
177
216
|
When using `--output <filename>`, the file extension is automatically corrected to match the chosen format. For example, `instapaper-scraper --format json --output my_articles.txt` will create `my_articles.json`.
|
|
178
217
|
|
|
179
|
-
#### Opening Articles in Instapaper
|
|
218
|
+
#### 📖 Opening Articles in Instapaper
|
|
180
219
|
|
|
181
220
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
182
221
|
|
|
183
|
-
For convenience, you can use the `--
|
|
222
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
184
223
|
|
|
185
224
|
```sh
|
|
186
|
-
instapaper-scraper --
|
|
225
|
+
instapaper-scraper --read-url
|
|
187
226
|
```
|
|
188
227
|
|
|
189
228
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
190
229
|
|
|
191
|
-
## How It Works
|
|
230
|
+
## 🛠️ How It Works
|
|
192
231
|
|
|
193
232
|
The tool is designed with a modular architecture for reliability and maintainability.
|
|
194
233
|
|
|
@@ -197,17 +236,17 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
197
236
|
3. **Data Collection**: All fetched articles are aggregated into a single list.
|
|
198
237
|
4. **Export**: Finally, the collected data is written to a file in your chosen format (`.csv`, `.json`, or `.db`).
|
|
199
238
|
|
|
200
|
-
## Example Output
|
|
239
|
+
## 📊 Example Output
|
|
201
240
|
|
|
202
|
-
### CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
241
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
203
242
|
|
|
204
243
|
```csv
|
|
205
|
-
"id","instapaper_url","title","url"
|
|
206
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
207
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
244
|
+
"id","instapaper_url","title","url","article_preview"
|
|
245
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
246
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
208
247
|
```
|
|
209
248
|
|
|
210
|
-
### JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
249
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
211
250
|
|
|
212
251
|
```json
|
|
213
252
|
[
|
|
@@ -215,26 +254,57 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
215
254
|
"id": "999901234",
|
|
216
255
|
"title": "Article 1",
|
|
217
256
|
"url": "https://www.example.com/page-1/",
|
|
218
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
257
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
258
|
+
"article_preview": "This is a preview of article 1."
|
|
219
259
|
},
|
|
220
260
|
{
|
|
221
261
|
"id": "999002345",
|
|
222
262
|
"title": "Article 2",
|
|
223
263
|
"url": "https://www.example.com/page-2/",
|
|
224
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
264
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
265
|
+
"article_preview": "This is a preview of article 2."
|
|
225
266
|
}
|
|
226
267
|
]
|
|
227
268
|
```
|
|
228
269
|
|
|
229
|
-
### SQLite (`output/bookmarks.db`)
|
|
270
|
+
### 🗄️ SQLite (`output/bookmarks.db`)
|
|
230
271
|
|
|
231
272
|
A SQLite database file is created with an `articles` table. The table includes `id`, `title`, and `url` columns. If the `--add-instapaper-url` flag is used, a `instapaper_url` column is also included. This feature is fully backward-compatible and will automatically adapt to the user's installed SQLite version, using an efficient generated column on modern versions (3.31.0+) and a fallback for older versions.
|
|
232
273
|
|
|
233
|
-
##
|
|
274
|
+
## 🤗 Support and Community
|
|
275
|
+
|
|
276
|
+
- **🐛 Bug Reports:** For any bugs or unexpected behavior, please [open an issue on GitHub](https://github.com/chriskyfung/InstapaperScraper/issues).
|
|
277
|
+
- **💬 Questions & General Discussion:** For questions, feature requests, or general discussion, please use our [GitHub Discussions](https://github.com/chriskyfung/InstapaperScraper/discussions).
|
|
278
|
+
|
|
279
|
+
## 🙏 Support the Project
|
|
280
|
+
|
|
281
|
+
`Instapaper Scraper` is a free and open-source project that requires significant time and effort to maintain and improve. If you find this tool useful, please consider supporting its development. Your contribution helps ensure the project stays healthy, active, and continuously updated.
|
|
282
|
+
|
|
283
|
+
- **[Sponsor on GitHub](https://github.com/sponsors/chriskyfung):** The best way to support the project with recurring monthly donations. Tiers with special rewards like priority support are available!
|
|
284
|
+
- **[Buy Me a Coffee](https://www.buymeacoffee.com/chriskyfung):** Perfect for a one-time thank you.
|
|
285
|
+
|
|
286
|
+
## 🤝 Contributing
|
|
287
|
+
|
|
288
|
+
Contributions are welcome! Whether it's a bug fix, a new feature, or documentation improvements, please feel free to open a pull request.
|
|
289
|
+
|
|
290
|
+
Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
291
|
+
|
|
292
|
+
## 🧑💻 Development & Testing
|
|
293
|
+
|
|
294
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
295
|
+
|
|
296
|
+
### 🚀 Using the Makefile
|
|
297
|
+
|
|
298
|
+
The most common commands are:
|
|
299
|
+
- `make install`: Installs development dependencies.
|
|
300
|
+
- `make format`: Formats the entire codebase.
|
|
301
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
302
|
+
- `make test`: Runs the test suite.
|
|
303
|
+
- `make build`: Builds the distributable packages.
|
|
234
304
|
|
|
235
|
-
|
|
305
|
+
Run `make help` to see all available commands.
|
|
236
306
|
|
|
237
|
-
### Setup
|
|
307
|
+
### 🔧 Setup
|
|
238
308
|
|
|
239
309
|
To install the development dependencies:
|
|
240
310
|
|
|
@@ -242,7 +312,13 @@ To install the development dependencies:
|
|
|
242
312
|
pip install -e .[dev]
|
|
243
313
|
```
|
|
244
314
|
|
|
245
|
-
|
|
315
|
+
To set up the pre-commit hooks:
|
|
316
|
+
|
|
317
|
+
```sh
|
|
318
|
+
pre-commit install
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### ▶️ Running the Scraper
|
|
246
322
|
|
|
247
323
|
To run the scraper directly without installing the package:
|
|
248
324
|
|
|
@@ -250,26 +326,28 @@ To run the scraper directly without installing the package:
|
|
|
250
326
|
python -m src.instapaper_scraper.cli
|
|
251
327
|
```
|
|
252
328
|
|
|
253
|
-
### Testing
|
|
329
|
+
### ✅ Testing
|
|
254
330
|
|
|
255
|
-
To run the tests, execute the following command from the project root:
|
|
331
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
256
332
|
|
|
257
333
|
```sh
|
|
258
334
|
pytest
|
|
259
335
|
```
|
|
260
336
|
|
|
261
|
-
To check test coverage:
|
|
337
|
+
To check test coverage (or use `make test-cov`):
|
|
262
338
|
|
|
263
339
|
```sh
|
|
264
340
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
265
341
|
```
|
|
266
342
|
|
|
267
|
-
### Code Quality
|
|
343
|
+
### ✨ Code Quality
|
|
268
344
|
|
|
269
|
-
|
|
345
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
346
|
+
|
|
347
|
+
To format the code with `ruff`:
|
|
270
348
|
|
|
271
349
|
```sh
|
|
272
|
-
|
|
350
|
+
ruff format .
|
|
273
351
|
```
|
|
274
352
|
|
|
275
353
|
To check for linting errors with `ruff`:
|
|
@@ -278,16 +356,29 @@ To check for linting errors with `ruff`:
|
|
|
278
356
|
ruff check .
|
|
279
357
|
```
|
|
280
358
|
|
|
281
|
-
To
|
|
359
|
+
To run static type checking with `mypy`:
|
|
282
360
|
|
|
283
361
|
```sh
|
|
284
|
-
|
|
362
|
+
mypy src
|
|
285
363
|
```
|
|
286
364
|
|
|
287
|
-
|
|
365
|
+
To run license checks:
|
|
366
|
+
|
|
367
|
+
```sh
|
|
368
|
+
licensecheck --zero
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
## 📜 Disclaimer
|
|
288
373
|
|
|
289
374
|
This script requires valid Instapaper credentials. Use it responsibly and in accordance with Instapaper’s Terms of Service.
|
|
290
375
|
|
|
291
|
-
## License
|
|
376
|
+
## 📄 License
|
|
377
|
+
|
|
378
|
+
This project is licensed under the terms of the **GNU General Public License v3.0**. See the [LICENSE](LICENSE) file for the full license text.
|
|
379
|
+
|
|
380
|
+
## Contributors
|
|
381
|
+
|
|
382
|
+
[](https://github.com/chriskyfung/InstapaperScraper/graphs/contributors)
|
|
292
383
|
|
|
293
|
-
|
|
384
|
+
Made with [contrib.rocks](https://contrib.rocks).
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
+
instapaper_scraper/api.py,sha256=q5cxikx3bmRfGUcgLbYjPtpMkrAE-A6vWjZ_KKwOmAU,13701
|
|
3
|
+
instapaper_scraper/auth.py,sha256=OpgjbdI697FitumiyznWjey5-R2ZuxAEATaMz9NNnTc,7092
|
|
4
|
+
instapaper_scraper/cli.py,sha256=MklUuxCVzoOGdT4jtMH0unY7D50qqJvU3XKatdfvGbg,8588
|
|
5
|
+
instapaper_scraper/constants.py,sha256=hiWriGWAQjDlx_Jn14dTkJIg4I--5ltzOOwD0ywFmwg,443
|
|
6
|
+
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
+
instapaper_scraper/output.py,sha256=6UdeKUubG_Yn-lCX0Pk8vG1zzc00xWg_5uNRWedOA30,6454
|
|
8
|
+
instapaper_scraper-1.2.0.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
+
instapaper_scraper-1.2.0.dist-info/METADATA,sha256=5m285Un8lmlLiY6aIFH_ANg9w_Gyofnw2CV4XymPbF0,15887
|
|
10
|
+
instapaper_scraper-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
11
|
+
instapaper_scraper-1.2.0.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
+
instapaper_scraper-1.2.0.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
+
instapaper_scraper-1.2.0.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
-
instapaper_scraper/api.py,sha256=-Dq5fOAGSGopb-qonIbETd9ZlxWdULKRgl1DCOuVemY,11618
|
|
3
|
-
instapaper_scraper/auth.py,sha256=VTBE9KhGGJm0KbMT5DCTMCbh-N3HiJuJ9wMDb8CyZT4,7015
|
|
4
|
-
instapaper_scraper/cli.py,sha256=wsQxTVFIyJq3EQiAtz7dCjg1vI2_Y9quZv4ifuEPDU8,7495
|
|
5
|
-
instapaper_scraper/constants.py,sha256=ubFWa47985lIz58qokMC0xQzTmCB6NOa17KFgWLn65E,403
|
|
6
|
-
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
-
instapaper_scraper/output.py,sha256=lxJgW71-m1YuMYJHeK6nu479pk_3bQGc0axzNCvxtZQ,5338
|
|
8
|
-
instapaper_scraper-1.1.0rc1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
-
instapaper_scraper-1.1.0rc1.dist-info/METADATA,sha256=O-VJZg1yN3cuPRfBCevmD9_IrOR07NGpzrgZXI2-6hk,11637
|
|
10
|
-
instapaper_scraper-1.1.0rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
instapaper_scraper-1.1.0rc1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
-
instapaper_scraper-1.1.0rc1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
-
instapaper_scraper-1.1.0rc1.dist-info/RECORD,,
|
{instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{instapaper_scraper-1.1.0rc1.dist-info → instapaper_scraper-1.2.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|