instapaper-scraper 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instapaper_scraper/api.py +41 -6
- instapaper_scraper/cli.py +30 -5
- instapaper_scraper/constants.py +1 -0
- instapaper_scraper/output.py +50 -10
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0.dist-info}/METADATA +56 -32
- instapaper_scraper-1.2.0.dist-info/RECORD +13 -0
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0.dist-info}/WHEEL +1 -1
- instapaper_scraper-1.1.1.dist-info/RECORD +0 -13
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0.dist-info}/entry_points.txt +0 -0
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0.dist-info}/top_level.txt +0 -0
instapaper_scraper/api.py
CHANGED
|
@@ -8,7 +8,13 @@ from bs4 import BeautifulSoup
|
|
|
8
8
|
from bs4.element import Tag
|
|
9
9
|
|
|
10
10
|
from .exceptions import ScraperStructureChanged
|
|
11
|
-
from .constants import
|
|
11
|
+
from .constants import (
|
|
12
|
+
INSTAPAPER_BASE_URL,
|
|
13
|
+
KEY_ID,
|
|
14
|
+
KEY_TITLE,
|
|
15
|
+
KEY_URL,
|
|
16
|
+
KEY_ARTICLE_PREVIEW,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
class InstapaperClient:
|
|
@@ -34,6 +40,7 @@ class InstapaperClient:
|
|
|
34
40
|
PAGINATE_OLDER_CLASS = "paginate_older"
|
|
35
41
|
ARTICLE_TITLE_CLASS = "article_title"
|
|
36
42
|
TITLE_META_CLASS = "title_meta"
|
|
43
|
+
ARTICLE_PREVIEW_CLASS = "article_preview"
|
|
37
44
|
|
|
38
45
|
# URL paths
|
|
39
46
|
URL_PATH_USER = "/u/"
|
|
@@ -102,12 +109,14 @@ class InstapaperClient:
|
|
|
102
109
|
self,
|
|
103
110
|
page: int = DEFAULT_PAGE_START,
|
|
104
111
|
folder_info: Optional[Dict[str, str]] = None,
|
|
112
|
+
add_article_preview: bool = False,
|
|
105
113
|
) -> Tuple[List[Dict[str, str]], bool]:
|
|
106
114
|
"""
|
|
107
115
|
Fetches a single page of articles and determines if there are more pages.
|
|
108
116
|
Args:
|
|
109
117
|
page: The page number to fetch.
|
|
110
118
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
119
|
+
add_article_preview: Whether to include the article preview.
|
|
111
120
|
Returns:
|
|
112
121
|
A tuple containing:
|
|
113
122
|
- A list of article data (dictionaries with id, title, url).
|
|
@@ -147,7 +156,9 @@ class InstapaperClient:
|
|
|
147
156
|
article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
|
|
148
157
|
)
|
|
149
158
|
|
|
150
|
-
data = self._parse_article_data(
|
|
159
|
+
data = self._parse_article_data(
|
|
160
|
+
soup, article_ids, page, add_article_preview
|
|
161
|
+
)
|
|
151
162
|
has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
|
|
152
163
|
|
|
153
164
|
return data, has_more
|
|
@@ -185,13 +196,17 @@ class InstapaperClient:
|
|
|
185
196
|
raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
|
|
186
197
|
|
|
187
198
|
def get_all_articles(
|
|
188
|
-
self,
|
|
199
|
+
self,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
folder_info: Optional[Dict[str, str]] = None,
|
|
202
|
+
add_article_preview: bool = False,
|
|
189
203
|
) -> List[Dict[str, str]]:
|
|
190
204
|
"""
|
|
191
205
|
Iterates through pages and fetches articles up to a specified limit.
|
|
192
206
|
Args:
|
|
193
207
|
limit: The maximum number of pages to scrape. If None, scrapes all pages.
|
|
194
208
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
209
|
+
add_article_preview: Whether to include the article preview.
|
|
195
210
|
"""
|
|
196
211
|
all_articles = []
|
|
197
212
|
page = self.DEFAULT_PAGE_START
|
|
@@ -202,7 +217,11 @@ class InstapaperClient:
|
|
|
202
217
|
break
|
|
203
218
|
|
|
204
219
|
logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
|
|
205
|
-
data, has_more = self.get_articles(
|
|
220
|
+
data, has_more = self.get_articles(
|
|
221
|
+
page=page,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=add_article_preview,
|
|
224
|
+
)
|
|
206
225
|
if data:
|
|
207
226
|
all_articles.extend(data)
|
|
208
227
|
page += 1
|
|
@@ -217,7 +236,11 @@ class InstapaperClient:
|
|
|
217
236
|
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
218
237
|
|
|
219
238
|
def _parse_article_data(
|
|
220
|
-
self,
|
|
239
|
+
self,
|
|
240
|
+
soup: BeautifulSoup,
|
|
241
|
+
article_ids: List[str],
|
|
242
|
+
page: int,
|
|
243
|
+
add_article_preview: bool = False,
|
|
221
244
|
) -> List[Dict[str, Any]]:
|
|
222
245
|
"""Parses the raw HTML to extract structured data for each article."""
|
|
223
246
|
data = []
|
|
@@ -249,7 +272,19 @@ class InstapaperClient:
|
|
|
249
272
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
250
273
|
link = link_element["href"]
|
|
251
274
|
|
|
252
|
-
|
|
275
|
+
article_data = {KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link}
|
|
276
|
+
|
|
277
|
+
if add_article_preview:
|
|
278
|
+
preview_element = article_element.find(
|
|
279
|
+
class_=self.ARTICLE_PREVIEW_CLASS
|
|
280
|
+
)
|
|
281
|
+
article_data[KEY_ARTICLE_PREVIEW] = (
|
|
282
|
+
preview_element.get_text().strip()
|
|
283
|
+
if isinstance(preview_element, Tag)
|
|
284
|
+
else ""
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
data.append(article_data)
|
|
253
288
|
except AttributeError as e:
|
|
254
289
|
logging.warning(
|
|
255
290
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
instapaper_scraper/cli.py
CHANGED
|
@@ -102,9 +102,18 @@ def main() -> None:
|
|
|
102
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
103
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
104
|
parser.add_argument(
|
|
105
|
-
"--
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
"--read-url", # New, preferred flag
|
|
106
|
+
"--add-instapaper-url", # Old, for backward compatibility
|
|
107
|
+
dest="add_instapaper_url",
|
|
108
|
+
action=argparse.BooleanOptionalAction,
|
|
109
|
+
help="Include the Instapaper read URL. Overrides config.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--article-preview", # New, preferred flag
|
|
113
|
+
"--add-article-preview", # Old, for backward compatibility
|
|
114
|
+
dest="add_article_preview",
|
|
115
|
+
action=argparse.BooleanOptionalAction,
|
|
116
|
+
help="Include the article preview text. Overrides config.",
|
|
108
117
|
)
|
|
109
118
|
parser.add_argument(
|
|
110
119
|
"--limit",
|
|
@@ -120,8 +129,21 @@ def main() -> None:
|
|
|
120
129
|
|
|
121
130
|
config = load_config(args.config_path)
|
|
122
131
|
folders = config.get("folders", []) if config else []
|
|
132
|
+
fields_config = config.get("fields", {}) if config else {}
|
|
123
133
|
selected_folder = None
|
|
124
134
|
|
|
135
|
+
# Resolve boolean flags, giving CLI priority over config
|
|
136
|
+
final_add_instapaper_url = (
|
|
137
|
+
args.add_instapaper_url
|
|
138
|
+
if args.add_instapaper_url is not None
|
|
139
|
+
else fields_config.get("read_url", False)
|
|
140
|
+
)
|
|
141
|
+
final_add_article_preview = (
|
|
142
|
+
args.add_article_preview
|
|
143
|
+
if args.add_article_preview is not None
|
|
144
|
+
else fields_config.get("article_preview", False)
|
|
145
|
+
)
|
|
146
|
+
|
|
125
147
|
if args.folder:
|
|
126
148
|
if args.folder.lower() == "none":
|
|
127
149
|
selected_folder = None
|
|
@@ -196,7 +218,9 @@ def main() -> None:
|
|
|
196
218
|
try:
|
|
197
219
|
folder_info = selected_folder if selected_folder else None
|
|
198
220
|
all_articles = client.get_all_articles(
|
|
199
|
-
limit=args.limit,
|
|
221
|
+
limit=args.limit,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=final_add_article_preview,
|
|
200
224
|
)
|
|
201
225
|
except ScraperStructureChanged as e:
|
|
202
226
|
logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
|
|
@@ -214,7 +238,8 @@ def main() -> None:
|
|
|
214
238
|
all_articles,
|
|
215
239
|
args.format,
|
|
216
240
|
output_filename,
|
|
217
|
-
add_instapaper_url=
|
|
241
|
+
add_instapaper_url=final_add_instapaper_url,
|
|
242
|
+
add_article_preview=final_add_article_preview,
|
|
218
243
|
)
|
|
219
244
|
logging.info("Articles scraped and saved successfully.")
|
|
220
245
|
except Exception as e:
|
instapaper_scraper/constants.py
CHANGED
instapaper_scraper/output.py
CHANGED
|
@@ -2,7 +2,13 @@ import os
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List, Dict, Any, TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from .constants import
|
|
5
|
+
from .constants import (
|
|
6
|
+
INSTAPAPER_READ_URL,
|
|
7
|
+
KEY_ID,
|
|
8
|
+
KEY_TITLE,
|
|
9
|
+
KEY_URL,
|
|
10
|
+
KEY_ARTICLE_PREVIEW,
|
|
11
|
+
)
|
|
6
12
|
|
|
7
13
|
# Constants for file operations
|
|
8
14
|
JSON_INDENT = 4
|
|
@@ -24,7 +30,9 @@ if TYPE_CHECKING:
|
|
|
24
30
|
__all__ = ["sqlite3"]
|
|
25
31
|
|
|
26
32
|
|
|
27
|
-
def get_sqlite_create_table_sql(
|
|
33
|
+
def get_sqlite_create_table_sql(
|
|
34
|
+
add_instapaper_url: bool = False, add_article_preview: bool = False
|
|
35
|
+
) -> str:
|
|
28
36
|
"""Returns the SQL statement to create the articles table."""
|
|
29
37
|
columns = [
|
|
30
38
|
f"{KEY_ID} TEXT PRIMARY KEY",
|
|
@@ -42,10 +50,15 @@ def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
|
|
|
42
50
|
else:
|
|
43
51
|
columns.append(f"{SQLITE_INSTAPAPER_URL_COL} TEXT")
|
|
44
52
|
|
|
53
|
+
if add_article_preview:
|
|
54
|
+
columns.append(f"{KEY_ARTICLE_PREVIEW} TEXT")
|
|
55
|
+
|
|
45
56
|
return f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} ({', '.join(columns)})"
|
|
46
57
|
|
|
47
58
|
|
|
48
|
-
def get_sqlite_insert_sql(
|
|
59
|
+
def get_sqlite_insert_sql(
|
|
60
|
+
add_instapaper_url_manually: bool = False, add_article_preview: bool = False
|
|
61
|
+
) -> str:
|
|
49
62
|
"""Returns the SQL statement to insert an article."""
|
|
50
63
|
cols = [KEY_ID, KEY_TITLE, KEY_URL]
|
|
51
64
|
placeholders = [f":{KEY_ID}", f":{KEY_TITLE}", f":{KEY_URL}"]
|
|
@@ -54,11 +67,18 @@ def get_sqlite_insert_sql(add_instapaper_url_manually: bool = False) -> str:
|
|
|
54
67
|
cols.append(SQLITE_INSTAPAPER_URL_COL)
|
|
55
68
|
placeholders.append(f":{SQLITE_INSTAPAPER_URL_COL}")
|
|
56
69
|
|
|
70
|
+
if add_article_preview:
|
|
71
|
+
cols.append(KEY_ARTICLE_PREVIEW)
|
|
72
|
+
placeholders.append(f":{KEY_ARTICLE_PREVIEW}")
|
|
73
|
+
|
|
57
74
|
return f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({', '.join(cols)}) VALUES ({', '.join(placeholders)})"
|
|
58
75
|
|
|
59
76
|
|
|
60
77
|
def save_to_csv(
|
|
61
|
-
data: List[Dict[str, Any]],
|
|
78
|
+
data: List[Dict[str, Any]],
|
|
79
|
+
filename: str,
|
|
80
|
+
add_instapaper_url: bool = False,
|
|
81
|
+
add_article_preview: bool = False,
|
|
62
82
|
) -> None:
|
|
63
83
|
"""Saves a list of articles to a CSV file."""
|
|
64
84
|
import csv
|
|
@@ -69,6 +89,8 @@ def save_to_csv(
|
|
|
69
89
|
if add_instapaper_url:
|
|
70
90
|
# Insert instapaper_url after the id column
|
|
71
91
|
fieldnames.insert(1, SQLITE_INSTAPAPER_URL_COL)
|
|
92
|
+
if add_article_preview:
|
|
93
|
+
fieldnames.append(KEY_ARTICLE_PREVIEW)
|
|
72
94
|
|
|
73
95
|
writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
|
74
96
|
writer.writeheader()
|
|
@@ -77,7 +99,10 @@ def save_to_csv(
|
|
|
77
99
|
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
78
100
|
|
|
79
101
|
|
|
80
|
-
def save_to_json(
|
|
102
|
+
def save_to_json(
|
|
103
|
+
data: List[Dict[str, Any]],
|
|
104
|
+
filename: str,
|
|
105
|
+
) -> None:
|
|
81
106
|
"""Saves a list of articles to a JSON file."""
|
|
82
107
|
import json
|
|
83
108
|
|
|
@@ -88,7 +113,10 @@ def save_to_json(data: List[Dict[str, Any]], filename: str) -> None:
|
|
|
88
113
|
|
|
89
114
|
|
|
90
115
|
def save_to_sqlite(
|
|
91
|
-
data: List[Dict[str, Any]],
|
|
116
|
+
data: List[Dict[str, Any]],
|
|
117
|
+
db_name: str,
|
|
118
|
+
add_instapaper_url: bool = False,
|
|
119
|
+
add_article_preview: bool = False,
|
|
92
120
|
) -> None:
|
|
93
121
|
"""Saves a list of articles to a SQLite database."""
|
|
94
122
|
import sqlite3
|
|
@@ -96,7 +124,7 @@ def save_to_sqlite(
|
|
|
96
124
|
os.makedirs(os.path.dirname(db_name), exist_ok=True)
|
|
97
125
|
conn = sqlite3.connect(db_name)
|
|
98
126
|
cursor = conn.cursor()
|
|
99
|
-
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url))
|
|
127
|
+
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url, add_article_preview))
|
|
100
128
|
|
|
101
129
|
# For older SQLite versions, we need to manually add the URL
|
|
102
130
|
manual_insert_required = add_instapaper_url and sqlite3.sqlite_version_info < (
|
|
@@ -116,7 +144,8 @@ def save_to_sqlite(
|
|
|
116
144
|
data_to_insert = data
|
|
117
145
|
|
|
118
146
|
insert_sql = get_sqlite_insert_sql(
|
|
119
|
-
add_instapaper_url_manually=manual_insert_required
|
|
147
|
+
add_instapaper_url_manually=manual_insert_required,
|
|
148
|
+
add_article_preview=add_article_preview,
|
|
120
149
|
)
|
|
121
150
|
cursor.executemany(insert_sql, data_to_insert)
|
|
122
151
|
|
|
@@ -143,6 +172,7 @@ def save_articles(
|
|
|
143
172
|
format: str,
|
|
144
173
|
filename: str,
|
|
145
174
|
add_instapaper_url: bool = False,
|
|
175
|
+
add_article_preview: bool = False,
|
|
146
176
|
) -> None:
|
|
147
177
|
"""
|
|
148
178
|
Dispatches to the correct save function based on the format.
|
|
@@ -164,10 +194,20 @@ def save_articles(
|
|
|
164
194
|
]
|
|
165
195
|
|
|
166
196
|
if format == "csv":
|
|
167
|
-
save_to_csv(
|
|
197
|
+
save_to_csv(
|
|
198
|
+
data,
|
|
199
|
+
filename=filename,
|
|
200
|
+
add_instapaper_url=add_instapaper_url,
|
|
201
|
+
add_article_preview=add_article_preview,
|
|
202
|
+
)
|
|
168
203
|
elif format == "json":
|
|
169
204
|
save_to_json(data, filename=filename)
|
|
170
205
|
elif format == "sqlite":
|
|
171
|
-
save_to_sqlite(
|
|
206
|
+
save_to_sqlite(
|
|
207
|
+
data,
|
|
208
|
+
db_name=filename,
|
|
209
|
+
add_instapaper_url=add_instapaper_url,
|
|
210
|
+
add_article_preview=add_article_preview,
|
|
211
|
+
)
|
|
172
212
|
else:
|
|
173
213
|
logging.error(LOG_UNKNOWN_FORMAT.format(format=format))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: instapaper-scraper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A tool to scrape articles from Instapaper.
|
|
5
5
|
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
6
|
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
@@ -21,7 +21,7 @@ Requires-Python: >=3.9
|
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Requires-Dist: beautifulsoup4~=4.14.2
|
|
24
|
-
Requires-Dist: certifi
|
|
24
|
+
Requires-Dist: certifi<2026.2.0,>=2025.11.12
|
|
25
25
|
Requires-Dist: charset-normalizer~=3.4.3
|
|
26
26
|
Requires-Dist: cryptography~=46.0.3
|
|
27
27
|
Requires-Dist: guara~=0.0.14
|
|
@@ -72,16 +72,20 @@ Dynamic: license-file
|
|
|
72
72
|
<a href="https://www.gnu.org/licenses/gpl-3.0.en.html">
|
|
73
73
|
<img src="https://img.shields.io/github/license/chriskyfung/InstapaperScraper" alt="GitHub License">
|
|
74
74
|
</a>
|
|
75
|
+
</p>
|
|
76
|
+
|
|
77
|
+
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
78
|
+
|
|
79
|
+
<!-- Sponsors -->
|
|
80
|
+
<p align="center">
|
|
75
81
|
<a href="https://github.com/sponsors/chriskyfung" title="Sponsor on GitHub">
|
|
76
|
-
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
82
|
+
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?style=for-the-badge&logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
77
83
|
</a>
|
|
78
84
|
<a href="https://www.buymeacoffee.com/chriskyfung" title="Support Coffee">
|
|
79
|
-
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
85
|
+
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
80
86
|
</a>
|
|
81
87
|
</p>
|
|
82
88
|
|
|
83
|
-
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
84
|
-
|
|
85
89
|
## ✨ Features
|
|
86
90
|
|
|
87
91
|
- Scrapes all bookmarks from your Instapaper account.
|
|
@@ -141,9 +145,9 @@ The script authenticates using one of the following methods, in order of priorit
|
|
|
141
145
|
|
|
142
146
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
143
147
|
|
|
144
|
-
### 📁 Folder Configuration
|
|
148
|
+
### 📁 Folder and Field Configuration
|
|
145
149
|
|
|
146
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
150
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
147
151
|
|
|
148
152
|
1. The path specified by the `--config-path` argument.
|
|
149
153
|
2. `config.toml` in the current working directory.
|
|
@@ -155,6 +159,12 @@ Here is an example of `config.toml`:
|
|
|
155
159
|
# Default output filename for non-folder mode
|
|
156
160
|
output_filename = "home-articles.csv"
|
|
157
161
|
|
|
162
|
+
# Optional fields to include in the output.
|
|
163
|
+
# These can be overridden by command-line flags.
|
|
164
|
+
[fields]
|
|
165
|
+
read_url = false
|
|
166
|
+
article_preview = false
|
|
167
|
+
|
|
158
168
|
[[folders]]
|
|
159
169
|
key = "ml"
|
|
160
170
|
id = "1234567"
|
|
@@ -169,10 +179,14 @@ output_filename = "python-articles.db"
|
|
|
169
179
|
```
|
|
170
180
|
|
|
171
181
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
172
|
-
- **
|
|
173
|
-
-
|
|
174
|
-
-
|
|
175
|
-
- **
|
|
182
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
183
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
184
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
185
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
186
|
+
- **key**: A short alias for the folder.
|
|
187
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
188
|
+
- **slug**: The human-readable part of the folder URL.
|
|
189
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
176
190
|
|
|
177
191
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
178
192
|
|
|
@@ -186,7 +200,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
186
200
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
187
201
|
| `--username <user>` | Your Instapaper account username. |
|
|
188
202
|
| `--password <pass>` | Your Instapaper account password. |
|
|
189
|
-
| `--
|
|
203
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
204
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
190
205
|
|
|
191
206
|
### 📄 Output Formats
|
|
192
207
|
|
|
@@ -204,10 +219,10 @@ When using `--output <filename>`, the file extension is automatically corrected
|
|
|
204
219
|
|
|
205
220
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
206
221
|
|
|
207
|
-
For convenience, you can use the `--
|
|
222
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
208
223
|
|
|
209
224
|
```sh
|
|
210
|
-
instapaper-scraper --
|
|
225
|
+
instapaper-scraper --read-url
|
|
211
226
|
```
|
|
212
227
|
|
|
213
228
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
@@ -223,15 +238,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
223
238
|
|
|
224
239
|
## 📊 Example Output
|
|
225
240
|
|
|
226
|
-
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
241
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
227
242
|
|
|
228
243
|
```csv
|
|
229
|
-
"id","instapaper_url","title","url"
|
|
230
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
231
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
244
|
+
"id","instapaper_url","title","url","article_preview"
|
|
245
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
246
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
232
247
|
```
|
|
233
248
|
|
|
234
|
-
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
249
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
235
250
|
|
|
236
251
|
```json
|
|
237
252
|
[
|
|
@@ -239,13 +254,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
239
254
|
"id": "999901234",
|
|
240
255
|
"title": "Article 1",
|
|
241
256
|
"url": "https://www.example.com/page-1/",
|
|
242
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
257
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
258
|
+
"article_preview": "This is a preview of article 1."
|
|
243
259
|
},
|
|
244
260
|
{
|
|
245
261
|
"id": "999002345",
|
|
246
262
|
"title": "Article 2",
|
|
247
263
|
"url": "https://www.example.com/page-2/",
|
|
248
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
264
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
265
|
+
"article_preview": "This is a preview of article 2."
|
|
249
266
|
}
|
|
250
267
|
]
|
|
251
268
|
```
|
|
@@ -274,7 +291,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
|
274
291
|
|
|
275
292
|
## 🧑💻 Development & Testing
|
|
276
293
|
|
|
277
|
-
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
|
|
294
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
295
|
+
|
|
296
|
+
### 🚀 Using the Makefile
|
|
297
|
+
|
|
298
|
+
The most common commands are:
|
|
299
|
+
- `make install`: Installs development dependencies.
|
|
300
|
+
- `make format`: Formats the entire codebase.
|
|
301
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
302
|
+
- `make test`: Runs the test suite.
|
|
303
|
+
- `make build`: Builds the distributable packages.
|
|
304
|
+
|
|
305
|
+
Run `make help` to see all available commands.
|
|
278
306
|
|
|
279
307
|
### 🔧 Setup
|
|
280
308
|
|
|
@@ -300,13 +328,13 @@ python -m src.instapaper_scraper.cli
|
|
|
300
328
|
|
|
301
329
|
### ✅ Testing
|
|
302
330
|
|
|
303
|
-
To run the tests, execute the following command from the project root:
|
|
331
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
304
332
|
|
|
305
333
|
```sh
|
|
306
334
|
pytest
|
|
307
335
|
```
|
|
308
336
|
|
|
309
|
-
To check test coverage:
|
|
337
|
+
To check test coverage (or use `make test-cov`):
|
|
310
338
|
|
|
311
339
|
```sh
|
|
312
340
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
@@ -314,6 +342,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
|
314
342
|
|
|
315
343
|
### ✨ Code Quality
|
|
316
344
|
|
|
345
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
346
|
+
|
|
317
347
|
To format the code with `ruff`:
|
|
318
348
|
|
|
319
349
|
```sh
|
|
@@ -326,12 +356,6 @@ To check for linting errors with `ruff`:
|
|
|
326
356
|
ruff check .
|
|
327
357
|
```
|
|
328
358
|
|
|
329
|
-
To automatically fix linting errors:
|
|
330
|
-
|
|
331
|
-
```sh
|
|
332
|
-
ruff check . --fix
|
|
333
|
-
```
|
|
334
|
-
|
|
335
359
|
To run static type checking with `mypy`:
|
|
336
360
|
|
|
337
361
|
```sh
|
|
@@ -341,7 +365,7 @@ mypy src
|
|
|
341
365
|
To run license checks:
|
|
342
366
|
|
|
343
367
|
```sh
|
|
344
|
-
licensecheck --
|
|
368
|
+
licensecheck --zero
|
|
345
369
|
```
|
|
346
370
|
|
|
347
371
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
+
instapaper_scraper/api.py,sha256=q5cxikx3bmRfGUcgLbYjPtpMkrAE-A6vWjZ_KKwOmAU,13701
|
|
3
|
+
instapaper_scraper/auth.py,sha256=OpgjbdI697FitumiyznWjey5-R2ZuxAEATaMz9NNnTc,7092
|
|
4
|
+
instapaper_scraper/cli.py,sha256=MklUuxCVzoOGdT4jtMH0unY7D50qqJvU3XKatdfvGbg,8588
|
|
5
|
+
instapaper_scraper/constants.py,sha256=hiWriGWAQjDlx_Jn14dTkJIg4I--5ltzOOwD0ywFmwg,443
|
|
6
|
+
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
+
instapaper_scraper/output.py,sha256=6UdeKUubG_Yn-lCX0Pk8vG1zzc00xWg_5uNRWedOA30,6454
|
|
8
|
+
instapaper_scraper-1.2.0.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
+
instapaper_scraper-1.2.0.dist-info/METADATA,sha256=5m285Un8lmlLiY6aIFH_ANg9w_Gyofnw2CV4XymPbF0,15887
|
|
10
|
+
instapaper_scraper-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
11
|
+
instapaper_scraper-1.2.0.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
+
instapaper_scraper-1.2.0.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
+
instapaper_scraper-1.2.0.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
-
instapaper_scraper/api.py,sha256=67ZeiVjsZpGspB8S3ni8FS6LBAOHXBc_oz3vEDWDNms,12672
|
|
3
|
-
instapaper_scraper/auth.py,sha256=OpgjbdI697FitumiyznWjey5-R2ZuxAEATaMz9NNnTc,7092
|
|
4
|
-
instapaper_scraper/cli.py,sha256=YL9c7kksmj5iGKRvVqG0KO4rBbhTg5c9Lgvsf_brRPA,7579
|
|
5
|
-
instapaper_scraper/constants.py,sha256=ubFWa47985lIz58qokMC0xQzTmCB6NOa17KFgWLn65E,403
|
|
6
|
-
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
-
instapaper_scraper/output.py,sha256=cadyUOaGQ5Ct5iLiEkHDvN2cqYc1WmJTvAa7OxFjg0w,5618
|
|
8
|
-
instapaper_scraper-1.1.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
-
instapaper_scraper-1.1.1.dist-info/METADATA,sha256=CDiUTjY5eu1OTlFhhBNA1irP6gTNTLw6Ra-RIbkJeKY,14320
|
|
10
|
-
instapaper_scraper-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
instapaper_scraper-1.1.1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
-
instapaper_scraper-1.1.1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
-
instapaper_scraper-1.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|