instapaper-scraper 1.1.1__py3-none-any.whl → 1.2.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instapaper_scraper/api.py +41 -6
- instapaper_scraper/cli.py +30 -5
- instapaper_scraper/constants.py +1 -0
- instapaper_scraper/output.py +50 -10
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/METADATA +48 -28
- instapaper_scraper-1.2.0rc1.dist-info/RECORD +13 -0
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/WHEEL +1 -1
- instapaper_scraper-1.1.1.dist-info/RECORD +0 -13
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/entry_points.txt +0 -0
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/licenses/LICENSE +0 -0
- {instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/top_level.txt +0 -0
instapaper_scraper/api.py
CHANGED
|
@@ -8,7 +8,13 @@ from bs4 import BeautifulSoup
|
|
|
8
8
|
from bs4.element import Tag
|
|
9
9
|
|
|
10
10
|
from .exceptions import ScraperStructureChanged
|
|
11
|
-
from .constants import
|
|
11
|
+
from .constants import (
|
|
12
|
+
INSTAPAPER_BASE_URL,
|
|
13
|
+
KEY_ID,
|
|
14
|
+
KEY_TITLE,
|
|
15
|
+
KEY_URL,
|
|
16
|
+
KEY_ARTICLE_PREVIEW,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
class InstapaperClient:
|
|
@@ -34,6 +40,7 @@ class InstapaperClient:
|
|
|
34
40
|
PAGINATE_OLDER_CLASS = "paginate_older"
|
|
35
41
|
ARTICLE_TITLE_CLASS = "article_title"
|
|
36
42
|
TITLE_META_CLASS = "title_meta"
|
|
43
|
+
ARTICLE_PREVIEW_CLASS = "article_preview"
|
|
37
44
|
|
|
38
45
|
# URL paths
|
|
39
46
|
URL_PATH_USER = "/u/"
|
|
@@ -102,12 +109,14 @@ class InstapaperClient:
|
|
|
102
109
|
self,
|
|
103
110
|
page: int = DEFAULT_PAGE_START,
|
|
104
111
|
folder_info: Optional[Dict[str, str]] = None,
|
|
112
|
+
add_article_preview: bool = False,
|
|
105
113
|
) -> Tuple[List[Dict[str, str]], bool]:
|
|
106
114
|
"""
|
|
107
115
|
Fetches a single page of articles and determines if there are more pages.
|
|
108
116
|
Args:
|
|
109
117
|
page: The page number to fetch.
|
|
110
118
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
119
|
+
add_article_preview: Whether to include the article preview.
|
|
111
120
|
Returns:
|
|
112
121
|
A tuple containing:
|
|
113
122
|
- A list of article data (dictionaries with id, title, url).
|
|
@@ -147,7 +156,9 @@ class InstapaperClient:
|
|
|
147
156
|
article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
|
|
148
157
|
)
|
|
149
158
|
|
|
150
|
-
data = self._parse_article_data(
|
|
159
|
+
data = self._parse_article_data(
|
|
160
|
+
soup, article_ids, page, add_article_preview
|
|
161
|
+
)
|
|
151
162
|
has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
|
|
152
163
|
|
|
153
164
|
return data, has_more
|
|
@@ -185,13 +196,17 @@ class InstapaperClient:
|
|
|
185
196
|
raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
|
|
186
197
|
|
|
187
198
|
def get_all_articles(
|
|
188
|
-
self,
|
|
199
|
+
self,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
folder_info: Optional[Dict[str, str]] = None,
|
|
202
|
+
add_article_preview: bool = False,
|
|
189
203
|
) -> List[Dict[str, str]]:
|
|
190
204
|
"""
|
|
191
205
|
Iterates through pages and fetches articles up to a specified limit.
|
|
192
206
|
Args:
|
|
193
207
|
limit: The maximum number of pages to scrape. If None, scrapes all pages.
|
|
194
208
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
209
|
+
add_article_preview: Whether to include the article preview.
|
|
195
210
|
"""
|
|
196
211
|
all_articles = []
|
|
197
212
|
page = self.DEFAULT_PAGE_START
|
|
@@ -202,7 +217,11 @@ class InstapaperClient:
|
|
|
202
217
|
break
|
|
203
218
|
|
|
204
219
|
logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
|
|
205
|
-
data, has_more = self.get_articles(
|
|
220
|
+
data, has_more = self.get_articles(
|
|
221
|
+
page=page,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=add_article_preview,
|
|
224
|
+
)
|
|
206
225
|
if data:
|
|
207
226
|
all_articles.extend(data)
|
|
208
227
|
page += 1
|
|
@@ -217,7 +236,11 @@ class InstapaperClient:
|
|
|
217
236
|
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
218
237
|
|
|
219
238
|
def _parse_article_data(
|
|
220
|
-
self,
|
|
239
|
+
self,
|
|
240
|
+
soup: BeautifulSoup,
|
|
241
|
+
article_ids: List[str],
|
|
242
|
+
page: int,
|
|
243
|
+
add_article_preview: bool = False,
|
|
221
244
|
) -> List[Dict[str, Any]]:
|
|
222
245
|
"""Parses the raw HTML to extract structured data for each article."""
|
|
223
246
|
data = []
|
|
@@ -249,7 +272,19 @@ class InstapaperClient:
|
|
|
249
272
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
250
273
|
link = link_element["href"]
|
|
251
274
|
|
|
252
|
-
|
|
275
|
+
article_data = {KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link}
|
|
276
|
+
|
|
277
|
+
if add_article_preview:
|
|
278
|
+
preview_element = article_element.find(
|
|
279
|
+
class_=self.ARTICLE_PREVIEW_CLASS
|
|
280
|
+
)
|
|
281
|
+
article_data[KEY_ARTICLE_PREVIEW] = (
|
|
282
|
+
preview_element.get_text().strip()
|
|
283
|
+
if isinstance(preview_element, Tag)
|
|
284
|
+
else ""
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
data.append(article_data)
|
|
253
288
|
except AttributeError as e:
|
|
254
289
|
logging.warning(
|
|
255
290
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
instapaper_scraper/cli.py
CHANGED
|
@@ -102,9 +102,18 @@ def main() -> None:
|
|
|
102
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
103
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
104
|
parser.add_argument(
|
|
105
|
-
"--
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
"--read-url", # New, preferred flag
|
|
106
|
+
"--add-instapaper-url", # Old, for backward compatibility
|
|
107
|
+
dest="add_instapaper_url",
|
|
108
|
+
action=argparse.BooleanOptionalAction,
|
|
109
|
+
help="Include the Instapaper read URL. Overrides config.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--article-preview", # New, preferred flag
|
|
113
|
+
"--add-article-preview", # Old, for backward compatibility
|
|
114
|
+
dest="add_article_preview",
|
|
115
|
+
action=argparse.BooleanOptionalAction,
|
|
116
|
+
help="Include the article preview text. Overrides config.",
|
|
108
117
|
)
|
|
109
118
|
parser.add_argument(
|
|
110
119
|
"--limit",
|
|
@@ -120,8 +129,21 @@ def main() -> None:
|
|
|
120
129
|
|
|
121
130
|
config = load_config(args.config_path)
|
|
122
131
|
folders = config.get("folders", []) if config else []
|
|
132
|
+
fields_config = config.get("fields", {}) if config else {}
|
|
123
133
|
selected_folder = None
|
|
124
134
|
|
|
135
|
+
# Resolve boolean flags, giving CLI priority over config
|
|
136
|
+
final_add_instapaper_url = (
|
|
137
|
+
args.add_instapaper_url
|
|
138
|
+
if args.add_instapaper_url is not None
|
|
139
|
+
else fields_config.get("read_url", False)
|
|
140
|
+
)
|
|
141
|
+
final_add_article_preview = (
|
|
142
|
+
args.add_article_preview
|
|
143
|
+
if args.add_article_preview is not None
|
|
144
|
+
else fields_config.get("article_preview", False)
|
|
145
|
+
)
|
|
146
|
+
|
|
125
147
|
if args.folder:
|
|
126
148
|
if args.folder.lower() == "none":
|
|
127
149
|
selected_folder = None
|
|
@@ -196,7 +218,9 @@ def main() -> None:
|
|
|
196
218
|
try:
|
|
197
219
|
folder_info = selected_folder if selected_folder else None
|
|
198
220
|
all_articles = client.get_all_articles(
|
|
199
|
-
limit=args.limit,
|
|
221
|
+
limit=args.limit,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=final_add_article_preview,
|
|
200
224
|
)
|
|
201
225
|
except ScraperStructureChanged as e:
|
|
202
226
|
logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
|
|
@@ -214,7 +238,8 @@ def main() -> None:
|
|
|
214
238
|
all_articles,
|
|
215
239
|
args.format,
|
|
216
240
|
output_filename,
|
|
217
|
-
add_instapaper_url=
|
|
241
|
+
add_instapaper_url=final_add_instapaper_url,
|
|
242
|
+
add_article_preview=final_add_article_preview,
|
|
218
243
|
)
|
|
219
244
|
logging.info("Articles scraped and saved successfully.")
|
|
220
245
|
except Exception as e:
|
instapaper_scraper/constants.py
CHANGED
instapaper_scraper/output.py
CHANGED
|
@@ -2,7 +2,13 @@ import os
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List, Dict, Any, TYPE_CHECKING
|
|
4
4
|
|
|
5
|
-
from .constants import
|
|
5
|
+
from .constants import (
|
|
6
|
+
INSTAPAPER_READ_URL,
|
|
7
|
+
KEY_ID,
|
|
8
|
+
KEY_TITLE,
|
|
9
|
+
KEY_URL,
|
|
10
|
+
KEY_ARTICLE_PREVIEW,
|
|
11
|
+
)
|
|
6
12
|
|
|
7
13
|
# Constants for file operations
|
|
8
14
|
JSON_INDENT = 4
|
|
@@ -24,7 +30,9 @@ if TYPE_CHECKING:
|
|
|
24
30
|
__all__ = ["sqlite3"]
|
|
25
31
|
|
|
26
32
|
|
|
27
|
-
def get_sqlite_create_table_sql(
|
|
33
|
+
def get_sqlite_create_table_sql(
|
|
34
|
+
add_instapaper_url: bool = False, add_article_preview: bool = False
|
|
35
|
+
) -> str:
|
|
28
36
|
"""Returns the SQL statement to create the articles table."""
|
|
29
37
|
columns = [
|
|
30
38
|
f"{KEY_ID} TEXT PRIMARY KEY",
|
|
@@ -42,10 +50,15 @@ def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
|
|
|
42
50
|
else:
|
|
43
51
|
columns.append(f"{SQLITE_INSTAPAPER_URL_COL} TEXT")
|
|
44
52
|
|
|
53
|
+
if add_article_preview:
|
|
54
|
+
columns.append(f"{KEY_ARTICLE_PREVIEW} TEXT")
|
|
55
|
+
|
|
45
56
|
return f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} ({', '.join(columns)})"
|
|
46
57
|
|
|
47
58
|
|
|
48
|
-
def get_sqlite_insert_sql(
|
|
59
|
+
def get_sqlite_insert_sql(
|
|
60
|
+
add_instapaper_url_manually: bool = False, add_article_preview: bool = False
|
|
61
|
+
) -> str:
|
|
49
62
|
"""Returns the SQL statement to insert an article."""
|
|
50
63
|
cols = [KEY_ID, KEY_TITLE, KEY_URL]
|
|
51
64
|
placeholders = [f":{KEY_ID}", f":{KEY_TITLE}", f":{KEY_URL}"]
|
|
@@ -54,11 +67,18 @@ def get_sqlite_insert_sql(add_instapaper_url_manually: bool = False) -> str:
|
|
|
54
67
|
cols.append(SQLITE_INSTAPAPER_URL_COL)
|
|
55
68
|
placeholders.append(f":{SQLITE_INSTAPAPER_URL_COL}")
|
|
56
69
|
|
|
70
|
+
if add_article_preview:
|
|
71
|
+
cols.append(KEY_ARTICLE_PREVIEW)
|
|
72
|
+
placeholders.append(f":{KEY_ARTICLE_PREVIEW}")
|
|
73
|
+
|
|
57
74
|
return f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({', '.join(cols)}) VALUES ({', '.join(placeholders)})"
|
|
58
75
|
|
|
59
76
|
|
|
60
77
|
def save_to_csv(
|
|
61
|
-
data: List[Dict[str, Any]],
|
|
78
|
+
data: List[Dict[str, Any]],
|
|
79
|
+
filename: str,
|
|
80
|
+
add_instapaper_url: bool = False,
|
|
81
|
+
add_article_preview: bool = False,
|
|
62
82
|
) -> None:
|
|
63
83
|
"""Saves a list of articles to a CSV file."""
|
|
64
84
|
import csv
|
|
@@ -69,6 +89,8 @@ def save_to_csv(
|
|
|
69
89
|
if add_instapaper_url:
|
|
70
90
|
# Insert instapaper_url after the id column
|
|
71
91
|
fieldnames.insert(1, SQLITE_INSTAPAPER_URL_COL)
|
|
92
|
+
if add_article_preview:
|
|
93
|
+
fieldnames.append(KEY_ARTICLE_PREVIEW)
|
|
72
94
|
|
|
73
95
|
writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
|
74
96
|
writer.writeheader()
|
|
@@ -77,7 +99,10 @@ def save_to_csv(
|
|
|
77
99
|
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
78
100
|
|
|
79
101
|
|
|
80
|
-
def save_to_json(
|
|
102
|
+
def save_to_json(
|
|
103
|
+
data: List[Dict[str, Any]],
|
|
104
|
+
filename: str,
|
|
105
|
+
) -> None:
|
|
81
106
|
"""Saves a list of articles to a JSON file."""
|
|
82
107
|
import json
|
|
83
108
|
|
|
@@ -88,7 +113,10 @@ def save_to_json(data: List[Dict[str, Any]], filename: str) -> None:
|
|
|
88
113
|
|
|
89
114
|
|
|
90
115
|
def save_to_sqlite(
|
|
91
|
-
data: List[Dict[str, Any]],
|
|
116
|
+
data: List[Dict[str, Any]],
|
|
117
|
+
db_name: str,
|
|
118
|
+
add_instapaper_url: bool = False,
|
|
119
|
+
add_article_preview: bool = False,
|
|
92
120
|
) -> None:
|
|
93
121
|
"""Saves a list of articles to a SQLite database."""
|
|
94
122
|
import sqlite3
|
|
@@ -96,7 +124,7 @@ def save_to_sqlite(
|
|
|
96
124
|
os.makedirs(os.path.dirname(db_name), exist_ok=True)
|
|
97
125
|
conn = sqlite3.connect(db_name)
|
|
98
126
|
cursor = conn.cursor()
|
|
99
|
-
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url))
|
|
127
|
+
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url, add_article_preview))
|
|
100
128
|
|
|
101
129
|
# For older SQLite versions, we need to manually add the URL
|
|
102
130
|
manual_insert_required = add_instapaper_url and sqlite3.sqlite_version_info < (
|
|
@@ -116,7 +144,8 @@ def save_to_sqlite(
|
|
|
116
144
|
data_to_insert = data
|
|
117
145
|
|
|
118
146
|
insert_sql = get_sqlite_insert_sql(
|
|
119
|
-
add_instapaper_url_manually=manual_insert_required
|
|
147
|
+
add_instapaper_url_manually=manual_insert_required,
|
|
148
|
+
add_article_preview=add_article_preview,
|
|
120
149
|
)
|
|
121
150
|
cursor.executemany(insert_sql, data_to_insert)
|
|
122
151
|
|
|
@@ -143,6 +172,7 @@ def save_articles(
|
|
|
143
172
|
format: str,
|
|
144
173
|
filename: str,
|
|
145
174
|
add_instapaper_url: bool = False,
|
|
175
|
+
add_article_preview: bool = False,
|
|
146
176
|
) -> None:
|
|
147
177
|
"""
|
|
148
178
|
Dispatches to the correct save function based on the format.
|
|
@@ -164,10 +194,20 @@ def save_articles(
|
|
|
164
194
|
]
|
|
165
195
|
|
|
166
196
|
if format == "csv":
|
|
167
|
-
save_to_csv(
|
|
197
|
+
save_to_csv(
|
|
198
|
+
data,
|
|
199
|
+
filename=filename,
|
|
200
|
+
add_instapaper_url=add_instapaper_url,
|
|
201
|
+
add_article_preview=add_article_preview,
|
|
202
|
+
)
|
|
168
203
|
elif format == "json":
|
|
169
204
|
save_to_json(data, filename=filename)
|
|
170
205
|
elif format == "sqlite":
|
|
171
|
-
save_to_sqlite(
|
|
206
|
+
save_to_sqlite(
|
|
207
|
+
data,
|
|
208
|
+
db_name=filename,
|
|
209
|
+
add_instapaper_url=add_instapaper_url,
|
|
210
|
+
add_article_preview=add_article_preview,
|
|
211
|
+
)
|
|
172
212
|
else:
|
|
173
213
|
logging.error(LOG_UNKNOWN_FORMAT.format(format=format))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: instapaper-scraper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0rc1
|
|
4
4
|
Summary: A tool to scrape articles from Instapaper.
|
|
5
5
|
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
6
|
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
@@ -21,7 +21,7 @@ Requires-Python: >=3.9
|
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Requires-Dist: beautifulsoup4~=4.14.2
|
|
24
|
-
Requires-Dist: certifi
|
|
24
|
+
Requires-Dist: certifi<2026.2.0,>=2025.11.12
|
|
25
25
|
Requires-Dist: charset-normalizer~=3.4.3
|
|
26
26
|
Requires-Dist: cryptography~=46.0.3
|
|
27
27
|
Requires-Dist: guara~=0.0.14
|
|
@@ -141,9 +141,9 @@ The script authenticates using one of the following methods, in order of priorit
|
|
|
141
141
|
|
|
142
142
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
143
143
|
|
|
144
|
-
### 📁 Folder Configuration
|
|
144
|
+
### 📁 Folder and Field Configuration
|
|
145
145
|
|
|
146
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
146
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
147
147
|
|
|
148
148
|
1. The path specified by the `--config-path` argument.
|
|
149
149
|
2. `config.toml` in the current working directory.
|
|
@@ -155,6 +155,12 @@ Here is an example of `config.toml`:
|
|
|
155
155
|
# Default output filename for non-folder mode
|
|
156
156
|
output_filename = "home-articles.csv"
|
|
157
157
|
|
|
158
|
+
# Optional fields to include in the output.
|
|
159
|
+
# These can be overridden by command-line flags.
|
|
160
|
+
[fields]
|
|
161
|
+
read_url = false
|
|
162
|
+
article_preview = false
|
|
163
|
+
|
|
158
164
|
[[folders]]
|
|
159
165
|
key = "ml"
|
|
160
166
|
id = "1234567"
|
|
@@ -169,10 +175,14 @@ output_filename = "python-articles.db"
|
|
|
169
175
|
```
|
|
170
176
|
|
|
171
177
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
172
|
-
- **
|
|
173
|
-
-
|
|
174
|
-
-
|
|
175
|
-
- **
|
|
178
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
179
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
180
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
181
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
182
|
+
- **key**: A short alias for the folder.
|
|
183
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
184
|
+
- **slug**: The human-readable part of the folder URL.
|
|
185
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
176
186
|
|
|
177
187
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
178
188
|
|
|
@@ -186,7 +196,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
186
196
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
187
197
|
| `--username <user>` | Your Instapaper account username. |
|
|
188
198
|
| `--password <pass>` | Your Instapaper account password. |
|
|
189
|
-
| `--
|
|
199
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
200
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
190
201
|
|
|
191
202
|
### 📄 Output Formats
|
|
192
203
|
|
|
@@ -204,10 +215,10 @@ When using `--output <filename>`, the file extension is automatically corrected
|
|
|
204
215
|
|
|
205
216
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
206
217
|
|
|
207
|
-
For convenience, you can use the `--
|
|
218
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
208
219
|
|
|
209
220
|
```sh
|
|
210
|
-
instapaper-scraper --
|
|
221
|
+
instapaper-scraper --read-url
|
|
211
222
|
```
|
|
212
223
|
|
|
213
224
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
@@ -223,15 +234,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
223
234
|
|
|
224
235
|
## 📊 Example Output
|
|
225
236
|
|
|
226
|
-
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
237
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
227
238
|
|
|
228
239
|
```csv
|
|
229
|
-
"id","instapaper_url","title","url"
|
|
230
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
231
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
240
|
+
"id","instapaper_url","title","url","article_preview"
|
|
241
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
242
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
232
243
|
```
|
|
233
244
|
|
|
234
|
-
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
245
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
235
246
|
|
|
236
247
|
```json
|
|
237
248
|
[
|
|
@@ -239,13 +250,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
239
250
|
"id": "999901234",
|
|
240
251
|
"title": "Article 1",
|
|
241
252
|
"url": "https://www.example.com/page-1/",
|
|
242
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
253
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
254
|
+
"article_preview": "This is a preview of article 1."
|
|
243
255
|
},
|
|
244
256
|
{
|
|
245
257
|
"id": "999002345",
|
|
246
258
|
"title": "Article 2",
|
|
247
259
|
"url": "https://www.example.com/page-2/",
|
|
248
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
260
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
261
|
+
"article_preview": "This is a preview of article 2."
|
|
249
262
|
}
|
|
250
263
|
]
|
|
251
264
|
```
|
|
@@ -274,7 +287,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
|
274
287
|
|
|
275
288
|
## 🧑💻 Development & Testing
|
|
276
289
|
|
|
277
|
-
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
|
|
290
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
291
|
+
|
|
292
|
+
### 🚀 Using the Makefile
|
|
293
|
+
|
|
294
|
+
The most common commands are:
|
|
295
|
+
- `make install`: Installs development dependencies.
|
|
296
|
+
- `make format`: Formats the entire codebase.
|
|
297
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
298
|
+
- `make test`: Runs the test suite.
|
|
299
|
+
- `make build`: Builds the distributable packages.
|
|
300
|
+
|
|
301
|
+
Run `make help` to see all available commands.
|
|
278
302
|
|
|
279
303
|
### 🔧 Setup
|
|
280
304
|
|
|
@@ -300,13 +324,13 @@ python -m src.instapaper_scraper.cli
|
|
|
300
324
|
|
|
301
325
|
### ✅ Testing
|
|
302
326
|
|
|
303
|
-
To run the tests, execute the following command from the project root:
|
|
327
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
304
328
|
|
|
305
329
|
```sh
|
|
306
330
|
pytest
|
|
307
331
|
```
|
|
308
332
|
|
|
309
|
-
To check test coverage:
|
|
333
|
+
To check test coverage (or use `make test-cov`):
|
|
310
334
|
|
|
311
335
|
```sh
|
|
312
336
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
@@ -314,6 +338,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
|
314
338
|
|
|
315
339
|
### ✨ Code Quality
|
|
316
340
|
|
|
341
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
342
|
+
|
|
317
343
|
To format the code with `ruff`:
|
|
318
344
|
|
|
319
345
|
```sh
|
|
@@ -326,12 +352,6 @@ To check for linting errors with `ruff`:
|
|
|
326
352
|
ruff check .
|
|
327
353
|
```
|
|
328
354
|
|
|
329
|
-
To automatically fix linting errors:
|
|
330
|
-
|
|
331
|
-
```sh
|
|
332
|
-
ruff check . --fix
|
|
333
|
-
```
|
|
334
|
-
|
|
335
355
|
To run static type checking with `mypy`:
|
|
336
356
|
|
|
337
357
|
```sh
|
|
@@ -341,7 +361,7 @@ mypy src
|
|
|
341
361
|
To run license checks:
|
|
342
362
|
|
|
343
363
|
```sh
|
|
344
|
-
licensecheck --
|
|
364
|
+
licensecheck --zero
|
|
345
365
|
```
|
|
346
366
|
|
|
347
367
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
+
instapaper_scraper/api.py,sha256=q5cxikx3bmRfGUcgLbYjPtpMkrAE-A6vWjZ_KKwOmAU,13701
|
|
3
|
+
instapaper_scraper/auth.py,sha256=OpgjbdI697FitumiyznWjey5-R2ZuxAEATaMz9NNnTc,7092
|
|
4
|
+
instapaper_scraper/cli.py,sha256=MklUuxCVzoOGdT4jtMH0unY7D50qqJvU3XKatdfvGbg,8588
|
|
5
|
+
instapaper_scraper/constants.py,sha256=hiWriGWAQjDlx_Jn14dTkJIg4I--5ltzOOwD0ywFmwg,443
|
|
6
|
+
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
+
instapaper_scraper/output.py,sha256=6UdeKUubG_Yn-lCX0Pk8vG1zzc00xWg_5uNRWedOA30,6454
|
|
8
|
+
instapaper_scraper-1.2.0rc1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
+
instapaper_scraper-1.2.0rc1.dist-info/METADATA,sha256=RQXqIpq42MTtOgaIl46ouVORpaOCRHqdxq70HljH_mU,15807
|
|
10
|
+
instapaper_scraper-1.2.0rc1.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
11
|
+
instapaper_scraper-1.2.0rc1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
+
instapaper_scraper-1.2.0rc1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
+
instapaper_scraper-1.2.0rc1.dist-info/RECORD,,
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
-
instapaper_scraper/api.py,sha256=67ZeiVjsZpGspB8S3ni8FS6LBAOHXBc_oz3vEDWDNms,12672
|
|
3
|
-
instapaper_scraper/auth.py,sha256=OpgjbdI697FitumiyznWjey5-R2ZuxAEATaMz9NNnTc,7092
|
|
4
|
-
instapaper_scraper/cli.py,sha256=YL9c7kksmj5iGKRvVqG0KO4rBbhTg5c9Lgvsf_brRPA,7579
|
|
5
|
-
instapaper_scraper/constants.py,sha256=ubFWa47985lIz58qokMC0xQzTmCB6NOa17KFgWLn65E,403
|
|
6
|
-
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
-
instapaper_scraper/output.py,sha256=cadyUOaGQ5Ct5iLiEkHDvN2cqYc1WmJTvAa7OxFjg0w,5618
|
|
8
|
-
instapaper_scraper-1.1.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
-
instapaper_scraper-1.1.1.dist-info/METADATA,sha256=CDiUTjY5eu1OTlFhhBNA1irP6gTNTLw6Ra-RIbkJeKY,14320
|
|
10
|
-
instapaper_scraper-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
instapaper_scraper-1.1.1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
-
instapaper_scraper-1.1.1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
-
instapaper_scraper-1.1.1.dist-info/RECORD,,
|
{instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{instapaper_scraper-1.1.1.dist-info → instapaper_scraper-1.2.0rc1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|