instapaper-scraper 1.1.1__tar.gz → 1.2.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {instapaper_scraper-1.1.1/src/instapaper_scraper.egg-info → instapaper_scraper-1.2.0rc1}/PKG-INFO +48 -28
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/README.md +46 -26
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/pyproject.toml +2 -2
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/api.py +41 -6
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/cli.py +30 -5
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/constants.py +1 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/output.py +50 -10
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1/src/instapaper_scraper.egg-info}/PKG-INFO +48 -28
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/SOURCES.txt +1 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/requires.txt +1 -1
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_api.py +108 -9
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_cli.py +69 -20
- instapaper_scraper-1.2.0rc1/tests/test_cli_config_flags.py +367 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_output.py +37 -3
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/LICENSE +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/setup.cfg +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/__init__.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/auth.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/exceptions.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/dependency_links.txt +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/entry_points.txt +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/top_level.txt +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_auth.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_cli_priority.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_init.py +0 -0
{instapaper_scraper-1.1.1/src/instapaper_scraper.egg-info → instapaper_scraper-1.2.0rc1}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: instapaper-scraper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0rc1
|
|
4
4
|
Summary: A tool to scrape articles from Instapaper.
|
|
5
5
|
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
6
|
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
@@ -21,7 +21,7 @@ Requires-Python: >=3.9
|
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Requires-Dist: beautifulsoup4~=4.14.2
|
|
24
|
-
Requires-Dist: certifi
|
|
24
|
+
Requires-Dist: certifi<2026.2.0,>=2025.11.12
|
|
25
25
|
Requires-Dist: charset-normalizer~=3.4.3
|
|
26
26
|
Requires-Dist: cryptography~=46.0.3
|
|
27
27
|
Requires-Dist: guara~=0.0.14
|
|
@@ -141,9 +141,9 @@ The script authenticates using one of the following methods, in order of priorit
|
|
|
141
141
|
|
|
142
142
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
143
143
|
|
|
144
|
-
### 📁 Folder Configuration
|
|
144
|
+
### 📁 Folder and Field Configuration
|
|
145
145
|
|
|
146
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
146
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
147
147
|
|
|
148
148
|
1. The path specified by the `--config-path` argument.
|
|
149
149
|
2. `config.toml` in the current working directory.
|
|
@@ -155,6 +155,12 @@ Here is an example of `config.toml`:
|
|
|
155
155
|
# Default output filename for non-folder mode
|
|
156
156
|
output_filename = "home-articles.csv"
|
|
157
157
|
|
|
158
|
+
# Optional fields to include in the output.
|
|
159
|
+
# These can be overridden by command-line flags.
|
|
160
|
+
[fields]
|
|
161
|
+
read_url = false
|
|
162
|
+
article_preview = false
|
|
163
|
+
|
|
158
164
|
[[folders]]
|
|
159
165
|
key = "ml"
|
|
160
166
|
id = "1234567"
|
|
@@ -169,10 +175,14 @@ output_filename = "python-articles.db"
|
|
|
169
175
|
```
|
|
170
176
|
|
|
171
177
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
172
|
-
- **
|
|
173
|
-
-
|
|
174
|
-
-
|
|
175
|
-
- **
|
|
178
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
179
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
180
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
181
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
182
|
+
- **key**: A short alias for the folder.
|
|
183
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
184
|
+
- **slug**: The human-readable part of the folder URL.
|
|
185
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
176
186
|
|
|
177
187
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
178
188
|
|
|
@@ -186,7 +196,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
186
196
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
187
197
|
| `--username <user>` | Your Instapaper account username. |
|
|
188
198
|
| `--password <pass>` | Your Instapaper account password. |
|
|
189
|
-
| `--
|
|
199
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
200
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
190
201
|
|
|
191
202
|
### 📄 Output Formats
|
|
192
203
|
|
|
@@ -204,10 +215,10 @@ When using `--output <filename>`, the file extension is automatically corrected
|
|
|
204
215
|
|
|
205
216
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
206
217
|
|
|
207
|
-
For convenience, you can use the `--
|
|
218
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
208
219
|
|
|
209
220
|
```sh
|
|
210
|
-
instapaper-scraper --
|
|
221
|
+
instapaper-scraper --read-url
|
|
211
222
|
```
|
|
212
223
|
|
|
213
224
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
@@ -223,15 +234,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
223
234
|
|
|
224
235
|
## 📊 Example Output
|
|
225
236
|
|
|
226
|
-
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
237
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
227
238
|
|
|
228
239
|
```csv
|
|
229
|
-
"id","instapaper_url","title","url"
|
|
230
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
231
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
240
|
+
"id","instapaper_url","title","url","article_preview"
|
|
241
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
242
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
232
243
|
```
|
|
233
244
|
|
|
234
|
-
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
245
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
235
246
|
|
|
236
247
|
```json
|
|
237
248
|
[
|
|
@@ -239,13 +250,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
239
250
|
"id": "999901234",
|
|
240
251
|
"title": "Article 1",
|
|
241
252
|
"url": "https://www.example.com/page-1/",
|
|
242
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
253
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
254
|
+
"article_preview": "This is a preview of article 1."
|
|
243
255
|
},
|
|
244
256
|
{
|
|
245
257
|
"id": "999002345",
|
|
246
258
|
"title": "Article 2",
|
|
247
259
|
"url": "https://www.example.com/page-2/",
|
|
248
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
260
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
261
|
+
"article_preview": "This is a preview of article 2."
|
|
249
262
|
}
|
|
250
263
|
]
|
|
251
264
|
```
|
|
@@ -274,7 +287,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
|
274
287
|
|
|
275
288
|
## 🧑💻 Development & Testing
|
|
276
289
|
|
|
277
|
-
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
|
|
290
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
291
|
+
|
|
292
|
+
### 🚀 Using the Makefile
|
|
293
|
+
|
|
294
|
+
The most common commands are:
|
|
295
|
+
- `make install`: Installs development dependencies.
|
|
296
|
+
- `make format`: Formats the entire codebase.
|
|
297
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
298
|
+
- `make test`: Runs the test suite.
|
|
299
|
+
- `make build`: Builds the distributable packages.
|
|
300
|
+
|
|
301
|
+
Run `make help` to see all available commands.
|
|
278
302
|
|
|
279
303
|
### 🔧 Setup
|
|
280
304
|
|
|
@@ -300,13 +324,13 @@ python -m src.instapaper_scraper.cli
|
|
|
300
324
|
|
|
301
325
|
### ✅ Testing
|
|
302
326
|
|
|
303
|
-
To run the tests, execute the following command from the project root:
|
|
327
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
304
328
|
|
|
305
329
|
```sh
|
|
306
330
|
pytest
|
|
307
331
|
```
|
|
308
332
|
|
|
309
|
-
To check test coverage:
|
|
333
|
+
To check test coverage (or use `make test-cov`):
|
|
310
334
|
|
|
311
335
|
```sh
|
|
312
336
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
@@ -314,6 +338,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
|
314
338
|
|
|
315
339
|
### ✨ Code Quality
|
|
316
340
|
|
|
341
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
342
|
+
|
|
317
343
|
To format the code with `ruff`:
|
|
318
344
|
|
|
319
345
|
```sh
|
|
@@ -326,12 +352,6 @@ To check for linting errors with `ruff`:
|
|
|
326
352
|
ruff check .
|
|
327
353
|
```
|
|
328
354
|
|
|
329
|
-
To automatically fix linting errors:
|
|
330
|
-
|
|
331
|
-
```sh
|
|
332
|
-
ruff check . --fix
|
|
333
|
-
```
|
|
334
|
-
|
|
335
355
|
To run static type checking with `mypy`:
|
|
336
356
|
|
|
337
357
|
```sh
|
|
@@ -341,7 +361,7 @@ mypy src
|
|
|
341
361
|
To run license checks:
|
|
342
362
|
|
|
343
363
|
```sh
|
|
344
|
-
licensecheck --
|
|
364
|
+
licensecheck --zero
|
|
345
365
|
```
|
|
346
366
|
|
|
347
367
|
|
|
@@ -93,9 +93,9 @@ The script authenticates using one of the following methods, in order of priorit
|
|
|
93
93
|
|
|
94
94
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
95
95
|
|
|
96
|
-
### 📁 Folder Configuration
|
|
96
|
+
### 📁 Folder and Field Configuration
|
|
97
97
|
|
|
98
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
98
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
99
99
|
|
|
100
100
|
1. The path specified by the `--config-path` argument.
|
|
101
101
|
2. `config.toml` in the current working directory.
|
|
@@ -107,6 +107,12 @@ Here is an example of `config.toml`:
|
|
|
107
107
|
# Default output filename for non-folder mode
|
|
108
108
|
output_filename = "home-articles.csv"
|
|
109
109
|
|
|
110
|
+
# Optional fields to include in the output.
|
|
111
|
+
# These can be overridden by command-line flags.
|
|
112
|
+
[fields]
|
|
113
|
+
read_url = false
|
|
114
|
+
article_preview = false
|
|
115
|
+
|
|
110
116
|
[[folders]]
|
|
111
117
|
key = "ml"
|
|
112
118
|
id = "1234567"
|
|
@@ -121,10 +127,14 @@ output_filename = "python-articles.db"
|
|
|
121
127
|
```
|
|
122
128
|
|
|
123
129
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
124
|
-
- **
|
|
125
|
-
-
|
|
126
|
-
-
|
|
127
|
-
- **
|
|
130
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
131
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
132
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
133
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
134
|
+
- **key**: A short alias for the folder.
|
|
135
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
136
|
+
- **slug**: The human-readable part of the folder URL.
|
|
137
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
128
138
|
|
|
129
139
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
130
140
|
|
|
@@ -138,7 +148,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
138
148
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
139
149
|
| `--username <user>` | Your Instapaper account username. |
|
|
140
150
|
| `--password <pass>` | Your Instapaper account password. |
|
|
141
|
-
| `--
|
|
151
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
152
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
142
153
|
|
|
143
154
|
### 📄 Output Formats
|
|
144
155
|
|
|
@@ -156,10 +167,10 @@ When using `--output <filename>`, the file extension is automatically corrected
|
|
|
156
167
|
|
|
157
168
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
158
169
|
|
|
159
|
-
For convenience, you can use the `--
|
|
170
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
160
171
|
|
|
161
172
|
```sh
|
|
162
|
-
instapaper-scraper --
|
|
173
|
+
instapaper-scraper --read-url
|
|
163
174
|
```
|
|
164
175
|
|
|
165
176
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
@@ -175,15 +186,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
175
186
|
|
|
176
187
|
## 📊 Example Output
|
|
177
188
|
|
|
178
|
-
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
189
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
179
190
|
|
|
180
191
|
```csv
|
|
181
|
-
"id","instapaper_url","title","url"
|
|
182
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
183
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
192
|
+
"id","instapaper_url","title","url","article_preview"
|
|
193
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
194
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
184
195
|
```
|
|
185
196
|
|
|
186
|
-
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
197
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
187
198
|
|
|
188
199
|
```json
|
|
189
200
|
[
|
|
@@ -191,13 +202,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
191
202
|
"id": "999901234",
|
|
192
203
|
"title": "Article 1",
|
|
193
204
|
"url": "https://www.example.com/page-1/",
|
|
194
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
205
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
206
|
+
"article_preview": "This is a preview of article 1."
|
|
195
207
|
},
|
|
196
208
|
{
|
|
197
209
|
"id": "999002345",
|
|
198
210
|
"title": "Article 2",
|
|
199
211
|
"url": "https://www.example.com/page-2/",
|
|
200
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
212
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
213
|
+
"article_preview": "This is a preview of article 2."
|
|
201
214
|
}
|
|
202
215
|
]
|
|
203
216
|
```
|
|
@@ -226,7 +239,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
|
226
239
|
|
|
227
240
|
## 🧑💻 Development & Testing
|
|
228
241
|
|
|
229
|
-
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
|
|
242
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
243
|
+
|
|
244
|
+
### 🚀 Using the Makefile
|
|
245
|
+
|
|
246
|
+
The most common commands are:
|
|
247
|
+
- `make install`: Installs development dependencies.
|
|
248
|
+
- `make format`: Formats the entire codebase.
|
|
249
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
250
|
+
- `make test`: Runs the test suite.
|
|
251
|
+
- `make build`: Builds the distributable packages.
|
|
252
|
+
|
|
253
|
+
Run `make help` to see all available commands.
|
|
230
254
|
|
|
231
255
|
### 🔧 Setup
|
|
232
256
|
|
|
@@ -252,13 +276,13 @@ python -m src.instapaper_scraper.cli
|
|
|
252
276
|
|
|
253
277
|
### ✅ Testing
|
|
254
278
|
|
|
255
|
-
To run the tests, execute the following command from the project root:
|
|
279
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
256
280
|
|
|
257
281
|
```sh
|
|
258
282
|
pytest
|
|
259
283
|
```
|
|
260
284
|
|
|
261
|
-
To check test coverage:
|
|
285
|
+
To check test coverage (or use `make test-cov`):
|
|
262
286
|
|
|
263
287
|
```sh
|
|
264
288
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
@@ -266,6 +290,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
|
266
290
|
|
|
267
291
|
### ✨ Code Quality
|
|
268
292
|
|
|
293
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
294
|
+
|
|
269
295
|
To format the code with `ruff`:
|
|
270
296
|
|
|
271
297
|
```sh
|
|
@@ -278,12 +304,6 @@ To check for linting errors with `ruff`:
|
|
|
278
304
|
ruff check .
|
|
279
305
|
```
|
|
280
306
|
|
|
281
|
-
To automatically fix linting errors:
|
|
282
|
-
|
|
283
|
-
```sh
|
|
284
|
-
ruff check . --fix
|
|
285
|
-
```
|
|
286
|
-
|
|
287
307
|
To run static type checking with `mypy`:
|
|
288
308
|
|
|
289
309
|
```sh
|
|
@@ -293,7 +313,7 @@ mypy src
|
|
|
293
313
|
To run license checks:
|
|
294
314
|
|
|
295
315
|
```sh
|
|
296
|
-
licensecheck --
|
|
316
|
+
licensecheck --zero
|
|
297
317
|
```
|
|
298
318
|
|
|
299
319
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "instapaper-scraper"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.2.0rc1"
|
|
8
8
|
description = "A tool to scrape articles from Instapaper."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -25,7 +25,7 @@ classifiers = [
|
|
|
25
25
|
license-files = ["LICEN[CS]E*"]
|
|
26
26
|
dependencies = [
|
|
27
27
|
"beautifulsoup4~=4.14.2",
|
|
28
|
-
"certifi
|
|
28
|
+
"certifi>=2025.11.12,<2026.2.0",
|
|
29
29
|
"charset-normalizer~=3.4.3",
|
|
30
30
|
"cryptography~=46.0.3",
|
|
31
31
|
"guara~=0.0.14",
|
|
@@ -8,7 +8,13 @@ from bs4 import BeautifulSoup
|
|
|
8
8
|
from bs4.element import Tag
|
|
9
9
|
|
|
10
10
|
from .exceptions import ScraperStructureChanged
|
|
11
|
-
from .constants import
|
|
11
|
+
from .constants import (
|
|
12
|
+
INSTAPAPER_BASE_URL,
|
|
13
|
+
KEY_ID,
|
|
14
|
+
KEY_TITLE,
|
|
15
|
+
KEY_URL,
|
|
16
|
+
KEY_ARTICLE_PREVIEW,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
class InstapaperClient:
|
|
@@ -34,6 +40,7 @@ class InstapaperClient:
|
|
|
34
40
|
PAGINATE_OLDER_CLASS = "paginate_older"
|
|
35
41
|
ARTICLE_TITLE_CLASS = "article_title"
|
|
36
42
|
TITLE_META_CLASS = "title_meta"
|
|
43
|
+
ARTICLE_PREVIEW_CLASS = "article_preview"
|
|
37
44
|
|
|
38
45
|
# URL paths
|
|
39
46
|
URL_PATH_USER = "/u/"
|
|
@@ -102,12 +109,14 @@ class InstapaperClient:
|
|
|
102
109
|
self,
|
|
103
110
|
page: int = DEFAULT_PAGE_START,
|
|
104
111
|
folder_info: Optional[Dict[str, str]] = None,
|
|
112
|
+
add_article_preview: bool = False,
|
|
105
113
|
) -> Tuple[List[Dict[str, str]], bool]:
|
|
106
114
|
"""
|
|
107
115
|
Fetches a single page of articles and determines if there are more pages.
|
|
108
116
|
Args:
|
|
109
117
|
page: The page number to fetch.
|
|
110
118
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
119
|
+
add_article_preview: Whether to include the article preview.
|
|
111
120
|
Returns:
|
|
112
121
|
A tuple containing:
|
|
113
122
|
- A list of article data (dictionaries with id, title, url).
|
|
@@ -147,7 +156,9 @@ class InstapaperClient:
|
|
|
147
156
|
article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
|
|
148
157
|
)
|
|
149
158
|
|
|
150
|
-
data = self._parse_article_data(
|
|
159
|
+
data = self._parse_article_data(
|
|
160
|
+
soup, article_ids, page, add_article_preview
|
|
161
|
+
)
|
|
151
162
|
has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
|
|
152
163
|
|
|
153
164
|
return data, has_more
|
|
@@ -185,13 +196,17 @@ class InstapaperClient:
|
|
|
185
196
|
raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
|
|
186
197
|
|
|
187
198
|
def get_all_articles(
|
|
188
|
-
self,
|
|
199
|
+
self,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
folder_info: Optional[Dict[str, str]] = None,
|
|
202
|
+
add_article_preview: bool = False,
|
|
189
203
|
) -> List[Dict[str, str]]:
|
|
190
204
|
"""
|
|
191
205
|
Iterates through pages and fetches articles up to a specified limit.
|
|
192
206
|
Args:
|
|
193
207
|
limit: The maximum number of pages to scrape. If None, scrapes all pages.
|
|
194
208
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
209
|
+
add_article_preview: Whether to include the article preview.
|
|
195
210
|
"""
|
|
196
211
|
all_articles = []
|
|
197
212
|
page = self.DEFAULT_PAGE_START
|
|
@@ -202,7 +217,11 @@ class InstapaperClient:
|
|
|
202
217
|
break
|
|
203
218
|
|
|
204
219
|
logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
|
|
205
|
-
data, has_more = self.get_articles(
|
|
220
|
+
data, has_more = self.get_articles(
|
|
221
|
+
page=page,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=add_article_preview,
|
|
224
|
+
)
|
|
206
225
|
if data:
|
|
207
226
|
all_articles.extend(data)
|
|
208
227
|
page += 1
|
|
@@ -217,7 +236,11 @@ class InstapaperClient:
|
|
|
217
236
|
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
218
237
|
|
|
219
238
|
def _parse_article_data(
|
|
220
|
-
self,
|
|
239
|
+
self,
|
|
240
|
+
soup: BeautifulSoup,
|
|
241
|
+
article_ids: List[str],
|
|
242
|
+
page: int,
|
|
243
|
+
add_article_preview: bool = False,
|
|
221
244
|
) -> List[Dict[str, Any]]:
|
|
222
245
|
"""Parses the raw HTML to extract structured data for each article."""
|
|
223
246
|
data = []
|
|
@@ -249,7 +272,19 @@ class InstapaperClient:
|
|
|
249
272
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
250
273
|
link = link_element["href"]
|
|
251
274
|
|
|
252
|
-
|
|
275
|
+
article_data = {KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link}
|
|
276
|
+
|
|
277
|
+
if add_article_preview:
|
|
278
|
+
preview_element = article_element.find(
|
|
279
|
+
class_=self.ARTICLE_PREVIEW_CLASS
|
|
280
|
+
)
|
|
281
|
+
article_data[KEY_ARTICLE_PREVIEW] = (
|
|
282
|
+
preview_element.get_text().strip()
|
|
283
|
+
if isinstance(preview_element, Tag)
|
|
284
|
+
else ""
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
data.append(article_data)
|
|
253
288
|
except AttributeError as e:
|
|
254
289
|
logging.warning(
|
|
255
290
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
|
@@ -102,9 +102,18 @@ def main() -> None:
|
|
|
102
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
103
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
104
|
parser.add_argument(
|
|
105
|
-
"--
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
"--read-url", # New, preferred flag
|
|
106
|
+
"--add-instapaper-url", # Old, for backward compatibility
|
|
107
|
+
dest="add_instapaper_url",
|
|
108
|
+
action=argparse.BooleanOptionalAction,
|
|
109
|
+
help="Include the Instapaper read URL. Overrides config.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--article-preview", # New, preferred flag
|
|
113
|
+
"--add-article-preview", # Old, for backward compatibility
|
|
114
|
+
dest="add_article_preview",
|
|
115
|
+
action=argparse.BooleanOptionalAction,
|
|
116
|
+
help="Include the article preview text. Overrides config.",
|
|
108
117
|
)
|
|
109
118
|
parser.add_argument(
|
|
110
119
|
"--limit",
|
|
@@ -120,8 +129,21 @@ def main() -> None:
|
|
|
120
129
|
|
|
121
130
|
config = load_config(args.config_path)
|
|
122
131
|
folders = config.get("folders", []) if config else []
|
|
132
|
+
fields_config = config.get("fields", {}) if config else {}
|
|
123
133
|
selected_folder = None
|
|
124
134
|
|
|
135
|
+
# Resolve boolean flags, giving CLI priority over config
|
|
136
|
+
final_add_instapaper_url = (
|
|
137
|
+
args.add_instapaper_url
|
|
138
|
+
if args.add_instapaper_url is not None
|
|
139
|
+
else fields_config.get("read_url", False)
|
|
140
|
+
)
|
|
141
|
+
final_add_article_preview = (
|
|
142
|
+
args.add_article_preview
|
|
143
|
+
if args.add_article_preview is not None
|
|
144
|
+
else fields_config.get("article_preview", False)
|
|
145
|
+
)
|
|
146
|
+
|
|
125
147
|
if args.folder:
|
|
126
148
|
if args.folder.lower() == "none":
|
|
127
149
|
selected_folder = None
|
|
@@ -196,7 +218,9 @@ def main() -> None:
|
|
|
196
218
|
try:
|
|
197
219
|
folder_info = selected_folder if selected_folder else None
|
|
198
220
|
all_articles = client.get_all_articles(
|
|
199
|
-
limit=args.limit,
|
|
221
|
+
limit=args.limit,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=final_add_article_preview,
|
|
200
224
|
)
|
|
201
225
|
except ScraperStructureChanged as e:
|
|
202
226
|
logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
|
|
@@ -214,7 +238,8 @@ def main() -> None:
|
|
|
214
238
|
all_articles,
|
|
215
239
|
args.format,
|
|
216
240
|
output_filename,
|
|
217
|
-
add_instapaper_url=
|
|
241
|
+
add_instapaper_url=final_add_instapaper_url,
|
|
242
|
+
add_article_preview=final_add_article_preview,
|
|
218
243
|
)
|
|
219
244
|
logging.info("Articles scraped and saved successfully.")
|
|
220
245
|
except Exception as e:
|