instapaper-scraper 1.1.1__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {instapaper_scraper-1.1.1/src/instapaper_scraper.egg-info → instapaper_scraper-1.2.0}/PKG-INFO +56 -32
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/README.md +54 -30
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/pyproject.toml +2 -2
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/api.py +41 -6
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/cli.py +30 -5
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/constants.py +1 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/output.py +50 -10
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0/src/instapaper_scraper.egg-info}/PKG-INFO +56 -32
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper.egg-info/SOURCES.txt +1 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper.egg-info/requires.txt +1 -1
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/tests/test_api.py +108 -9
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/tests/test_cli.py +69 -20
- instapaper_scraper-1.2.0/tests/test_cli_config_flags.py +367 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/tests/test_output.py +37 -3
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/LICENSE +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/setup.cfg +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/__init__.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/auth.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper/exceptions.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper.egg-info/dependency_links.txt +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper.egg-info/entry_points.txt +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/src/instapaper_scraper.egg-info/top_level.txt +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/tests/test_auth.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/tests/test_cli_priority.py +0 -0
- {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0}/tests/test_init.py +0 -0
{instapaper_scraper-1.1.1/src/instapaper_scraper.egg-info → instapaper_scraper-1.2.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: instapaper-scraper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A tool to scrape articles from Instapaper.
|
|
5
5
|
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
6
|
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
@@ -21,7 +21,7 @@ Requires-Python: >=3.9
|
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
Requires-Dist: beautifulsoup4~=4.14.2
|
|
24
|
-
Requires-Dist: certifi
|
|
24
|
+
Requires-Dist: certifi<2026.2.0,>=2025.11.12
|
|
25
25
|
Requires-Dist: charset-normalizer~=3.4.3
|
|
26
26
|
Requires-Dist: cryptography~=46.0.3
|
|
27
27
|
Requires-Dist: guara~=0.0.14
|
|
@@ -72,16 +72,20 @@ Dynamic: license-file
|
|
|
72
72
|
<a href="https://www.gnu.org/licenses/gpl-3.0.en.html">
|
|
73
73
|
<img src="https://img.shields.io/github/license/chriskyfung/InstapaperScraper" alt="GitHub License">
|
|
74
74
|
</a>
|
|
75
|
+
</p>
|
|
76
|
+
|
|
77
|
+
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
78
|
+
|
|
79
|
+
<!-- Sponsors -->
|
|
80
|
+
<p align="center">
|
|
75
81
|
<a href="https://github.com/sponsors/chriskyfung" title="Sponsor on GitHub">
|
|
76
|
-
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
82
|
+
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?style=for-the-badge&logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
77
83
|
</a>
|
|
78
84
|
<a href="https://www.buymeacoffee.com/chriskyfung" title="Support Coffee">
|
|
79
|
-
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
85
|
+
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
80
86
|
</a>
|
|
81
87
|
</p>
|
|
82
88
|
|
|
83
|
-
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
84
|
-
|
|
85
89
|
## ✨ Features
|
|
86
90
|
|
|
87
91
|
- Scrapes all bookmarks from your Instapaper account.
|
|
@@ -141,9 +145,9 @@ The script authenticates using one of the following methods, in order of priorit
|
|
|
141
145
|
|
|
142
146
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
143
147
|
|
|
144
|
-
### 📁 Folder Configuration
|
|
148
|
+
### 📁 Folder and Field Configuration
|
|
145
149
|
|
|
146
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
150
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
147
151
|
|
|
148
152
|
1. The path specified by the `--config-path` argument.
|
|
149
153
|
2. `config.toml` in the current working directory.
|
|
@@ -155,6 +159,12 @@ Here is an example of `config.toml`:
|
|
|
155
159
|
# Default output filename for non-folder mode
|
|
156
160
|
output_filename = "home-articles.csv"
|
|
157
161
|
|
|
162
|
+
# Optional fields to include in the output.
|
|
163
|
+
# These can be overridden by command-line flags.
|
|
164
|
+
[fields]
|
|
165
|
+
read_url = false
|
|
166
|
+
article_preview = false
|
|
167
|
+
|
|
158
168
|
[[folders]]
|
|
159
169
|
key = "ml"
|
|
160
170
|
id = "1234567"
|
|
@@ -169,10 +179,14 @@ output_filename = "python-articles.db"
|
|
|
169
179
|
```
|
|
170
180
|
|
|
171
181
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
172
|
-
- **
|
|
173
|
-
-
|
|
174
|
-
-
|
|
175
|
-
- **
|
|
182
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
183
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
184
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
185
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
186
|
+
- **key**: A short alias for the folder.
|
|
187
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
188
|
+
- **slug**: The human-readable part of the folder URL.
|
|
189
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
176
190
|
|
|
177
191
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
178
192
|
|
|
@@ -186,7 +200,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
186
200
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
187
201
|
| `--username <user>` | Your Instapaper account username. |
|
|
188
202
|
| `--password <pass>` | Your Instapaper account password. |
|
|
189
|
-
| `--
|
|
203
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
204
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
190
205
|
|
|
191
206
|
### 📄 Output Formats
|
|
192
207
|
|
|
@@ -204,10 +219,10 @@ When using `--output <filename>`, the file extension is automatically corrected
|
|
|
204
219
|
|
|
205
220
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
206
221
|
|
|
207
|
-
For convenience, you can use the `--
|
|
222
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
208
223
|
|
|
209
224
|
```sh
|
|
210
|
-
instapaper-scraper --
|
|
225
|
+
instapaper-scraper --read-url
|
|
211
226
|
```
|
|
212
227
|
|
|
213
228
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
@@ -223,15 +238,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
223
238
|
|
|
224
239
|
## 📊 Example Output
|
|
225
240
|
|
|
226
|
-
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
241
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
227
242
|
|
|
228
243
|
```csv
|
|
229
|
-
"id","instapaper_url","title","url"
|
|
230
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
231
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
244
|
+
"id","instapaper_url","title","url","article_preview"
|
|
245
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
246
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
232
247
|
```
|
|
233
248
|
|
|
234
|
-
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
249
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
235
250
|
|
|
236
251
|
```json
|
|
237
252
|
[
|
|
@@ -239,13 +254,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
239
254
|
"id": "999901234",
|
|
240
255
|
"title": "Article 1",
|
|
241
256
|
"url": "https://www.example.com/page-1/",
|
|
242
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
257
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
258
|
+
"article_preview": "This is a preview of article 1."
|
|
243
259
|
},
|
|
244
260
|
{
|
|
245
261
|
"id": "999002345",
|
|
246
262
|
"title": "Article 2",
|
|
247
263
|
"url": "https://www.example.com/page-2/",
|
|
248
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
264
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
265
|
+
"article_preview": "This is a preview of article 2."
|
|
249
266
|
}
|
|
250
267
|
]
|
|
251
268
|
```
|
|
@@ -274,7 +291,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
|
274
291
|
|
|
275
292
|
## 🧑💻 Development & Testing
|
|
276
293
|
|
|
277
|
-
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
|
|
294
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
295
|
+
|
|
296
|
+
### 🚀 Using the Makefile
|
|
297
|
+
|
|
298
|
+
The most common commands are:
|
|
299
|
+
- `make install`: Installs development dependencies.
|
|
300
|
+
- `make format`: Formats the entire codebase.
|
|
301
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
302
|
+
- `make test`: Runs the test suite.
|
|
303
|
+
- `make build`: Builds the distributable packages.
|
|
304
|
+
|
|
305
|
+
Run `make help` to see all available commands.
|
|
278
306
|
|
|
279
307
|
### 🔧 Setup
|
|
280
308
|
|
|
@@ -300,13 +328,13 @@ python -m src.instapaper_scraper.cli
|
|
|
300
328
|
|
|
301
329
|
### ✅ Testing
|
|
302
330
|
|
|
303
|
-
To run the tests, execute the following command from the project root:
|
|
331
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
304
332
|
|
|
305
333
|
```sh
|
|
306
334
|
pytest
|
|
307
335
|
```
|
|
308
336
|
|
|
309
|
-
To check test coverage:
|
|
337
|
+
To check test coverage (or use `make test-cov`):
|
|
310
338
|
|
|
311
339
|
```sh
|
|
312
340
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
@@ -314,6 +342,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
|
314
342
|
|
|
315
343
|
### ✨ Code Quality
|
|
316
344
|
|
|
345
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
346
|
+
|
|
317
347
|
To format the code with `ruff`:
|
|
318
348
|
|
|
319
349
|
```sh
|
|
@@ -326,12 +356,6 @@ To check for linting errors with `ruff`:
|
|
|
326
356
|
ruff check .
|
|
327
357
|
```
|
|
328
358
|
|
|
329
|
-
To automatically fix linting errors:
|
|
330
|
-
|
|
331
|
-
```sh
|
|
332
|
-
ruff check . --fix
|
|
333
|
-
```
|
|
334
|
-
|
|
335
359
|
To run static type checking with `mypy`:
|
|
336
360
|
|
|
337
361
|
```sh
|
|
@@ -341,7 +365,7 @@ mypy src
|
|
|
341
365
|
To run license checks:
|
|
342
366
|
|
|
343
367
|
```sh
|
|
344
|
-
licensecheck --
|
|
368
|
+
licensecheck --zero
|
|
345
369
|
```
|
|
346
370
|
|
|
347
371
|
|
|
@@ -24,16 +24,20 @@
|
|
|
24
24
|
<a href="https://www.gnu.org/licenses/gpl-3.0.en.html">
|
|
25
25
|
<img src="https://img.shields.io/github/license/chriskyfung/InstapaperScraper" alt="GitHub License">
|
|
26
26
|
</a>
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
30
|
+
|
|
31
|
+
<!-- Sponsors -->
|
|
32
|
+
<p align="center">
|
|
27
33
|
<a href="https://github.com/sponsors/chriskyfung" title="Sponsor on GitHub">
|
|
28
|
-
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
34
|
+
<img src="https://img.shields.io/badge/Sponsor-GitHub-blue?style=for-the-badge&logo=github-sponsors&colorA=263238&colorB=EC407A" alt="GitHub Sponsors Default">
|
|
29
35
|
</a>
|
|
30
36
|
<a href="https://www.buymeacoffee.com/chriskyfung" title="Support Coffee">
|
|
31
|
-
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
37
|
+
<img src="https://img.shields.io/badge/Support-Coffee-ffdd00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=ffdd00&colorA=263238" alt="Buy Me A Coffee">
|
|
32
38
|
</a>
|
|
33
39
|
</p>
|
|
34
40
|
|
|
35
|
-
A powerful and reliable Python tool to automate the export of all your saved Instapaper bookmarks into various formats, giving you full ownership of your data.
|
|
36
|
-
|
|
37
41
|
## ✨ Features
|
|
38
42
|
|
|
39
43
|
- Scrapes all bookmarks from your Instapaper account.
|
|
@@ -93,9 +97,9 @@ The script authenticates using one of the following methods, in order of priorit
|
|
|
93
97
|
|
|
94
98
|
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
95
99
|
|
|
96
|
-
### 📁 Folder Configuration
|
|
100
|
+
### 📁 Folder and Field Configuration
|
|
97
101
|
|
|
98
|
-
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
102
|
+
You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
99
103
|
|
|
100
104
|
1. The path specified by the `--config-path` argument.
|
|
101
105
|
2. `config.toml` in the current working directory.
|
|
@@ -107,6 +111,12 @@ Here is an example of `config.toml`:
|
|
|
107
111
|
# Default output filename for non-folder mode
|
|
108
112
|
output_filename = "home-articles.csv"
|
|
109
113
|
|
|
114
|
+
# Optional fields to include in the output.
|
|
115
|
+
# These can be overridden by command-line flags.
|
|
116
|
+
[fields]
|
|
117
|
+
read_url = false
|
|
118
|
+
article_preview = false
|
|
119
|
+
|
|
110
120
|
[[folders]]
|
|
111
121
|
key = "ml"
|
|
112
122
|
id = "1234567"
|
|
@@ -121,10 +131,14 @@ output_filename = "python-articles.db"
|
|
|
121
131
|
```
|
|
122
132
|
|
|
123
133
|
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
124
|
-
- **
|
|
125
|
-
-
|
|
126
|
-
-
|
|
127
|
-
- **
|
|
134
|
+
- **[fields]**: A section to control which optional data fields are included in the output.
|
|
135
|
+
- `read_url`: Set to `true` to include the Instapaper read URL for each article.
|
|
136
|
+
- `article_preview`: Set to `true` to include the article's text preview.
|
|
137
|
+
- **[[folders]]**: Each `[[folders]]` block defines a specific folder.
|
|
138
|
+
- **key**: A short alias for the folder.
|
|
139
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
140
|
+
- **slug**: The human-readable part of the folder URL.
|
|
141
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
128
142
|
|
|
129
143
|
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
130
144
|
|
|
@@ -138,7 +152,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
138
152
|
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
139
153
|
| `--username <user>` | Your Instapaper account username. |
|
|
140
154
|
| `--password <pass>` | Your Instapaper account password. |
|
|
141
|
-
| `--
|
|
155
|
+
| `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
156
|
+
| `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
|
|
142
157
|
|
|
143
158
|
### 📄 Output Formats
|
|
144
159
|
|
|
@@ -156,10 +171,10 @@ When using `--output <filename>`, the file extension is automatically corrected
|
|
|
156
171
|
|
|
157
172
|
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
158
173
|
|
|
159
|
-
For convenience, you can use the `--
|
|
174
|
+
For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
|
|
160
175
|
|
|
161
176
|
```sh
|
|
162
|
-
instapaper-scraper --
|
|
177
|
+
instapaper-scraper --read-url
|
|
163
178
|
```
|
|
164
179
|
|
|
165
180
|
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
@@ -175,15 +190,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
175
190
|
|
|
176
191
|
## 📊 Example Output
|
|
177
192
|
|
|
178
|
-
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
193
|
+
### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
|
|
179
194
|
|
|
180
195
|
```csv
|
|
181
|
-
"id","instapaper_url","title","url"
|
|
182
|
-
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
183
|
-
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
196
|
+
"id","instapaper_url","title","url","article_preview"
|
|
197
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
|
|
198
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
|
|
184
199
|
```
|
|
185
200
|
|
|
186
|
-
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
201
|
+
### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
|
|
187
202
|
|
|
188
203
|
```json
|
|
189
204
|
[
|
|
@@ -191,13 +206,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
|
|
|
191
206
|
"id": "999901234",
|
|
192
207
|
"title": "Article 1",
|
|
193
208
|
"url": "https://www.example.com/page-1/",
|
|
194
|
-
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
209
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234",
|
|
210
|
+
"article_preview": "This is a preview of article 1."
|
|
195
211
|
},
|
|
196
212
|
{
|
|
197
213
|
"id": "999002345",
|
|
198
214
|
"title": "Article 2",
|
|
199
215
|
"url": "https://www.example.com/page-2/",
|
|
200
|
-
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
216
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345",
|
|
217
|
+
"article_preview": "This is a preview of article 2."
|
|
201
218
|
}
|
|
202
219
|
]
|
|
203
220
|
```
|
|
@@ -226,7 +243,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
|
|
|
226
243
|
|
|
227
244
|
## 🧑💻 Development & Testing
|
|
228
245
|
|
|
229
|
-
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
|
|
246
|
+
This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
|
|
247
|
+
|
|
248
|
+
### 🚀 Using the Makefile
|
|
249
|
+
|
|
250
|
+
The most common commands are:
|
|
251
|
+
- `make install`: Installs development dependencies.
|
|
252
|
+
- `make format`: Formats the entire codebase.
|
|
253
|
+
- `make check`: Runs the linter, type checker, and test suite.
|
|
254
|
+
- `make test`: Runs the test suite.
|
|
255
|
+
- `make build`: Builds the distributable packages.
|
|
256
|
+
|
|
257
|
+
Run `make help` to see all available commands.
|
|
230
258
|
|
|
231
259
|
### 🔧 Setup
|
|
232
260
|
|
|
@@ -252,13 +280,13 @@ python -m src.instapaper_scraper.cli
|
|
|
252
280
|
|
|
253
281
|
### ✅ Testing
|
|
254
282
|
|
|
255
|
-
To run the tests, execute the following command from the project root:
|
|
283
|
+
To run the tests, execute the following command from the project root (or use `make test`):
|
|
256
284
|
|
|
257
285
|
```sh
|
|
258
286
|
pytest
|
|
259
287
|
```
|
|
260
288
|
|
|
261
|
-
To check test coverage:
|
|
289
|
+
To check test coverage (or use `make test-cov`):
|
|
262
290
|
|
|
263
291
|
```sh
|
|
264
292
|
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
@@ -266,6 +294,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
|
266
294
|
|
|
267
295
|
### ✨ Code Quality
|
|
268
296
|
|
|
297
|
+
You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
|
|
298
|
+
|
|
269
299
|
To format the code with `ruff`:
|
|
270
300
|
|
|
271
301
|
```sh
|
|
@@ -278,12 +308,6 @@ To check for linting errors with `ruff`:
|
|
|
278
308
|
ruff check .
|
|
279
309
|
```
|
|
280
310
|
|
|
281
|
-
To automatically fix linting errors:
|
|
282
|
-
|
|
283
|
-
```sh
|
|
284
|
-
ruff check . --fix
|
|
285
|
-
```
|
|
286
|
-
|
|
287
311
|
To run static type checking with `mypy`:
|
|
288
312
|
|
|
289
313
|
```sh
|
|
@@ -293,7 +317,7 @@ mypy src
|
|
|
293
317
|
To run license checks:
|
|
294
318
|
|
|
295
319
|
```sh
|
|
296
|
-
licensecheck --
|
|
320
|
+
licensecheck --zero
|
|
297
321
|
```
|
|
298
322
|
|
|
299
323
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "instapaper-scraper"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.2.0"
|
|
8
8
|
description = "A tool to scrape articles from Instapaper."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -25,7 +25,7 @@ classifiers = [
|
|
|
25
25
|
license-files = ["LICEN[CS]E*"]
|
|
26
26
|
dependencies = [
|
|
27
27
|
"beautifulsoup4~=4.14.2",
|
|
28
|
-
"certifi
|
|
28
|
+
"certifi>=2025.11.12,<2026.2.0",
|
|
29
29
|
"charset-normalizer~=3.4.3",
|
|
30
30
|
"cryptography~=46.0.3",
|
|
31
31
|
"guara~=0.0.14",
|
|
@@ -8,7 +8,13 @@ from bs4 import BeautifulSoup
|
|
|
8
8
|
from bs4.element import Tag
|
|
9
9
|
|
|
10
10
|
from .exceptions import ScraperStructureChanged
|
|
11
|
-
from .constants import
|
|
11
|
+
from .constants import (
|
|
12
|
+
INSTAPAPER_BASE_URL,
|
|
13
|
+
KEY_ID,
|
|
14
|
+
KEY_TITLE,
|
|
15
|
+
KEY_URL,
|
|
16
|
+
KEY_ARTICLE_PREVIEW,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
20
|
class InstapaperClient:
|
|
@@ -34,6 +40,7 @@ class InstapaperClient:
|
|
|
34
40
|
PAGINATE_OLDER_CLASS = "paginate_older"
|
|
35
41
|
ARTICLE_TITLE_CLASS = "article_title"
|
|
36
42
|
TITLE_META_CLASS = "title_meta"
|
|
43
|
+
ARTICLE_PREVIEW_CLASS = "article_preview"
|
|
37
44
|
|
|
38
45
|
# URL paths
|
|
39
46
|
URL_PATH_USER = "/u/"
|
|
@@ -102,12 +109,14 @@ class InstapaperClient:
|
|
|
102
109
|
self,
|
|
103
110
|
page: int = DEFAULT_PAGE_START,
|
|
104
111
|
folder_info: Optional[Dict[str, str]] = None,
|
|
112
|
+
add_article_preview: bool = False,
|
|
105
113
|
) -> Tuple[List[Dict[str, str]], bool]:
|
|
106
114
|
"""
|
|
107
115
|
Fetches a single page of articles and determines if there are more pages.
|
|
108
116
|
Args:
|
|
109
117
|
page: The page number to fetch.
|
|
110
118
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
119
|
+
add_article_preview: Whether to include the article preview.
|
|
111
120
|
Returns:
|
|
112
121
|
A tuple containing:
|
|
113
122
|
- A list of article data (dictionaries with id, title, url).
|
|
@@ -147,7 +156,9 @@ class InstapaperClient:
|
|
|
147
156
|
article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
|
|
148
157
|
)
|
|
149
158
|
|
|
150
|
-
data = self._parse_article_data(
|
|
159
|
+
data = self._parse_article_data(
|
|
160
|
+
soup, article_ids, page, add_article_preview
|
|
161
|
+
)
|
|
151
162
|
has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
|
|
152
163
|
|
|
153
164
|
return data, has_more
|
|
@@ -185,13 +196,17 @@ class InstapaperClient:
|
|
|
185
196
|
raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
|
|
186
197
|
|
|
187
198
|
def get_all_articles(
|
|
188
|
-
self,
|
|
199
|
+
self,
|
|
200
|
+
limit: Optional[int] = None,
|
|
201
|
+
folder_info: Optional[Dict[str, str]] = None,
|
|
202
|
+
add_article_preview: bool = False,
|
|
189
203
|
) -> List[Dict[str, str]]:
|
|
190
204
|
"""
|
|
191
205
|
Iterates through pages and fetches articles up to a specified limit.
|
|
192
206
|
Args:
|
|
193
207
|
limit: The maximum number of pages to scrape. If None, scrapes all pages.
|
|
194
208
|
folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
|
|
209
|
+
add_article_preview: Whether to include the article preview.
|
|
195
210
|
"""
|
|
196
211
|
all_articles = []
|
|
197
212
|
page = self.DEFAULT_PAGE_START
|
|
@@ -202,7 +217,11 @@ class InstapaperClient:
|
|
|
202
217
|
break
|
|
203
218
|
|
|
204
219
|
logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
|
|
205
|
-
data, has_more = self.get_articles(
|
|
220
|
+
data, has_more = self.get_articles(
|
|
221
|
+
page=page,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=add_article_preview,
|
|
224
|
+
)
|
|
206
225
|
if data:
|
|
207
226
|
all_articles.extend(data)
|
|
208
227
|
page += 1
|
|
@@ -217,7 +236,11 @@ class InstapaperClient:
|
|
|
217
236
|
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
218
237
|
|
|
219
238
|
def _parse_article_data(
|
|
220
|
-
self,
|
|
239
|
+
self,
|
|
240
|
+
soup: BeautifulSoup,
|
|
241
|
+
article_ids: List[str],
|
|
242
|
+
page: int,
|
|
243
|
+
add_article_preview: bool = False,
|
|
221
244
|
) -> List[Dict[str, Any]]:
|
|
222
245
|
"""Parses the raw HTML to extract structured data for each article."""
|
|
223
246
|
data = []
|
|
@@ -249,7 +272,19 @@ class InstapaperClient:
|
|
|
249
272
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
250
273
|
link = link_element["href"]
|
|
251
274
|
|
|
252
|
-
|
|
275
|
+
article_data = {KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link}
|
|
276
|
+
|
|
277
|
+
if add_article_preview:
|
|
278
|
+
preview_element = article_element.find(
|
|
279
|
+
class_=self.ARTICLE_PREVIEW_CLASS
|
|
280
|
+
)
|
|
281
|
+
article_data[KEY_ARTICLE_PREVIEW] = (
|
|
282
|
+
preview_element.get_text().strip()
|
|
283
|
+
if isinstance(preview_element, Tag)
|
|
284
|
+
else ""
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
data.append(article_data)
|
|
253
288
|
except AttributeError as e:
|
|
254
289
|
logging.warning(
|
|
255
290
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
|
@@ -102,9 +102,18 @@ def main() -> None:
|
|
|
102
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
103
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
104
|
parser.add_argument(
|
|
105
|
-
"--
|
|
106
|
-
|
|
107
|
-
|
|
105
|
+
"--read-url", # New, preferred flag
|
|
106
|
+
"--add-instapaper-url", # Old, for backward compatibility
|
|
107
|
+
dest="add_instapaper_url",
|
|
108
|
+
action=argparse.BooleanOptionalAction,
|
|
109
|
+
help="Include the Instapaper read URL. Overrides config.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--article-preview", # New, preferred flag
|
|
113
|
+
"--add-article-preview", # Old, for backward compatibility
|
|
114
|
+
dest="add_article_preview",
|
|
115
|
+
action=argparse.BooleanOptionalAction,
|
|
116
|
+
help="Include the article preview text. Overrides config.",
|
|
108
117
|
)
|
|
109
118
|
parser.add_argument(
|
|
110
119
|
"--limit",
|
|
@@ -120,8 +129,21 @@ def main() -> None:
|
|
|
120
129
|
|
|
121
130
|
config = load_config(args.config_path)
|
|
122
131
|
folders = config.get("folders", []) if config else []
|
|
132
|
+
fields_config = config.get("fields", {}) if config else {}
|
|
123
133
|
selected_folder = None
|
|
124
134
|
|
|
135
|
+
# Resolve boolean flags, giving CLI priority over config
|
|
136
|
+
final_add_instapaper_url = (
|
|
137
|
+
args.add_instapaper_url
|
|
138
|
+
if args.add_instapaper_url is not None
|
|
139
|
+
else fields_config.get("read_url", False)
|
|
140
|
+
)
|
|
141
|
+
final_add_article_preview = (
|
|
142
|
+
args.add_article_preview
|
|
143
|
+
if args.add_article_preview is not None
|
|
144
|
+
else fields_config.get("article_preview", False)
|
|
145
|
+
)
|
|
146
|
+
|
|
125
147
|
if args.folder:
|
|
126
148
|
if args.folder.lower() == "none":
|
|
127
149
|
selected_folder = None
|
|
@@ -196,7 +218,9 @@ def main() -> None:
|
|
|
196
218
|
try:
|
|
197
219
|
folder_info = selected_folder if selected_folder else None
|
|
198
220
|
all_articles = client.get_all_articles(
|
|
199
|
-
limit=args.limit,
|
|
221
|
+
limit=args.limit,
|
|
222
|
+
folder_info=folder_info,
|
|
223
|
+
add_article_preview=final_add_article_preview,
|
|
200
224
|
)
|
|
201
225
|
except ScraperStructureChanged as e:
|
|
202
226
|
logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
|
|
@@ -214,7 +238,8 @@ def main() -> None:
|
|
|
214
238
|
all_articles,
|
|
215
239
|
args.format,
|
|
216
240
|
output_filename,
|
|
217
|
-
add_instapaper_url=
|
|
241
|
+
add_instapaper_url=final_add_instapaper_url,
|
|
242
|
+
add_article_preview=final_add_article_preview,
|
|
218
243
|
)
|
|
219
244
|
logging.info("Articles scraped and saved successfully.")
|
|
220
245
|
except Exception as e:
|