instapaper-scraper 1.1.1__tar.gz → 1.2.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {instapaper_scraper-1.1.1/src/instapaper_scraper.egg-info → instapaper_scraper-1.2.0rc1}/PKG-INFO +48 -28
  2. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/README.md +46 -26
  3. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/pyproject.toml +2 -2
  4. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/api.py +41 -6
  5. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/cli.py +30 -5
  6. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/constants.py +1 -0
  7. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/output.py +50 -10
  8. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1/src/instapaper_scraper.egg-info}/PKG-INFO +48 -28
  9. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/SOURCES.txt +1 -0
  10. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/requires.txt +1 -1
  11. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_api.py +108 -9
  12. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_cli.py +69 -20
  13. instapaper_scraper-1.2.0rc1/tests/test_cli_config_flags.py +367 -0
  14. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_output.py +37 -3
  15. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/LICENSE +0 -0
  16. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/setup.cfg +0 -0
  17. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/__init__.py +0 -0
  18. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/auth.py +0 -0
  19. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper/exceptions.py +0 -0
  20. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/dependency_links.txt +0 -0
  21. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/entry_points.txt +0 -0
  22. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/src/instapaper_scraper.egg-info/top_level.txt +0 -0
  23. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_auth.py +0 -0
  24. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_cli_priority.py +0 -0
  25. {instapaper_scraper-1.1.1 → instapaper_scraper-1.2.0rc1}/tests/test_init.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: instapaper-scraper
3
- Version: 1.1.1
3
+ Version: 1.2.0rc1
4
4
  Summary: A tool to scrape articles from Instapaper.
5
5
  Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
6
6
  Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
@@ -21,7 +21,7 @@ Requires-Python: >=3.9
21
21
  Description-Content-Type: text/markdown
22
22
  License-File: LICENSE
23
23
  Requires-Dist: beautifulsoup4~=4.14.2
24
- Requires-Dist: certifi~=2025.11.12
24
+ Requires-Dist: certifi<2026.2.0,>=2025.11.12
25
25
  Requires-Dist: charset-normalizer~=3.4.3
26
26
  Requires-Dist: cryptography~=46.0.3
27
27
  Requires-Dist: guara~=0.0.14
@@ -141,9 +141,9 @@ The script authenticates using one of the following methods, in order of priorit
141
141
 
142
142
  > **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
143
143
 
144
- ### 📁 Folder Configuration
144
+ ### 📁 Folder and Field Configuration
145
145
 
146
- You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
146
+ You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
147
147
 
148
148
  1. The path specified by the `--config-path` argument.
149
149
  2. `config.toml` in the current working directory.
@@ -155,6 +155,12 @@ Here is an example of `config.toml`:
155
155
  # Default output filename for non-folder mode
156
156
  output_filename = "home-articles.csv"
157
157
 
158
+ # Optional fields to include in the output.
159
+ # These can be overridden by command-line flags.
160
+ [fields]
161
+ read_url = false
162
+ article_preview = false
163
+
158
164
  [[folders]]
159
165
  key = "ml"
160
166
  id = "1234567"
@@ -169,10 +175,14 @@ output_filename = "python-articles.db"
169
175
  ```
170
176
 
171
177
  - **output_filename (top-level)**: The default output filename to use when not in folder mode.
172
- - **key**: A short alias for the folder.
173
- - **id**: The folder ID from the Instapaper URL.
174
- - **slug**: The human-readable part of the folder URL.
175
- - **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
178
+ - **[fields]**: A section to control which optional data fields are included in the output.
179
+ - `read_url`: Set to `true` to include the Instapaper read URL for each article.
180
+ - `article_preview`: Set to `true` to include the article's text preview.
181
+ - **[[folders]]**: Each `[[folders]]` block defines a specific folder.
182
+ - **key**: A short alias for the folder.
183
+ - **id**: The folder ID from the Instapaper URL.
184
+ - **slug**: The human-readable part of the folder URL.
185
+ - **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
176
186
 
177
187
  When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
178
188
 
@@ -186,7 +196,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
186
196
  | `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
187
197
  | `--username <user>` | Your Instapaper account username. |
188
198
  | `--password <pass>` | Your Instapaper account password. |
189
- | `--add-instapaper-url` | Adds a `instapaper_url` column to the output, containing a full, clickable URL for each article. |
199
+ | `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
200
+ | `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
190
201
 
191
202
  ### 📄 Output Formats
192
203
 
@@ -204,10 +215,10 @@ When using `--output <filename>`, the file extension is automatically corrected
204
215
 
205
216
  The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
206
217
 
207
- For convenience, you can use the `--add-instapaper-url` flag to have the script include a full, clickable URL in the output.
218
+ For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
208
219
 
209
220
  ```sh
210
- instapaper-scraper --add-instapaper-url
221
+ instapaper-scraper --read-url
211
222
  ```
212
223
 
213
224
  This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
@@ -223,15 +234,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
223
234
 
224
235
  ## 📊 Example Output
225
236
 
226
- ### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
237
+ ### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
227
238
 
228
239
  ```csv
229
- "id","instapaper_url","title","url"
230
- "999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
231
- "999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
240
+ "id","instapaper_url","title","url","article_preview"
241
+ "999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
242
+ "999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
232
243
  ```
233
244
 
234
- ### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
245
+ ### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
235
246
 
236
247
  ```json
237
248
  [
@@ -239,13 +250,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
239
250
  "id": "999901234",
240
251
  "title": "Article 1",
241
252
  "url": "https://www.example.com/page-1/",
242
- "instapaper_url": "https://www.instapaper.com/read/999901234"
253
+ "instapaper_url": "https://www.instapaper.com/read/999901234",
254
+ "article_preview": "This is a preview of article 1."
243
255
  },
244
256
  {
245
257
  "id": "999002345",
246
258
  "title": "Article 2",
247
259
  "url": "https://www.example.com/page-2/",
248
- "instapaper_url": "https://www.instapaper.com/read/999002345"
260
+ "instapaper_url": "https://www.instapaper.com/read/999002345",
261
+ "article_preview": "This is a preview of article 2."
249
262
  }
250
263
  ]
251
264
  ```
@@ -274,7 +287,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
274
287
 
275
288
  ## 🧑‍💻 Development & Testing
276
289
 
277
- This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
290
+ This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
291
+
292
+ ### 🚀 Using the Makefile
293
+
294
+ The most common commands are:
295
+ - `make install`: Installs development dependencies.
296
+ - `make format`: Formats the entire codebase.
297
+ - `make check`: Runs the linter, type checker, and test suite.
298
+ - `make test`: Runs the test suite.
299
+ - `make build`: Builds the distributable packages.
300
+
301
+ Run `make help` to see all available commands.
278
302
 
279
303
  ### 🔧 Setup
280
304
 
@@ -300,13 +324,13 @@ python -m src.instapaper_scraper.cli
300
324
 
301
325
  ### ✅ Testing
302
326
 
303
- To run the tests, execute the following command from the project root:
327
+ To run the tests, execute the following command from the project root (or use `make test`):
304
328
 
305
329
  ```sh
306
330
  pytest
307
331
  ```
308
332
 
309
- To check test coverage:
333
+ To check test coverage (or use `make test-cov`):
310
334
 
311
335
  ```sh
312
336
  pytest --cov=src/instapaper_scraper --cov-report=term-missing
@@ -314,6 +338,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
314
338
 
315
339
  ### ✨ Code Quality
316
340
 
341
+ You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
342
+
317
343
  To format the code with `ruff`:
318
344
 
319
345
  ```sh
@@ -326,12 +352,6 @@ To check for linting errors with `ruff`:
326
352
  ruff check .
327
353
  ```
328
354
 
329
- To automatically fix linting errors:
330
-
331
- ```sh
332
- ruff check . --fix
333
- ```
334
-
335
355
  To run static type checking with `mypy`:
336
356
 
337
357
  ```sh
@@ -341,7 +361,7 @@ mypy src
341
361
  To run license checks:
342
362
 
343
363
  ```sh
344
- licensecheck --show-only-failing
364
+ licensecheck --zero
345
365
  ```
346
366
 
347
367
 
@@ -93,9 +93,9 @@ The script authenticates using one of the following methods, in order of priorit
93
93
 
94
94
  > **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
95
95
 
96
- ### 📁 Folder Configuration
96
+ ### 📁 Folder and Field Configuration
97
97
 
98
- You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
98
+ You can define and quickly access your Instapaper folders and set default output fields using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
99
99
 
100
100
  1. The path specified by the `--config-path` argument.
101
101
  2. `config.toml` in the current working directory.
@@ -107,6 +107,12 @@ Here is an example of `config.toml`:
107
107
  # Default output filename for non-folder mode
108
108
  output_filename = "home-articles.csv"
109
109
 
110
+ # Optional fields to include in the output.
111
+ # These can be overridden by command-line flags.
112
+ [fields]
113
+ read_url = false
114
+ article_preview = false
115
+
110
116
  [[folders]]
111
117
  key = "ml"
112
118
  id = "1234567"
@@ -121,10 +127,14 @@ output_filename = "python-articles.db"
121
127
  ```
122
128
 
123
129
  - **output_filename (top-level)**: The default output filename to use when not in folder mode.
124
- - **key**: A short alias for the folder.
125
- - **id**: The folder ID from the Instapaper URL.
126
- - **slug**: The human-readable part of the folder URL.
127
- - **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
130
+ - **[fields]**: A section to control which optional data fields are included in the output.
131
+ - `read_url`: Set to `true` to include the Instapaper read URL for each article.
132
+ - `article_preview`: Set to `true` to include the article's text preview.
133
+ - **[[folders]]**: Each `[[folders]]` block defines a specific folder.
134
+ - **key**: A short alias for the folder.
135
+ - **id**: The folder ID from the Instapaper URL.
136
+ - **slug**: The human-readable part of the folder URL.
137
+ - **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
128
138
 
129
139
  When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
130
140
 
@@ -138,7 +148,8 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
138
148
  | `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
139
149
  | `--username <user>` | Your Instapaper account username. |
140
150
  | `--password <pass>` | Your Instapaper account password. |
141
- | `--add-instapaper-url` | Adds a `instapaper_url` column to the output, containing a full, clickable URL for each article. |
151
+ | `--[no-]read-url` | Includes the Instapaper read URL. (Old flag `--add-instapaper-url` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
152
+ | `--[no-]article-preview` | Includes the article preview text. (Old flag `--add-article-preview` is deprecated but supported). Can be set in `config.toml`. Overrides config. |
142
153
 
143
154
  ### 📄 Output Formats
144
155
 
@@ -156,10 +167,10 @@ When using `--output <filename>`, the file extension is automatically corrected
156
167
 
157
168
  The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
158
169
 
159
- For convenience, you can use the `--add-instapaper-url` flag to have the script include a full, clickable URL in the output.
170
+ For convenience, you can use the `--read-url` flag to have the script include a full, clickable URL in the output.
160
171
 
161
172
  ```sh
162
- instapaper-scraper --add-instapaper-url
173
+ instapaper-scraper --read-url
163
174
  ```
164
175
 
165
176
  This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
@@ -175,15 +186,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
175
186
 
176
187
  ## 📊 Example Output
177
188
 
178
- ### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
189
+ ### 📄 CSV (`output/bookmarks.csv`) (with --add-instapaper-url and --add-article-preview)
179
190
 
180
191
  ```csv
181
- "id","instapaper_url","title","url"
182
- "999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
183
- "999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
192
+ "id","instapaper_url","title","url","article_preview"
193
+ "999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/","This is a preview of article 1."
194
+ "999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/","This is a preview of article 2."
184
195
  ```
185
196
 
186
- ### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url)
197
+ ### 📄 JSON (`output/bookmarks.json`) (with --add-instapaper-url and --add-article-preview)
187
198
 
188
199
  ```json
189
200
  [
@@ -191,13 +202,15 @@ The tool is designed with a modular architecture for reliability and maintainabi
191
202
  "id": "999901234",
192
203
  "title": "Article 1",
193
204
  "url": "https://www.example.com/page-1/",
194
- "instapaper_url": "https://www.instapaper.com/read/999901234"
205
+ "instapaper_url": "https://www.instapaper.com/read/999901234",
206
+ "article_preview": "This is a preview of article 1."
195
207
  },
196
208
  {
197
209
  "id": "999002345",
198
210
  "title": "Article 2",
199
211
  "url": "https://www.example.com/page-2/",
200
- "instapaper_url": "https://www.instapaper.com/read/999002345"
212
+ "instapaper_url": "https://www.instapaper.com/read/999002345",
213
+ "article_preview": "This is a preview of article 2."
201
214
  }
202
215
  ]
203
216
  ```
@@ -226,7 +239,18 @@ Please read the **[Contribution Guidelines](CONTRIBUTING.md)** before you start.
226
239
 
227
240
  ## 🧑‍💻 Development & Testing
228
241
 
229
- This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking.
242
+ This project uses `pytest` for testing, `ruff` for code formatting and linting, and `mypy` for static type checking. A `Makefile` is provided to simplify common development tasks.
243
+
244
+ ### 🚀 Using the Makefile
245
+
246
+ The most common commands are:
247
+ - `make install`: Installs development dependencies.
248
+ - `make format`: Formats the entire codebase.
249
+ - `make check`: Runs the linter, type checker, and test suite.
250
+ - `make test`: Runs the test suite.
251
+ - `make build`: Builds the distributable packages.
252
+
253
+ Run `make help` to see all available commands.
230
254
 
231
255
  ### 🔧 Setup
232
256
 
@@ -252,13 +276,13 @@ python -m src.instapaper_scraper.cli
252
276
 
253
277
  ### ✅ Testing
254
278
 
255
- To run the tests, execute the following command from the project root:
279
+ To run the tests, execute the following command from the project root (or use `make test`):
256
280
 
257
281
  ```sh
258
282
  pytest
259
283
  ```
260
284
 
261
- To check test coverage:
285
+ To check test coverage (or use `make test-cov`):
262
286
 
263
287
  ```sh
264
288
  pytest --cov=src/instapaper_scraper --cov-report=term-missing
@@ -266,6 +290,8 @@ pytest --cov=src/instapaper_scraper --cov-report=term-missing
266
290
 
267
291
  ### ✨ Code Quality
268
292
 
293
+ You can use the `Makefile` for convenience (e.g., `make format`, `make lint`).
294
+
269
295
  To format the code with `ruff`:
270
296
 
271
297
  ```sh
@@ -278,12 +304,6 @@ To check for linting errors with `ruff`:
278
304
  ruff check .
279
305
  ```
280
306
 
281
- To automatically fix linting errors:
282
-
283
- ```sh
284
- ruff check . --fix
285
- ```
286
-
287
307
  To run static type checking with `mypy`:
288
308
 
289
309
  ```sh
@@ -293,7 +313,7 @@ mypy src
293
313
  To run license checks:
294
314
 
295
315
  ```sh
296
- licensecheck --show-only-failing
316
+ licensecheck --zero
297
317
  ```
298
318
 
299
319
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "instapaper-scraper"
7
- version = "1.1.1"
7
+ version = "1.2.0rc1"
8
8
  description = "A tool to scrape articles from Instapaper."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -25,7 +25,7 @@ classifiers = [
25
25
  license-files = ["LICEN[CS]E*"]
26
26
  dependencies = [
27
27
  "beautifulsoup4~=4.14.2",
28
- "certifi~=2025.11.12",
28
+ "certifi>=2025.11.12,<2026.2.0",
29
29
  "charset-normalizer~=3.4.3",
30
30
  "cryptography~=46.0.3",
31
31
  "guara~=0.0.14",
@@ -8,7 +8,13 @@ from bs4 import BeautifulSoup
8
8
  from bs4.element import Tag
9
9
 
10
10
  from .exceptions import ScraperStructureChanged
11
- from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
11
+ from .constants import (
12
+ INSTAPAPER_BASE_URL,
13
+ KEY_ID,
14
+ KEY_TITLE,
15
+ KEY_URL,
16
+ KEY_ARTICLE_PREVIEW,
17
+ )
12
18
 
13
19
 
14
20
  class InstapaperClient:
@@ -34,6 +40,7 @@ class InstapaperClient:
34
40
  PAGINATE_OLDER_CLASS = "paginate_older"
35
41
  ARTICLE_TITLE_CLASS = "article_title"
36
42
  TITLE_META_CLASS = "title_meta"
43
+ ARTICLE_PREVIEW_CLASS = "article_preview"
37
44
 
38
45
  # URL paths
39
46
  URL_PATH_USER = "/u/"
@@ -102,12 +109,14 @@ class InstapaperClient:
102
109
  self,
103
110
  page: int = DEFAULT_PAGE_START,
104
111
  folder_info: Optional[Dict[str, str]] = None,
112
+ add_article_preview: bool = False,
105
113
  ) -> Tuple[List[Dict[str, str]], bool]:
106
114
  """
107
115
  Fetches a single page of articles and determines if there are more pages.
108
116
  Args:
109
117
  page: The page number to fetch.
110
118
  folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
119
+ add_article_preview: Whether to include the article preview.
111
120
  Returns:
112
121
  A tuple containing:
113
122
  - A list of article data (dictionaries with id, title, url).
@@ -147,7 +156,9 @@ class InstapaperClient:
147
156
  article_id_val.replace(self.ARTICLE_ID_PREFIX, "")
148
157
  )
149
158
 
150
- data = self._parse_article_data(soup, article_ids, page)
159
+ data = self._parse_article_data(
160
+ soup, article_ids, page, add_article_preview
161
+ )
151
162
  has_more = soup.find(class_=self.PAGINATE_OLDER_CLASS) is not None
152
163
 
153
164
  return data, has_more
@@ -185,13 +196,17 @@ class InstapaperClient:
185
196
  raise Exception(self.MSG_SCRAPING_FAILED_UNKNOWN)
186
197
 
187
198
  def get_all_articles(
188
- self, limit: Optional[int] = None, folder_info: Optional[Dict[str, str]] = None
199
+ self,
200
+ limit: Optional[int] = None,
201
+ folder_info: Optional[Dict[str, str]] = None,
202
+ add_article_preview: bool = False,
189
203
  ) -> List[Dict[str, str]]:
190
204
  """
191
205
  Iterates through pages and fetches articles up to a specified limit.
192
206
  Args:
193
207
  limit: The maximum number of pages to scrape. If None, scrapes all pages.
194
208
  folder_info: A dictionary containing 'id' and 'slug' of the folder to fetch articles from.
209
+ add_article_preview: Whether to include the article preview.
195
210
  """
196
211
  all_articles = []
197
212
  page = self.DEFAULT_PAGE_START
@@ -202,7 +217,11 @@ class InstapaperClient:
202
217
  break
203
218
 
204
219
  logging.info(self.MSG_SCRAPING_PAGE.format(page=page))
205
- data, has_more = self.get_articles(page=page, folder_info=folder_info)
220
+ data, has_more = self.get_articles(
221
+ page=page,
222
+ folder_info=folder_info,
223
+ add_article_preview=add_article_preview,
224
+ )
206
225
  if data:
207
226
  all_articles.extend(data)
208
227
  page += 1
@@ -217,7 +236,11 @@ class InstapaperClient:
217
236
  return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
218
237
 
219
238
  def _parse_article_data(
220
- self, soup: BeautifulSoup, article_ids: List[str], page: int
239
+ self,
240
+ soup: BeautifulSoup,
241
+ article_ids: List[str],
242
+ page: int,
243
+ add_article_preview: bool = False,
221
244
  ) -> List[Dict[str, Any]]:
222
245
  """Parses the raw HTML to extract structured data for each article."""
223
246
  data = []
@@ -249,7 +272,19 @@ class InstapaperClient:
249
272
  raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
250
273
  link = link_element["href"]
251
274
 
252
- data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
275
+ article_data = {KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link}
276
+
277
+ if add_article_preview:
278
+ preview_element = article_element.find(
279
+ class_=self.ARTICLE_PREVIEW_CLASS
280
+ )
281
+ article_data[KEY_ARTICLE_PREVIEW] = (
282
+ preview_element.get_text().strip()
283
+ if isinstance(preview_element, Tag)
284
+ else ""
285
+ )
286
+
287
+ data.append(article_data)
253
288
  except AttributeError as e:
254
289
  logging.warning(
255
290
  self.MSG_PARSE_ARTICLE_WARNING.format(
@@ -102,9 +102,18 @@ def main() -> None:
102
102
  parser.add_argument("--username", help="Instapaper username.")
103
103
  parser.add_argument("--password", help="Instapaper password.")
104
104
  parser.add_argument(
105
- "--add-instapaper-url",
106
- action="store_true",
107
- help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
105
+ "--read-url", # New, preferred flag
106
+ "--add-instapaper-url", # Old, for backward compatibility
107
+ dest="add_instapaper_url",
108
+ action=argparse.BooleanOptionalAction,
109
+ help="Include the Instapaper read URL. Overrides config.",
110
+ )
111
+ parser.add_argument(
112
+ "--article-preview", # New, preferred flag
113
+ "--add-article-preview", # Old, for backward compatibility
114
+ dest="add_article_preview",
115
+ action=argparse.BooleanOptionalAction,
116
+ help="Include the article preview text. Overrides config.",
108
117
  )
109
118
  parser.add_argument(
110
119
  "--limit",
@@ -120,8 +129,21 @@ def main() -> None:
120
129
 
121
130
  config = load_config(args.config_path)
122
131
  folders = config.get("folders", []) if config else []
132
+ fields_config = config.get("fields", {}) if config else {}
123
133
  selected_folder = None
124
134
 
135
+ # Resolve boolean flags, giving CLI priority over config
136
+ final_add_instapaper_url = (
137
+ args.add_instapaper_url
138
+ if args.add_instapaper_url is not None
139
+ else fields_config.get("read_url", False)
140
+ )
141
+ final_add_article_preview = (
142
+ args.add_article_preview
143
+ if args.add_article_preview is not None
144
+ else fields_config.get("article_preview", False)
145
+ )
146
+
125
147
  if args.folder:
126
148
  if args.folder.lower() == "none":
127
149
  selected_folder = None
@@ -196,7 +218,9 @@ def main() -> None:
196
218
  try:
197
219
  folder_info = selected_folder if selected_folder else None
198
220
  all_articles = client.get_all_articles(
199
- limit=args.limit, folder_info=folder_info
221
+ limit=args.limit,
222
+ folder_info=folder_info,
223
+ add_article_preview=final_add_article_preview,
200
224
  )
201
225
  except ScraperStructureChanged as e:
202
226
  logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
@@ -214,7 +238,8 @@ def main() -> None:
214
238
  all_articles,
215
239
  args.format,
216
240
  output_filename,
217
- add_instapaper_url=args.add_instapaper_url,
241
+ add_instapaper_url=final_add_instapaper_url,
242
+ add_article_preview=final_add_article_preview,
218
243
  )
219
244
  logging.info("Articles scraped and saved successfully.")
220
245
  except Exception as e:
@@ -15,3 +15,4 @@ CONFIG_DIR = Path.home() / ".config" / APP_NAME
15
15
  KEY_ID = "id"
16
16
  KEY_TITLE = "title"
17
17
  KEY_URL = "url"
18
+ KEY_ARTICLE_PREVIEW = "article_preview"