instapaper-scraper 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instapaper_scraper/__init__.py +7 -0
- instapaper_scraper/api.py +303 -0
- instapaper_scraper/auth.py +211 -0
- instapaper_scraper/cli.py +202 -0
- instapaper_scraper/exceptions.py +4 -0
- instapaper_scraper/output.py +88 -0
- instapaper_scraper-1.0.0.dist-info/METADATA +280 -0
- instapaper_scraper-1.0.0.dist-info/RECORD +12 -0
- instapaper_scraper-1.0.0.dist-info/WHEEL +5 -0
- instapaper_scraper-1.0.0.dist-info/entry_points.txt +2 -0
- instapaper_scraper-1.0.0.dist-info/licenses/LICENSE +674 -0
- instapaper_scraper-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import logging
|
|
3
|
+
import argparse
|
|
4
|
+
import requests
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
if sys.version_info >= (3, 11):
|
|
9
|
+
import tomllib
|
|
10
|
+
else:
|
|
11
|
+
import tomli as tomllib
|
|
12
|
+
|
|
13
|
+
from . import __version__
|
|
14
|
+
from .auth import InstapaperAuthenticator
|
|
15
|
+
from .api import InstapaperClient
|
|
16
|
+
from .output import save_articles
|
|
17
|
+
from .exceptions import ScraperStructureChanged
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _resolve_path(
|
|
21
|
+
arg_path: str, working_dir_filename: str, user_dir_filename: Path
|
|
22
|
+
) -> Path:
|
|
23
|
+
"""Resolves a path based on CLI arg, working dir, and user config dir."""
|
|
24
|
+
if arg_path:
|
|
25
|
+
return Path(arg_path).expanduser()
|
|
26
|
+
|
|
27
|
+
working_dir_path = Path(working_dir_filename)
|
|
28
|
+
if working_dir_path.exists():
|
|
29
|
+
logging.info(f"Found {working_dir_filename} in working directory.")
|
|
30
|
+
return working_dir_path
|
|
31
|
+
|
|
32
|
+
return user_dir_filename
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
36
|
+
"""
|
|
37
|
+
Loads configuration from a TOML file.
|
|
38
|
+
It checks the provided path, then config.toml in the project root,
|
|
39
|
+
and finally ~/.config/instapaper-scraper/config.toml.
|
|
40
|
+
"""
|
|
41
|
+
app_name = "instapaper-scraper"
|
|
42
|
+
default_paths = [
|
|
43
|
+
Path("config.toml"),
|
|
44
|
+
Path.home() / ".config" / app_name / "config.toml",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
paths_to_check = []
|
|
48
|
+
if config_path_str:
|
|
49
|
+
paths_to_check.insert(0, Path(config_path_str).expanduser())
|
|
50
|
+
paths_to_check.extend(default_paths)
|
|
51
|
+
|
|
52
|
+
for path in paths_to_check:
|
|
53
|
+
if path.is_file():
|
|
54
|
+
try:
|
|
55
|
+
with open(path, "rb") as f:
|
|
56
|
+
logging.info(f"Loading configuration from {path}")
|
|
57
|
+
return tomllib.load(f)
|
|
58
|
+
except tomllib.TOMLDecodeError as e:
|
|
59
|
+
logging.error(f"Error decoding TOML file at {path}: {e}")
|
|
60
|
+
return None
|
|
61
|
+
logging.info("No configuration file found at any default location.")
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def main():
|
|
66
|
+
"""
|
|
67
|
+
Main entry point for the Instapaper scraper CLI.
|
|
68
|
+
"""
|
|
69
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
|
70
|
+
|
|
71
|
+
parser = argparse.ArgumentParser(description="Scrape Instapaper articles.")
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"-v",
|
|
74
|
+
"--version",
|
|
75
|
+
action="version",
|
|
76
|
+
version=f"%(prog)s {__version__}",
|
|
77
|
+
help="Show program's version number and exit.",
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"--config-path",
|
|
81
|
+
help="Path to the configuration file.",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--format",
|
|
85
|
+
choices=["csv", "json", "sqlite"],
|
|
86
|
+
default="csv",
|
|
87
|
+
help="Output format (default: csv)",
|
|
88
|
+
)
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
"-o",
|
|
91
|
+
"--output",
|
|
92
|
+
help="Output filename. If not provided, defaults to output/bookmarks.{format}",
|
|
93
|
+
)
|
|
94
|
+
parser.add_argument("--session-file", help="Path to the encrypted session file.")
|
|
95
|
+
parser.add_argument("--key-file", help="Path to the session key file.")
|
|
96
|
+
parser.add_argument("--username", help="Instapaper username.")
|
|
97
|
+
parser.add_argument("--password", help="Instapaper password.")
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--limit",
|
|
100
|
+
type=int,
|
|
101
|
+
default=None,
|
|
102
|
+
help="Maximum number of pages to scrape (default: unlimited)",
|
|
103
|
+
)
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--folder",
|
|
106
|
+
help="Folder key, ID, or slug to scrape. Use 'none' to disable folder mode.",
|
|
107
|
+
)
|
|
108
|
+
args = parser.parse_args()
|
|
109
|
+
|
|
110
|
+
config = load_config(args.config_path)
|
|
111
|
+
folders = config.get("folders", []) if config else []
|
|
112
|
+
selected_folder = None
|
|
113
|
+
|
|
114
|
+
if args.folder:
|
|
115
|
+
if args.folder.lower() == "none":
|
|
116
|
+
selected_folder = None
|
|
117
|
+
else:
|
|
118
|
+
if not config:
|
|
119
|
+
logging.error(
|
|
120
|
+
"Configuration file not found or failed to load. The --folder option requires a configuration file."
|
|
121
|
+
)
|
|
122
|
+
sys.exit(1)
|
|
123
|
+
else:
|
|
124
|
+
for f in folders:
|
|
125
|
+
if args.folder in (f.get("key"), str(f.get("id")), f.get("slug")):
|
|
126
|
+
selected_folder = f
|
|
127
|
+
break
|
|
128
|
+
if not selected_folder:
|
|
129
|
+
# If folder is not in config, treat it as a folder ID
|
|
130
|
+
selected_folder = {"id": args.folder}
|
|
131
|
+
elif folders:
|
|
132
|
+
print("Available folders:")
|
|
133
|
+
print(" 0: none (non-folder mode)")
|
|
134
|
+
for i, folder in enumerate(folders):
|
|
135
|
+
display_name = folder.get("key") or folder.get("slug") or folder.get("id")
|
|
136
|
+
print(f" {i+1}: {display_name}")
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
choice = int(input("Select a folder (enter a number): "))
|
|
140
|
+
if 0 < choice <= len(folders):
|
|
141
|
+
selected_folder = folders[choice - 1]
|
|
142
|
+
elif choice != 0:
|
|
143
|
+
print("Invalid selection. Continuing in non-folder mode.")
|
|
144
|
+
except (ValueError, IndexError):
|
|
145
|
+
print("Invalid input. Continuing in non-folder mode.")
|
|
146
|
+
|
|
147
|
+
# Determine output filename
|
|
148
|
+
output_filename = args.output
|
|
149
|
+
if not output_filename:
|
|
150
|
+
if selected_folder and selected_folder.get("output_filename"):
|
|
151
|
+
output_filename = selected_folder["output_filename"]
|
|
152
|
+
elif not selected_folder and config and config.get("output_filename"):
|
|
153
|
+
output_filename = config["output_filename"]
|
|
154
|
+
else:
|
|
155
|
+
ext = "db" if args.format == "sqlite" else args.format
|
|
156
|
+
output_filename = f"output/bookmarks.{ext}"
|
|
157
|
+
|
|
158
|
+
session = requests.Session()
|
|
159
|
+
|
|
160
|
+
# Resolve session and key file paths
|
|
161
|
+
app_name = "instapaper-scraper"
|
|
162
|
+
user_config_dir = Path.home() / ".config" / app_name
|
|
163
|
+
|
|
164
|
+
session_file = _resolve_path(
|
|
165
|
+
args.session_file, ".instapaper_session", user_config_dir / ".instapaper_session"
|
|
166
|
+
)
|
|
167
|
+
key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
|
|
168
|
+
|
|
169
|
+
# 1. Authenticate
|
|
170
|
+
authenticator = InstapaperAuthenticator(
|
|
171
|
+
session,
|
|
172
|
+
session_file=session_file,
|
|
173
|
+
key_file=key_file,
|
|
174
|
+
username=args.username,
|
|
175
|
+
password=args.password,
|
|
176
|
+
)
|
|
177
|
+
if not authenticator.login():
|
|
178
|
+
sys.exit(1) # Exit if login fails
|
|
179
|
+
|
|
180
|
+
# 2. Scrape Articles
|
|
181
|
+
client = InstapaperClient(session)
|
|
182
|
+
try:
|
|
183
|
+
folder_info = selected_folder if selected_folder else None
|
|
184
|
+
all_articles = client.get_all_articles(
|
|
185
|
+
limit=args.limit, folder_info=folder_info
|
|
186
|
+
)
|
|
187
|
+
except ScraperStructureChanged as e:
|
|
188
|
+
logging.error(f"Stopping scraper due to an unrecoverable error: {e}")
|
|
189
|
+
sys.exit(1)
|
|
190
|
+
except requests.exceptions.RequestException as e:
|
|
191
|
+
logging.error(f"An HTTP error occurred: {e}")
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logging.error(f"An unexpected error occurred during scraping: {e}")
|
|
195
|
+
sys.exit(1)
|
|
196
|
+
|
|
197
|
+
# 3. Save Articles
|
|
198
|
+
save_articles(all_articles, args.format, output_filename)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
main()
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import sqlite3
|
|
4
|
+
import logging
|
|
5
|
+
from typing import List, Dict, Any
|
|
6
|
+
|
|
7
|
+
# Constants for file operations
|
|
8
|
+
JSON_INDENT = 4
|
|
9
|
+
|
|
10
|
+
# Constants for CSV output
|
|
11
|
+
CSV_HEADER = "id,title,url\n"
|
|
12
|
+
CSV_DELIMITER = ","
|
|
13
|
+
CSV_ROW_FORMAT = "{id},{title},{url}\n"
|
|
14
|
+
|
|
15
|
+
# Constants for SQLite output
|
|
16
|
+
SQLITE_TABLE_NAME = "articles"
|
|
17
|
+
SQLITE_ID_COL = "id"
|
|
18
|
+
SQLITE_TITLE_COL = "title"
|
|
19
|
+
SQLITE_URL_COL = "url"
|
|
20
|
+
SQLITE_CREATE_TABLE_SQL = f"""
|
|
21
|
+
CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} (
|
|
22
|
+
{SQLITE_ID_COL} TEXT PRIMARY KEY,
|
|
23
|
+
{SQLITE_TITLE_COL} TEXT NOT NULL,
|
|
24
|
+
{SQLITE_URL_COL} TEXT NOT NULL
|
|
25
|
+
)
|
|
26
|
+
"""
|
|
27
|
+
SQLITE_INSERT_SQL = f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({SQLITE_ID_COL}, {SQLITE_TITLE_COL}, {SQLITE_URL_COL}) VALUES (:{SQLITE_ID_COL}, :{SQLITE_TITLE_COL}, :{SQLITE_URL_COL})"
|
|
28
|
+
|
|
29
|
+
# Constants for logging messages
|
|
30
|
+
LOG_NO_ARTICLES = "No articles found to save."
|
|
31
|
+
LOG_SAVED_ARTICLES = "Saved {count} articles to {filename}"
|
|
32
|
+
LOG_UNKNOWN_FORMAT = "Unknown output format: {format}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def save_to_csv(data: List[Dict[str, Any]], filename: str):
|
|
36
|
+
"""Saves a list of articles to a CSV file."""
|
|
37
|
+
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
38
|
+
with open(filename, "w", newline="", encoding="utf-8") as f:
|
|
39
|
+
f.write(CSV_HEADER)
|
|
40
|
+
for article in data:
|
|
41
|
+
# Basic CSV quoting for titles with commas
|
|
42
|
+
title = article[SQLITE_TITLE_COL]
|
|
43
|
+
if CSV_DELIMITER in title:
|
|
44
|
+
title = f'"{title}"'
|
|
45
|
+
f.write(
|
|
46
|
+
CSV_ROW_FORMAT.format(
|
|
47
|
+
id=article[SQLITE_ID_COL], title=title, url=article[SQLITE_URL_COL]
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def save_to_json(data: List[Dict[str, Any]], filename: str):
|
|
54
|
+
"""Saves a list of articles to a JSON file."""
|
|
55
|
+
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
56
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
57
|
+
json.dump(data, f, indent=JSON_INDENT, ensure_ascii=False)
|
|
58
|
+
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def save_to_sqlite(data: List[Dict[str, Any]], db_name: str):
|
|
62
|
+
"""Saves a list of articles to a SQLite database."""
|
|
63
|
+
os.makedirs(os.path.dirname(db_name), exist_ok=True)
|
|
64
|
+
conn = sqlite3.connect(db_name)
|
|
65
|
+
cursor = conn.cursor()
|
|
66
|
+
cursor.execute(SQLITE_CREATE_TABLE_SQL)
|
|
67
|
+
cursor.executemany(SQLITE_INSERT_SQL, data)
|
|
68
|
+
conn.commit()
|
|
69
|
+
conn.close()
|
|
70
|
+
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=db_name))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def save_articles(data: List[Dict[str, Any]], format: str, filename: str):
|
|
74
|
+
"""
|
|
75
|
+
Dispatches to the correct save function based on the format.
|
|
76
|
+
"""
|
|
77
|
+
if not data:
|
|
78
|
+
logging.info(LOG_NO_ARTICLES)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if format == "csv":
|
|
82
|
+
save_to_csv(data, filename=filename)
|
|
83
|
+
elif format == "json":
|
|
84
|
+
save_to_json(data, filename=filename)
|
|
85
|
+
elif format == "sqlite":
|
|
86
|
+
save_to_sqlite(data, db_name=filename)
|
|
87
|
+
else:
|
|
88
|
+
logging.error(LOG_UNKNOWN_FORMAT.format(format=format))
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: instapaper-scraper
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A tool to scrape articles from Instapaper.
|
|
5
|
+
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
|
+
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
7
|
+
Project-URL: Issues, https://github.com/chriskyfung/InstapaperScraper/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
17
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
18
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: beautifulsoup4~=4.14.2
|
|
24
|
+
Requires-Dist: certifi~=2025.11.12
|
|
25
|
+
Requires-Dist: charset-normalizer~=3.4.3
|
|
26
|
+
Requires-Dist: cryptography~=46.0.3
|
|
27
|
+
Requires-Dist: guara~=0.0.14
|
|
28
|
+
Requires-Dist: idna~=3.11
|
|
29
|
+
Requires-Dist: python-dotenv~=1.2.1
|
|
30
|
+
Requires-Dist: requests~=2.32.5
|
|
31
|
+
Requires-Dist: soupsieve~=2.8
|
|
32
|
+
Requires-Dist: typing_extensions~=4.15.0
|
|
33
|
+
Requires-Dist: urllib3~=2.5.0
|
|
34
|
+
Requires-Dist: tomli~=2.0.1; python_version < "3.11"
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
38
|
+
Requires-Dist: black; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff; extra == "dev"
|
|
40
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
41
|
+
Requires-Dist: types-beautifulsoup4; extra == "dev"
|
|
42
|
+
Requires-Dist: requests-mock; extra == "dev"
|
|
43
|
+
Requires-Dist: build; extra == "dev"
|
|
44
|
+
Requires-Dist: twine; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# Instapaper Scraper
|
|
48
|
+
|
|
49
|
+

|
|
50
|
+
[](https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml)
|
|
51
|
+
[](https://pypi.org/project/instapaper-scraper/)
|
|
52
|
+
[](https://github.com/psf/black)
|
|
53
|
+
[](https://github.com/astral-sh/ruff)
|
|
54
|
+
[
|
|
55
|
+
](https://www.gnu.org/licenses/gpl-3.0.en.html)
|
|
56
|
+
[](https://codecov.io/gh/chriskyfung/InstapaperScraper)
|
|
57
|
+
|
|
58
|
+
A Python tool to scrape all your saved Instapaper bookmarks and export them to various formats.
|
|
59
|
+
|
|
60
|
+
## Features
|
|
61
|
+
|
|
62
|
+
- Scrapes all bookmarks from your Instapaper account.
|
|
63
|
+
- Supports scraping from specific folders.
|
|
64
|
+
- Exports data to CSV, JSON, or a SQLite database.
|
|
65
|
+
- Securely stores your session for future runs.
|
|
66
|
+
- Modern, modular, and tested architecture.
|
|
67
|
+
|
|
68
|
+
## Getting Started
|
|
69
|
+
|
|
70
|
+
### 1. Requirements
|
|
71
|
+
- Python 3.9+
|
|
72
|
+
|
|
73
|
+
### 2. Installation
|
|
74
|
+
|
|
75
|
+
This package is available on PyPI and can be installed with pip:
|
|
76
|
+
|
|
77
|
+
```sh
|
|
78
|
+
pip install instapaper-scraper
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 3. Usage
|
|
82
|
+
|
|
83
|
+
Run the tool from the command line, specifying your desired output format:
|
|
84
|
+
|
|
85
|
+
```sh
|
|
86
|
+
# Scrape and export to the default CSV format
|
|
87
|
+
instapaper-scraper
|
|
88
|
+
|
|
89
|
+
# Scrape and export to JSON
|
|
90
|
+
instapaper-scraper --format json
|
|
91
|
+
|
|
92
|
+
# Scrape and export to a SQLite database with a custom name
|
|
93
|
+
instapaper-scraper --format sqlite --output my_articles.db
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Configuration
|
|
97
|
+
|
|
98
|
+
### Authentication
|
|
99
|
+
|
|
100
|
+
The script authenticates using one of the following methods, in order of priority:
|
|
101
|
+
|
|
102
|
+
1. **Command-line Arguments**: Provide your username and password directly when running the script:
|
|
103
|
+
|
|
104
|
+
```sh
|
|
105
|
+
instapaper-scraper --username your_username --password your_password
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
2. **Session Files (`.session_key`, `.instapaper_session`)**: The script attempts to load these files in the following order:
|
|
109
|
+
a. Path specified by `--session-file` or `--key-file` arguments.
|
|
110
|
+
b. Files in the current working directory (e.g., `./.session_key`).
|
|
111
|
+
c. Files in the user's configuration directory (`~/.config/instapaper-scraper/`).
|
|
112
|
+
After the first successful login, the script creates an encrypted `.instapaper_session` file and a `.session_key` file to reuse your session securely.
|
|
113
|
+
|
|
114
|
+
3. **Interactive Prompt**: If no other method is available, the script will prompt you for your username and password.
|
|
115
|
+
|
|
116
|
+
> **Note on Security:** Your session file (`.instapaper_session`) and the encryption key (`.session_key`) are stored with secure permissions (read/write for the owner only) to protect your credentials.
|
|
117
|
+
|
|
118
|
+
### Folder Configuration
|
|
119
|
+
|
|
120
|
+
You can define and quickly access your Instapaper folders using a `config.toml` file. The scraper will look for this file in the following locations (in order of precedence):
|
|
121
|
+
|
|
122
|
+
1. The path specified by the `--config-path` argument.
|
|
123
|
+
2. `config.toml` in the current working directory.
|
|
124
|
+
3. `~/.config/instapaper-scraper/config.toml`
|
|
125
|
+
|
|
126
|
+
Here is an example of `config.toml`:
|
|
127
|
+
|
|
128
|
+
```toml
|
|
129
|
+
# Default output filename for non-folder mode
|
|
130
|
+
output_filename = "home-articles.csv"
|
|
131
|
+
|
|
132
|
+
[[folders]]
|
|
133
|
+
key = "ml"
|
|
134
|
+
id = "1234567"
|
|
135
|
+
slug = "machine-learning"
|
|
136
|
+
output_filename = "ml-articles.json"
|
|
137
|
+
|
|
138
|
+
[[folders]]
|
|
139
|
+
key = "python"
|
|
140
|
+
id = "7654321"
|
|
141
|
+
slug = "python-programming"
|
|
142
|
+
output_filename = "python-articles.db"
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
- **output_filename (top-level)**: The default output filename to use when not in folder mode.
|
|
146
|
+
- **key**: A short alias for the folder.
|
|
147
|
+
- **id**: The folder ID from the Instapaper URL.
|
|
148
|
+
- **slug**: The human-readable part of the folder URL.
|
|
149
|
+
- **output_filename (folder-specific)**: A preset output filename for scraped articles from this specific folder.
|
|
150
|
+
|
|
151
|
+
When a `config.toml` file is present and no `--folder` argument is provided, the scraper will prompt you to select a folder. You can also specify a folder directly using the `--folder` argument with its key, ID, or slug. Use `--folder=none` to explicitly disable folder mode and scrape all articles.
|
|
152
|
+
|
|
153
|
+
### Command-line Arguments
|
|
154
|
+
|
|
155
|
+
| Argument | Description |
|
|
156
|
+
| --------------------- | ------------------------------------------------------------------------ |
|
|
157
|
+
| `--config-path <path>`| Path to the configuration file. Searches `~/.config/instapaper-scraper/config.toml` and `config.toml` in the current directory by default. |
|
|
158
|
+
| `--folder <value>` | Specify a folder by key, ID, or slug from your `config.toml`. **Requires a configuration file to be loaded.** Use `none` to explicitly disable folder mode. If a configuration file is not found or fails to load, and this option is used (not set to `none`), the program will exit. |
|
|
159
|
+
| `--format <format>` | Output format (`csv`, `json`, `sqlite`). Default: `csv`. |
|
|
160
|
+
| `--output <filename>` | Specify a custom output filename. |
|
|
161
|
+
| `--username <user>` | Your Instapaper account username. |
|
|
162
|
+
| `--password <pass>` | Your Instapaper account password. |
|
|
163
|
+
|
|
164
|
+
### Output Formats
|
|
165
|
+
|
|
166
|
+
You can control the output format using the `--format` argument. The supported formats are:
|
|
167
|
+
|
|
168
|
+
- `csv` (default): Exports data to `output/bookmarks.csv`.
|
|
169
|
+
- `json`: Exports data to `output/bookmarks.json`.
|
|
170
|
+
- `sqlite`: Exports data to an `articles` table in `output/bookmarks.db`.
|
|
171
|
+
- `--output <filename>`: Specify a custom output filename.
|
|
172
|
+
|
|
173
|
+
If the `--format` flag is omitted, the script will default to `csv`.
|
|
174
|
+
|
|
175
|
+
#### Opening Articles in Instapaper
|
|
176
|
+
|
|
177
|
+
The output data includes a unique `id` for each article. To open an article directly in Instapaper's reader view, append this ID to the base URL:
|
|
178
|
+
`https://www.instapaper.com/read/<article_id>`
|
|
179
|
+
|
|
180
|
+
## How It Works
|
|
181
|
+
|
|
182
|
+
The tool is designed with a modular architecture for reliability and maintainability.
|
|
183
|
+
|
|
184
|
+
1. **Authentication**: The `InstapaperAuthenticator` handles secure login and session management.
|
|
185
|
+
2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries.
|
|
186
|
+
3. **Data Collection**: All fetched articles are aggregated into a single list.
|
|
187
|
+
4. **Export**: Finally, the collected data is written to a file in your chosen format (`.csv`, `.json`, or `.db`).
|
|
188
|
+
|
|
189
|
+
## Example Output
|
|
190
|
+
|
|
191
|
+
### CSV (`output/bookmarks.csv`)
|
|
192
|
+
|
|
193
|
+
```csv
|
|
194
|
+
id,title,url
|
|
195
|
+
999901234,"Article 1",https://www.example.com/page-1/
|
|
196
|
+
999002345,"Article 2",https://www.example.com/page-2/
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### JSON (`output/bookmarks.json`)
|
|
200
|
+
|
|
201
|
+
```json
|
|
202
|
+
[
|
|
203
|
+
{
|
|
204
|
+
"id": "999901234",
|
|
205
|
+
"title": "Article 1",
|
|
206
|
+
"url": "https://www.example.com/page-1/"
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
"id": "999002345",
|
|
210
|
+
"title": "Article 2",
|
|
211
|
+
"url": "https://www.example.com/page-2/"
|
|
212
|
+
}
|
|
213
|
+
]
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### SQLite (`output/bookmarks.db`)
|
|
217
|
+
|
|
218
|
+
A SQLite database file is created with an `articles` table containing `id`, `title`, and `url` columns.
|
|
219
|
+
|
|
220
|
+
## Development & Testing
|
|
221
|
+
|
|
222
|
+
This project uses `pytest` for testing, `black` for code formatting, and `ruff` for linting.
|
|
223
|
+
|
|
224
|
+
### Setup
|
|
225
|
+
|
|
226
|
+
To install the development dependencies:
|
|
227
|
+
|
|
228
|
+
```sh
|
|
229
|
+
pip install -e .[dev]
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Running the Scraper
|
|
233
|
+
|
|
234
|
+
To run the scraper directly without installing the package:
|
|
235
|
+
|
|
236
|
+
```sh
|
|
237
|
+
python -m src.instapaper_scraper.cli
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Testing
|
|
241
|
+
|
|
242
|
+
To run the tests, execute the following command from the project root:
|
|
243
|
+
|
|
244
|
+
```sh
|
|
245
|
+
pytest
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
To check test coverage:
|
|
249
|
+
|
|
250
|
+
```sh
|
|
251
|
+
pytest --cov=src/instapaper_scraper --cov-report=term-missing
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
### Code Quality
|
|
255
|
+
|
|
256
|
+
To format the code with `black`:
|
|
257
|
+
|
|
258
|
+
```sh
|
|
259
|
+
black .
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
To check for linting errors with `ruff`:
|
|
263
|
+
|
|
264
|
+
```sh
|
|
265
|
+
ruff check .
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
To automatically fix linting errors:
|
|
269
|
+
|
|
270
|
+
```sh
|
|
271
|
+
ruff check . --fix
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## Disclaimer
|
|
275
|
+
|
|
276
|
+
This script requires valid Instapaper credentials. Use it responsibly and in accordance with Instapaper’s Terms of Service.
|
|
277
|
+
|
|
278
|
+
## License
|
|
279
|
+
|
|
280
|
+
This project is licensed under the terms of the GNU General Public License v3.0. See the [LICENSE](LICENSE) file for the full license text.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
+
instapaper_scraper/api.py,sha256=KvGxK2P35-3TsONPWcQTVBZT-q70p7hobeQ7E9PhXwA,11740
|
|
3
|
+
instapaper_scraper/auth.py,sha256=DepQKDdVSm1dMFNIkpK_LIlaI0JllAYZb3_LJWhMe-g,7554
|
|
4
|
+
instapaper_scraper/cli.py,sha256=Pxf1cAoLW9N-X1BP73HE0i2Qv7rPTaIyrPqG3cgdSTI,6860
|
|
5
|
+
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
6
|
+
instapaper_scraper/output.py,sha256=0vQQ4AHZwFJg3O5O2zzvKUf0cOS1fTjXdivFqEHAun0,3081
|
|
7
|
+
instapaper_scraper-1.0.0.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
8
|
+
instapaper_scraper-1.0.0.dist-info/METADATA,sha256=cI3CAWZOY-1R5h6NfNEJHUOlhJQ6D5vS1bQCQ_cu3OI,10376
|
|
9
|
+
instapaper_scraper-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
10
|
+
instapaper_scraper-1.0.0.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
11
|
+
instapaper_scraper-1.0.0.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
12
|
+
instapaper_scraper-1.0.0.dist-info/RECORD,,
|