instapaper-scraper 1.0.0.post1__py3-none-any.whl → 1.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instapaper_scraper/api.py +5 -13
- instapaper_scraper/auth.py +46 -57
- instapaper_scraper/cli.py +34 -10
- instapaper_scraper/constants.py +17 -0
- instapaper_scraper/output.py +107 -34
- {instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/METADATA +34 -21
- instapaper_scraper-1.1.0rc1.dist-info/RECORD +13 -0
- instapaper_scraper-1.0.0.post1.dist-info/RECORD +0 -12
- {instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/WHEEL +0 -0
- {instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/entry_points.txt +0 -0
- {instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/licenses/LICENSE +0 -0
- {instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/top_level.txt +0 -0
instapaper_scraper/api.py
CHANGED
|
@@ -7,6 +7,7 @@ import requests
|
|
|
7
7
|
from bs4 import BeautifulSoup
|
|
8
8
|
|
|
9
9
|
from .exceptions import ScraperStructureChanged
|
|
10
|
+
from .constants import INSTAPAPER_BASE_URL, KEY_ID, KEY_TITLE, KEY_URL
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class InstapaperClient:
|
|
@@ -14,8 +15,6 @@ class InstapaperClient:
|
|
|
14
15
|
A client for interacting with the Instapaper website to fetch articles.
|
|
15
16
|
"""
|
|
16
17
|
|
|
17
|
-
BASE_URL = "https://www.instapaper.com"
|
|
18
|
-
|
|
19
18
|
# Environment variable names
|
|
20
19
|
ENV_MAX_RETRIES = "MAX_RETRIES"
|
|
21
20
|
ENV_BACKOFF_FACTOR = "BACKOFF_FACTOR"
|
|
@@ -39,11 +38,6 @@ class InstapaperClient:
|
|
|
39
38
|
URL_PATH_USER = "/u/"
|
|
40
39
|
URL_PATH_FOLDER = "/u/folder/"
|
|
41
40
|
|
|
42
|
-
# Dictionary keys for article data
|
|
43
|
-
KEY_ID = "id"
|
|
44
|
-
KEY_TITLE = "title"
|
|
45
|
-
KEY_URL = "url"
|
|
46
|
-
|
|
47
41
|
# HTTP status codes
|
|
48
42
|
HTTP_TOO_MANY_REQUESTS = 429
|
|
49
43
|
HTTP_SERVER_ERROR_START = 500
|
|
@@ -134,7 +128,7 @@ class InstapaperClient:
|
|
|
134
128
|
|
|
135
129
|
articles = article_list.find_all(self.ARTICLE_TAG)
|
|
136
130
|
article_ids = [
|
|
137
|
-
article[
|
|
131
|
+
article[KEY_ID].replace(self.ARTICLE_ID_PREFIX, "")
|
|
138
132
|
for article in articles
|
|
139
133
|
]
|
|
140
134
|
|
|
@@ -204,8 +198,8 @@ class InstapaperClient:
|
|
|
204
198
|
) -> str:
|
|
205
199
|
"""Constructs the URL for the given page, considering folder mode."""
|
|
206
200
|
if folder_info and folder_info.get("id") and folder_info.get("slug"):
|
|
207
|
-
return f"{
|
|
208
|
-
return f"{
|
|
201
|
+
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_FOLDER}{folder_info['id']}/{folder_info['slug']}/{page}"
|
|
202
|
+
return f"{INSTAPAPER_BASE_URL}{self.URL_PATH_USER}{page}"
|
|
209
203
|
|
|
210
204
|
def _parse_article_data(
|
|
211
205
|
self, soup: BeautifulSoup, article_ids: List[str], page: int
|
|
@@ -235,9 +229,7 @@ class InstapaperClient:
|
|
|
235
229
|
raise AttributeError(self.MSG_LINK_ELEMENT_NOT_FOUND)
|
|
236
230
|
link = link_element["href"]
|
|
237
231
|
|
|
238
|
-
data.append(
|
|
239
|
-
{self.KEY_ID: article_id, self.KEY_TITLE: title, self.KEY_URL: link}
|
|
240
|
-
)
|
|
232
|
+
data.append({KEY_ID: article_id, KEY_TITLE: title, KEY_URL: link})
|
|
241
233
|
except AttributeError as e:
|
|
242
234
|
logging.warning(
|
|
243
235
|
self.MSG_PARSE_ARTICLE_WARNING.format(
|
instapaper_scraper/auth.py
CHANGED
|
@@ -8,11 +8,33 @@ from typing import Union
|
|
|
8
8
|
from cryptography.fernet import Fernet
|
|
9
9
|
import requests
|
|
10
10
|
|
|
11
|
+
from .constants import INSTAPAPER_BASE_URL
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
|
|
14
|
+
# --- Encryption Helper ---
|
|
15
|
+
def get_encryption_key(key_file: Union[str, Path]) -> bytes:
|
|
16
|
+
"""
|
|
17
|
+
Loads the encryption key from a file or generates a new one.
|
|
18
|
+
Sets strict file permissions for the key file.
|
|
19
|
+
"""
|
|
20
|
+
key_path = Path(key_file)
|
|
21
|
+
key_path.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
if key_path.exists():
|
|
24
|
+
with open(key_path, "rb") as f:
|
|
25
|
+
key = f.read()
|
|
26
|
+
else:
|
|
27
|
+
key = Fernet.generate_key()
|
|
28
|
+
with open(key_path, "wb") as f:
|
|
29
|
+
f.write(key)
|
|
30
|
+
# Set file permissions to 0600 (owner read/write only)
|
|
31
|
+
os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
|
|
32
|
+
logging.info(f"Generated new encryption key at {key_path}.")
|
|
33
|
+
return key
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class InstapaperAuthenticator:
|
|
14
37
|
# URLs
|
|
15
|
-
INSTAPAPER_BASE_URL = "https://www.instapaper.com"
|
|
16
38
|
INSTAPAPER_VERIFY_URL = f"{INSTAPAPER_BASE_URL}/u"
|
|
17
39
|
INSTAPAPER_LOGIN_URL = f"{INSTAPAPER_BASE_URL}/user/login"
|
|
18
40
|
|
|
@@ -25,10 +47,6 @@ class InstapaperConstants:
|
|
|
25
47
|
# Request related
|
|
26
48
|
REQUEST_TIMEOUT = 10
|
|
27
49
|
|
|
28
|
-
# App config
|
|
29
|
-
APP_NAME = "instapaper-scraper"
|
|
30
|
-
CONFIG_DIR = Path.home() / ".config" / APP_NAME
|
|
31
|
-
|
|
32
50
|
# Prompts
|
|
33
51
|
PROMPT_USERNAME = "Enter your Instapaper username: "
|
|
34
52
|
PROMPT_PASSWORD = "Enter your Instapaper password: "
|
|
@@ -44,30 +62,6 @@ class InstapaperConstants:
|
|
|
44
62
|
LOG_NO_KNOWN_COOKIE_TO_SAVE = "Could not find a known session cookie to save."
|
|
45
63
|
LOG_SAVED_SESSION = "Saved encrypted session to {session_file}."
|
|
46
64
|
|
|
47
|
-
|
|
48
|
-
# --- Encryption Helper ---
|
|
49
|
-
def get_encryption_key(key_file: Union[str, Path]) -> bytes:
|
|
50
|
-
"""
|
|
51
|
-
Loads the encryption key from a file or generates a new one.
|
|
52
|
-
Sets strict file permissions for the key file.
|
|
53
|
-
"""
|
|
54
|
-
key_path = Path(key_file)
|
|
55
|
-
key_path.parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
-
|
|
57
|
-
if key_path.exists():
|
|
58
|
-
with open(key_path, "rb") as f:
|
|
59
|
-
key = f.read()
|
|
60
|
-
else:
|
|
61
|
-
key = Fernet.generate_key()
|
|
62
|
-
with open(key_path, "wb") as f:
|
|
63
|
-
f.write(key)
|
|
64
|
-
# Set file permissions to 0600 (owner read/write only)
|
|
65
|
-
os.chmod(key_path, stat.S_IRUSR | stat.S_IWUSR)
|
|
66
|
-
logging.info(f"Generated new encryption key at {key_path}.")
|
|
67
|
-
return key
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class InstapaperAuthenticator:
|
|
71
65
|
def __init__(
|
|
72
66
|
self,
|
|
73
67
|
session: requests.Session,
|
|
@@ -116,24 +110,22 @@ class InstapaperAuthenticator:
|
|
|
116
110
|
if not line:
|
|
117
111
|
continue
|
|
118
112
|
parts = line.split(":", 2)
|
|
119
|
-
if len(parts) ==
|
|
113
|
+
if len(parts) == self.COOKIE_PART_COUNT:
|
|
120
114
|
name, value, domain = parts
|
|
121
115
|
self.session.cookies.set(name, value, domain=domain)
|
|
122
116
|
|
|
123
117
|
if self.session.cookies and self._verify_session():
|
|
124
|
-
logging.info(
|
|
118
|
+
logging.info(self.LOG_SESSION_LOAD_SUCCESS)
|
|
125
119
|
return True
|
|
126
120
|
else:
|
|
127
|
-
logging.warning(
|
|
121
|
+
logging.warning(self.LOG_SESSION_LOAD_FAILED)
|
|
128
122
|
# Clear cookies if verification fails
|
|
129
123
|
self.session.cookies.clear()
|
|
130
124
|
return False
|
|
131
125
|
|
|
132
126
|
except Exception as e:
|
|
133
127
|
logging.warning(
|
|
134
|
-
|
|
135
|
-
session_file=self.session_file, e=e
|
|
136
|
-
)
|
|
128
|
+
self.LOG_SESSION_LOAD_ERROR.format(session_file=self.session_file, e=e)
|
|
137
129
|
)
|
|
138
130
|
self.session_file.unlink(missing_ok=True)
|
|
139
131
|
return False
|
|
@@ -142,57 +134,56 @@ class InstapaperAuthenticator:
|
|
|
142
134
|
"""Checks if the current session is valid by making a request."""
|
|
143
135
|
try:
|
|
144
136
|
verify_response = self.session.get(
|
|
145
|
-
|
|
146
|
-
timeout=
|
|
137
|
+
self.INSTAPAPER_VERIFY_URL,
|
|
138
|
+
timeout=self.REQUEST_TIMEOUT,
|
|
147
139
|
)
|
|
148
140
|
verify_response.raise_for_status()
|
|
149
|
-
return
|
|
141
|
+
return self.LOGIN_FORM_IDENTIFIER not in verify_response.text
|
|
150
142
|
except requests.RequestException as e:
|
|
151
|
-
logging.error(
|
|
143
|
+
logging.error(self.LOG_SESSION_VERIFY_FAILED.format(e=e))
|
|
152
144
|
return False
|
|
153
145
|
|
|
154
146
|
def _login_with_credentials(self) -> bool:
|
|
155
147
|
"""Logs in using username/password from arguments or user prompt."""
|
|
156
|
-
logging.info(
|
|
148
|
+
logging.info(self.LOG_NO_VALID_SESSION)
|
|
157
149
|
username = self.username
|
|
158
150
|
password = self.password
|
|
159
151
|
|
|
160
152
|
if not username or not password:
|
|
161
|
-
username = input(
|
|
162
|
-
password = getpass.getpass(
|
|
153
|
+
username = input(self.PROMPT_USERNAME)
|
|
154
|
+
password = getpass.getpass(self.PROMPT_PASSWORD)
|
|
163
155
|
elif self.username:
|
|
164
156
|
logging.info(
|
|
165
157
|
f"Using username '{self.username}' from command-line arguments."
|
|
166
158
|
)
|
|
167
159
|
|
|
168
160
|
login_response = self.session.post(
|
|
169
|
-
|
|
161
|
+
self.INSTAPAPER_LOGIN_URL,
|
|
170
162
|
data={"username": username, "password": password, "keep_logged_in": "yes"},
|
|
171
|
-
timeout=
|
|
163
|
+
timeout=self.REQUEST_TIMEOUT,
|
|
172
164
|
)
|
|
173
165
|
|
|
174
|
-
required_cookies =
|
|
166
|
+
required_cookies = self.REQUIRED_COOKIES
|
|
175
167
|
found_cookies = {c.name for c in self.session.cookies}
|
|
176
168
|
|
|
177
|
-
if (
|
|
178
|
-
|
|
179
|
-
and required_cookies.issubset(found_cookies)
|
|
169
|
+
if self.LOGIN_SUCCESS_PATH in login_response.url and required_cookies.issubset(
|
|
170
|
+
found_cookies
|
|
180
171
|
):
|
|
181
|
-
logging.info(
|
|
172
|
+
logging.info(self.LOG_LOGIN_SUCCESS)
|
|
182
173
|
return True
|
|
183
174
|
else:
|
|
184
|
-
logging.error(
|
|
175
|
+
logging.error(self.LOG_LOGIN_FAILED)
|
|
185
176
|
return False
|
|
186
177
|
|
|
187
178
|
def _save_session(self):
|
|
188
179
|
"""Saves the current session cookies to an encrypted file."""
|
|
189
|
-
required_cookies =
|
|
180
|
+
required_cookies = self.REQUIRED_COOKIES
|
|
190
181
|
cookies_to_save = [
|
|
191
182
|
c for c in self.session.cookies if c.name in required_cookies
|
|
192
183
|
]
|
|
193
184
|
|
|
194
185
|
if not cookies_to_save:
|
|
195
|
-
logging.warning(
|
|
186
|
+
logging.warning(self.LOG_NO_KNOWN_COOKIE_TO_SAVE)
|
|
196
187
|
return
|
|
197
188
|
|
|
198
189
|
cookie_data = ""
|
|
@@ -206,6 +197,4 @@ class InstapaperAuthenticator:
|
|
|
206
197
|
f.write(encrypted_data)
|
|
207
198
|
|
|
208
199
|
os.chmod(self.session_file, stat.S_IRUSR | stat.S_IWUSR)
|
|
209
|
-
logging.info(
|
|
210
|
-
InstapaperConstants.LOG_SAVED_SESSION.format(session_file=self.session_file)
|
|
211
|
-
)
|
|
200
|
+
logging.info(self.LOG_SAVED_SESSION.format(session_file=self.session_file))
|
instapaper_scraper/cli.py
CHANGED
|
@@ -15,6 +15,13 @@ from .auth import InstapaperAuthenticator
|
|
|
15
15
|
from .api import InstapaperClient
|
|
16
16
|
from .output import save_articles
|
|
17
17
|
from .exceptions import ScraperStructureChanged
|
|
18
|
+
from .constants import CONFIG_DIR
|
|
19
|
+
|
|
20
|
+
# --- Constants ---
|
|
21
|
+
CONFIG_FILENAME = "config.toml"
|
|
22
|
+
DEFAULT_SESSION_FILENAME = ".instapaper_session"
|
|
23
|
+
DEFAULT_KEY_FILENAME = ".session_key"
|
|
24
|
+
DEFAULT_OUTPUT_FILENAME = "output/bookmarks.{ext}"
|
|
18
25
|
|
|
19
26
|
|
|
20
27
|
def _resolve_path(
|
|
@@ -38,10 +45,9 @@ def load_config(config_path_str: Union[str, None] = None) -> Union[dict, None]:
|
|
|
38
45
|
It checks the provided path, then config.toml in the project root,
|
|
39
46
|
and finally ~/.config/instapaper-scraper/config.toml.
|
|
40
47
|
"""
|
|
41
|
-
app_name = "instapaper-scraper"
|
|
42
48
|
default_paths = [
|
|
43
|
-
Path(
|
|
44
|
-
|
|
49
|
+
Path(CONFIG_FILENAME),
|
|
50
|
+
CONFIG_DIR / CONFIG_FILENAME,
|
|
45
51
|
]
|
|
46
52
|
|
|
47
53
|
paths_to_check = []
|
|
@@ -95,6 +101,11 @@ def main():
|
|
|
95
101
|
parser.add_argument("--key-file", help="Path to the session key file.")
|
|
96
102
|
parser.add_argument("--username", help="Instapaper username.")
|
|
97
103
|
parser.add_argument("--password", help="Instapaper password.")
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--add-instapaper-url",
|
|
106
|
+
action="store_true",
|
|
107
|
+
help="Add an 'instapaper_url' column to the output with the full Instapaper read URL.",
|
|
108
|
+
)
|
|
98
109
|
parser.add_argument(
|
|
99
110
|
"--limit",
|
|
100
111
|
type=int,
|
|
@@ -153,18 +164,21 @@ def main():
|
|
|
153
164
|
output_filename = config["output_filename"]
|
|
154
165
|
else:
|
|
155
166
|
ext = "db" if args.format == "sqlite" else args.format
|
|
156
|
-
output_filename =
|
|
167
|
+
output_filename = DEFAULT_OUTPUT_FILENAME.format(ext=ext)
|
|
157
168
|
|
|
158
169
|
session = requests.Session()
|
|
159
170
|
|
|
160
171
|
# Resolve session and key file paths
|
|
161
|
-
app_name = "instapaper-scraper"
|
|
162
|
-
user_config_dir = Path.home() / ".config" / app_name
|
|
163
|
-
|
|
164
172
|
session_file = _resolve_path(
|
|
165
|
-
args.session_file,
|
|
173
|
+
args.session_file,
|
|
174
|
+
DEFAULT_SESSION_FILENAME,
|
|
175
|
+
CONFIG_DIR / DEFAULT_SESSION_FILENAME,
|
|
176
|
+
)
|
|
177
|
+
key_file = _resolve_path(
|
|
178
|
+
args.key_file,
|
|
179
|
+
DEFAULT_KEY_FILENAME,
|
|
180
|
+
CONFIG_DIR / DEFAULT_KEY_FILENAME,
|
|
166
181
|
)
|
|
167
|
-
key_file = _resolve_path(args.key_file, ".session_key", user_config_dir / ".session_key")
|
|
168
182
|
|
|
169
183
|
# 1. Authenticate
|
|
170
184
|
authenticator = InstapaperAuthenticator(
|
|
@@ -195,7 +209,17 @@ def main():
|
|
|
195
209
|
sys.exit(1)
|
|
196
210
|
|
|
197
211
|
# 3. Save Articles
|
|
198
|
-
|
|
212
|
+
try:
|
|
213
|
+
save_articles(
|
|
214
|
+
all_articles,
|
|
215
|
+
args.format,
|
|
216
|
+
output_filename,
|
|
217
|
+
add_instapaper_url=args.add_instapaper_url,
|
|
218
|
+
)
|
|
219
|
+
logging.info("Articles scraped and saved successfully.")
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logging.error(f"An unexpected error occurred during saving: {e}")
|
|
222
|
+
sys.exit(1)
|
|
199
223
|
|
|
200
224
|
|
|
201
225
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Shared constants used across the instapaper-scraper project.
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
# --- General ---
|
|
5
|
+
APP_NAME = "instapaper-scraper"
|
|
6
|
+
|
|
7
|
+
# --- URLS ---
|
|
8
|
+
INSTAPAPER_BASE_URL = "https://www.instapaper.com"
|
|
9
|
+
INSTAPAPER_READ_URL = f"{INSTAPAPER_BASE_URL}/read/"
|
|
10
|
+
|
|
11
|
+
# --- Paths ---
|
|
12
|
+
CONFIG_DIR = Path.home() / ".config" / APP_NAME
|
|
13
|
+
|
|
14
|
+
# --- Article Data Keys ---
|
|
15
|
+
KEY_ID = "id"
|
|
16
|
+
KEY_TITLE = "title"
|
|
17
|
+
KEY_URL = "url"
|
instapaper_scraper/output.py
CHANGED
|
@@ -2,29 +2,17 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
import sqlite3
|
|
4
4
|
import logging
|
|
5
|
+
import csv
|
|
5
6
|
from typing import List, Dict, Any
|
|
6
7
|
|
|
8
|
+
from .constants import INSTAPAPER_READ_URL, KEY_ID, KEY_TITLE, KEY_URL
|
|
9
|
+
|
|
7
10
|
# Constants for file operations
|
|
8
11
|
JSON_INDENT = 4
|
|
9
12
|
|
|
10
|
-
# Constants for CSV output
|
|
11
|
-
CSV_HEADER = "id,title,url\n"
|
|
12
|
-
CSV_DELIMITER = ","
|
|
13
|
-
CSV_ROW_FORMAT = "{id},{title},{url}\n"
|
|
14
|
-
|
|
15
13
|
# Constants for SQLite output
|
|
16
14
|
SQLITE_TABLE_NAME = "articles"
|
|
17
|
-
|
|
18
|
-
SQLITE_TITLE_COL = "title"
|
|
19
|
-
SQLITE_URL_COL = "url"
|
|
20
|
-
SQLITE_CREATE_TABLE_SQL = f"""
|
|
21
|
-
CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} (
|
|
22
|
-
{SQLITE_ID_COL} TEXT PRIMARY KEY,
|
|
23
|
-
{SQLITE_TITLE_COL} TEXT NOT NULL,
|
|
24
|
-
{SQLITE_URL_COL} TEXT NOT NULL
|
|
25
|
-
)
|
|
26
|
-
"""
|
|
27
|
-
SQLITE_INSERT_SQL = f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({SQLITE_ID_COL}, {SQLITE_TITLE_COL}, {SQLITE_URL_COL}) VALUES (:{SQLITE_ID_COL}, :{SQLITE_TITLE_COL}, :{SQLITE_URL_COL})"
|
|
15
|
+
SQLITE_INSTAPAPER_URL_COL = "instapaper_url"
|
|
28
16
|
|
|
29
17
|
# Constants for logging messages
|
|
30
18
|
LOG_NO_ARTICLES = "No articles found to save."
|
|
@@ -32,21 +20,52 @@ LOG_SAVED_ARTICLES = "Saved {count} articles to {filename}"
|
|
|
32
20
|
LOG_UNKNOWN_FORMAT = "Unknown output format: {format}"
|
|
33
21
|
|
|
34
22
|
|
|
35
|
-
def
|
|
23
|
+
def get_sqlite_create_table_sql(add_instapaper_url: bool = False) -> str:
|
|
24
|
+
"""Returns the SQL statement to create the articles table."""
|
|
25
|
+
columns = [
|
|
26
|
+
f"{KEY_ID} TEXT PRIMARY KEY",
|
|
27
|
+
f"{KEY_TITLE} TEXT NOT NULL",
|
|
28
|
+
f"{KEY_URL} TEXT NOT NULL",
|
|
29
|
+
]
|
|
30
|
+
if add_instapaper_url:
|
|
31
|
+
# The GENERATED ALWAYS AS syntax was added in SQLite 3.31.0
|
|
32
|
+
if sqlite3.sqlite_version_info >= (3, 31, 0):
|
|
33
|
+
columns.append(
|
|
34
|
+
f"{SQLITE_INSTAPAPER_URL_COL} TEXT GENERATED ALWAYS AS ('{INSTAPAPER_READ_URL}' || {KEY_ID}) VIRTUAL"
|
|
35
|
+
)
|
|
36
|
+
else:
|
|
37
|
+
columns.append(f"{SQLITE_INSTAPAPER_URL_COL} TEXT")
|
|
38
|
+
|
|
39
|
+
return f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} ({', '.join(columns)})"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_sqlite_insert_sql(add_instapaper_url_manually: bool = False) -> str:
|
|
43
|
+
"""Returns the SQL statement to insert an article."""
|
|
44
|
+
cols = [KEY_ID, KEY_TITLE, KEY_URL]
|
|
45
|
+
placeholders = [f":{KEY_ID}", f":{KEY_TITLE}", f":{KEY_URL}"]
|
|
46
|
+
|
|
47
|
+
if add_instapaper_url_manually:
|
|
48
|
+
cols.append(SQLITE_INSTAPAPER_URL_COL)
|
|
49
|
+
placeholders.append(f":{SQLITE_INSTAPAPER_URL_COL}")
|
|
50
|
+
|
|
51
|
+
return f"INSERT OR REPLACE INTO {SQLITE_TABLE_NAME} ({', '.join(cols)}) VALUES ({', '.join(placeholders)})"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def save_to_csv(
|
|
55
|
+
data: List[Dict[str, Any]], filename: str, add_instapaper_url: bool = False
|
|
56
|
+
):
|
|
36
57
|
"""Saves a list of articles to a CSV file."""
|
|
37
58
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
38
59
|
with open(filename, "w", newline="", encoding="utf-8") as f:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
)
|
|
49
|
-
)
|
|
60
|
+
fieldnames = [KEY_ID, KEY_TITLE, KEY_URL]
|
|
61
|
+
if add_instapaper_url:
|
|
62
|
+
# Insert instapaper_url after the id column
|
|
63
|
+
fieldnames.insert(1, SQLITE_INSTAPAPER_URL_COL)
|
|
64
|
+
|
|
65
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
|
66
|
+
writer.writeheader()
|
|
67
|
+
writer.writerows(data)
|
|
68
|
+
|
|
50
69
|
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
51
70
|
|
|
52
71
|
|
|
@@ -58,19 +77,61 @@ def save_to_json(data: List[Dict[str, Any]], filename: str):
|
|
|
58
77
|
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=filename))
|
|
59
78
|
|
|
60
79
|
|
|
61
|
-
def save_to_sqlite(
|
|
80
|
+
def save_to_sqlite(
|
|
81
|
+
data: List[Dict[str, Any]], db_name: str, add_instapaper_url: bool = False
|
|
82
|
+
):
|
|
62
83
|
"""Saves a list of articles to a SQLite database."""
|
|
63
84
|
os.makedirs(os.path.dirname(db_name), exist_ok=True)
|
|
64
85
|
conn = sqlite3.connect(db_name)
|
|
65
86
|
cursor = conn.cursor()
|
|
66
|
-
cursor.execute(
|
|
67
|
-
|
|
87
|
+
cursor.execute(get_sqlite_create_table_sql(add_instapaper_url))
|
|
88
|
+
|
|
89
|
+
# For older SQLite versions, we need to manually add the URL
|
|
90
|
+
manual_insert_required = add_instapaper_url and sqlite3.sqlite_version_info < (
|
|
91
|
+
3,
|
|
92
|
+
31,
|
|
93
|
+
0,
|
|
94
|
+
)
|
|
95
|
+
if manual_insert_required:
|
|
96
|
+
data_to_insert = [
|
|
97
|
+
{
|
|
98
|
+
**article,
|
|
99
|
+
SQLITE_INSTAPAPER_URL_COL: f"{INSTAPAPER_READ_URL}{article[KEY_ID]}",
|
|
100
|
+
}
|
|
101
|
+
for article in data
|
|
102
|
+
]
|
|
103
|
+
else:
|
|
104
|
+
data_to_insert = data
|
|
105
|
+
|
|
106
|
+
insert_sql = get_sqlite_insert_sql(
|
|
107
|
+
add_instapaper_url_manually=manual_insert_required
|
|
108
|
+
)
|
|
109
|
+
cursor.executemany(insert_sql, data_to_insert)
|
|
110
|
+
|
|
68
111
|
conn.commit()
|
|
69
112
|
conn.close()
|
|
70
113
|
logging.info(LOG_SAVED_ARTICLES.format(count=len(data), filename=db_name))
|
|
71
114
|
|
|
72
115
|
|
|
73
|
-
def
|
|
116
|
+
def _correct_ext(filename: str, format: str) -> str:
|
|
117
|
+
"""Corrects the filename extension based on the specified format."""
|
|
118
|
+
extension_map = {
|
|
119
|
+
"csv": ".csv",
|
|
120
|
+
"json": ".json",
|
|
121
|
+
"sqlite": ".db",
|
|
122
|
+
}
|
|
123
|
+
if format in extension_map:
|
|
124
|
+
name, _ = os.path.splitext(filename)
|
|
125
|
+
return name + extension_map[format]
|
|
126
|
+
return filename
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def save_articles(
|
|
130
|
+
data: List[Dict[str, Any]],
|
|
131
|
+
format: str,
|
|
132
|
+
filename: str,
|
|
133
|
+
add_instapaper_url: bool = False,
|
|
134
|
+
):
|
|
74
135
|
"""
|
|
75
136
|
Dispatches to the correct save function based on the format.
|
|
76
137
|
"""
|
|
@@ -78,11 +139,23 @@ def save_articles(data: List[Dict[str, Any]], format: str, filename: str):
|
|
|
78
139
|
logging.info(LOG_NO_ARTICLES)
|
|
79
140
|
return
|
|
80
141
|
|
|
142
|
+
filename = _correct_ext(filename, format)
|
|
143
|
+
|
|
144
|
+
# Add the instapaper_url to the data for formats that don't auto-generate it
|
|
145
|
+
if add_instapaper_url and format in ("csv", "json"):
|
|
146
|
+
data = [
|
|
147
|
+
{
|
|
148
|
+
**article,
|
|
149
|
+
SQLITE_INSTAPAPER_URL_COL: f"{INSTAPAPER_READ_URL}{article[KEY_ID]}",
|
|
150
|
+
}
|
|
151
|
+
for article in data
|
|
152
|
+
]
|
|
153
|
+
|
|
81
154
|
if format == "csv":
|
|
82
|
-
save_to_csv(data, filename=filename)
|
|
155
|
+
save_to_csv(data, filename=filename, add_instapaper_url=add_instapaper_url)
|
|
83
156
|
elif format == "json":
|
|
84
157
|
save_to_json(data, filename=filename)
|
|
85
158
|
elif format == "sqlite":
|
|
86
|
-
save_to_sqlite(data, db_name=filename)
|
|
159
|
+
save_to_sqlite(data, db_name=filename, add_instapaper_url=add_instapaper_url)
|
|
87
160
|
else:
|
|
88
161
|
logging.error(LOG_UNKNOWN_FORMAT.format(format=format))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: instapaper-scraper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0rc1
|
|
4
4
|
Summary: A tool to scrape articles from Instapaper.
|
|
5
5
|
Project-URL: Homepage, https://github.com/chriskyfung/InstapaperScraper
|
|
6
6
|
Project-URL: Source, https://github.com/chriskyfung/InstapaperScraper
|
|
@@ -30,7 +30,7 @@ Requires-Dist: python-dotenv~=1.2.1
|
|
|
30
30
|
Requires-Dist: requests~=2.32.5
|
|
31
31
|
Requires-Dist: soupsieve~=2.8
|
|
32
32
|
Requires-Dist: typing_extensions~=4.15.0
|
|
33
|
-
Requires-Dist: urllib3
|
|
33
|
+
Requires-Dist: urllib3<2.7,>=2.5
|
|
34
34
|
Requires-Dist: tomli~=2.0.1; python_version < "3.11"
|
|
35
35
|
Provides-Extra: dev
|
|
36
36
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -49,6 +49,7 @@ Dynamic: license-file
|
|
|
49
49
|

|
|
50
50
|
[](https://github.com/chriskyfung/InstapaperScraper/actions/workflows/ci.yml)
|
|
51
51
|
[](https://pypi.org/project/instapaper-scraper/)
|
|
52
|
+
[](https://pepy.tech/projects/instapaper-scraper)
|
|
52
53
|
[](https://github.com/psf/black)
|
|
53
54
|
[](https://github.com/astral-sh/ruff)
|
|
54
55
|
[
|
|
@@ -68,6 +69,7 @@ A Python tool to scrape all your saved Instapaper bookmarks and export them to v
|
|
|
68
69
|
## Getting Started
|
|
69
70
|
|
|
70
71
|
### 1. Requirements
|
|
72
|
+
|
|
71
73
|
- Python 3.9+
|
|
72
74
|
|
|
73
75
|
### 2. Installation
|
|
@@ -152,14 +154,15 @@ When a `config.toml` file is present and no `--folder` argument is provided, the
|
|
|
152
154
|
|
|
153
155
|
### Command-line Arguments
|
|
154
156
|
|
|
155
|
-
| Argument
|
|
156
|
-
|
|
|
157
|
+
| Argument | Description |
|
|
158
|
+
| --- | --- |
|
|
157
159
|
| `--config-path <path>`| Path to the configuration file. Searches `~/.config/instapaper-scraper/config.toml` and `config.toml` in the current directory by default. |
|
|
158
|
-
| `--folder <value>`
|
|
159
|
-
| `--format <format>`
|
|
160
|
-
| `--output <filename>` | Specify a custom output filename.
|
|
161
|
-
| `--username <user>`
|
|
162
|
-
| `--password <pass>`
|
|
160
|
+
| `--folder <value>` | Specify a folder by key, ID, or slug from your `config.toml`. **Requires a configuration file to be loaded.** Use `none` to explicitly disable folder mode. If a configuration file is not found or fails to load, and this option is used (not set to `none`), the program will exit. |
|
|
161
|
+
| `--format <format>` | Output format (`csv`, `json`, `sqlite`). Default: `csv`. |
|
|
162
|
+
| `--output <filename>` | Specify a custom output filename. The file extension will be automatically corrected to match the selected format. |
|
|
163
|
+
| `--username <user>` | Your Instapaper account username. |
|
|
164
|
+
| `--password <pass>` | Your Instapaper account password. |
|
|
165
|
+
| `--add-instapaper-url` | Adds a `instapaper_url` column to the output, containing a full, clickable URL for each article. |
|
|
163
166
|
|
|
164
167
|
### Output Formats
|
|
165
168
|
|
|
@@ -168,54 +171,64 @@ You can control the output format using the `--format` argument. The supported f
|
|
|
168
171
|
- `csv` (default): Exports data to `output/bookmarks.csv`.
|
|
169
172
|
- `json`: Exports data to `output/bookmarks.json`.
|
|
170
173
|
- `sqlite`: Exports data to an `articles` table in `output/bookmarks.db`.
|
|
171
|
-
- `--output <filename>`: Specify a custom output filename.
|
|
172
174
|
|
|
173
175
|
If the `--format` flag is omitted, the script will default to `csv`.
|
|
174
176
|
|
|
177
|
+
When using `--output <filename>`, the file extension is automatically corrected to match the chosen format. For example, `instapaper-scraper --format json --output my_articles.txt` will create `my_articles.json`.
|
|
178
|
+
|
|
175
179
|
#### Opening Articles in Instapaper
|
|
176
180
|
|
|
177
|
-
The output data includes a unique `id` for each article.
|
|
178
|
-
|
|
181
|
+
The output data includes a unique `id` for each article. You can use this ID to construct a URL to the article's reader view: `https://www.instapaper.com/read/<article_id>`.
|
|
182
|
+
|
|
183
|
+
For convenience, you can use the `--add-instapaper-url` flag to have the script include a full, clickable URL in the output.
|
|
184
|
+
|
|
185
|
+
```sh
|
|
186
|
+
instapaper-scraper --add-instapaper-url
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
This adds a `instapaper_url` field to each article in the JSON output and a `instapaper_url` column in the CSV and SQLite outputs. The original `id` field is preserved.
|
|
179
190
|
|
|
180
191
|
## How It Works
|
|
181
192
|
|
|
182
193
|
The tool is designed with a modular architecture for reliability and maintainability.
|
|
183
194
|
|
|
184
195
|
1. **Authentication**: The `InstapaperAuthenticator` handles secure login and session management.
|
|
185
|
-
2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries.
|
|
196
|
+
2. **Scraping**: The `InstapaperClient` iterates through all pages of your bookmarks, fetching the metadata for each article with robust error handling and retries. Shared constants, like the Instapaper base URL, are managed through `src/instapaper_scraper/constants.py`.
|
|
186
197
|
3. **Data Collection**: All fetched articles are aggregated into a single list.
|
|
187
198
|
4. **Export**: Finally, the collected data is written to a file in your chosen format (`.csv`, `.json`, or `.db`).
|
|
188
199
|
|
|
189
200
|
## Example Output
|
|
190
201
|
|
|
191
|
-
### CSV (`output/bookmarks.csv`)
|
|
202
|
+
### CSV (`output/bookmarks.csv`) (with --add-instapaper-url)
|
|
192
203
|
|
|
193
204
|
```csv
|
|
194
|
-
id,title,url
|
|
195
|
-
999901234,"Article 1",https://www.example.com/page-1/
|
|
196
|
-
999002345,"Article 2",https://www.example.com/page-2/
|
|
205
|
+
"id","instapaper_url","title","url"
|
|
206
|
+
"999901234","https://www.instapaper.com/read/999901234","Article 1","https://www.example.com/page-1/"
|
|
207
|
+
"999002345","https://www.instapaper.com/read/999002345","Article 2","https://www.example.com/page-2/"
|
|
197
208
|
```
|
|
198
209
|
|
|
199
|
-
### JSON (`output/bookmarks.json`)
|
|
210
|
+
### JSON (`output/bookmarks.json`) (with --add-instapaper-url)
|
|
200
211
|
|
|
201
212
|
```json
|
|
202
213
|
[
|
|
203
214
|
{
|
|
204
215
|
"id": "999901234",
|
|
205
216
|
"title": "Article 1",
|
|
206
|
-
"url": "https://www.example.com/page-1/"
|
|
217
|
+
"url": "https://www.example.com/page-1/",
|
|
218
|
+
"instapaper_url": "https://www.instapaper.com/read/999901234"
|
|
207
219
|
},
|
|
208
220
|
{
|
|
209
221
|
"id": "999002345",
|
|
210
222
|
"title": "Article 2",
|
|
211
|
-
"url": "https://www.example.com/page-2/"
|
|
223
|
+
"url": "https://www.example.com/page-2/",
|
|
224
|
+
"instapaper_url": "https://www.instapaper.com/read/999002345"
|
|
212
225
|
}
|
|
213
226
|
]
|
|
214
227
|
```
|
|
215
228
|
|
|
216
229
|
### SQLite (`output/bookmarks.db`)
|
|
217
230
|
|
|
218
|
-
A SQLite database file is created with an `articles` table
|
|
231
|
+
A SQLite database file is created with an `articles` table. The table includes `id`, `title`, and `url` columns. If the `--add-instapaper-url` flag is used, a `instapaper_url` column is also included. This feature is fully backward-compatible and will automatically adapt to the user's installed SQLite version, using an efficient generated column on modern versions (3.31.0+) and a fallback for older versions.
|
|
219
232
|
|
|
220
233
|
## Development & Testing
|
|
221
234
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
+
instapaper_scraper/api.py,sha256=-Dq5fOAGSGopb-qonIbETd9ZlxWdULKRgl1DCOuVemY,11618
|
|
3
|
+
instapaper_scraper/auth.py,sha256=VTBE9KhGGJm0KbMT5DCTMCbh-N3HiJuJ9wMDb8CyZT4,7015
|
|
4
|
+
instapaper_scraper/cli.py,sha256=wsQxTVFIyJq3EQiAtz7dCjg1vI2_Y9quZv4ifuEPDU8,7495
|
|
5
|
+
instapaper_scraper/constants.py,sha256=ubFWa47985lIz58qokMC0xQzTmCB6NOa17KFgWLn65E,403
|
|
6
|
+
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
7
|
+
instapaper_scraper/output.py,sha256=lxJgW71-m1YuMYJHeK6nu479pk_3bQGc0axzNCvxtZQ,5338
|
|
8
|
+
instapaper_scraper-1.1.0rc1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
9
|
+
instapaper_scraper-1.1.0rc1.dist-info/METADATA,sha256=O-VJZg1yN3cuPRfBCevmD9_IrOR07NGpzrgZXI2-6hk,11637
|
|
10
|
+
instapaper_scraper-1.1.0rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
instapaper_scraper-1.1.0rc1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
12
|
+
instapaper_scraper-1.1.0rc1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
13
|
+
instapaper_scraper-1.1.0rc1.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
instapaper_scraper/__init__.py,sha256=qdcT3tp4KLufWH1u6tOuPVUQaXwakQD0gdjkwY4ljfg,206
|
|
2
|
-
instapaper_scraper/api.py,sha256=KvGxK2P35-3TsONPWcQTVBZT-q70p7hobeQ7E9PhXwA,11740
|
|
3
|
-
instapaper_scraper/auth.py,sha256=DepQKDdVSm1dMFNIkpK_LIlaI0JllAYZb3_LJWhMe-g,7554
|
|
4
|
-
instapaper_scraper/cli.py,sha256=Pxf1cAoLW9N-X1BP73HE0i2Qv7rPTaIyrPqG3cgdSTI,6860
|
|
5
|
-
instapaper_scraper/exceptions.py,sha256=CptHoZe4NOhdjOoyXkZEMFgQC6oKtzjRljywwDEtsTg,134
|
|
6
|
-
instapaper_scraper/output.py,sha256=0vQQ4AHZwFJg3O5O2zzvKUf0cOS1fTjXdivFqEHAun0,3081
|
|
7
|
-
instapaper_scraper-1.0.0.post1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
|
8
|
-
instapaper_scraper-1.0.0.post1.dist-info/METADATA,sha256=rWkPxBIY-Vo2opYPJ6KiSGiGfmrklMkI-CM9HwOf9to,10353
|
|
9
|
-
instapaper_scraper-1.0.0.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
10
|
-
instapaper_scraper-1.0.0.post1.dist-info/entry_points.txt,sha256=7AvRgN5fvtas_Duxdz-JPbDN6A1Lq2GaTfTSv54afxA,67
|
|
11
|
-
instapaper_scraper-1.0.0.post1.dist-info/top_level.txt,sha256=kiU9nLkqPOVPLsP4QMHuBFjAmoIKfftYmGV05daLrcc,19
|
|
12
|
-
instapaper_scraper-1.0.0.post1.dist-info/RECORD,,
|
|
File without changes
|
{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{instapaper_scraper-1.0.0.post1.dist-info → instapaper_scraper-1.1.0rc1.dist-info}/top_level.txt
RENAMED
|
File without changes
|