article-backup 0.3.13__tar.gz → 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {article_backup-0.3.13 → article_backup-0.3.14}/PKG-INFO +1 -1
- {article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/PKG-INFO +1 -1
- {article_backup-0.3.13 → article_backup-0.3.14}/backup.py +4 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/pyproject.toml +1 -1
- {article_backup-0.3.13 → article_backup-0.3.14}/src/downloader.py +53 -31
- article_backup-0.3.14/tests/test_asset_dedup.py +352 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_config_hardening.py +3 -0
- article_backup-0.3.13/tests/test_asset_dedup.py +0 -180
- {article_backup-0.3.13 → article_backup-0.3.14}/LICENSE +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/README.md +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/SOURCES.txt +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/dependency_links.txt +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/entry_points.txt +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/requires.txt +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/top_level.txt +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/setup.cfg +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/src/__init__.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/src/boosty.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/src/config.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/src/database.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/src/sponsr.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/src/utils.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_boosty_empty_link.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_boosty_normalize.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_incremental_sync.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_slug_safety.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_sponsr_formatting_fix.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_sponsr_normalize.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_sponsr_tags.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_sync_policy.py +0 -0
- {article_backup-0.3.13 → article_backup-0.3.14}/tests/test_video_embed.py +0 -0
|
@@ -27,9 +27,13 @@ def generate_hugo_config(config: Config):
|
|
|
27
27
|
|
|
28
28
|
content = f'''baseURL = {toml_str(config.hugo.base_url)}
|
|
29
29
|
locale = {toml_str(config.hugo.language_code)}
|
|
30
|
+
defaultContentLanguage = {toml_str(config.hugo.language_code)}
|
|
30
31
|
title = {toml_str(config.hugo.title)}
|
|
31
32
|
relativeURLs = true
|
|
32
33
|
|
|
34
|
+
[languages.{config.hugo.language_code}]
|
|
35
|
+
locale = {toml_str(config.hugo.language_code)}
|
|
36
|
+
|
|
33
37
|
[params]
|
|
34
38
|
default_theme = {toml_str(config.hugo.default_theme)}
|
|
35
39
|
|
|
@@ -33,6 +33,7 @@ def retry_request(
|
|
|
33
33
|
base_delay: float = 1.0,
|
|
34
34
|
max_delay: float = 30.0,
|
|
35
35
|
backoff_factor: float = 2.0,
|
|
36
|
+
delays: list[float] | None = None,
|
|
36
37
|
):
|
|
37
38
|
"""
|
|
38
39
|
Выполняет функцию с retry и exponential backoff.
|
|
@@ -43,6 +44,7 @@ def retry_request(
|
|
|
43
44
|
base_delay: Начальная задержка в секундах
|
|
44
45
|
max_delay: Максимальная задержка в секундах
|
|
45
46
|
backoff_factor: Множитель для увеличения задержки
|
|
47
|
+
delays: Явная последовательность задержек между попытками
|
|
46
48
|
"""
|
|
47
49
|
last_exception = None
|
|
48
50
|
delay = base_delay
|
|
@@ -58,8 +60,11 @@ def retry_request(
|
|
|
58
60
|
raise
|
|
59
61
|
|
|
60
62
|
if attempt < max_retries - 1:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
if delays:
|
|
64
|
+
time.sleep(delays[min(attempt, len(delays) - 1)])
|
|
65
|
+
else:
|
|
66
|
+
time.sleep(delay)
|
|
67
|
+
delay = min(delay * backoff_factor, max_delay)
|
|
63
68
|
|
|
64
69
|
if last_exception:
|
|
65
70
|
raise last_exception
|
|
@@ -326,43 +331,60 @@ class BaseDownloader(ABC):
|
|
|
326
331
|
if ext and not force and not should_download_asset(url, None, self.source.asset_types):
|
|
327
332
|
return url, None
|
|
328
333
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
resp.raise_for_status()
|
|
332
|
-
return resp
|
|
334
|
+
filename: str | None = None
|
|
335
|
+
filepath: Path | None = None
|
|
333
336
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
+
def download_to_file():
|
|
338
|
+
nonlocal filename, filepath
|
|
339
|
+
resp = self.session.get(request_url, stream=True, timeout=self.TIMEOUT)
|
|
340
|
+
try:
|
|
341
|
+
resp.raise_for_status()
|
|
342
|
+
content_type = resp.headers.get('Content-Type', '')
|
|
337
343
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
344
|
+
# Полная проверка после получения Content-Type
|
|
345
|
+
if not force and not should_download_asset(url, content_type, self.source.asset_types):
|
|
346
|
+
return None
|
|
341
347
|
|
|
342
|
-
|
|
348
|
+
if filename is None or filepath is None:
|
|
349
|
+
filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
|
|
343
350
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
351
|
+
with used_lock:
|
|
352
|
+
filename = filename_base
|
|
353
|
+
filepath = assets_dir / filename
|
|
354
|
+
if filename in used_filenames or filepath.exists():
|
|
355
|
+
filename = self._deduplicate_filename(filename, url)
|
|
356
|
+
filepath = assets_dir / filename
|
|
350
357
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
358
|
+
# На всякий случай добиваемся уникальности в рамках сессии
|
|
359
|
+
while filename in used_filenames or filepath.exists():
|
|
360
|
+
filename = self._deduplicate_filename(filename, url + filename)
|
|
361
|
+
filepath = assets_dir / filename
|
|
355
362
|
|
|
356
|
-
|
|
363
|
+
used_filenames.add(filename)
|
|
357
364
|
|
|
358
|
-
if not filepath.exists():
|
|
359
365
|
with open(filepath, 'wb') as f:
|
|
360
|
-
for chunk in
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
+
for chunk in resp.iter_content(chunk_size=8192):
|
|
367
|
+
if chunk:
|
|
368
|
+
f.write(chunk)
|
|
369
|
+
return filename
|
|
370
|
+
except Exception as e:
|
|
371
|
+
if filepath and filepath.exists():
|
|
372
|
+
filepath.unlink()
|
|
373
|
+
if isinstance(e, OSError) and not isinstance(e, requests.RequestException):
|
|
374
|
+
raise requests.RequestException(str(e)) from e
|
|
375
|
+
raise
|
|
376
|
+
finally:
|
|
377
|
+
close = getattr(resp, 'close', None)
|
|
378
|
+
if callable(close):
|
|
379
|
+
close()
|
|
380
|
+
|
|
381
|
+
filename = retry_request(
|
|
382
|
+
download_to_file,
|
|
383
|
+
max_retries=10,
|
|
384
|
+
delays=[3, 5, 7, 10, 15, 15, 15, 15, 15],
|
|
385
|
+
)
|
|
386
|
+
if not filename:
|
|
387
|
+
return url, None
|
|
366
388
|
|
|
367
389
|
return url, filename
|
|
368
390
|
except requests.RequestException as e:
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import cast
|
|
5
|
+
from unittest.mock import patch
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from src.config import Auth, Config, Source
|
|
10
|
+
from src.database import Database
|
|
11
|
+
from src.downloader import BaseDownloader
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _FakeResponse:
|
|
15
|
+
def __init__(self, content_type: str, body: bytes):
|
|
16
|
+
self.headers = {"Content-Type": content_type}
|
|
17
|
+
self._body = body
|
|
18
|
+
|
|
19
|
+
def raise_for_status(self):
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
def iter_content(self, chunk_size: int = 8192):
|
|
23
|
+
# Yield at least one chunk to trigger file write.
|
|
24
|
+
yield self._body
|
|
25
|
+
|
|
26
|
+
def close(self):
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _FailingStreamResponse(_FakeResponse):
|
|
31
|
+
def iter_content(self, chunk_size: int = 8192):
|
|
32
|
+
yield self._body
|
|
33
|
+
raise requests.exceptions.ChunkedEncodingError("stream interrupted")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _HttpErrorResponse(_FakeResponse):
|
|
37
|
+
def __init__(self, status_code: int):
|
|
38
|
+
super().__init__("text/plain", b"")
|
|
39
|
+
self.status_code = status_code
|
|
40
|
+
|
|
41
|
+
def raise_for_status(self):
|
|
42
|
+
response = requests.Response()
|
|
43
|
+
response.status_code = self.status_code
|
|
44
|
+
raise requests.HTTPError(f"{self.status_code} error", response=response)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class _DummyDB:
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class _DummyDownloader(BaseDownloader):
|
|
52
|
+
PLATFORM = "dummy"
|
|
53
|
+
MAX_WORKERS = 2
|
|
54
|
+
|
|
55
|
+
def _setup_session(self):
|
|
56
|
+
# Tests patch session.get directly.
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
def fetch_posts_list(
|
|
60
|
+
self,
|
|
61
|
+
existing_ids: set[str] | None = None,
|
|
62
|
+
incremental: bool = False,
|
|
63
|
+
safety_chunks: int = 1
|
|
64
|
+
):
|
|
65
|
+
raise NotImplementedError
|
|
66
|
+
|
|
67
|
+
def fetch_post(self, post_id: str):
|
|
68
|
+
raise NotImplementedError
|
|
69
|
+
|
|
70
|
+
def _parse_post(self, raw_data: dict):
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
|
|
73
|
+
def _to_markdown(self, post, asset_map):
|
|
74
|
+
raise NotImplementedError
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class _FailingWriteFile:
|
|
78
|
+
def __init__(self, wrapped):
|
|
79
|
+
self._wrapped = wrapped
|
|
80
|
+
|
|
81
|
+
def __enter__(self):
|
|
82
|
+
self._wrapped.__enter__()
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
86
|
+
return self._wrapped.__exit__(exc_type, exc_val, exc_tb)
|
|
87
|
+
|
|
88
|
+
def write(self, data: bytes):
|
|
89
|
+
self._wrapped.write(b"partial")
|
|
90
|
+
raise OSError("temporary disk write failure")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class AssetDedupTests(unittest.TestCase):
|
|
94
|
+
def test_download_assets_deduplicates_colliding_names(self):
|
|
95
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
96
|
+
tmp_path = Path(tmp)
|
|
97
|
+
assets_dir = tmp_path / "assets"
|
|
98
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
|
|
100
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
101
|
+
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
102
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
103
|
+
|
|
104
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
105
|
+
# URLs intentionally do not contain extensions.
|
|
106
|
+
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
107
|
+
|
|
108
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
109
|
+
|
|
110
|
+
assets = [
|
|
111
|
+
{"url": "https://example.test/media/1", "alt": "same name"},
|
|
112
|
+
{"url": "https://example.test/media/2", "alt": "same name"},
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
asset_map = dl._download_assets(assets, assets_dir)
|
|
116
|
+
|
|
117
|
+
self.assertEqual(set(asset_map.keys()), {a["url"] for a in assets})
|
|
118
|
+
|
|
119
|
+
filenames = list(asset_map.values())
|
|
120
|
+
self.assertEqual(len(filenames), 2)
|
|
121
|
+
self.assertNotEqual(filenames[0], filenames[1])
|
|
122
|
+
|
|
123
|
+
for fn in filenames:
|
|
124
|
+
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
125
|
+
|
|
126
|
+
def test_download_assets_deduplicates_when_file_exists(self):
|
|
127
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
128
|
+
tmp_path = Path(tmp)
|
|
129
|
+
assets_dir = tmp_path / "assets"
|
|
130
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
|
|
132
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
133
|
+
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
134
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
135
|
+
|
|
136
|
+
# Pre-create a file with the expected base name.
|
|
137
|
+
base = dl._make_asset_filename(
|
|
138
|
+
"https://example.test/media/1",
|
|
139
|
+
"image/jpeg",
|
|
140
|
+
"same name",
|
|
141
|
+
)
|
|
142
|
+
(assets_dir / base).write_bytes(b"existing")
|
|
143
|
+
|
|
144
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
145
|
+
return _FakeResponse("image/jpeg", body=b"downloaded")
|
|
146
|
+
|
|
147
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
148
|
+
|
|
149
|
+
assets = [{"url": "https://example.test/media/1", "alt": "same name"}]
|
|
150
|
+
asset_map = dl._download_assets(assets, assets_dir)
|
|
151
|
+
|
|
152
|
+
self.assertIn("https://example.test/media/1", asset_map)
|
|
153
|
+
self.assertNotEqual(asset_map["https://example.test/media/1"], base)
|
|
154
|
+
self.assertTrue((assets_dir / asset_map["https://example.test/media/1"]).exists())
|
|
155
|
+
|
|
156
|
+
def test_download_assets_keeps_unique_names_under_parallelism(self):
|
|
157
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
158
|
+
tmp_path = Path(tmp)
|
|
159
|
+
assets_dir = tmp_path / "assets"
|
|
160
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
|
|
162
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
163
|
+
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
164
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
165
|
+
dl.MAX_WORKERS = 5
|
|
166
|
+
|
|
167
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
168
|
+
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
169
|
+
|
|
170
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
171
|
+
|
|
172
|
+
assets = [
|
|
173
|
+
{"url": f"https://example.test/media/{i}", "alt": "same name"}
|
|
174
|
+
for i in range(20)
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
asset_map = dl._download_assets(assets, assets_dir)
|
|
178
|
+
|
|
179
|
+
self.assertEqual(len(asset_map), 20)
|
|
180
|
+
filenames = list(asset_map.values())
|
|
181
|
+
self.assertEqual(len(set(filenames)), 20)
|
|
182
|
+
for fn in filenames:
|
|
183
|
+
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
184
|
+
|
|
185
|
+
def test_download_assets_uses_download_url_but_maps_original_url(self):
|
|
186
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
187
|
+
tmp_path = Path(tmp)
|
|
188
|
+
assets_dir = tmp_path / "assets"
|
|
189
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
|
|
191
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
192
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
193
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
194
|
+
|
|
195
|
+
requested_urls = []
|
|
196
|
+
|
|
197
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
198
|
+
requested_urls.append(url)
|
|
199
|
+
return _FakeResponse("audio/mpeg", body=b"audio")
|
|
200
|
+
|
|
201
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
202
|
+
|
|
203
|
+
asset_map = dl._download_assets(
|
|
204
|
+
[
|
|
205
|
+
{
|
|
206
|
+
"url": "https://cdn.boosty.to/audio/audio-id",
|
|
207
|
+
"download_url": "https://cdn.boosty.to/audio/audio-id?sign=abc",
|
|
208
|
+
"alt": "audio.mp3",
|
|
209
|
+
}
|
|
210
|
+
],
|
|
211
|
+
assets_dir,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
self.assertEqual(requested_urls, ["https://cdn.boosty.to/audio/audio-id?sign=abc"])
|
|
215
|
+
self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
|
|
216
|
+
|
|
217
|
+
def test_download_assets_retries_network_errors_ten_times(self):
|
|
218
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
219
|
+
tmp_path = Path(tmp)
|
|
220
|
+
assets_dir = tmp_path / "assets"
|
|
221
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
|
|
223
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
224
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
225
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
226
|
+
|
|
227
|
+
attempts = 0
|
|
228
|
+
|
|
229
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
230
|
+
nonlocal attempts
|
|
231
|
+
attempts += 1
|
|
232
|
+
if attempts < 10:
|
|
233
|
+
raise requests.ConnectionError("temporary cdn failure")
|
|
234
|
+
return _FakeResponse("audio/mpeg", body=b"audio")
|
|
235
|
+
|
|
236
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
237
|
+
|
|
238
|
+
with patch("src.downloader.time.sleep") as sleep_mock:
|
|
239
|
+
asset_map = dl._download_assets(
|
|
240
|
+
[{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
|
|
241
|
+
assets_dir,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
self.assertEqual(attempts, 10)
|
|
245
|
+
self.assertEqual(
|
|
246
|
+
[call.args[0] for call in sleep_mock.call_args_list],
|
|
247
|
+
[3, 5, 7, 10, 15, 15, 15, 15, 15],
|
|
248
|
+
)
|
|
249
|
+
self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
|
|
250
|
+
|
|
251
|
+
def test_download_assets_retries_stream_errors_and_removes_partial_file(self):
|
|
252
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
253
|
+
tmp_path = Path(tmp)
|
|
254
|
+
assets_dir = tmp_path / "assets"
|
|
255
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
|
|
257
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
258
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
259
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
260
|
+
|
|
261
|
+
attempts = 0
|
|
262
|
+
|
|
263
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
264
|
+
nonlocal attempts
|
|
265
|
+
attempts += 1
|
|
266
|
+
if attempts == 1:
|
|
267
|
+
return _FailingStreamResponse("audio/mpeg", body=b"partial")
|
|
268
|
+
return _FakeResponse("audio/mpeg", body=b"complete")
|
|
269
|
+
|
|
270
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
271
|
+
|
|
272
|
+
with patch("src.downloader.time.sleep"):
|
|
273
|
+
asset_map = dl._download_assets(
|
|
274
|
+
[{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
|
|
275
|
+
assets_dir,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
self.assertEqual(attempts, 2)
|
|
279
|
+
filename = asset_map["https://cdn.boosty.to/audio/audio-id"]
|
|
280
|
+
self.assertEqual((assets_dir / filename).read_bytes(), b"complete")
|
|
281
|
+
self.assertFalse(any(path.read_bytes() == b"partial" for path in assets_dir.iterdir()))
|
|
282
|
+
|
|
283
|
+
def test_download_assets_does_not_retry_permanent_404(self):
|
|
284
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
285
|
+
tmp_path = Path(tmp)
|
|
286
|
+
assets_dir = tmp_path / "assets"
|
|
287
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
288
|
+
|
|
289
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
290
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
291
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
292
|
+
|
|
293
|
+
attempts = 0
|
|
294
|
+
|
|
295
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
296
|
+
nonlocal attempts
|
|
297
|
+
attempts += 1
|
|
298
|
+
return _HttpErrorResponse(404)
|
|
299
|
+
|
|
300
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
301
|
+
|
|
302
|
+
with patch("src.downloader.time.sleep"):
|
|
303
|
+
asset_map = dl._download_assets(
|
|
304
|
+
[{"url": "https://cdn.boosty.to/audio/missing-id", "alt": "missing.mp3"}],
|
|
305
|
+
assets_dir,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
self.assertEqual(attempts, 1)
|
|
309
|
+
self.assertEqual(asset_map, {})
|
|
310
|
+
|
|
311
|
+
def test_download_assets_retries_write_errors_and_removes_partial_file(self):
|
|
312
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
313
|
+
tmp_path = Path(tmp)
|
|
314
|
+
assets_dir = tmp_path / "assets"
|
|
315
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
316
|
+
|
|
317
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
318
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
319
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
320
|
+
|
|
321
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
322
|
+
return _FakeResponse("audio/mpeg", body=b"complete")
|
|
323
|
+
|
|
324
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
325
|
+
|
|
326
|
+
real_open = open
|
|
327
|
+
open_attempts = 0
|
|
328
|
+
|
|
329
|
+
def flaky_open(path, mode="r", *args, **kwargs):
|
|
330
|
+
nonlocal open_attempts
|
|
331
|
+
if "wb" in mode:
|
|
332
|
+
open_attempts += 1
|
|
333
|
+
wrapped = real_open(path, mode, *args, **kwargs)
|
|
334
|
+
if open_attempts == 1:
|
|
335
|
+
return _FailingWriteFile(wrapped)
|
|
336
|
+
return wrapped
|
|
337
|
+
return real_open(path, mode, *args, **kwargs)
|
|
338
|
+
|
|
339
|
+
with patch("src.downloader.time.sleep"), patch("builtins.open", flaky_open):
|
|
340
|
+
asset_map = dl._download_assets(
|
|
341
|
+
[{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
|
|
342
|
+
assets_dir,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
self.assertEqual(open_attempts, 2)
|
|
346
|
+
filename = asset_map["https://cdn.boosty.to/audio/audio-id"]
|
|
347
|
+
self.assertEqual((assets_dir / filename).read_bytes(), b"complete")
|
|
348
|
+
self.assertFalse(any(path.read_bytes() == b"partial" for path in assets_dir.iterdir()))
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
if __name__ == "__main__":
|
|
352
|
+
unittest.main()
|
|
@@ -61,6 +61,9 @@ class ConfigHardeningTests(unittest.TestCase):
|
|
|
61
61
|
self.assertIn('title = "Bob\'s \\"backup\\""', toml)
|
|
62
62
|
self.assertIn('baseURL = "https://example.com/a\\"b"', toml)
|
|
63
63
|
self.assertIn('locale = "ru"', toml)
|
|
64
|
+
self.assertIn('defaultContentLanguage = "ru"', toml)
|
|
65
|
+
self.assertIn('[languages.ru]', toml)
|
|
66
|
+
self.assertIn(' locale = "ru"', toml)
|
|
64
67
|
self.assertNotIn('languageCode', toml)
|
|
65
68
|
self.assertIn('default_theme = "light\\"mode"', toml)
|
|
66
69
|
finally:
|
|
@@ -1,180 +0,0 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
import unittest
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import cast
|
|
5
|
-
|
|
6
|
-
from src.config import Auth, Config, Source
|
|
7
|
-
from src.database import Database
|
|
8
|
-
from src.downloader import BaseDownloader
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class _FakeResponse:
|
|
12
|
-
def __init__(self, content_type: str, body: bytes):
|
|
13
|
-
self.headers = {"Content-Type": content_type}
|
|
14
|
-
self._body = body
|
|
15
|
-
|
|
16
|
-
def raise_for_status(self):
|
|
17
|
-
return None
|
|
18
|
-
|
|
19
|
-
def iter_content(self, chunk_size: int = 8192):
|
|
20
|
-
# Yield at least one chunk to trigger file write.
|
|
21
|
-
yield self._body
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class _DummyDB:
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class _DummyDownloader(BaseDownloader):
|
|
29
|
-
PLATFORM = "dummy"
|
|
30
|
-
MAX_WORKERS = 2
|
|
31
|
-
|
|
32
|
-
def _setup_session(self):
|
|
33
|
-
# Tests patch session.get directly.
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
def fetch_posts_list(
|
|
37
|
-
self,
|
|
38
|
-
existing_ids: set[str] | None = None,
|
|
39
|
-
incremental: bool = False,
|
|
40
|
-
safety_chunks: int = 1
|
|
41
|
-
):
|
|
42
|
-
raise NotImplementedError
|
|
43
|
-
|
|
44
|
-
def fetch_post(self, post_id: str):
|
|
45
|
-
raise NotImplementedError
|
|
46
|
-
|
|
47
|
-
def _parse_post(self, raw_data: dict):
|
|
48
|
-
raise NotImplementedError
|
|
49
|
-
|
|
50
|
-
def _to_markdown(self, post, asset_map):
|
|
51
|
-
raise NotImplementedError
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class AssetDedupTests(unittest.TestCase):
|
|
55
|
-
def test_download_assets_deduplicates_colliding_names(self):
|
|
56
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
57
|
-
tmp_path = Path(tmp)
|
|
58
|
-
assets_dir = tmp_path / "assets"
|
|
59
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
-
|
|
61
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
62
|
-
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
63
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
64
|
-
|
|
65
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
66
|
-
# URLs intentionally do not contain extensions.
|
|
67
|
-
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
68
|
-
|
|
69
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
70
|
-
|
|
71
|
-
assets = [
|
|
72
|
-
{"url": "https://example.test/media/1", "alt": "same name"},
|
|
73
|
-
{"url": "https://example.test/media/2", "alt": "same name"},
|
|
74
|
-
]
|
|
75
|
-
|
|
76
|
-
asset_map = dl._download_assets(assets, assets_dir)
|
|
77
|
-
|
|
78
|
-
self.assertEqual(set(asset_map.keys()), {a["url"] for a in assets})
|
|
79
|
-
|
|
80
|
-
filenames = list(asset_map.values())
|
|
81
|
-
self.assertEqual(len(filenames), 2)
|
|
82
|
-
self.assertNotEqual(filenames[0], filenames[1])
|
|
83
|
-
|
|
84
|
-
for fn in filenames:
|
|
85
|
-
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
86
|
-
|
|
87
|
-
def test_download_assets_deduplicates_when_file_exists(self):
|
|
88
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
89
|
-
tmp_path = Path(tmp)
|
|
90
|
-
assets_dir = tmp_path / "assets"
|
|
91
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
92
|
-
|
|
93
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
94
|
-
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
95
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
96
|
-
|
|
97
|
-
# Pre-create a file with the expected base name.
|
|
98
|
-
base = dl._make_asset_filename(
|
|
99
|
-
"https://example.test/media/1",
|
|
100
|
-
"image/jpeg",
|
|
101
|
-
"same name",
|
|
102
|
-
)
|
|
103
|
-
(assets_dir / base).write_bytes(b"existing")
|
|
104
|
-
|
|
105
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
106
|
-
return _FakeResponse("image/jpeg", body=b"downloaded")
|
|
107
|
-
|
|
108
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
109
|
-
|
|
110
|
-
assets = [{"url": "https://example.test/media/1", "alt": "same name"}]
|
|
111
|
-
asset_map = dl._download_assets(assets, assets_dir)
|
|
112
|
-
|
|
113
|
-
self.assertIn("https://example.test/media/1", asset_map)
|
|
114
|
-
self.assertNotEqual(asset_map["https://example.test/media/1"], base)
|
|
115
|
-
self.assertTrue((assets_dir / asset_map["https://example.test/media/1"]).exists())
|
|
116
|
-
|
|
117
|
-
def test_download_assets_keeps_unique_names_under_parallelism(self):
|
|
118
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
119
|
-
tmp_path = Path(tmp)
|
|
120
|
-
assets_dir = tmp_path / "assets"
|
|
121
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
122
|
-
|
|
123
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
124
|
-
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
125
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
126
|
-
dl.MAX_WORKERS = 5
|
|
127
|
-
|
|
128
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
129
|
-
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
130
|
-
|
|
131
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
132
|
-
|
|
133
|
-
assets = [
|
|
134
|
-
{"url": f"https://example.test/media/{i}", "alt": "same name"}
|
|
135
|
-
for i in range(20)
|
|
136
|
-
]
|
|
137
|
-
|
|
138
|
-
asset_map = dl._download_assets(assets, assets_dir)
|
|
139
|
-
|
|
140
|
-
self.assertEqual(len(asset_map), 20)
|
|
141
|
-
filenames = list(asset_map.values())
|
|
142
|
-
self.assertEqual(len(set(filenames)), 20)
|
|
143
|
-
for fn in filenames:
|
|
144
|
-
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
145
|
-
|
|
146
|
-
def test_download_assets_uses_download_url_but_maps_original_url(self):
|
|
147
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
148
|
-
tmp_path = Path(tmp)
|
|
149
|
-
assets_dir = tmp_path / "assets"
|
|
150
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
151
|
-
|
|
152
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
153
|
-
source = Source(platform="boosty", author="author", download_assets=True)
|
|
154
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
155
|
-
|
|
156
|
-
requested_urls = []
|
|
157
|
-
|
|
158
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
159
|
-
requested_urls.append(url)
|
|
160
|
-
return _FakeResponse("audio/mpeg", body=b"audio")
|
|
161
|
-
|
|
162
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
163
|
-
|
|
164
|
-
asset_map = dl._download_assets(
|
|
165
|
-
[
|
|
166
|
-
{
|
|
167
|
-
"url": "https://cdn.boosty.to/audio/audio-id",
|
|
168
|
-
"download_url": "https://cdn.boosty.to/audio/audio-id?sign=abc",
|
|
169
|
-
"alt": "audio.mp3",
|
|
170
|
-
}
|
|
171
|
-
],
|
|
172
|
-
assets_dir,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
self.assertEqual(requested_urls, ["https://cdn.boosty.to/audio/audio-id?sign=abc"])
|
|
176
|
-
self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if __name__ == "__main__":
|
|
180
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{article_backup-0.3.13 → article_backup-0.3.14}/article_backup.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|