article-backup 0.3.12__tar.gz → 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {article_backup-0.3.12 → article_backup-0.3.14}/PKG-INFO +1 -1
- {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/PKG-INFO +1 -1
- {article_backup-0.3.12 → article_backup-0.3.14}/backup.py +4 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/pyproject.toml +1 -1
- {article_backup-0.3.12 → article_backup-0.3.14}/src/boosty.py +37 -3
- {article_backup-0.3.12 → article_backup-0.3.14}/src/downloader.py +54 -31
- article_backup-0.3.14/tests/test_asset_dedup.py +352 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_boosty_normalize.py +70 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_config_hardening.py +3 -0
- article_backup-0.3.12/tests/test_asset_dedup.py +0 -148
- {article_backup-0.3.12 → article_backup-0.3.14}/LICENSE +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/README.md +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/SOURCES.txt +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/dependency_links.txt +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/entry_points.txt +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/requires.txt +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/top_level.txt +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/setup.cfg +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/src/__init__.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/src/config.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/src/database.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/src/sponsr.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/src/utils.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_boosty_empty_link.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_incremental_sync.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_slug_safety.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sponsr_formatting_fix.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sponsr_normalize.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sponsr_tags.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sync_policy.py +0 -0
- {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_video_embed.py +0 -0
|
@@ -27,9 +27,13 @@ def generate_hugo_config(config: Config):
|
|
|
27
27
|
|
|
28
28
|
content = f'''baseURL = {toml_str(config.hugo.base_url)}
|
|
29
29
|
locale = {toml_str(config.hugo.language_code)}
|
|
30
|
+
defaultContentLanguage = {toml_str(config.hugo.language_code)}
|
|
30
31
|
title = {toml_str(config.hugo.title)}
|
|
31
32
|
relativeURLs = true
|
|
32
33
|
|
|
34
|
+
[languages.{config.hugo.language_code}]
|
|
35
|
+
locale = {toml_str(config.hugo.language_code)}
|
|
36
|
+
|
|
33
37
|
[params]
|
|
34
38
|
default_theme = {toml_str(config.hugo.default_theme)}
|
|
35
39
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import json
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
|
+
from urllib.parse import parse_qsl, urlencode, urlparse
|
|
6
7
|
|
|
7
8
|
import requests
|
|
8
9
|
|
|
@@ -158,7 +159,7 @@ class BoostyDownloader(BaseDownloader):
|
|
|
158
159
|
content_blocks = raw_data.get("data", [])
|
|
159
160
|
|
|
160
161
|
# Извлекаем assets
|
|
161
|
-
assets = self._extract_assets(content_blocks)
|
|
162
|
+
assets = self._extract_assets(content_blocks, raw_data.get("signedQuery", ""))
|
|
162
163
|
|
|
163
164
|
return Post(
|
|
164
165
|
post_id=post_id,
|
|
@@ -170,7 +171,7 @@ class BoostyDownloader(BaseDownloader):
|
|
|
170
171
|
assets=assets,
|
|
171
172
|
)
|
|
172
173
|
|
|
173
|
-
def _extract_assets(self, blocks: list[dict]) -> list[dict]:
|
|
174
|
+
def _extract_assets(self, blocks: list[dict], signed_query: str = "") -> list[dict]:
|
|
174
175
|
"""Извлекает URL медиафайлов из блоков контента."""
|
|
175
176
|
assets = []
|
|
176
177
|
|
|
@@ -190,6 +191,16 @@ class BoostyDownloader(BaseDownloader):
|
|
|
190
191
|
if url:
|
|
191
192
|
assets.append({
|
|
192
193
|
"url": url,
|
|
194
|
+
"download_url": self._sign_media_url(url, signed_query),
|
|
195
|
+
"alt": block.get("title", block.get("id", "")),
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
elif block_type == "file":
|
|
199
|
+
url = block.get("url", "")
|
|
200
|
+
if url:
|
|
201
|
+
assets.append({
|
|
202
|
+
"url": url,
|
|
203
|
+
"download_url": self._sign_media_url(url, signed_query),
|
|
193
204
|
"alt": block.get("title", block.get("id", "")),
|
|
194
205
|
})
|
|
195
206
|
|
|
@@ -244,7 +255,7 @@ class BoostyDownloader(BaseDownloader):
|
|
|
244
255
|
continue
|
|
245
256
|
|
|
246
257
|
# Block-level элементы разрывают параграф
|
|
247
|
-
if block_type in ("image", "audio_file", "ok_video"):
|
|
258
|
+
if block_type in ("image", "audio_file", "file", "ok_video"):
|
|
248
259
|
if current_paragraph:
|
|
249
260
|
lines.append("".join(current_paragraph))
|
|
250
261
|
current_paragraph = []
|
|
@@ -293,6 +304,15 @@ class BoostyDownloader(BaseDownloader):
|
|
|
293
304
|
elif url:
|
|
294
305
|
return f"\n🎵 **{title}**: [слушать]({url})\n"
|
|
295
306
|
|
|
307
|
+
elif block_type == "file":
|
|
308
|
+
url = block.get("url", "")
|
|
309
|
+
title = block.get("title") or block.get("id") or "file"
|
|
310
|
+
local = asset_map.get(url)
|
|
311
|
+
if local:
|
|
312
|
+
return f"\n📎 [{title}](assets/{local})\n"
|
|
313
|
+
elif url:
|
|
314
|
+
return f"\n📎 [{title}]({url})\n"
|
|
315
|
+
|
|
296
316
|
elif block_type == "ok_video":
|
|
297
317
|
# Определяем ссылку на видео (приоритет: локальный файл > ok.ru/video > videoembed)
|
|
298
318
|
video_url = self._extract_ok_video_player_url(block)
|
|
@@ -322,6 +342,20 @@ class BoostyDownloader(BaseDownloader):
|
|
|
322
342
|
|
|
323
343
|
return ""
|
|
324
344
|
|
|
345
|
+
def _sign_media_url(self, url: str, signed_query: str) -> str:
|
|
346
|
+
"""Добавляет signedQuery Boosty к URL медиа, не перезаписывая существующие параметры."""
|
|
347
|
+
if not url or not signed_query:
|
|
348
|
+
return url
|
|
349
|
+
|
|
350
|
+
parsed = urlparse(url)
|
|
351
|
+
params = dict(parse_qsl(parsed.query, keep_blank_values=True))
|
|
352
|
+
query = signed_query[1:] if signed_query.startswith("?") else signed_query
|
|
353
|
+
for key, value in parse_qsl(query, keep_blank_values=True):
|
|
354
|
+
if key not in params:
|
|
355
|
+
params[key] = value
|
|
356
|
+
|
|
357
|
+
return parsed._replace(query=urlencode(params)).geturl()
|
|
358
|
+
|
|
325
359
|
def _extract_ok_video_player_url(self, block: dict) -> str:
|
|
326
360
|
"""Выбирает лучший прямой URL видео из ok_video блока."""
|
|
327
361
|
player_urls = block.get("playerUrls")
|
|
@@ -33,6 +33,7 @@ def retry_request(
|
|
|
33
33
|
base_delay: float = 1.0,
|
|
34
34
|
max_delay: float = 30.0,
|
|
35
35
|
backoff_factor: float = 2.0,
|
|
36
|
+
delays: list[float] | None = None,
|
|
36
37
|
):
|
|
37
38
|
"""
|
|
38
39
|
Выполняет функцию с retry и exponential backoff.
|
|
@@ -43,6 +44,7 @@ def retry_request(
|
|
|
43
44
|
base_delay: Начальная задержка в секундах
|
|
44
45
|
max_delay: Максимальная задержка в секундах
|
|
45
46
|
backoff_factor: Множитель для увеличения задержки
|
|
47
|
+
delays: Явная последовательность задержек между попытками
|
|
46
48
|
"""
|
|
47
49
|
last_exception = None
|
|
48
50
|
delay = base_delay
|
|
@@ -58,8 +60,11 @@ def retry_request(
|
|
|
58
60
|
raise
|
|
59
61
|
|
|
60
62
|
if attempt < max_retries - 1:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
if delays:
|
|
64
|
+
time.sleep(delays[min(attempt, len(delays) - 1)])
|
|
65
|
+
else:
|
|
66
|
+
time.sleep(delay)
|
|
67
|
+
delay = min(delay * backoff_factor, max_delay)
|
|
63
68
|
|
|
64
69
|
if last_exception:
|
|
65
70
|
raise last_exception
|
|
@@ -318,6 +323,7 @@ class BaseDownloader(ABC):
|
|
|
318
323
|
|
|
319
324
|
def download_one(asset: dict) -> tuple[str, str | None]:
|
|
320
325
|
url = asset["url"]
|
|
326
|
+
request_url = asset.get("download_url", url)
|
|
321
327
|
force = asset.get("force", False)
|
|
322
328
|
try:
|
|
323
329
|
# Предварительная проверка (если расширение есть)
|
|
@@ -325,43 +331,60 @@ class BaseDownloader(ABC):
|
|
|
325
331
|
if ext and not force and not should_download_asset(url, None, self.source.asset_types):
|
|
326
332
|
return url, None
|
|
327
333
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
resp.raise_for_status()
|
|
331
|
-
return resp
|
|
334
|
+
filename: str | None = None
|
|
335
|
+
filepath: Path | None = None
|
|
332
336
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
337
|
+
def download_to_file():
|
|
338
|
+
nonlocal filename, filepath
|
|
339
|
+
resp = self.session.get(request_url, stream=True, timeout=self.TIMEOUT)
|
|
340
|
+
try:
|
|
341
|
+
resp.raise_for_status()
|
|
342
|
+
content_type = resp.headers.get('Content-Type', '')
|
|
336
343
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
344
|
+
# Полная проверка после получения Content-Type
|
|
345
|
+
if not force and not should_download_asset(url, content_type, self.source.asset_types):
|
|
346
|
+
return None
|
|
340
347
|
|
|
341
|
-
|
|
348
|
+
if filename is None or filepath is None:
|
|
349
|
+
filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
|
|
342
350
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
351
|
+
with used_lock:
|
|
352
|
+
filename = filename_base
|
|
353
|
+
filepath = assets_dir / filename
|
|
354
|
+
if filename in used_filenames or filepath.exists():
|
|
355
|
+
filename = self._deduplicate_filename(filename, url)
|
|
356
|
+
filepath = assets_dir / filename
|
|
349
357
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
358
|
+
# На всякий случай добиваемся уникальности в рамках сессии
|
|
359
|
+
while filename in used_filenames or filepath.exists():
|
|
360
|
+
filename = self._deduplicate_filename(filename, url + filename)
|
|
361
|
+
filepath = assets_dir / filename
|
|
354
362
|
|
|
355
|
-
|
|
363
|
+
used_filenames.add(filename)
|
|
356
364
|
|
|
357
|
-
if not filepath.exists():
|
|
358
365
|
with open(filepath, 'wb') as f:
|
|
359
|
-
for chunk in
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
366
|
+
for chunk in resp.iter_content(chunk_size=8192):
|
|
367
|
+
if chunk:
|
|
368
|
+
f.write(chunk)
|
|
369
|
+
return filename
|
|
370
|
+
except Exception as e:
|
|
371
|
+
if filepath and filepath.exists():
|
|
372
|
+
filepath.unlink()
|
|
373
|
+
if isinstance(e, OSError) and not isinstance(e, requests.RequestException):
|
|
374
|
+
raise requests.RequestException(str(e)) from e
|
|
375
|
+
raise
|
|
376
|
+
finally:
|
|
377
|
+
close = getattr(resp, 'close', None)
|
|
378
|
+
if callable(close):
|
|
379
|
+
close()
|
|
380
|
+
|
|
381
|
+
filename = retry_request(
|
|
382
|
+
download_to_file,
|
|
383
|
+
max_retries=10,
|
|
384
|
+
delays=[3, 5, 7, 10, 15, 15, 15, 15, 15],
|
|
385
|
+
)
|
|
386
|
+
if not filename:
|
|
387
|
+
return url, None
|
|
365
388
|
|
|
366
389
|
return url, filename
|
|
367
390
|
except requests.RequestException as e:
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import cast
|
|
5
|
+
from unittest.mock import patch
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from src.config import Auth, Config, Source
|
|
10
|
+
from src.database import Database
|
|
11
|
+
from src.downloader import BaseDownloader
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _FakeResponse:
|
|
15
|
+
def __init__(self, content_type: str, body: bytes):
|
|
16
|
+
self.headers = {"Content-Type": content_type}
|
|
17
|
+
self._body = body
|
|
18
|
+
|
|
19
|
+
def raise_for_status(self):
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
def iter_content(self, chunk_size: int = 8192):
|
|
23
|
+
# Yield at least one chunk to trigger file write.
|
|
24
|
+
yield self._body
|
|
25
|
+
|
|
26
|
+
def close(self):
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _FailingStreamResponse(_FakeResponse):
|
|
31
|
+
def iter_content(self, chunk_size: int = 8192):
|
|
32
|
+
yield self._body
|
|
33
|
+
raise requests.exceptions.ChunkedEncodingError("stream interrupted")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _HttpErrorResponse(_FakeResponse):
|
|
37
|
+
def __init__(self, status_code: int):
|
|
38
|
+
super().__init__("text/plain", b"")
|
|
39
|
+
self.status_code = status_code
|
|
40
|
+
|
|
41
|
+
def raise_for_status(self):
|
|
42
|
+
response = requests.Response()
|
|
43
|
+
response.status_code = self.status_code
|
|
44
|
+
raise requests.HTTPError(f"{self.status_code} error", response=response)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class _DummyDB:
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class _DummyDownloader(BaseDownloader):
|
|
52
|
+
PLATFORM = "dummy"
|
|
53
|
+
MAX_WORKERS = 2
|
|
54
|
+
|
|
55
|
+
def _setup_session(self):
|
|
56
|
+
# Tests patch session.get directly.
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
def fetch_posts_list(
|
|
60
|
+
self,
|
|
61
|
+
existing_ids: set[str] | None = None,
|
|
62
|
+
incremental: bool = False,
|
|
63
|
+
safety_chunks: int = 1
|
|
64
|
+
):
|
|
65
|
+
raise NotImplementedError
|
|
66
|
+
|
|
67
|
+
def fetch_post(self, post_id: str):
|
|
68
|
+
raise NotImplementedError
|
|
69
|
+
|
|
70
|
+
def _parse_post(self, raw_data: dict):
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
|
|
73
|
+
def _to_markdown(self, post, asset_map):
|
|
74
|
+
raise NotImplementedError
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class _FailingWriteFile:
|
|
78
|
+
def __init__(self, wrapped):
|
|
79
|
+
self._wrapped = wrapped
|
|
80
|
+
|
|
81
|
+
def __enter__(self):
|
|
82
|
+
self._wrapped.__enter__()
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
86
|
+
return self._wrapped.__exit__(exc_type, exc_val, exc_tb)
|
|
87
|
+
|
|
88
|
+
def write(self, data: bytes):
|
|
89
|
+
self._wrapped.write(b"partial")
|
|
90
|
+
raise OSError("temporary disk write failure")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class AssetDedupTests(unittest.TestCase):
|
|
94
|
+
def test_download_assets_deduplicates_colliding_names(self):
|
|
95
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
96
|
+
tmp_path = Path(tmp)
|
|
97
|
+
assets_dir = tmp_path / "assets"
|
|
98
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
|
|
100
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
101
|
+
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
102
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
103
|
+
|
|
104
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
105
|
+
# URLs intentionally do not contain extensions.
|
|
106
|
+
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
107
|
+
|
|
108
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
109
|
+
|
|
110
|
+
assets = [
|
|
111
|
+
{"url": "https://example.test/media/1", "alt": "same name"},
|
|
112
|
+
{"url": "https://example.test/media/2", "alt": "same name"},
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
asset_map = dl._download_assets(assets, assets_dir)
|
|
116
|
+
|
|
117
|
+
self.assertEqual(set(asset_map.keys()), {a["url"] for a in assets})
|
|
118
|
+
|
|
119
|
+
filenames = list(asset_map.values())
|
|
120
|
+
self.assertEqual(len(filenames), 2)
|
|
121
|
+
self.assertNotEqual(filenames[0], filenames[1])
|
|
122
|
+
|
|
123
|
+
for fn in filenames:
|
|
124
|
+
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
125
|
+
|
|
126
|
+
def test_download_assets_deduplicates_when_file_exists(self):
|
|
127
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
128
|
+
tmp_path = Path(tmp)
|
|
129
|
+
assets_dir = tmp_path / "assets"
|
|
130
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
|
|
132
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
133
|
+
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
134
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
135
|
+
|
|
136
|
+
# Pre-create a file with the expected base name.
|
|
137
|
+
base = dl._make_asset_filename(
|
|
138
|
+
"https://example.test/media/1",
|
|
139
|
+
"image/jpeg",
|
|
140
|
+
"same name",
|
|
141
|
+
)
|
|
142
|
+
(assets_dir / base).write_bytes(b"existing")
|
|
143
|
+
|
|
144
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
145
|
+
return _FakeResponse("image/jpeg", body=b"downloaded")
|
|
146
|
+
|
|
147
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
148
|
+
|
|
149
|
+
assets = [{"url": "https://example.test/media/1", "alt": "same name"}]
|
|
150
|
+
asset_map = dl._download_assets(assets, assets_dir)
|
|
151
|
+
|
|
152
|
+
self.assertIn("https://example.test/media/1", asset_map)
|
|
153
|
+
self.assertNotEqual(asset_map["https://example.test/media/1"], base)
|
|
154
|
+
self.assertTrue((assets_dir / asset_map["https://example.test/media/1"]).exists())
|
|
155
|
+
|
|
156
|
+
def test_download_assets_keeps_unique_names_under_parallelism(self):
|
|
157
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
158
|
+
tmp_path = Path(tmp)
|
|
159
|
+
assets_dir = tmp_path / "assets"
|
|
160
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
|
|
162
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
163
|
+
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
164
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
165
|
+
dl.MAX_WORKERS = 5
|
|
166
|
+
|
|
167
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
168
|
+
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
169
|
+
|
|
170
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
171
|
+
|
|
172
|
+
assets = [
|
|
173
|
+
{"url": f"https://example.test/media/{i}", "alt": "same name"}
|
|
174
|
+
for i in range(20)
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
asset_map = dl._download_assets(assets, assets_dir)
|
|
178
|
+
|
|
179
|
+
self.assertEqual(len(asset_map), 20)
|
|
180
|
+
filenames = list(asset_map.values())
|
|
181
|
+
self.assertEqual(len(set(filenames)), 20)
|
|
182
|
+
for fn in filenames:
|
|
183
|
+
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
184
|
+
|
|
185
|
+
def test_download_assets_uses_download_url_but_maps_original_url(self):
|
|
186
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
187
|
+
tmp_path = Path(tmp)
|
|
188
|
+
assets_dir = tmp_path / "assets"
|
|
189
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
|
|
191
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
192
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
193
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
194
|
+
|
|
195
|
+
requested_urls = []
|
|
196
|
+
|
|
197
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
198
|
+
requested_urls.append(url)
|
|
199
|
+
return _FakeResponse("audio/mpeg", body=b"audio")
|
|
200
|
+
|
|
201
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
202
|
+
|
|
203
|
+
asset_map = dl._download_assets(
|
|
204
|
+
[
|
|
205
|
+
{
|
|
206
|
+
"url": "https://cdn.boosty.to/audio/audio-id",
|
|
207
|
+
"download_url": "https://cdn.boosty.to/audio/audio-id?sign=abc",
|
|
208
|
+
"alt": "audio.mp3",
|
|
209
|
+
}
|
|
210
|
+
],
|
|
211
|
+
assets_dir,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
self.assertEqual(requested_urls, ["https://cdn.boosty.to/audio/audio-id?sign=abc"])
|
|
215
|
+
self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
|
|
216
|
+
|
|
217
|
+
def test_download_assets_retries_network_errors_ten_times(self):
|
|
218
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
219
|
+
tmp_path = Path(tmp)
|
|
220
|
+
assets_dir = tmp_path / "assets"
|
|
221
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
|
|
223
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
224
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
225
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
226
|
+
|
|
227
|
+
attempts = 0
|
|
228
|
+
|
|
229
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
230
|
+
nonlocal attempts
|
|
231
|
+
attempts += 1
|
|
232
|
+
if attempts < 10:
|
|
233
|
+
raise requests.ConnectionError("temporary cdn failure")
|
|
234
|
+
return _FakeResponse("audio/mpeg", body=b"audio")
|
|
235
|
+
|
|
236
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
237
|
+
|
|
238
|
+
with patch("src.downloader.time.sleep") as sleep_mock:
|
|
239
|
+
asset_map = dl._download_assets(
|
|
240
|
+
[{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
|
|
241
|
+
assets_dir,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
self.assertEqual(attempts, 10)
|
|
245
|
+
self.assertEqual(
|
|
246
|
+
[call.args[0] for call in sleep_mock.call_args_list],
|
|
247
|
+
[3, 5, 7, 10, 15, 15, 15, 15, 15],
|
|
248
|
+
)
|
|
249
|
+
self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
|
|
250
|
+
|
|
251
|
+
def test_download_assets_retries_stream_errors_and_removes_partial_file(self):
|
|
252
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
253
|
+
tmp_path = Path(tmp)
|
|
254
|
+
assets_dir = tmp_path / "assets"
|
|
255
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
|
|
257
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
258
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
259
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
260
|
+
|
|
261
|
+
attempts = 0
|
|
262
|
+
|
|
263
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
264
|
+
nonlocal attempts
|
|
265
|
+
attempts += 1
|
|
266
|
+
if attempts == 1:
|
|
267
|
+
return _FailingStreamResponse("audio/mpeg", body=b"partial")
|
|
268
|
+
return _FakeResponse("audio/mpeg", body=b"complete")
|
|
269
|
+
|
|
270
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
271
|
+
|
|
272
|
+
with patch("src.downloader.time.sleep"):
|
|
273
|
+
asset_map = dl._download_assets(
|
|
274
|
+
[{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
|
|
275
|
+
assets_dir,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
self.assertEqual(attempts, 2)
|
|
279
|
+
filename = asset_map["https://cdn.boosty.to/audio/audio-id"]
|
|
280
|
+
self.assertEqual((assets_dir / filename).read_bytes(), b"complete")
|
|
281
|
+
self.assertFalse(any(path.read_bytes() == b"partial" for path in assets_dir.iterdir()))
|
|
282
|
+
|
|
283
|
+
def test_download_assets_does_not_retry_permanent_404(self):
|
|
284
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
285
|
+
tmp_path = Path(tmp)
|
|
286
|
+
assets_dir = tmp_path / "assets"
|
|
287
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
288
|
+
|
|
289
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
290
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
291
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
292
|
+
|
|
293
|
+
attempts = 0
|
|
294
|
+
|
|
295
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
296
|
+
nonlocal attempts
|
|
297
|
+
attempts += 1
|
|
298
|
+
return _HttpErrorResponse(404)
|
|
299
|
+
|
|
300
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
301
|
+
|
|
302
|
+
with patch("src.downloader.time.sleep"):
|
|
303
|
+
asset_map = dl._download_assets(
|
|
304
|
+
[{"url": "https://cdn.boosty.to/audio/missing-id", "alt": "missing.mp3"}],
|
|
305
|
+
assets_dir,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
self.assertEqual(attempts, 1)
|
|
309
|
+
self.assertEqual(asset_map, {})
|
|
310
|
+
|
|
311
|
+
def test_download_assets_retries_write_errors_and_removes_partial_file(self):
|
|
312
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
313
|
+
tmp_path = Path(tmp)
|
|
314
|
+
assets_dir = tmp_path / "assets"
|
|
315
|
+
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
316
|
+
|
|
317
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
318
|
+
source = Source(platform="boosty", author="author", download_assets=True)
|
|
319
|
+
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
320
|
+
|
|
321
|
+
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
322
|
+
return _FakeResponse("audio/mpeg", body=b"complete")
|
|
323
|
+
|
|
324
|
+
dl.session.get = fake_get # type: ignore[method-assign]
|
|
325
|
+
|
|
326
|
+
real_open = open
|
|
327
|
+
open_attempts = 0
|
|
328
|
+
|
|
329
|
+
def flaky_open(path, mode="r", *args, **kwargs):
|
|
330
|
+
nonlocal open_attempts
|
|
331
|
+
if "wb" in mode:
|
|
332
|
+
open_attempts += 1
|
|
333
|
+
wrapped = real_open(path, mode, *args, **kwargs)
|
|
334
|
+
if open_attempts == 1:
|
|
335
|
+
return _FailingWriteFile(wrapped)
|
|
336
|
+
return wrapped
|
|
337
|
+
return real_open(path, mode, *args, **kwargs)
|
|
338
|
+
|
|
339
|
+
with patch("src.downloader.time.sleep"), patch("builtins.open", flaky_open):
|
|
340
|
+
asset_map = dl._download_assets(
|
|
341
|
+
[{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
|
|
342
|
+
assets_dir,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
self.assertEqual(open_attempts, 2)
|
|
346
|
+
filename = asset_map["https://cdn.boosty.to/audio/audio-id"]
|
|
347
|
+
self.assertEqual((assets_dir / filename).read_bytes(), b"complete")
|
|
348
|
+
self.assertFalse(any(path.read_bytes() == b"partial" for path in assets_dir.iterdir()))
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
if __name__ == "__main__":
|
|
352
|
+
unittest.main()
|
|
@@ -143,5 +143,75 @@ class BoostyParagraphTests(unittest.TestCase):
|
|
|
143
143
|
self.assertIn(')\n\nТекст после', md)
|
|
144
144
|
|
|
145
145
|
|
|
146
|
+
class BoostySignedMediaTests(unittest.TestCase):
|
|
147
|
+
def setUp(self):
|
|
148
|
+
self.config = Config(output_dir=Path('/tmp/test'), auth=Auth())
|
|
149
|
+
self.source = Source(platform='boosty', author='test_author')
|
|
150
|
+
self.db = MagicMock(spec=Database)
|
|
151
|
+
with patch('src.boosty.load_cookie', return_value='fake'), \
|
|
152
|
+
patch('src.boosty.load_auth_header', return_value='Bearer fake'):
|
|
153
|
+
self.downloader = BoostyDownloader(self.config, self.source, self.db)
|
|
154
|
+
|
|
155
|
+
def test_parse_post_signs_audio_asset_with_signed_query(self):
|
|
156
|
+
raw = {
|
|
157
|
+
'id': 'post-id',
|
|
158
|
+
'title': 'Post',
|
|
159
|
+
'createdAt': 1735689600,
|
|
160
|
+
'signedQuery': '?sign=abc&expires=123',
|
|
161
|
+
'data': [
|
|
162
|
+
{
|
|
163
|
+
'type': 'audio_file',
|
|
164
|
+
'url': 'https://cdn.boosty.to/audio/audio-id',
|
|
165
|
+
'title': 'Audio title.mp3',
|
|
166
|
+
}
|
|
167
|
+
],
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
post = self.downloader._parse_post(raw)
|
|
171
|
+
|
|
172
|
+
self.assertEqual(post.assets[0]['url'], 'https://cdn.boosty.to/audio/audio-id')
|
|
173
|
+
self.assertEqual(
|
|
174
|
+
post.assets[0]['download_url'],
|
|
175
|
+
'https://cdn.boosty.to/audio/audio-id?sign=abc&expires=123',
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def test_parse_post_signs_file_asset_with_signed_query(self):
|
|
179
|
+
raw = {
|
|
180
|
+
'id': 'post-id',
|
|
181
|
+
'title': 'Post',
|
|
182
|
+
'createdAt': 1735689600,
|
|
183
|
+
'signedQuery': 'sign=abc&expires=123',
|
|
184
|
+
'data': [
|
|
185
|
+
{
|
|
186
|
+
'type': 'file',
|
|
187
|
+
'url': 'https://cdn.boosty.to/file/file-id?name=doc.pdf',
|
|
188
|
+
'title': 'doc.pdf',
|
|
189
|
+
}
|
|
190
|
+
],
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
post = self.downloader._parse_post(raw)
|
|
194
|
+
|
|
195
|
+
self.assertEqual(post.assets[0]['url'], 'https://cdn.boosty.to/file/file-id?name=doc.pdf')
|
|
196
|
+
self.assertEqual(
|
|
197
|
+
post.assets[0]['download_url'],
|
|
198
|
+
'https://cdn.boosty.to/file/file-id?name=doc.pdf&sign=abc&expires=123',
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def test_file_block_uses_local_asset_when_downloaded(self):
|
|
202
|
+
block = {
|
|
203
|
+
'type': 'file',
|
|
204
|
+
'url': 'https://cdn.boosty.to/file/file-id',
|
|
205
|
+
'title': 'doc.pdf',
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
md = self.downloader._block_to_markdown(
|
|
209
|
+
block,
|
|
210
|
+
{'https://cdn.boosty.to/file/file-id': 'doc.pdf'},
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
self.assertIn('[doc.pdf](assets/doc.pdf)', md)
|
|
214
|
+
|
|
215
|
+
|
|
146
216
|
if __name__ == '__main__':
|
|
147
217
|
unittest.main()
|
|
@@ -61,6 +61,9 @@ class ConfigHardeningTests(unittest.TestCase):
|
|
|
61
61
|
self.assertIn('title = "Bob\'s \\"backup\\""', toml)
|
|
62
62
|
self.assertIn('baseURL = "https://example.com/a\\"b"', toml)
|
|
63
63
|
self.assertIn('locale = "ru"', toml)
|
|
64
|
+
self.assertIn('defaultContentLanguage = "ru"', toml)
|
|
65
|
+
self.assertIn('[languages.ru]', toml)
|
|
66
|
+
self.assertIn(' locale = "ru"', toml)
|
|
64
67
|
self.assertNotIn('languageCode', toml)
|
|
65
68
|
self.assertIn('default_theme = "light\\"mode"', toml)
|
|
66
69
|
finally:
|
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
import unittest
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import cast
|
|
5
|
-
|
|
6
|
-
from src.config import Auth, Config, Source
|
|
7
|
-
from src.database import Database
|
|
8
|
-
from src.downloader import BaseDownloader
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class _FakeResponse:
|
|
12
|
-
def __init__(self, content_type: str, body: bytes):
|
|
13
|
-
self.headers = {"Content-Type": content_type}
|
|
14
|
-
self._body = body
|
|
15
|
-
|
|
16
|
-
def raise_for_status(self):
|
|
17
|
-
return None
|
|
18
|
-
|
|
19
|
-
def iter_content(self, chunk_size: int = 8192):
|
|
20
|
-
# Yield at least one chunk to trigger file write.
|
|
21
|
-
yield self._body
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class _DummyDB:
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class _DummyDownloader(BaseDownloader):
|
|
29
|
-
PLATFORM = "dummy"
|
|
30
|
-
MAX_WORKERS = 2
|
|
31
|
-
|
|
32
|
-
def _setup_session(self):
|
|
33
|
-
# Tests patch session.get directly.
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
def fetch_posts_list(
|
|
37
|
-
self,
|
|
38
|
-
existing_ids: set[str] | None = None,
|
|
39
|
-
incremental: bool = False,
|
|
40
|
-
safety_chunks: int = 1
|
|
41
|
-
):
|
|
42
|
-
raise NotImplementedError
|
|
43
|
-
|
|
44
|
-
def fetch_post(self, post_id: str):
|
|
45
|
-
raise NotImplementedError
|
|
46
|
-
|
|
47
|
-
def _parse_post(self, raw_data: dict):
|
|
48
|
-
raise NotImplementedError
|
|
49
|
-
|
|
50
|
-
def _to_markdown(self, post, asset_map):
|
|
51
|
-
raise NotImplementedError
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
class AssetDedupTests(unittest.TestCase):
|
|
55
|
-
def test_download_assets_deduplicates_colliding_names(self):
|
|
56
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
57
|
-
tmp_path = Path(tmp)
|
|
58
|
-
assets_dir = tmp_path / "assets"
|
|
59
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
-
|
|
61
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
62
|
-
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
63
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
64
|
-
|
|
65
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
66
|
-
# URLs intentionally do not contain extensions.
|
|
67
|
-
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
68
|
-
|
|
69
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
70
|
-
|
|
71
|
-
assets = [
|
|
72
|
-
{"url": "https://example.test/media/1", "alt": "same name"},
|
|
73
|
-
{"url": "https://example.test/media/2", "alt": "same name"},
|
|
74
|
-
]
|
|
75
|
-
|
|
76
|
-
asset_map = dl._download_assets(assets, assets_dir)
|
|
77
|
-
|
|
78
|
-
self.assertEqual(set(asset_map.keys()), {a["url"] for a in assets})
|
|
79
|
-
|
|
80
|
-
filenames = list(asset_map.values())
|
|
81
|
-
self.assertEqual(len(filenames), 2)
|
|
82
|
-
self.assertNotEqual(filenames[0], filenames[1])
|
|
83
|
-
|
|
84
|
-
for fn in filenames:
|
|
85
|
-
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
86
|
-
|
|
87
|
-
def test_download_assets_deduplicates_when_file_exists(self):
|
|
88
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
89
|
-
tmp_path = Path(tmp)
|
|
90
|
-
assets_dir = tmp_path / "assets"
|
|
91
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
92
|
-
|
|
93
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
94
|
-
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
95
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
96
|
-
|
|
97
|
-
# Pre-create a file with the expected base name.
|
|
98
|
-
base = dl._make_asset_filename(
|
|
99
|
-
"https://example.test/media/1",
|
|
100
|
-
"image/jpeg",
|
|
101
|
-
"same name",
|
|
102
|
-
)
|
|
103
|
-
(assets_dir / base).write_bytes(b"existing")
|
|
104
|
-
|
|
105
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
106
|
-
return _FakeResponse("image/jpeg", body=b"downloaded")
|
|
107
|
-
|
|
108
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
109
|
-
|
|
110
|
-
assets = [{"url": "https://example.test/media/1", "alt": "same name"}]
|
|
111
|
-
asset_map = dl._download_assets(assets, assets_dir)
|
|
112
|
-
|
|
113
|
-
self.assertIn("https://example.test/media/1", asset_map)
|
|
114
|
-
self.assertNotEqual(asset_map["https://example.test/media/1"], base)
|
|
115
|
-
self.assertTrue((assets_dir / asset_map["https://example.test/media/1"]).exists())
|
|
116
|
-
|
|
117
|
-
def test_download_assets_keeps_unique_names_under_parallelism(self):
|
|
118
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
119
|
-
tmp_path = Path(tmp)
|
|
120
|
-
assets_dir = tmp_path / "assets"
|
|
121
|
-
assets_dir.mkdir(parents=True, exist_ok=True)
|
|
122
|
-
|
|
123
|
-
config = Config(output_dir=tmp_path, auth=Auth())
|
|
124
|
-
source = Source(platform="sponsr", author="author", download_assets=True)
|
|
125
|
-
dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
|
|
126
|
-
dl.MAX_WORKERS = 5
|
|
127
|
-
|
|
128
|
-
def fake_get(url: str, stream: bool = True, timeout=None):
|
|
129
|
-
return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
|
|
130
|
-
|
|
131
|
-
dl.session.get = fake_get # type: ignore[method-assign]
|
|
132
|
-
|
|
133
|
-
assets = [
|
|
134
|
-
{"url": f"https://example.test/media/{i}", "alt": "same name"}
|
|
135
|
-
for i in range(20)
|
|
136
|
-
]
|
|
137
|
-
|
|
138
|
-
asset_map = dl._download_assets(assets, assets_dir)
|
|
139
|
-
|
|
140
|
-
self.assertEqual(len(asset_map), 20)
|
|
141
|
-
filenames = list(asset_map.values())
|
|
142
|
-
self.assertEqual(len(set(filenames)), 20)
|
|
143
|
-
for fn in filenames:
|
|
144
|
-
self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
if __name__ == "__main__":
|
|
148
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|