article-backup 0.3.12__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {article_backup-0.3.12 → article_backup-0.3.14}/PKG-INFO +1 -1
  2. {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/PKG-INFO +1 -1
  3. {article_backup-0.3.12 → article_backup-0.3.14}/backup.py +4 -0
  4. {article_backup-0.3.12 → article_backup-0.3.14}/pyproject.toml +1 -1
  5. {article_backup-0.3.12 → article_backup-0.3.14}/src/boosty.py +37 -3
  6. {article_backup-0.3.12 → article_backup-0.3.14}/src/downloader.py +54 -31
  7. article_backup-0.3.14/tests/test_asset_dedup.py +352 -0
  8. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_boosty_normalize.py +70 -0
  9. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_config_hardening.py +3 -0
  10. article_backup-0.3.12/tests/test_asset_dedup.py +0 -148
  11. {article_backup-0.3.12 → article_backup-0.3.14}/LICENSE +0 -0
  12. {article_backup-0.3.12 → article_backup-0.3.14}/README.md +0 -0
  13. {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/SOURCES.txt +0 -0
  14. {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/dependency_links.txt +0 -0
  15. {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/entry_points.txt +0 -0
  16. {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/requires.txt +0 -0
  17. {article_backup-0.3.12 → article_backup-0.3.14}/article_backup.egg-info/top_level.txt +0 -0
  18. {article_backup-0.3.12 → article_backup-0.3.14}/setup.cfg +0 -0
  19. {article_backup-0.3.12 → article_backup-0.3.14}/src/__init__.py +0 -0
  20. {article_backup-0.3.12 → article_backup-0.3.14}/src/config.py +0 -0
  21. {article_backup-0.3.12 → article_backup-0.3.14}/src/database.py +0 -0
  22. {article_backup-0.3.12 → article_backup-0.3.14}/src/sponsr.py +0 -0
  23. {article_backup-0.3.12 → article_backup-0.3.14}/src/utils.py +0 -0
  24. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_boosty_empty_link.py +0 -0
  25. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_incremental_sync.py +0 -0
  26. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_slug_safety.py +0 -0
  27. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sponsr_formatting_fix.py +0 -0
  28. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sponsr_normalize.py +0 -0
  29. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sponsr_tags.py +0 -0
  30. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_sync_policy.py +0 -0
  31. {article_backup-0.3.12 → article_backup-0.3.14}/tests/test_video_embed.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: article-backup
3
- Version: 0.3.12
3
+ Version: 0.3.14
4
4
  Summary: Локальный бэкап статей с Sponsr.ru и Boosty.to в Markdown с Hugo-интеграцией
5
5
  Author-email: Eugene Chaykin <eugene@chayk.in>
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: article-backup
3
- Version: 0.3.12
3
+ Version: 0.3.14
4
4
  Summary: Локальный бэкап статей с Sponsr.ru и Boosty.to в Markdown с Hugo-интеграцией
5
5
  Author-email: Eugene Chaykin <eugene@chayk.in>
6
6
  License: Apache-2.0
@@ -27,9 +27,13 @@ def generate_hugo_config(config: Config):
27
27
 
28
28
  content = f'''baseURL = {toml_str(config.hugo.base_url)}
29
29
  locale = {toml_str(config.hugo.language_code)}
30
+ defaultContentLanguage = {toml_str(config.hugo.language_code)}
30
31
  title = {toml_str(config.hugo.title)}
31
32
  relativeURLs = true
32
33
 
34
+ [languages.{config.hugo.language_code}]
35
+ locale = {toml_str(config.hugo.language_code)}
36
+
33
37
  [params]
34
38
  default_theme = {toml_str(config.hugo.default_theme)}
35
39
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "article-backup"
3
- version = "0.3.12"
3
+ version = "0.3.14"
4
4
  description = "Локальный бэкап статей с Sponsr.ru и Boosty.to в Markdown с Hugo-интеграцией"
5
5
  readme = "README.md"
6
6
  license = {text = "Apache-2.0"}
@@ -3,6 +3,7 @@
3
3
 
4
4
  import json
5
5
  from datetime import datetime, timezone
6
+ from urllib.parse import parse_qsl, urlencode, urlparse
6
7
 
7
8
  import requests
8
9
 
@@ -158,7 +159,7 @@ class BoostyDownloader(BaseDownloader):
158
159
  content_blocks = raw_data.get("data", [])
159
160
 
160
161
  # Извлекаем assets
161
- assets = self._extract_assets(content_blocks)
162
+ assets = self._extract_assets(content_blocks, raw_data.get("signedQuery", ""))
162
163
 
163
164
  return Post(
164
165
  post_id=post_id,
@@ -170,7 +171,7 @@ class BoostyDownloader(BaseDownloader):
170
171
  assets=assets,
171
172
  )
172
173
 
173
- def _extract_assets(self, blocks: list[dict]) -> list[dict]:
174
+ def _extract_assets(self, blocks: list[dict], signed_query: str = "") -> list[dict]:
174
175
  """Извлекает URL медиафайлов из блоков контента."""
175
176
  assets = []
176
177
 
@@ -190,6 +191,16 @@ class BoostyDownloader(BaseDownloader):
190
191
  if url:
191
192
  assets.append({
192
193
  "url": url,
194
+ "download_url": self._sign_media_url(url, signed_query),
195
+ "alt": block.get("title", block.get("id", "")),
196
+ })
197
+
198
+ elif block_type == "file":
199
+ url = block.get("url", "")
200
+ if url:
201
+ assets.append({
202
+ "url": url,
203
+ "download_url": self._sign_media_url(url, signed_query),
193
204
  "alt": block.get("title", block.get("id", "")),
194
205
  })
195
206
 
@@ -244,7 +255,7 @@ class BoostyDownloader(BaseDownloader):
244
255
  continue
245
256
 
246
257
  # Block-level элементы разрывают параграф
247
- if block_type in ("image", "audio_file", "ok_video"):
258
+ if block_type in ("image", "audio_file", "file", "ok_video"):
248
259
  if current_paragraph:
249
260
  lines.append("".join(current_paragraph))
250
261
  current_paragraph = []
@@ -293,6 +304,15 @@ class BoostyDownloader(BaseDownloader):
293
304
  elif url:
294
305
  return f"\n🎵 **{title}**: [слушать]({url})\n"
295
306
 
307
+ elif block_type == "file":
308
+ url = block.get("url", "")
309
+ title = block.get("title") or block.get("id") or "file"
310
+ local = asset_map.get(url)
311
+ if local:
312
+ return f"\n📎 [{title}](assets/{local})\n"
313
+ elif url:
314
+ return f"\n📎 [{title}]({url})\n"
315
+
296
316
  elif block_type == "ok_video":
297
317
  # Определяем ссылку на видео (приоритет: локальный файл > ok.ru/video > videoembed)
298
318
  video_url = self._extract_ok_video_player_url(block)
@@ -322,6 +342,20 @@ class BoostyDownloader(BaseDownloader):
322
342
 
323
343
  return ""
324
344
 
345
+ def _sign_media_url(self, url: str, signed_query: str) -> str:
346
+ """Добавляет signedQuery Boosty к URL медиа, не перезаписывая существующие параметры."""
347
+ if not url or not signed_query:
348
+ return url
349
+
350
+ parsed = urlparse(url)
351
+ params = dict(parse_qsl(parsed.query, keep_blank_values=True))
352
+ query = signed_query[1:] if signed_query.startswith("?") else signed_query
353
+ for key, value in parse_qsl(query, keep_blank_values=True):
354
+ if key not in params:
355
+ params[key] = value
356
+
357
+ return parsed._replace(query=urlencode(params)).geturl()
358
+
325
359
  def _extract_ok_video_player_url(self, block: dict) -> str:
326
360
  """Выбирает лучший прямой URL видео из ok_video блока."""
327
361
  player_urls = block.get("playerUrls")
@@ -33,6 +33,7 @@ def retry_request(
33
33
  base_delay: float = 1.0,
34
34
  max_delay: float = 30.0,
35
35
  backoff_factor: float = 2.0,
36
+ delays: list[float] | None = None,
36
37
  ):
37
38
  """
38
39
  Выполняет функцию с retry и exponential backoff.
@@ -43,6 +44,7 @@ def retry_request(
43
44
  base_delay: Начальная задержка в секундах
44
45
  max_delay: Максимальная задержка в секундах
45
46
  backoff_factor: Множитель для увеличения задержки
47
+ delays: Явная последовательность задержек между попытками
46
48
  """
47
49
  last_exception = None
48
50
  delay = base_delay
@@ -58,8 +60,11 @@ def retry_request(
58
60
  raise
59
61
 
60
62
  if attempt < max_retries - 1:
61
- time.sleep(delay)
62
- delay = min(delay * backoff_factor, max_delay)
63
+ if delays:
64
+ time.sleep(delays[min(attempt, len(delays) - 1)])
65
+ else:
66
+ time.sleep(delay)
67
+ delay = min(delay * backoff_factor, max_delay)
63
68
 
64
69
  if last_exception:
65
70
  raise last_exception
@@ -318,6 +323,7 @@ class BaseDownloader(ABC):
318
323
 
319
324
  def download_one(asset: dict) -> tuple[str, str | None]:
320
325
  url = asset["url"]
326
+ request_url = asset.get("download_url", url)
321
327
  force = asset.get("force", False)
322
328
  try:
323
329
  # Предварительная проверка (если расширение есть)
@@ -325,43 +331,60 @@ class BaseDownloader(ABC):
325
331
  if ext and not force and not should_download_asset(url, None, self.source.asset_types):
326
332
  return url, None
327
333
 
328
- def do_request():
329
- resp = self.session.get(url, stream=True, timeout=self.TIMEOUT)
330
- resp.raise_for_status()
331
- return resp
334
+ filename: str | None = None
335
+ filepath: Path | None = None
332
336
 
333
- response = retry_request(do_request, max_retries=3)
334
- try:
335
- content_type = response.headers.get('Content-Type', '')
337
+ def download_to_file():
338
+ nonlocal filename, filepath
339
+ resp = self.session.get(request_url, stream=True, timeout=self.TIMEOUT)
340
+ try:
341
+ resp.raise_for_status()
342
+ content_type = resp.headers.get('Content-Type', '')
336
343
 
337
- # Полная проверка после получения Content-Type
338
- if not force and not should_download_asset(url, content_type, self.source.asset_types):
339
- return url, None
344
+ # Полная проверка после получения Content-Type
345
+ if not force and not should_download_asset(url, content_type, self.source.asset_types):
346
+ return None
340
347
 
341
- filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
348
+ if filename is None or filepath is None:
349
+ filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
342
350
 
343
- with used_lock:
344
- filename = filename_base
345
- filepath = assets_dir / filename
346
- if filename in used_filenames or filepath.exists():
347
- filename = self._deduplicate_filename(filename, url)
348
- filepath = assets_dir / filename
351
+ with used_lock:
352
+ filename = filename_base
353
+ filepath = assets_dir / filename
354
+ if filename in used_filenames or filepath.exists():
355
+ filename = self._deduplicate_filename(filename, url)
356
+ filepath = assets_dir / filename
349
357
 
350
- # На всякий случай добиваемся уникальности в рамках сессии
351
- while filename in used_filenames or filepath.exists():
352
- filename = self._deduplicate_filename(filename, url + filename)
353
- filepath = assets_dir / filename
358
+ # На всякий случай добиваемся уникальности в рамках сессии
359
+ while filename in used_filenames or filepath.exists():
360
+ filename = self._deduplicate_filename(filename, url + filename)
361
+ filepath = assets_dir / filename
354
362
 
355
- used_filenames.add(filename)
363
+ used_filenames.add(filename)
356
364
 
357
- if not filepath.exists():
358
365
  with open(filepath, 'wb') as f:
359
- for chunk in response.iter_content(chunk_size=8192):
360
- f.write(chunk)
361
- finally:
362
- close = getattr(response, 'close', None)
363
- if callable(close):
364
- close()
366
+ for chunk in resp.iter_content(chunk_size=8192):
367
+ if chunk:
368
+ f.write(chunk)
369
+ return filename
370
+ except Exception as e:
371
+ if filepath and filepath.exists():
372
+ filepath.unlink()
373
+ if isinstance(e, OSError) and not isinstance(e, requests.RequestException):
374
+ raise requests.RequestException(str(e)) from e
375
+ raise
376
+ finally:
377
+ close = getattr(resp, 'close', None)
378
+ if callable(close):
379
+ close()
380
+
381
+ filename = retry_request(
382
+ download_to_file,
383
+ max_retries=10,
384
+ delays=[3, 5, 7, 10, 15, 15, 15, 15, 15],
385
+ )
386
+ if not filename:
387
+ return url, None
365
388
 
366
389
  return url, filename
367
390
  except requests.RequestException as e:
@@ -0,0 +1,352 @@
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+ from typing import cast
5
+ from unittest.mock import patch
6
+
7
+ import requests
8
+
9
+ from src.config import Auth, Config, Source
10
+ from src.database import Database
11
+ from src.downloader import BaseDownloader
12
+
13
+
14
+ class _FakeResponse:
15
+ def __init__(self, content_type: str, body: bytes):
16
+ self.headers = {"Content-Type": content_type}
17
+ self._body = body
18
+
19
+ def raise_for_status(self):
20
+ return None
21
+
22
+ def iter_content(self, chunk_size: int = 8192):
23
+ # Yield at least one chunk to trigger file write.
24
+ yield self._body
25
+
26
+ def close(self):
27
+ return None
28
+
29
+
30
+ class _FailingStreamResponse(_FakeResponse):
31
+ def iter_content(self, chunk_size: int = 8192):
32
+ yield self._body
33
+ raise requests.exceptions.ChunkedEncodingError("stream interrupted")
34
+
35
+
36
+ class _HttpErrorResponse(_FakeResponse):
37
+ def __init__(self, status_code: int):
38
+ super().__init__("text/plain", b"")
39
+ self.status_code = status_code
40
+
41
+ def raise_for_status(self):
42
+ response = requests.Response()
43
+ response.status_code = self.status_code
44
+ raise requests.HTTPError(f"{self.status_code} error", response=response)
45
+
46
+
47
+ class _DummyDB:
48
+ pass
49
+
50
+
51
+ class _DummyDownloader(BaseDownloader):
52
+ PLATFORM = "dummy"
53
+ MAX_WORKERS = 2
54
+
55
+ def _setup_session(self):
56
+ # Tests patch session.get directly.
57
+ return None
58
+
59
+ def fetch_posts_list(
60
+ self,
61
+ existing_ids: set[str] | None = None,
62
+ incremental: bool = False,
63
+ safety_chunks: int = 1
64
+ ):
65
+ raise NotImplementedError
66
+
67
+ def fetch_post(self, post_id: str):
68
+ raise NotImplementedError
69
+
70
+ def _parse_post(self, raw_data: dict):
71
+ raise NotImplementedError
72
+
73
+ def _to_markdown(self, post, asset_map):
74
+ raise NotImplementedError
75
+
76
+
77
+ class _FailingWriteFile:
78
+ def __init__(self, wrapped):
79
+ self._wrapped = wrapped
80
+
81
+ def __enter__(self):
82
+ self._wrapped.__enter__()
83
+ return self
84
+
85
+ def __exit__(self, exc_type, exc_val, exc_tb):
86
+ return self._wrapped.__exit__(exc_type, exc_val, exc_tb)
87
+
88
+ def write(self, data: bytes):
89
+ self._wrapped.write(b"partial")
90
+ raise OSError("temporary disk write failure")
91
+
92
+
93
+ class AssetDedupTests(unittest.TestCase):
94
+ def test_download_assets_deduplicates_colliding_names(self):
95
+ with tempfile.TemporaryDirectory() as tmp:
96
+ tmp_path = Path(tmp)
97
+ assets_dir = tmp_path / "assets"
98
+ assets_dir.mkdir(parents=True, exist_ok=True)
99
+
100
+ config = Config(output_dir=tmp_path, auth=Auth())
101
+ source = Source(platform="sponsr", author="author", download_assets=True)
102
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
103
+
104
+ def fake_get(url: str, stream: bool = True, timeout=None):
105
+ # URLs intentionally do not contain extensions.
106
+ return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
107
+
108
+ dl.session.get = fake_get # type: ignore[method-assign]
109
+
110
+ assets = [
111
+ {"url": "https://example.test/media/1", "alt": "same name"},
112
+ {"url": "https://example.test/media/2", "alt": "same name"},
113
+ ]
114
+
115
+ asset_map = dl._download_assets(assets, assets_dir)
116
+
117
+ self.assertEqual(set(asset_map.keys()), {a["url"] for a in assets})
118
+
119
+ filenames = list(asset_map.values())
120
+ self.assertEqual(len(filenames), 2)
121
+ self.assertNotEqual(filenames[0], filenames[1])
122
+
123
+ for fn in filenames:
124
+ self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
125
+
126
+ def test_download_assets_deduplicates_when_file_exists(self):
127
+ with tempfile.TemporaryDirectory() as tmp:
128
+ tmp_path = Path(tmp)
129
+ assets_dir = tmp_path / "assets"
130
+ assets_dir.mkdir(parents=True, exist_ok=True)
131
+
132
+ config = Config(output_dir=tmp_path, auth=Auth())
133
+ source = Source(platform="sponsr", author="author", download_assets=True)
134
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
135
+
136
+ # Pre-create a file with the expected base name.
137
+ base = dl._make_asset_filename(
138
+ "https://example.test/media/1",
139
+ "image/jpeg",
140
+ "same name",
141
+ )
142
+ (assets_dir / base).write_bytes(b"existing")
143
+
144
+ def fake_get(url: str, stream: bool = True, timeout=None):
145
+ return _FakeResponse("image/jpeg", body=b"downloaded")
146
+
147
+ dl.session.get = fake_get # type: ignore[method-assign]
148
+
149
+ assets = [{"url": "https://example.test/media/1", "alt": "same name"}]
150
+ asset_map = dl._download_assets(assets, assets_dir)
151
+
152
+ self.assertIn("https://example.test/media/1", asset_map)
153
+ self.assertNotEqual(asset_map["https://example.test/media/1"], base)
154
+ self.assertTrue((assets_dir / asset_map["https://example.test/media/1"]).exists())
155
+
156
+ def test_download_assets_keeps_unique_names_under_parallelism(self):
157
+ with tempfile.TemporaryDirectory() as tmp:
158
+ tmp_path = Path(tmp)
159
+ assets_dir = tmp_path / "assets"
160
+ assets_dir.mkdir(parents=True, exist_ok=True)
161
+
162
+ config = Config(output_dir=tmp_path, auth=Auth())
163
+ source = Source(platform="sponsr", author="author", download_assets=True)
164
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
165
+ dl.MAX_WORKERS = 5
166
+
167
+ def fake_get(url: str, stream: bool = True, timeout=None):
168
+ return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
169
+
170
+ dl.session.get = fake_get # type: ignore[method-assign]
171
+
172
+ assets = [
173
+ {"url": f"https://example.test/media/{i}", "alt": "same name"}
174
+ for i in range(20)
175
+ ]
176
+
177
+ asset_map = dl._download_assets(assets, assets_dir)
178
+
179
+ self.assertEqual(len(asset_map), 20)
180
+ filenames = list(asset_map.values())
181
+ self.assertEqual(len(set(filenames)), 20)
182
+ for fn in filenames:
183
+ self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
184
+
185
+ def test_download_assets_uses_download_url_but_maps_original_url(self):
186
+ with tempfile.TemporaryDirectory() as tmp:
187
+ tmp_path = Path(tmp)
188
+ assets_dir = tmp_path / "assets"
189
+ assets_dir.mkdir(parents=True, exist_ok=True)
190
+
191
+ config = Config(output_dir=tmp_path, auth=Auth())
192
+ source = Source(platform="boosty", author="author", download_assets=True)
193
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
194
+
195
+ requested_urls = []
196
+
197
+ def fake_get(url: str, stream: bool = True, timeout=None):
198
+ requested_urls.append(url)
199
+ return _FakeResponse("audio/mpeg", body=b"audio")
200
+
201
+ dl.session.get = fake_get # type: ignore[method-assign]
202
+
203
+ asset_map = dl._download_assets(
204
+ [
205
+ {
206
+ "url": "https://cdn.boosty.to/audio/audio-id",
207
+ "download_url": "https://cdn.boosty.to/audio/audio-id?sign=abc",
208
+ "alt": "audio.mp3",
209
+ }
210
+ ],
211
+ assets_dir,
212
+ )
213
+
214
+ self.assertEqual(requested_urls, ["https://cdn.boosty.to/audio/audio-id?sign=abc"])
215
+ self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
216
+
217
+ def test_download_assets_retries_network_errors_ten_times(self):
218
+ with tempfile.TemporaryDirectory() as tmp:
219
+ tmp_path = Path(tmp)
220
+ assets_dir = tmp_path / "assets"
221
+ assets_dir.mkdir(parents=True, exist_ok=True)
222
+
223
+ config = Config(output_dir=tmp_path, auth=Auth())
224
+ source = Source(platform="boosty", author="author", download_assets=True)
225
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
226
+
227
+ attempts = 0
228
+
229
+ def fake_get(url: str, stream: bool = True, timeout=None):
230
+ nonlocal attempts
231
+ attempts += 1
232
+ if attempts < 10:
233
+ raise requests.ConnectionError("temporary cdn failure")
234
+ return _FakeResponse("audio/mpeg", body=b"audio")
235
+
236
+ dl.session.get = fake_get # type: ignore[method-assign]
237
+
238
+ with patch("src.downloader.time.sleep") as sleep_mock:
239
+ asset_map = dl._download_assets(
240
+ [{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
241
+ assets_dir,
242
+ )
243
+
244
+ self.assertEqual(attempts, 10)
245
+ self.assertEqual(
246
+ [call.args[0] for call in sleep_mock.call_args_list],
247
+ [3, 5, 7, 10, 15, 15, 15, 15, 15],
248
+ )
249
+ self.assertIn("https://cdn.boosty.to/audio/audio-id", asset_map)
250
+
251
+ def test_download_assets_retries_stream_errors_and_removes_partial_file(self):
252
+ with tempfile.TemporaryDirectory() as tmp:
253
+ tmp_path = Path(tmp)
254
+ assets_dir = tmp_path / "assets"
255
+ assets_dir.mkdir(parents=True, exist_ok=True)
256
+
257
+ config = Config(output_dir=tmp_path, auth=Auth())
258
+ source = Source(platform="boosty", author="author", download_assets=True)
259
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
260
+
261
+ attempts = 0
262
+
263
+ def fake_get(url: str, stream: bool = True, timeout=None):
264
+ nonlocal attempts
265
+ attempts += 1
266
+ if attempts == 1:
267
+ return _FailingStreamResponse("audio/mpeg", body=b"partial")
268
+ return _FakeResponse("audio/mpeg", body=b"complete")
269
+
270
+ dl.session.get = fake_get # type: ignore[method-assign]
271
+
272
+ with patch("src.downloader.time.sleep"):
273
+ asset_map = dl._download_assets(
274
+ [{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
275
+ assets_dir,
276
+ )
277
+
278
+ self.assertEqual(attempts, 2)
279
+ filename = asset_map["https://cdn.boosty.to/audio/audio-id"]
280
+ self.assertEqual((assets_dir / filename).read_bytes(), b"complete")
281
+ self.assertFalse(any(path.read_bytes() == b"partial" for path in assets_dir.iterdir()))
282
+
283
+ def test_download_assets_does_not_retry_permanent_404(self):
284
+ with tempfile.TemporaryDirectory() as tmp:
285
+ tmp_path = Path(tmp)
286
+ assets_dir = tmp_path / "assets"
287
+ assets_dir.mkdir(parents=True, exist_ok=True)
288
+
289
+ config = Config(output_dir=tmp_path, auth=Auth())
290
+ source = Source(platform="boosty", author="author", download_assets=True)
291
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
292
+
293
+ attempts = 0
294
+
295
+ def fake_get(url: str, stream: bool = True, timeout=None):
296
+ nonlocal attempts
297
+ attempts += 1
298
+ return _HttpErrorResponse(404)
299
+
300
+ dl.session.get = fake_get # type: ignore[method-assign]
301
+
302
+ with patch("src.downloader.time.sleep"):
303
+ asset_map = dl._download_assets(
304
+ [{"url": "https://cdn.boosty.to/audio/missing-id", "alt": "missing.mp3"}],
305
+ assets_dir,
306
+ )
307
+
308
+ self.assertEqual(attempts, 1)
309
+ self.assertEqual(asset_map, {})
310
+
311
+ def test_download_assets_retries_write_errors_and_removes_partial_file(self):
312
+ with tempfile.TemporaryDirectory() as tmp:
313
+ tmp_path = Path(tmp)
314
+ assets_dir = tmp_path / "assets"
315
+ assets_dir.mkdir(parents=True, exist_ok=True)
316
+
317
+ config = Config(output_dir=tmp_path, auth=Auth())
318
+ source = Source(platform="boosty", author="author", download_assets=True)
319
+ dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
320
+
321
+ def fake_get(url: str, stream: bool = True, timeout=None):
322
+ return _FakeResponse("audio/mpeg", body=b"complete")
323
+
324
+ dl.session.get = fake_get # type: ignore[method-assign]
325
+
326
+ real_open = open
327
+ open_attempts = 0
328
+
329
+ def flaky_open(path, mode="r", *args, **kwargs):
330
+ nonlocal open_attempts
331
+ if "wb" in mode:
332
+ open_attempts += 1
333
+ wrapped = real_open(path, mode, *args, **kwargs)
334
+ if open_attempts == 1:
335
+ return _FailingWriteFile(wrapped)
336
+ return wrapped
337
+ return real_open(path, mode, *args, **kwargs)
338
+
339
+ with patch("src.downloader.time.sleep"), patch("builtins.open", flaky_open):
340
+ asset_map = dl._download_assets(
341
+ [{"url": "https://cdn.boosty.to/audio/audio-id", "alt": "audio.mp3"}],
342
+ assets_dir,
343
+ )
344
+
345
+ self.assertEqual(open_attempts, 2)
346
+ filename = asset_map["https://cdn.boosty.to/audio/audio-id"]
347
+ self.assertEqual((assets_dir / filename).read_bytes(), b"complete")
348
+ self.assertFalse(any(path.read_bytes() == b"partial" for path in assets_dir.iterdir()))
349
+
350
+
351
+ if __name__ == "__main__":
352
+ unittest.main()
@@ -143,5 +143,75 @@ class BoostyParagraphTests(unittest.TestCase):
143
143
  self.assertIn(')\n\nТекст после', md)
144
144
 
145
145
 
146
+ class BoostySignedMediaTests(unittest.TestCase):
147
+ def setUp(self):
148
+ self.config = Config(output_dir=Path('/tmp/test'), auth=Auth())
149
+ self.source = Source(platform='boosty', author='test_author')
150
+ self.db = MagicMock(spec=Database)
151
+ with patch('src.boosty.load_cookie', return_value='fake'), \
152
+ patch('src.boosty.load_auth_header', return_value='Bearer fake'):
153
+ self.downloader = BoostyDownloader(self.config, self.source, self.db)
154
+
155
+ def test_parse_post_signs_audio_asset_with_signed_query(self):
156
+ raw = {
157
+ 'id': 'post-id',
158
+ 'title': 'Post',
159
+ 'createdAt': 1735689600,
160
+ 'signedQuery': '?sign=abc&expires=123',
161
+ 'data': [
162
+ {
163
+ 'type': 'audio_file',
164
+ 'url': 'https://cdn.boosty.to/audio/audio-id',
165
+ 'title': 'Audio title.mp3',
166
+ }
167
+ ],
168
+ }
169
+
170
+ post = self.downloader._parse_post(raw)
171
+
172
+ self.assertEqual(post.assets[0]['url'], 'https://cdn.boosty.to/audio/audio-id')
173
+ self.assertEqual(
174
+ post.assets[0]['download_url'],
175
+ 'https://cdn.boosty.to/audio/audio-id?sign=abc&expires=123',
176
+ )
177
+
178
+ def test_parse_post_signs_file_asset_with_signed_query(self):
179
+ raw = {
180
+ 'id': 'post-id',
181
+ 'title': 'Post',
182
+ 'createdAt': 1735689600,
183
+ 'signedQuery': 'sign=abc&expires=123',
184
+ 'data': [
185
+ {
186
+ 'type': 'file',
187
+ 'url': 'https://cdn.boosty.to/file/file-id?name=doc.pdf',
188
+ 'title': 'doc.pdf',
189
+ }
190
+ ],
191
+ }
192
+
193
+ post = self.downloader._parse_post(raw)
194
+
195
+ self.assertEqual(post.assets[0]['url'], 'https://cdn.boosty.to/file/file-id?name=doc.pdf')
196
+ self.assertEqual(
197
+ post.assets[0]['download_url'],
198
+ 'https://cdn.boosty.to/file/file-id?name=doc.pdf&sign=abc&expires=123',
199
+ )
200
+
201
+ def test_file_block_uses_local_asset_when_downloaded(self):
202
+ block = {
203
+ 'type': 'file',
204
+ 'url': 'https://cdn.boosty.to/file/file-id',
205
+ 'title': 'doc.pdf',
206
+ }
207
+
208
+ md = self.downloader._block_to_markdown(
209
+ block,
210
+ {'https://cdn.boosty.to/file/file-id': 'doc.pdf'},
211
+ )
212
+
213
+ self.assertIn('[doc.pdf](assets/doc.pdf)', md)
214
+
215
+
146
216
  if __name__ == '__main__':
147
217
  unittest.main()
@@ -61,6 +61,9 @@ class ConfigHardeningTests(unittest.TestCase):
61
61
  self.assertIn('title = "Bob\'s \\"backup\\""', toml)
62
62
  self.assertIn('baseURL = "https://example.com/a\\"b"', toml)
63
63
  self.assertIn('locale = "ru"', toml)
64
+ self.assertIn('defaultContentLanguage = "ru"', toml)
65
+ self.assertIn('[languages.ru]', toml)
66
+ self.assertIn(' locale = "ru"', toml)
64
67
  self.assertNotIn('languageCode', toml)
65
68
  self.assertIn('default_theme = "light\\"mode"', toml)
66
69
  finally:
@@ -1,148 +0,0 @@
1
- import tempfile
2
- import unittest
3
- from pathlib import Path
4
- from typing import cast
5
-
6
- from src.config import Auth, Config, Source
7
- from src.database import Database
8
- from src.downloader import BaseDownloader
9
-
10
-
11
- class _FakeResponse:
12
- def __init__(self, content_type: str, body: bytes):
13
- self.headers = {"Content-Type": content_type}
14
- self._body = body
15
-
16
- def raise_for_status(self):
17
- return None
18
-
19
- def iter_content(self, chunk_size: int = 8192):
20
- # Yield at least one chunk to trigger file write.
21
- yield self._body
22
-
23
-
24
- class _DummyDB:
25
- pass
26
-
27
-
28
- class _DummyDownloader(BaseDownloader):
29
- PLATFORM = "dummy"
30
- MAX_WORKERS = 2
31
-
32
- def _setup_session(self):
33
- # Tests patch session.get directly.
34
- return None
35
-
36
- def fetch_posts_list(
37
- self,
38
- existing_ids: set[str] | None = None,
39
- incremental: bool = False,
40
- safety_chunks: int = 1
41
- ):
42
- raise NotImplementedError
43
-
44
- def fetch_post(self, post_id: str):
45
- raise NotImplementedError
46
-
47
- def _parse_post(self, raw_data: dict):
48
- raise NotImplementedError
49
-
50
- def _to_markdown(self, post, asset_map):
51
- raise NotImplementedError
52
-
53
-
54
- class AssetDedupTests(unittest.TestCase):
55
- def test_download_assets_deduplicates_colliding_names(self):
56
- with tempfile.TemporaryDirectory() as tmp:
57
- tmp_path = Path(tmp)
58
- assets_dir = tmp_path / "assets"
59
- assets_dir.mkdir(parents=True, exist_ok=True)
60
-
61
- config = Config(output_dir=tmp_path, auth=Auth())
62
- source = Source(platform="sponsr", author="author", download_assets=True)
63
- dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
64
-
65
- def fake_get(url: str, stream: bool = True, timeout=None):
66
- # URLs intentionally do not contain extensions.
67
- return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
68
-
69
- dl.session.get = fake_get # type: ignore[method-assign]
70
-
71
- assets = [
72
- {"url": "https://example.test/media/1", "alt": "same name"},
73
- {"url": "https://example.test/media/2", "alt": "same name"},
74
- ]
75
-
76
- asset_map = dl._download_assets(assets, assets_dir)
77
-
78
- self.assertEqual(set(asset_map.keys()), {a["url"] for a in assets})
79
-
80
- filenames = list(asset_map.values())
81
- self.assertEqual(len(filenames), 2)
82
- self.assertNotEqual(filenames[0], filenames[1])
83
-
84
- for fn in filenames:
85
- self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
86
-
87
- def test_download_assets_deduplicates_when_file_exists(self):
88
- with tempfile.TemporaryDirectory() as tmp:
89
- tmp_path = Path(tmp)
90
- assets_dir = tmp_path / "assets"
91
- assets_dir.mkdir(parents=True, exist_ok=True)
92
-
93
- config = Config(output_dir=tmp_path, auth=Auth())
94
- source = Source(platform="sponsr", author="author", download_assets=True)
95
- dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
96
-
97
- # Pre-create a file with the expected base name.
98
- base = dl._make_asset_filename(
99
- "https://example.test/media/1",
100
- "image/jpeg",
101
- "same name",
102
- )
103
- (assets_dir / base).write_bytes(b"existing")
104
-
105
- def fake_get(url: str, stream: bool = True, timeout=None):
106
- return _FakeResponse("image/jpeg", body=b"downloaded")
107
-
108
- dl.session.get = fake_get # type: ignore[method-assign]
109
-
110
- assets = [{"url": "https://example.test/media/1", "alt": "same name"}]
111
- asset_map = dl._download_assets(assets, assets_dir)
112
-
113
- self.assertIn("https://example.test/media/1", asset_map)
114
- self.assertNotEqual(asset_map["https://example.test/media/1"], base)
115
- self.assertTrue((assets_dir / asset_map["https://example.test/media/1"]).exists())
116
-
117
- def test_download_assets_keeps_unique_names_under_parallelism(self):
118
- with tempfile.TemporaryDirectory() as tmp:
119
- tmp_path = Path(tmp)
120
- assets_dir = tmp_path / "assets"
121
- assets_dir.mkdir(parents=True, exist_ok=True)
122
-
123
- config = Config(output_dir=tmp_path, auth=Auth())
124
- source = Source(platform="sponsr", author="author", download_assets=True)
125
- dl = _DummyDownloader(config, source, cast(Database, _DummyDB()))
126
- dl.MAX_WORKERS = 5
127
-
128
- def fake_get(url: str, stream: bool = True, timeout=None):
129
- return _FakeResponse("image/jpeg", body=(url + "\n").encode("ascii"))
130
-
131
- dl.session.get = fake_get # type: ignore[method-assign]
132
-
133
- assets = [
134
- {"url": f"https://example.test/media/{i}", "alt": "same name"}
135
- for i in range(20)
136
- ]
137
-
138
- asset_map = dl._download_assets(assets, assets_dir)
139
-
140
- self.assertEqual(len(asset_map), 20)
141
- filenames = list(asset_map.values())
142
- self.assertEqual(len(set(filenames)), 20)
143
- for fn in filenames:
144
- self.assertTrue((assets_dir / fn).exists(), msg=f"missing file: {fn}")
145
-
146
-
147
- if __name__ == "__main__":
148
- unittest.main()
File without changes