article-backup 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {article_backup-0.3.4 → article_backup-0.3.6}/PKG-INFO +1 -1
- {article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/PKG-INFO +1 -1
- {article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/SOURCES.txt +2 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/backup.py +26 -6
- {article_backup-0.3.4 → article_backup-0.3.6}/pyproject.toml +1 -1
- {article_backup-0.3.4 → article_backup-0.3.6}/src/boosty.py +21 -5
- {article_backup-0.3.4 → article_backup-0.3.6}/src/config.py +25 -2
- {article_backup-0.3.4 → article_backup-0.3.6}/src/downloader.py +37 -26
- {article_backup-0.3.4 → article_backup-0.3.6}/src/sponsr.py +80 -44
- article_backup-0.3.6/tests/test_config_hardening.py +51 -0
- article_backup-0.3.6/tests/test_slug_safety.py +111 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_sponsr_normalize.py +94 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/LICENSE +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/README.md +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/dependency_links.txt +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/entry_points.txt +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/requires.txt +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/top_level.txt +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/setup.cfg +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/src/__init__.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/src/database.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/src/utils.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_asset_dedup.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_boosty_empty_link.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_boosty_normalize.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_incremental_sync.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_sponsr_tags.py +0 -0
- {article_backup-0.3.4 → article_backup-0.3.6}/tests/test_video_embed.py +0 -0
|
@@ -18,7 +18,9 @@ src/utils.py
|
|
|
18
18
|
tests/test_asset_dedup.py
|
|
19
19
|
tests/test_boosty_empty_link.py
|
|
20
20
|
tests/test_boosty_normalize.py
|
|
21
|
+
tests/test_config_hardening.py
|
|
21
22
|
tests/test_incremental_sync.py
|
|
23
|
+
tests/test_slug_safety.py
|
|
22
24
|
tests/test_sponsr_normalize.py
|
|
23
25
|
tests/test_sponsr_tags.py
|
|
24
26
|
tests/test_video_embed.py
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
"""CLI точка входа для бэкапа статей."""
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
|
+
import json
|
|
6
7
|
import os
|
|
7
8
|
import sys
|
|
8
9
|
from pathlib import Path
|
|
@@ -21,13 +22,16 @@ def generate_hugo_config(config: Config):
|
|
|
21
22
|
if not hugo_toml.parent.exists():
|
|
22
23
|
return
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
def toml_str(value: str) -> str:
|
|
26
|
+
return json.dumps(value, ensure_ascii=False)
|
|
27
|
+
|
|
28
|
+
content = f'''baseURL = {toml_str(config.hugo.base_url)}
|
|
29
|
+
languageCode = {toml_str(config.hugo.language_code)}
|
|
30
|
+
title = {toml_str(config.hugo.title)}
|
|
27
31
|
relativeURLs = true
|
|
28
32
|
|
|
29
33
|
[params]
|
|
30
|
-
default_theme =
|
|
34
|
+
default_theme = {toml_str(config.hugo.default_theme)}
|
|
31
35
|
|
|
32
36
|
[markup.goldmark.renderer]
|
|
33
37
|
unsafe = true
|
|
@@ -89,12 +93,15 @@ def get_downloader(platform: str, config: Config, source: Source, db: Database):
|
|
|
89
93
|
|
|
90
94
|
def sync_all(config: Config, db: Database):
|
|
91
95
|
"""Синхронизирует всех авторов из конфига."""
|
|
96
|
+
errors: list[tuple[Source, Exception]] = []
|
|
92
97
|
for source in config.sources:
|
|
93
98
|
try:
|
|
94
99
|
downloader = get_downloader(source.platform, config, source, db)
|
|
95
100
|
downloader.sync()
|
|
96
101
|
except Exception as e:
|
|
97
102
|
print(f"[{source.platform}] Ошибка при синхронизации {source.author}: {e}")
|
|
103
|
+
errors.append((source, e))
|
|
104
|
+
return errors
|
|
98
105
|
|
|
99
106
|
|
|
100
107
|
def download_single_post(url: str, config: Config, db: Database):
|
|
@@ -157,21 +164,34 @@ def main():
|
|
|
157
164
|
# Создаём директорию и базу
|
|
158
165
|
config.output_dir.mkdir(parents=True, exist_ok=True)
|
|
159
166
|
|
|
167
|
+
sync_errors: list[tuple[Source, Exception]] = []
|
|
168
|
+
|
|
160
169
|
with Database(config.output_dir / 'index.db') as db:
|
|
161
170
|
# Выполняем команду
|
|
162
171
|
if args.url:
|
|
163
172
|
if not is_post_url(args.url):
|
|
164
173
|
print(f"Ошибка: неверный URL поста: {args.url}")
|
|
165
174
|
sys.exit(1)
|
|
166
|
-
|
|
175
|
+
try:
|
|
176
|
+
download_single_post(args.url, config, db)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
print(f"Ошибка при скачивании поста: {e}")
|
|
179
|
+
sys.exit(1)
|
|
167
180
|
else:
|
|
168
181
|
if not config.sources:
|
|
169
182
|
print("Нет источников в конфиге. Добавьте секцию 'sources'.")
|
|
170
183
|
sys.exit(1)
|
|
171
|
-
sync_all(config, db)
|
|
184
|
+
sync_errors = sync_all(config, db)
|
|
172
185
|
|
|
173
186
|
ensure_site_content_link(config)
|
|
174
187
|
generate_hugo_config(config)
|
|
188
|
+
|
|
189
|
+
if sync_errors:
|
|
190
|
+
print(f"\nЗавершено с ошибками: {len(sync_errors)}")
|
|
191
|
+
for source, error in sync_errors:
|
|
192
|
+
print(f" - [{source.platform}] {source.author}: {error}")
|
|
193
|
+
sys.exit(1)
|
|
194
|
+
|
|
175
195
|
print("\nГотово!")
|
|
176
196
|
|
|
177
197
|
|
|
@@ -8,7 +8,7 @@ import requests
|
|
|
8
8
|
|
|
9
9
|
from .config import Config, Source, load_cookie, load_auth_header
|
|
10
10
|
from .database import Database
|
|
11
|
-
from .downloader import BaseDownloader, Post
|
|
11
|
+
from .downloader import BaseDownloader, Post, retry_request
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BoostyDownloader(BaseDownloader):
|
|
@@ -17,6 +17,10 @@ class BoostyDownloader(BaseDownloader):
|
|
|
17
17
|
PLATFORM = "boosty"
|
|
18
18
|
API_BASE = "https://api.boosty.to/v1"
|
|
19
19
|
|
|
20
|
+
def __init__(self, config: Config, source: Source, db: Database):
|
|
21
|
+
self._warned_unknown_block_types: set[str] = set()
|
|
22
|
+
super().__init__(config, source, db)
|
|
23
|
+
|
|
20
24
|
def _setup_session(self):
|
|
21
25
|
"""Настройка сессии с cookies и authorization."""
|
|
22
26
|
cookie = load_cookie(self.config.auth.boosty_cookie_file)
|
|
@@ -51,8 +55,12 @@ class BoostyDownloader(BaseDownloader):
|
|
|
51
55
|
if offset:
|
|
52
56
|
url += f"&offset={offset}"
|
|
53
57
|
|
|
54
|
-
|
|
55
|
-
|
|
58
|
+
def do_request():
|
|
59
|
+
resp = self.session.get(url, timeout=self.TIMEOUT)
|
|
60
|
+
resp.raise_for_status()
|
|
61
|
+
return resp
|
|
62
|
+
|
|
63
|
+
response = retry_request(do_request, max_retries=3)
|
|
56
64
|
|
|
57
65
|
data = response.json()
|
|
58
66
|
posts_chunk = data.get("data", [])
|
|
@@ -96,8 +104,12 @@ class BoostyDownloader(BaseDownloader):
|
|
|
96
104
|
url = f"{self.API_BASE}/blog/{self.source.author}/post/{post_id}"
|
|
97
105
|
|
|
98
106
|
try:
|
|
99
|
-
|
|
100
|
-
|
|
107
|
+
def do_request():
|
|
108
|
+
resp = self.session.get(url, timeout=self.TIMEOUT)
|
|
109
|
+
resp.raise_for_status()
|
|
110
|
+
return resp
|
|
111
|
+
|
|
112
|
+
response = retry_request(do_request, max_retries=3)
|
|
101
113
|
data = response.json()
|
|
102
114
|
return self._parse_post(data)
|
|
103
115
|
except requests.RequestException as e:
|
|
@@ -256,6 +268,10 @@ class BoostyDownloader(BaseDownloader):
|
|
|
256
268
|
video_id = block.get("id", "")
|
|
257
269
|
return f"\n[\U0001f4f9 Видео](https://ok.ru/videoembed/{video_id})\n"
|
|
258
270
|
|
|
271
|
+
elif block_type and block_type not in self._warned_unknown_block_types:
|
|
272
|
+
print(f" [boosty] Пропущен неподдерживаемый тип блока: {block_type}")
|
|
273
|
+
self._warned_unknown_block_types.add(block_type)
|
|
274
|
+
|
|
259
275
|
return ""
|
|
260
276
|
|
|
261
277
|
def _parse_text_block(self, block: dict, paragraph_offset: int = 0) -> str:
|
|
@@ -47,6 +47,11 @@ def load_config(config_path: Path) -> Config:
|
|
|
47
47
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|
48
48
|
data = yaml.safe_load(f)
|
|
49
49
|
|
|
50
|
+
if data is None:
|
|
51
|
+
data = {}
|
|
52
|
+
if not isinstance(data, dict):
|
|
53
|
+
raise ValueError("Корень config.yaml должен быть объектом (mapping)")
|
|
54
|
+
|
|
50
55
|
# output_dir
|
|
51
56
|
env_output_dir = os.environ.get('BACKUP_OUTPUT_DIR')
|
|
52
57
|
if env_output_dir:
|
|
@@ -56,6 +61,10 @@ def load_config(config_path: Path) -> Config:
|
|
|
56
61
|
|
|
57
62
|
# auth
|
|
58
63
|
auth_data = data.get('auth', {})
|
|
64
|
+
if auth_data is None:
|
|
65
|
+
auth_data = {}
|
|
66
|
+
if not isinstance(auth_data, dict):
|
|
67
|
+
raise ValueError("Секция 'auth' должна быть объектом")
|
|
59
68
|
auth = Auth(
|
|
60
69
|
sponsr_cookie_file=_to_path(auth_data.get('sponsr_cookie_file')),
|
|
61
70
|
boosty_cookie_file=_to_path(auth_data.get('boosty_cookie_file')),
|
|
@@ -64,7 +73,17 @@ def load_config(config_path: Path) -> Config:
|
|
|
64
73
|
|
|
65
74
|
# sources
|
|
66
75
|
sources = []
|
|
67
|
-
|
|
76
|
+
sources_data = data.get('sources', [])
|
|
77
|
+
if sources_data is None:
|
|
78
|
+
sources_data = []
|
|
79
|
+
if not isinstance(sources_data, list):
|
|
80
|
+
raise ValueError("Секция 'sources' должна быть списком")
|
|
81
|
+
|
|
82
|
+
for src in sources_data:
|
|
83
|
+
if not isinstance(src, dict):
|
|
84
|
+
raise ValueError("Каждый элемент в 'sources' должен быть объектом")
|
|
85
|
+
if 'platform' not in src or 'author' not in src:
|
|
86
|
+
raise ValueError("Каждый источник в 'sources' должен содержать 'platform' и 'author'")
|
|
68
87
|
sources.append(Source(
|
|
69
88
|
platform=src['platform'],
|
|
70
89
|
author=src['author'],
|
|
@@ -75,6 +94,10 @@ def load_config(config_path: Path) -> Config:
|
|
|
75
94
|
|
|
76
95
|
# hugo
|
|
77
96
|
hugo_data = data.get('hugo', {})
|
|
97
|
+
if hugo_data is None:
|
|
98
|
+
hugo_data = {}
|
|
99
|
+
if not isinstance(hugo_data, dict):
|
|
100
|
+
raise ValueError("Секция 'hugo' должна быть объектом")
|
|
78
101
|
hugo = HugoConfig(
|
|
79
102
|
base_url=hugo_data.get('base_url', HugoConfig.base_url),
|
|
80
103
|
title=hugo_data.get('title', HugoConfig.title),
|
|
@@ -105,4 +128,4 @@ def load_auth_header(auth_file: Path | None) -> str:
|
|
|
105
128
|
raise FileNotFoundError("Auth file path not specified")
|
|
106
129
|
if not auth_file.exists():
|
|
107
130
|
raise FileNotFoundError(f"Auth file not found: {auth_file}")
|
|
108
|
-
return auth_file.read_text(encoding='utf-8').strip()
|
|
131
|
+
return auth_file.read_text(encoding='utf-8').strip()
|
|
@@ -83,7 +83,7 @@ class BaseDownloader(ABC):
|
|
|
83
83
|
|
|
84
84
|
PLATFORM: str = ""
|
|
85
85
|
MAX_WORKERS: int = 5
|
|
86
|
-
TIMEOUT: tuple = (5,
|
|
86
|
+
TIMEOUT: tuple = (5, 60)
|
|
87
87
|
|
|
88
88
|
def __init__(self, config: Config, source: Source, db: Database):
|
|
89
89
|
self.config = config
|
|
@@ -209,7 +209,11 @@ class BaseDownloader(ABC):
|
|
|
209
209
|
|
|
210
210
|
def _save_post(self, post: Post):
|
|
211
211
|
"""Сохраняет пост на диск."""
|
|
212
|
-
|
|
212
|
+
existing_record = self.db.get_post(self.PLATFORM, self.source.author, post.post_id)
|
|
213
|
+
if existing_record and existing_record.slug:
|
|
214
|
+
slug = existing_record.slug
|
|
215
|
+
else:
|
|
216
|
+
slug = self._make_slug(post)
|
|
213
217
|
post_dir = self._get_post_dir(slug)
|
|
214
218
|
post_dir.mkdir(parents=True, exist_ok=True)
|
|
215
219
|
|
|
@@ -251,7 +255,10 @@ class BaseDownloader(ABC):
|
|
|
251
255
|
"""Создаёт slug для папки поста."""
|
|
252
256
|
date_prefix = post.post_date[:10]
|
|
253
257
|
title_slug = transliterate(post.title)[:60]
|
|
254
|
-
|
|
258
|
+
post_suffix = slugify(post.post_id, lowercase=True, max_length=16)
|
|
259
|
+
if not post_suffix:
|
|
260
|
+
post_suffix = hashlib.md5(post.post_id.encode()).hexdigest()[:8]
|
|
261
|
+
return f"{date_prefix}-{title_slug}-{post_suffix}"
|
|
255
262
|
|
|
256
263
|
def _get_post_dir(self, slug: str) -> Path:
|
|
257
264
|
"""Возвращает путь к папке поста."""
|
|
@@ -306,33 +313,37 @@ class BaseDownloader(ABC):
|
|
|
306
313
|
return resp
|
|
307
314
|
|
|
308
315
|
response = retry_request(do_request, max_retries=3)
|
|
316
|
+
try:
|
|
317
|
+
content_type = response.headers.get('Content-Type', '')
|
|
309
318
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
if not should_download_asset(url, content_type, self.source.asset_types):
|
|
314
|
-
return url, None
|
|
319
|
+
# Полная проверка после получения Content-Type
|
|
320
|
+
if not should_download_asset(url, content_type, self.source.asset_types):
|
|
321
|
+
return url, None
|
|
315
322
|
|
|
316
|
-
|
|
323
|
+
filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
|
|
317
324
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
filepath = assets_dir / filename
|
|
321
|
-
if filename in used_filenames or filepath.exists():
|
|
322
|
-
filename = self._deduplicate_filename(filename, url)
|
|
325
|
+
with used_lock:
|
|
326
|
+
filename = filename_base
|
|
323
327
|
filepath = assets_dir / filename
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
328
|
+
if filename in used_filenames or filepath.exists():
|
|
329
|
+
filename = self._deduplicate_filename(filename, url)
|
|
330
|
+
filepath = assets_dir / filename
|
|
331
|
+
|
|
332
|
+
# На всякий случай добиваемся уникальности в рамках сессии
|
|
333
|
+
while filename in used_filenames or filepath.exists():
|
|
334
|
+
filename = self._deduplicate_filename(filename, url + filename)
|
|
335
|
+
filepath = assets_dir / filename
|
|
336
|
+
|
|
337
|
+
used_filenames.add(filename)
|
|
338
|
+
|
|
339
|
+
if not filepath.exists():
|
|
340
|
+
with open(filepath, 'wb') as f:
|
|
341
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
342
|
+
f.write(chunk)
|
|
343
|
+
finally:
|
|
344
|
+
close = getattr(response, 'close', None)
|
|
345
|
+
if callable(close):
|
|
346
|
+
close()
|
|
336
347
|
|
|
337
348
|
return url, filename
|
|
338
349
|
except requests.RequestException as e:
|
|
@@ -12,7 +12,7 @@ import html2text
|
|
|
12
12
|
|
|
13
13
|
from .config import Config, Source, load_cookie
|
|
14
14
|
from .database import Database
|
|
15
|
-
from .downloader import BaseDownloader, Post
|
|
15
|
+
from .downloader import BaseDownloader, Post, retry_request
|
|
16
16
|
|
|
17
17
|
# Паттерны для распознавания embed URL видеохостингов (whitelist).
|
|
18
18
|
# Если iframe src матчит один из паттернов — это встроенное видео.
|
|
@@ -49,8 +49,12 @@ class SponsorDownloader(BaseDownloader):
|
|
|
49
49
|
return self._project_id
|
|
50
50
|
|
|
51
51
|
url = f"https://sponsr.ru/{self.source.author}/"
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
def do_request():
|
|
53
|
+
resp = self.session.get(url, timeout=self.TIMEOUT)
|
|
54
|
+
resp.raise_for_status()
|
|
55
|
+
return resp
|
|
56
|
+
|
|
57
|
+
response = retry_request(do_request, max_retries=3)
|
|
54
58
|
|
|
55
59
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
56
60
|
data_tag = soup.find('script', id='__NEXT_DATA__')
|
|
@@ -86,8 +90,12 @@ class SponsorDownloader(BaseDownloader):
|
|
|
86
90
|
|
|
87
91
|
while True:
|
|
88
92
|
api_url = f"https://sponsr.ru/project/{project_id}/more-posts/?offset={offset}"
|
|
89
|
-
|
|
90
|
-
|
|
93
|
+
def do_request():
|
|
94
|
+
resp = self.session.get(api_url, timeout=self.TIMEOUT)
|
|
95
|
+
resp.raise_for_status()
|
|
96
|
+
return resp
|
|
97
|
+
|
|
98
|
+
response = retry_request(do_request, max_retries=3)
|
|
91
99
|
|
|
92
100
|
data = response.json().get("response", {})
|
|
93
101
|
posts_chunk = data.get("rows", [])
|
|
@@ -135,8 +143,12 @@ class SponsorDownloader(BaseDownloader):
|
|
|
135
143
|
# URL формат: https://sponsr.ru/{author}/{post_id}/...
|
|
136
144
|
url = f"https://sponsr.ru/{self.source.author}/{post_id}/"
|
|
137
145
|
try:
|
|
138
|
-
|
|
139
|
-
|
|
146
|
+
def do_request():
|
|
147
|
+
resp = self.session.get(url, timeout=self.TIMEOUT)
|
|
148
|
+
resp.raise_for_status()
|
|
149
|
+
return resp
|
|
150
|
+
|
|
151
|
+
response = retry_request(do_request, max_retries=3)
|
|
140
152
|
|
|
141
153
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
142
154
|
data_tag = soup.find('script', id='__NEXT_DATA__')
|
|
@@ -160,8 +172,12 @@ class SponsorDownloader(BaseDownloader):
|
|
|
160
172
|
while True:
|
|
161
173
|
api_url = f"https://sponsr.ru/project/{project_id}/more-posts/?offset={offset}"
|
|
162
174
|
try:
|
|
163
|
-
|
|
164
|
-
|
|
175
|
+
def do_request():
|
|
176
|
+
resp = self.session.get(api_url, timeout=self.TIMEOUT)
|
|
177
|
+
resp.raise_for_status()
|
|
178
|
+
return resp
|
|
179
|
+
|
|
180
|
+
response = retry_request(do_request, max_retries=3)
|
|
165
181
|
|
|
166
182
|
data = response.json().get("response", {})
|
|
167
183
|
posts_chunk = data.get("rows", [])
|
|
@@ -345,12 +361,9 @@ class SponsorDownloader(BaseDownloader):
|
|
|
345
361
|
tag.insert_after(NavigableString(trailing))
|
|
346
362
|
|
|
347
363
|
# 4. Вынос trailing/leading пробелов из <a> тегов наружу
|
|
348
|
-
# После выноса пробелов из formatting тегов, пробел может остаться
|
|
349
|
-
# внутри <a> (но вне <em>/<b>), что даёт [текст ](url) в markdown
|
|
350
364
|
for tag in list(soup.find_all('a')):
|
|
351
365
|
if tag.parent is None:
|
|
352
366
|
continue
|
|
353
|
-
# Trailing: проверяем последний дочерний узел (может быть голый пробел)
|
|
354
367
|
children = list(tag.children)
|
|
355
368
|
if children:
|
|
356
369
|
last_child = children[-1]
|
|
@@ -359,8 +372,40 @@ class SponsorDownloader(BaseDownloader):
|
|
|
359
372
|
last_child.replace_with(NavigableString(str(last_child).rstrip()))
|
|
360
373
|
tag.insert_after(NavigableString(trailing))
|
|
361
374
|
|
|
375
|
+
# 5. Экранирование markdown-символов в текстовых узлах
|
|
376
|
+
# Чтобы "сырые" _, *, [ ] в тексте не превращались в разметку
|
|
377
|
+
self._escape_text_nodes(soup)
|
|
378
|
+
|
|
362
379
|
return str(soup)
|
|
363
380
|
|
|
381
|
+
@staticmethod
|
|
382
|
+
def _escape_text_nodes(soup):
|
|
383
|
+
"""Экранирует спецсимволы Markdown в текстовых узлах."""
|
|
384
|
+
from bs4 import NavigableString
|
|
385
|
+
|
|
386
|
+
replacements = {
|
|
387
|
+
'_': '@@@US@@@',
|
|
388
|
+
'*': '@@@AST@@@',
|
|
389
|
+
'[': '@@@LBR@@@',
|
|
390
|
+
']': '@@@RBR@@@',
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
for text_node in soup.find_all(string=True):
|
|
394
|
+
if text_node.parent and text_node.parent.name in ['script', 'style', 'title']:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
text = str(text_node)
|
|
398
|
+
if not text:
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
new_text = text
|
|
402
|
+
for char, placeholder in replacements.items():
|
|
403
|
+
if char in new_text:
|
|
404
|
+
new_text = new_text.replace(char, placeholder)
|
|
405
|
+
|
|
406
|
+
if new_text != text:
|
|
407
|
+
text_node.replace_with(NavigableString(new_text))
|
|
408
|
+
|
|
364
409
|
@staticmethod
|
|
365
410
|
def _merge_adjacent_em(soup, em_tags: set, bold_tags: set):
|
|
366
411
|
"""Объединяет соседние <em>/<i> теги внутри одного родителя.
|
|
@@ -519,6 +564,12 @@ class SponsorDownloader(BaseDownloader):
|
|
|
519
564
|
|
|
520
565
|
markdown = h2t.handle(html)
|
|
521
566
|
|
|
567
|
+
# Восстанавливаем экранированные символы (из плейсхолдеров DOM)
|
|
568
|
+
markdown = markdown.replace('@@@US@@@', r'\_')
|
|
569
|
+
markdown = markdown.replace('@@@AST@@@', r'\*')
|
|
570
|
+
markdown = markdown.replace('@@@LBR@@@', r'\[')
|
|
571
|
+
markdown = markdown.replace('@@@RBR@@@', r'\]')
|
|
572
|
+
|
|
522
573
|
# Удаляем bidi-маркеры, которые ломают пробелы рядом с текстом
|
|
523
574
|
markdown = re.sub(r'[\u200e\u200f\u202a-\u202e\u2066-\u2069]', '', markdown)
|
|
524
575
|
|
|
@@ -550,38 +601,23 @@ class SponsorDownloader(BaseDownloader):
|
|
|
550
601
|
# Закрывающие: » " '
|
|
551
602
|
markdown = re.sub(r'\s+([\u00bb\u201d\u2019])', r'\1', markdown)
|
|
552
603
|
|
|
553
|
-
# Восстанавливаем пробелы вокруг
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
# Добавляем пробел справа, если нужно
|
|
572
|
-
if end < len(text) and text[end].isalnum():
|
|
573
|
-
matched_text = matched_text + ' '
|
|
574
|
-
|
|
575
|
-
parts.append(matched_text)
|
|
576
|
-
last = end
|
|
577
|
-
|
|
578
|
-
parts.append(text[last:])
|
|
579
|
-
return ''.join(parts)
|
|
580
|
-
|
|
581
|
-
# Восстанавливаем пробелы вокруг bold-italic, bold, ссылок
|
|
582
|
-
markdown = _fix_spacing(markdown, re.compile(r'\*\*\*.+?\*\*\*'))
|
|
583
|
-
markdown = _fix_spacing(markdown, re.compile(r'(?<!\*)\*\*(?!\*).+?(?<!\*)\*\*(?!\*)'))
|
|
584
|
-
markdown = _fix_spacing(markdown, re.compile(r'\[[^\]]+\]\([^)]+\)'))
|
|
604
|
+
# Восстанавливаем пробелы вокруг **bold**
|
|
605
|
+
# html2text часто склеивает: слово**bold** -> слово **bold**
|
|
606
|
+
# Используем поиск пар **, чтобы не сломать closing tag (bold**word -> bold **word - WRONG)
|
|
607
|
+
# 1. Left side: word**bold** -> word **bold**
|
|
608
|
+
markdown = re.sub(r'(\w)\*\*(.+?)\*\*', r'\1 **\2**', markdown)
|
|
609
|
+
# 2. Right side: **bold**word -> **bold** word
|
|
610
|
+
markdown = re.sub(r'\*\*(.+?)\*\*(\w)', r'**\1** \2', markdown)
|
|
611
|
+
|
|
612
|
+
# Убираем пробел между ссылкой и знаками препинания (даже если они курсивные)
|
|
613
|
+
# [link](url) . -> [link](url).
|
|
614
|
+
# [link](url) _._ -> [link](url)_._
|
|
615
|
+
markdown = re.sub(r'(\)\s+)([.,:;!?])', r')\2', markdown)
|
|
616
|
+
markdown = re.sub(r'(\)\s+)(_[.,:;!?]_)', r')\2', markdown)
|
|
617
|
+
|
|
618
|
+
# Исправляем артефакты html2text внутри ссылок: [ _текст_ ] -> [_текст_]
|
|
619
|
+
markdown = re.sub(r'\[\s+_', r'[_', markdown)
|
|
620
|
+
markdown = re.sub(r'_\s+\]', r'_]', markdown)
|
|
585
621
|
|
|
586
622
|
# Заголовок берётся из frontmatter (Hugo), не дублируем его в body.
|
|
587
623
|
return markdown
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import unittest
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from backup import generate_hugo_config
|
|
7
|
+
from src.config import Auth, Config, HugoConfig, load_config
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigHardeningTests(unittest.TestCase):
|
|
11
|
+
def test_load_config_accepts_empty_yaml(self):
|
|
12
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
13
|
+
cfg_path = Path(tmp) / "config.yaml"
|
|
14
|
+
cfg_path.write_text("", encoding="utf-8")
|
|
15
|
+
|
|
16
|
+
cfg = load_config(cfg_path)
|
|
17
|
+
|
|
18
|
+
self.assertEqual(cfg.output_dir, Path("./backup"))
|
|
19
|
+
self.assertEqual(cfg.sources, [])
|
|
20
|
+
|
|
21
|
+
def test_generate_hugo_config_escapes_quotes(self):
|
|
22
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
23
|
+
old_cwd = Path.cwd()
|
|
24
|
+
tmp_path = Path(tmp)
|
|
25
|
+
(tmp_path / "site").mkdir(parents=True, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
os.chdir(tmp_path)
|
|
29
|
+
cfg = Config(
|
|
30
|
+
output_dir=tmp_path / "backup",
|
|
31
|
+
auth=Auth(),
|
|
32
|
+
hugo=HugoConfig(
|
|
33
|
+
base_url='https://example.com/a"b',
|
|
34
|
+
title='Bob\'s "backup"',
|
|
35
|
+
language_code="ru",
|
|
36
|
+
default_theme='light"mode',
|
|
37
|
+
),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
generate_hugo_config(cfg)
|
|
41
|
+
toml = (tmp_path / "site" / "hugo.toml").read_text(encoding="utf-8")
|
|
42
|
+
|
|
43
|
+
self.assertIn('title = "Bob\'s \\"backup\\""', toml)
|
|
44
|
+
self.assertIn('baseURL = "https://example.com/a\\"b"', toml)
|
|
45
|
+
self.assertIn('default_theme = "light\\"mode"', toml)
|
|
46
|
+
finally:
|
|
47
|
+
os.chdir(old_cwd)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
unittest.main()
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from src.config import Auth, Config, Source
|
|
6
|
+
from src.database import Database, PostRecord
|
|
7
|
+
from src.downloader import BaseDownloader, Post
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _SlugDummyDownloader(BaseDownloader):
|
|
11
|
+
PLATFORM = "dummy"
|
|
12
|
+
|
|
13
|
+
def _setup_session(self):
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
def fetch_posts_list(self, existing_ids=None, incremental=False, safety_chunks=1):
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
|
|
19
|
+
def fetch_post(self, post_id: str):
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
|
|
22
|
+
def _parse_post(self, raw_data: dict):
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
|
|
26
|
+
return "content\n"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SlugSafetyTests(unittest.TestCase):
|
|
30
|
+
def test_slug_unique_for_same_title_and_date(self):
|
|
31
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
32
|
+
tmp_path = Path(tmp)
|
|
33
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
34
|
+
source = Source(platform="sponsr", author="author")
|
|
35
|
+
|
|
36
|
+
with Database(tmp_path / "test.db") as db:
|
|
37
|
+
dl = _SlugDummyDownloader(config, source, db)
|
|
38
|
+
|
|
39
|
+
post1 = Post(
|
|
40
|
+
post_id="101",
|
|
41
|
+
title="Одинаковый заголовок",
|
|
42
|
+
content_html="",
|
|
43
|
+
post_date="2025-01-01T00:00:00",
|
|
44
|
+
source_url="https://example.com/101",
|
|
45
|
+
tags=[],
|
|
46
|
+
assets=[],
|
|
47
|
+
)
|
|
48
|
+
post2 = Post(
|
|
49
|
+
post_id="202",
|
|
50
|
+
title="Одинаковый заголовок",
|
|
51
|
+
content_html="",
|
|
52
|
+
post_date="2025-01-01T01:00:00",
|
|
53
|
+
source_url="https://example.com/202",
|
|
54
|
+
tags=[],
|
|
55
|
+
assets=[],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
dl._save_post(post1)
|
|
59
|
+
dl._save_post(post2)
|
|
60
|
+
|
|
61
|
+
rec1 = db.get_post("dummy", "author", "101")
|
|
62
|
+
rec2 = db.get_post("dummy", "author", "202")
|
|
63
|
+
self.assertIsNotNone(rec1)
|
|
64
|
+
self.assertIsNotNone(rec2)
|
|
65
|
+
self.assertNotEqual(rec1.slug, rec2.slug)
|
|
66
|
+
self.assertTrue((Path(rec1.local_path) / "index.md").exists())
|
|
67
|
+
self.assertTrue((Path(rec2.local_path) / "index.md").exists())
|
|
68
|
+
|
|
69
|
+
def test_existing_slug_is_reused_for_same_post_id(self):
|
|
70
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
71
|
+
tmp_path = Path(tmp)
|
|
72
|
+
config = Config(output_dir=tmp_path, auth=Auth())
|
|
73
|
+
source = Source(platform="sponsr", author="author")
|
|
74
|
+
|
|
75
|
+
with Database(tmp_path / "test.db") as db:
|
|
76
|
+
old_slug = "2025-01-01-old-style-slug"
|
|
77
|
+
old_path = str(tmp_path / "dummy" / "author" / "posts" / old_slug)
|
|
78
|
+
db.add_post(PostRecord(
|
|
79
|
+
platform="dummy",
|
|
80
|
+
author="author",
|
|
81
|
+
post_id="legacy-id",
|
|
82
|
+
title="Old",
|
|
83
|
+
slug=old_slug,
|
|
84
|
+
post_date="2025-01-01T00:00:00",
|
|
85
|
+
source_url="https://example.com/legacy",
|
|
86
|
+
local_path=old_path,
|
|
87
|
+
tags="[]",
|
|
88
|
+
synced_at="2025-01-01T00:00:00+00:00",
|
|
89
|
+
))
|
|
90
|
+
|
|
91
|
+
dl = _SlugDummyDownloader(config, source, db)
|
|
92
|
+
updated = Post(
|
|
93
|
+
post_id="legacy-id",
|
|
94
|
+
title="Новое имя",
|
|
95
|
+
content_html="",
|
|
96
|
+
post_date="2025-01-01T02:00:00",
|
|
97
|
+
source_url="https://example.com/legacy",
|
|
98
|
+
tags=[],
|
|
99
|
+
assets=[],
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
dl._save_post(updated)
|
|
103
|
+
|
|
104
|
+
rec = db.get_post("dummy", "author", "legacy-id")
|
|
105
|
+
self.assertIsNotNone(rec)
|
|
106
|
+
self.assertEqual(rec.slug, old_slug)
|
|
107
|
+
self.assertTrue((Path(rec.local_path) / "index.md").exists())
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
if __name__ == "__main__":
|
|
111
|
+
unittest.main()
|
|
@@ -412,6 +412,100 @@ class SponsorNormalizeTests(unittest.TestCase):
|
|
|
412
412
|
self.assertIn('_курсив2_', result)
|
|
413
413
|
self.assertIn('обычный', result)
|
|
414
414
|
|
|
415
|
+
def _convert_full(self, html):
|
|
416
|
+
"""Helper to convert HTML to Markdown (full text)."""
|
|
417
|
+
post = Post(
|
|
418
|
+
post_id='1',
|
|
419
|
+
title='Test',
|
|
420
|
+
content_html=html,
|
|
421
|
+
post_date='2025-01-01',
|
|
422
|
+
source_url='https://test.com',
|
|
423
|
+
tags=[],
|
|
424
|
+
assets=[]
|
|
425
|
+
)
|
|
426
|
+
return self.downloader._to_markdown(post, {})
|
|
427
|
+
|
|
428
|
+
def test_case_1_spacing_cleanup(self):
|
|
429
|
+
"""1. Пробелы внутри курсива (_ текст _) и вокруг."""
|
|
430
|
+
html = (
|
|
431
|
+
'<p>фильме.</em></p><p><em>Например, Гор предсказал, что к 2016 году на Килиманджаро не останется снега. '
|
|
432
|
+
'В 2020 году газета The Times сообщила, что снег на горе высотой 19 000 футов (около 5800 метров) остался, '
|
|
433
|
+
'несмотря на предсказания Гора. </em></p><p><em>Гор'
|
|
434
|
+
)
|
|
435
|
+
md = self._convert_full(html)
|
|
436
|
+
|
|
437
|
+
# Expectation: no spaces inside markers, clean paragraphs
|
|
438
|
+
self.assertIn('фильме.', md)
|
|
439
|
+
self.assertIn('_Например, Гор', md)
|
|
440
|
+
self.assertIn('предсказания Гора._', md)
|
|
441
|
+
self.assertIn('_Гор', md)
|
|
442
|
+
|
|
443
|
+
self.assertNotIn('_ Например', md)
|
|
444
|
+
self.assertNotIn('Гора. _', md)
|
|
445
|
+
self.assertNotIn(' _Гор', md)
|
|
446
|
+
|
|
447
|
+
def test_case_2_multiline_italic(self):
|
|
448
|
+
"""2. Курсив через границы абзацев."""
|
|
449
|
+
html = (
|
|
450
|
+
'<p>В.М.).</em></p><p><em>Метеоролог Крис Марц сказал, что климатология полна неопределенности и нюансов, '
|
|
451
|
+
'которые «Неудобная правда» полностью отвергает. </em></p><p><em>Однако'
|
|
452
|
+
)
|
|
453
|
+
md = self._convert_full(html)
|
|
454
|
+
|
|
455
|
+
self.assertIn('В.М.).', md)
|
|
456
|
+
self.assertIn('_Метеоролог Крис', md)
|
|
457
|
+
self.assertIn('отвергает._', md)
|
|
458
|
+
self.assertIn('_Однако', md)
|
|
459
|
+
|
|
460
|
+
self.assertNotIn('_ Метеоролог', md)
|
|
461
|
+
self.assertNotIn('отвергает. _', md)
|
|
462
|
+
|
|
463
|
+
def test_case_3_literal_underscore_in_text(self):
|
|
464
|
+
"""3. Символы _ в обычном тексте не должны становиться разметкой."""
|
|
465
|
+
html = (
|
|
466
|
+
'<p>сформулировал: «_39 лет я никогда не писал этих слов в отзыве на кино, а сейчас пишу: _'
|
|
467
|
+
'<a href="http://example.com" target="_blank"><em>вы <strong>обязаны</strong> это посмотреть</em></a>».</p><p>К тому же'
|
|
468
|
+
)
|
|
469
|
+
md = self._convert_full(html)
|
|
470
|
+
|
|
471
|
+
# Literal underscores should be escaped
|
|
472
|
+
self.assertIn(r'\_39 лет', md)
|
|
473
|
+
self.assertIn(r'пишу: \_', md)
|
|
474
|
+
|
|
475
|
+
# Link formatting should be clean
|
|
476
|
+
self.assertIn('[_вы **обязаны** это посмотреть_](http://example.com)', md)
|
|
477
|
+
|
|
478
|
+
# No extra spaces
|
|
479
|
+
self.assertNotIn('[ _вы', md)
|
|
480
|
+
|
|
481
|
+
def test_case_4_underscore_suffix(self):
|
|
482
|
+
"""4. Пробел перед закрывающим _."""
|
|
483
|
+
html = '<p>читатель данного проекта ощутил себя _не таким как все _(которого не проведёшь)?</p>'
|
|
484
|
+
md = self._convert_full(html)
|
|
485
|
+
|
|
486
|
+
# Literal underscores should be escaped
|
|
487
|
+
self.assertIn(r'\_не таким как все \_', md)
|
|
488
|
+
|
|
489
|
+
# Verify no unescaped underscores (except inside words if any, but here they are spaced)
|
|
490
|
+
# Using regex to ensure underscores are preceded by backslash
|
|
491
|
+
import re
|
|
492
|
+
self.assertFalse(re.search(r'(?<!\\)_', md), "Found unescaped underscore")
|
|
493
|
+
|
|
494
|
+
def test_case_5_link_italic_punctuation(self):
|
|
495
|
+
"""5. Курсив вокруг ссылки и точки."""
|
|
496
|
+
html = (
|
|
497
|
+
'<p>бежать.</em></p><p><em>Из нескольких разговоров ... из </em>'
|
|
498
|
+
'<a href="https://example.com" target="_blank"><em>свежего текста</em></a><em>.</em></p><p><em>Поэтому'
|
|
499
|
+
)
|
|
500
|
+
md = self._convert_full(html)
|
|
501
|
+
|
|
502
|
+
self.assertIn('бежать.', md)
|
|
503
|
+
self.assertIn('_Из нескольких', md)
|
|
504
|
+
# Link inside italic context
|
|
505
|
+
self.assertIn('](https://example.com)', md)
|
|
506
|
+
self.assertNotIn(' _.', md)
|
|
507
|
+
self.assertNotIn('_. _', md)
|
|
508
|
+
|
|
415
509
|
|
|
416
510
|
if __name__ == '__main__':
|
|
417
511
|
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|