article-backup 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of article-backup might be problematic. Click here for more details.

src/downloader.py ADDED
@@ -0,0 +1,366 @@
1
+ # src/downloader.py
2
+ """Базовый класс загрузчика и общая логика."""
3
+
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import time
8
+ from abc import ABC, abstractmethod
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from urllib.parse import urljoin, urlparse
14
+
15
+ import requests
16
+ from slugify import slugify
17
+
18
+ from .config import Config, Source
19
+ from .database import Database, PostRecord
20
+ from .utils import (
21
+ ALLOWED_EXTENSIONS,
22
+ should_download_asset,
23
+ get_extension_from_content_type,
24
+ transliterate,
25
+ sanitize_filename,
26
+ extract_internal_links,
27
+ )
28
+
29
+
30
+ def retry_request(
31
+ func,
32
+ max_retries: int = 3,
33
+ base_delay: float = 1.0,
34
+ max_delay: float = 30.0,
35
+ backoff_factor: float = 2.0,
36
+ ):
37
+ """
38
+ Выполняет функцию с retry и exponential backoff.
39
+
40
+ Args:
41
+ func: Функция для выполнения (должна возвращать Response или вызывать исключение)
42
+ max_retries: Максимальное количество попыток
43
+ base_delay: Начальная задержка в секундах
44
+ max_delay: Максимальная задержка в секундах
45
+ backoff_factor: Множитель для увеличения задержки
46
+ """
47
+ last_exception = None
48
+ delay = base_delay
49
+
50
+ for attempt in range(max_retries):
51
+ try:
52
+ return func()
53
+ except requests.RequestException as e:
54
+ last_exception = e
55
+ # Не ретраим 4xx ошибки (кроме 429 Too Many Requests)
56
+ if hasattr(e, 'response') and e.response is not None:
57
+ if 400 <= e.response.status_code < 500 and e.response.status_code != 429:
58
+ raise
59
+
60
+ if attempt < max_retries - 1:
61
+ time.sleep(delay)
62
+ delay = min(delay * backoff_factor, max_delay)
63
+
64
+ raise last_exception
65
+
66
+
67
+ @dataclass
68
+ class Post:
69
+ """Универсальная структура поста."""
70
+ post_id: str
71
+ title: str
72
+ content_html: str
73
+ post_date: str
74
+ source_url: str
75
+ tags: list[str]
76
+ assets: list[dict]
77
+
78
+
79
+ class BaseDownloader(ABC):
80
+ """Базовый класс для загрузчиков."""
81
+
82
+ PLATFORM: str = ""
83
+ MAX_WORKERS: int = 5
84
+ TIMEOUT: tuple = (5, 30)
85
+
86
+ def __init__(self, config: Config, source: Source, db: Database):
87
+ self.config = config
88
+ self.source = source
89
+ self.db = db
90
+ self.session = requests.Session()
91
+ self._setup_session()
92
+
93
+ @abstractmethod
94
+ def _setup_session(self):
95
+ """Настройка сессии (cookies, headers)."""
96
+ pass
97
+
98
+ @abstractmethod
99
+ def fetch_posts_list(self) -> list[dict]:
100
+ """Получает список постов с API."""
101
+ pass
102
+
103
+ @abstractmethod
104
+ def fetch_post(self, post_id: str) -> Post | None:
105
+ """Получает один пост по ID."""
106
+ pass
107
+
108
+ @abstractmethod
109
+ def _parse_post(self, raw_data: dict) -> Post:
110
+ """Парсит сырые данные API в Post."""
111
+ pass
112
+
113
+ def sync(self):
114
+ """Синхронизирует все новые посты автора."""
115
+ print(f"[{self.PLATFORM}] Синхронизация {self.source.author}...")
116
+
117
+ self._create_index_files()
118
+
119
+ existing_ids = self.db.get_all_post_ids(self.PLATFORM, self.source.author)
120
+ posts = self.fetch_posts_list()
121
+
122
+ new_posts = [p for p in posts if str(p.get('id', p.get('post_id'))) not in existing_ids]
123
+ print(f" Найдено постов: {len(posts)}, новых: {len(new_posts)}")
124
+
125
+ for raw_post in new_posts:
126
+ post = self._parse_post(raw_post)
127
+ if post:
128
+ self._save_post(post)
129
+
130
+ # Фиксим ссылки после скачивания всех постов
131
+ if new_posts:
132
+ print(f" Фиксим внутренние ссылки...")
133
+ self.fix_internal_links()
134
+
135
+ def download_single(self, post_id: str):
136
+ """Скачивает один пост по ID."""
137
+ print(f"[{self.PLATFORM}] Скачивание поста {post_id}...")
138
+ post = self.fetch_post(post_id)
139
+ if post:
140
+ self._save_post(post)
141
+ else:
142
+ print(f" Ошибка: пост {post_id} не найден")
143
+
144
+ def _create_index_files(self):
145
+ """Создаёт _index.md файлы для навигации Hugo."""
146
+ # Для платформы
147
+ platform_dir = self.config.output_dir / self.PLATFORM
148
+ platform_dir.mkdir(parents=True, exist_ok=True)
149
+ platform_index = platform_dir / "_index.md"
150
+ if not platform_index.exists():
151
+ platform_index.write_text(f"---\ntitle: {self.PLATFORM.title()}\n---\n", encoding='utf-8')
152
+
153
+ # Для автора
154
+ author_dir = platform_dir / self.source.author
155
+ author_dir.mkdir(parents=True, exist_ok=True)
156
+ author_index = author_dir / "_index.md"
157
+ display_name = self.source.display_name or self.source.author
158
+ safe_display_name = display_name.replace('"', '\\"')
159
+ author_index.write_text(f'---\ntitle: "{safe_display_name}"\n---\n', encoding='utf-8')
160
+
161
+ # Для posts
162
+ posts_dir = author_dir / "posts"
163
+ posts_dir.mkdir(parents=True, exist_ok=True)
164
+ posts_index = posts_dir / "_index.md"
165
+ posts_index.write_text(f'---\ntitle: "Посты"\n---\n', encoding='utf-8')
166
+
167
+ def _save_post(self, post: Post):
168
+ """Сохраняет пост на диск."""
169
+ slug = self._make_slug(post)
170
+ post_dir = self._get_post_dir(slug)
171
+ post_dir.mkdir(parents=True, exist_ok=True)
172
+
173
+ # Скачиваем assets
174
+ if self.source.download_assets and post.assets:
175
+ assets_dir = post_dir / "assets"
176
+ assets_dir.mkdir(exist_ok=True)
177
+ asset_map = self._download_assets(post.assets, assets_dir) or {}
178
+ else:
179
+ asset_map = {}
180
+
181
+ # Конвертируем в Markdown
182
+ content_md = self._to_markdown(post, asset_map)
183
+
184
+ # Создаём frontmatter
185
+ frontmatter = self._make_frontmatter(post)
186
+
187
+ # Записываем файл
188
+ md_path = post_dir / "index.md"
189
+ md_path.write_text(frontmatter + content_md, encoding='utf-8')
190
+
191
+ # Обновляем индекс
192
+ record = PostRecord(
193
+ platform=self.PLATFORM,
194
+ author=self.source.author,
195
+ post_id=post.post_id,
196
+ title=post.title,
197
+ slug=slug,
198
+ post_date=post.post_date,
199
+ source_url=post.source_url,
200
+ local_path=str(post_dir),
201
+ tags=json.dumps(post.tags, ensure_ascii=False),
202
+ synced_at=datetime.now(timezone.utc).isoformat(),
203
+ )
204
+ self.db.add_post(record)
205
+ print(f" ✓ {post.title}")
206
+
207
+ def _make_slug(self, post: Post) -> str:
208
+ """Создаёт slug для папки поста."""
209
+ date_prefix = post.post_date[:10]
210
+ title_slug = transliterate(post.title)[:60]
211
+ return f"{date_prefix}-{title_slug}"
212
+
213
+ def _get_post_dir(self, slug: str) -> Path:
214
+ """Возвращает путь к папке поста."""
215
+ return (
216
+ self.config.output_dir
217
+ / self.PLATFORM
218
+ / self.source.author
219
+ / "posts"
220
+ / slug
221
+ )
222
+
223
+ def _make_frontmatter(self, post: Post) -> str:
224
+ """Создаёт YAML frontmatter."""
225
+ # Экранируем кавычки в заголовке
226
+ safe_title = post.title.replace('"', '\\"')
227
+
228
+ lines = [
229
+ "---",
230
+ f'title: "{safe_title}"',
231
+ f"date: {post.post_date}",
232
+ f"source: {post.source_url}",
233
+ f"author: {self.source.author}",
234
+ f"platform: {self.PLATFORM}",
235
+ f"post_id: {post.post_id}",
236
+ ]
237
+ if post.tags:
238
+ tags_str = json.dumps(post.tags, ensure_ascii=False)
239
+ lines.append(f"tags: {tags_str}")
240
+ lines.append("---\n\n")
241
+ return "\n".join(lines)
242
+
243
+ def _download_assets(self, assets: list[dict], assets_dir: Path) -> dict[str, str]:
244
+ """
245
+ Скачивает assets параллельно.
246
+ Возвращает маппинг {original_url: local_filename}.
247
+ """
248
+ asset_map = {}
249
+ used_filenames: set[str] = set()
250
+
251
+ def download_one(asset: dict) -> tuple[str, str | None]:
252
+ url = asset["url"]
253
+ try:
254
+ # Предварительная проверка только по расширению (если есть)
255
+ ext = Path(urlparse(url).path).suffix.lower()
256
+ if ext and ext not in ALLOWED_EXTENSIONS:
257
+ return url, None
258
+
259
+ def do_request():
260
+ resp = self.session.get(url, stream=True, timeout=self.TIMEOUT)
261
+ resp.raise_for_status()
262
+ return resp
263
+
264
+ response = retry_request(do_request, max_retries=3)
265
+
266
+ content_type = response.headers.get('Content-Type', '')
267
+
268
+ # Полная проверка после получения Content-Type
269
+ if not should_download_asset(url, content_type):
270
+ return url, None
271
+
272
+ filename = self._make_asset_filename(url, content_type, asset.get('alt'))
273
+ filepath = assets_dir / filename
274
+
275
+ if not filepath.exists():
276
+ with open(filepath, 'wb') as f:
277
+ for chunk in response.iter_content(chunk_size=8192):
278
+ f.write(chunk)
279
+
280
+ return url, filename
281
+ except requests.RequestException as e:
282
+ print(f" Ошибка скачивания {url}: {e}")
283
+ return url, None
284
+
285
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
286
+ futures = {executor.submit(download_one, a): a for a in assets}
287
+ for future in as_completed(futures):
288
+ url, filename = future.result()
289
+ if filename:
290
+ # Дедупликация имён файлов
291
+ if filename in used_filenames:
292
+ filename = self._deduplicate_filename(filename, url)
293
+ used_filenames.add(filename)
294
+ asset_map[url] = filename
295
+
296
+ return asset_map
297
+
298
+ def _deduplicate_filename(self, filename: str, url: str) -> str:
299
+ """Создаёт уникальное имя файла добавляя хеш URL."""
300
+ path = Path(filename)
301
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
302
+ return f"{path.stem}-{url_hash}{path.suffix}"
303
+
304
+ def _make_asset_filename(self, url: str, content_type: str, alt: str | None) -> str:
305
+ """Создаёт имя файла для asset."""
306
+ path = urlparse(url).path
307
+ original_name = Path(path).name
308
+ ext = Path(path).suffix.lower()
309
+
310
+ if not ext or ext not in ALLOWED_EXTENSIONS:
311
+ ext = get_extension_from_content_type(content_type) or '.bin'
312
+
313
+ if alt:
314
+ name = transliterate(alt)[:50]
315
+ else:
316
+ name = slugify(Path(path).stem or 'asset', max_length=50)
317
+
318
+ return f"{name}{ext}"
319
+
320
+ def fix_internal_links(self):
321
+ """Фиксит внутренние ссылки во всех постах автора."""
322
+ posts = self.db.get_all_posts(self.PLATFORM, self.source.author)
323
+ if not posts:
324
+ return
325
+
326
+ # Строим маппинг post_id → slug
327
+ id_to_slug = {p.post_id: p.slug for p in posts}
328
+
329
+ fixed_files = 0
330
+
331
+ for post in posts:
332
+ md_path = Path(post.local_path) / "index.md"
333
+ if not md_path.exists():
334
+ continue
335
+
336
+ content = md_path.read_text(encoding='utf-8')
337
+
338
+ # Разделяем frontmatter и body
339
+ if content.startswith('---'):
340
+ parts = content.split('---', 2)
341
+ if len(parts) >= 3:
342
+ frontmatter = parts[1]
343
+ body = parts[2]
344
+ else:
345
+ continue
346
+ else:
347
+ continue
348
+
349
+ original_body = body
350
+
351
+ for full_url, platform, post_id in extract_internal_links(body):
352
+ if post_id in id_to_slug:
353
+ body = body.replace(full_url, f"../{id_to_slug[post_id]}/")
354
+
355
+ if body != original_body:
356
+ new_content = f"---{frontmatter}---{body}"
357
+ md_path.write_text(new_content, encoding='utf-8')
358
+ fixed_files += 1
359
+
360
+ if fixed_files:
361
+ print(f" Исправлено ссылок в {fixed_files} файлах")
362
+
363
+ @abstractmethod
364
+ def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
365
+ """Конвертирует контент поста в Markdown."""
366
+ pass
src/sponsr.py ADDED
@@ -0,0 +1,257 @@
1
+ # src/sponsr.py
2
+ """Загрузчик для Sponsr.ru"""
3
+
4
+ import json
5
+ import re
6
+ from urllib.parse import urljoin
7
+
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ import html2text
11
+
12
+ from .config import Config, Source, load_cookie
13
+ from .database import Database
14
+ from .downloader import BaseDownloader, Post
15
+
16
+ # Паттерны для преобразования embed URL в watch URL
17
+ VIDEO_EMBED_PATTERNS = [
18
+ (r'rutube\.ru/play/embed/([a-f0-9]+)', lambda m: f'https://rutube.ru/video/{m.group(1)}/'),
19
+ (r'youtube\.com/embed/([^/?]+)', lambda m: f'https://youtube.com/watch?v={m.group(1)}'),
20
+ (r'youtu\.be/([^/?]+)', lambda m: f'https://youtube.com/watch?v={m.group(1)}'),
21
+ (r'player\.vimeo\.com/video/(\d+)', lambda m: f'https://vimeo.com/{m.group(1)}'),
22
+ (r'ok\.ru/videoembed/(\d+)', lambda m: f'https://ok.ru/video/{m.group(1)}'),
23
+ (r'vk\.com/video_ext\.php\?.*?oid=(-?\d+).*?id=(\d+)', lambda m: f'https://vk.com/video{m.group(1)}_{m.group(2)}'),
24
+ ]
25
+
26
+
27
+ class SponsorDownloader(BaseDownloader):
28
+ """Загрузчик статей с Sponsr.ru"""
29
+
30
+ PLATFORM = "sponsr"
31
+
32
+ def __init__(self, config: Config, source: Source, db: Database):
33
+ self._project_id: str | None = None
34
+ super().__init__(config, source, db)
35
+
36
+ def _setup_session(self):
37
+ """Настройка сессии с cookies."""
38
+ cookie = load_cookie(self.config.auth.sponsr_cookie_file)
39
+ self.session.headers.update({
40
+ 'Cookie': cookie,
41
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
42
+ 'X-Requested-With': 'XMLHttpRequest',
43
+ })
44
+
45
+ def _get_project_id(self) -> str:
46
+ """Получает project_id со страницы проекта."""
47
+ if self._project_id:
48
+ return self._project_id
49
+
50
+ url = f"https://sponsr.ru/{self.source.author}/"
51
+ response = self.session.get(url, timeout=self.TIMEOUT)
52
+ response.raise_for_status()
53
+
54
+ soup = BeautifulSoup(response.text, 'lxml')
55
+ data_tag = soup.find('script', id='__NEXT_DATA__')
56
+ if not data_tag:
57
+ raise ValueError(f"Не найден __NEXT_DATA__ на странице {url}")
58
+
59
+ data = json.loads(data_tag.string)
60
+ project_id = data.get('props', {}).get('pageProps', {}).get('project', {}).get('id')
61
+ if not project_id:
62
+ raise ValueError(f"Не найден project.id в __NEXT_DATA__")
63
+
64
+ self._project_id = str(project_id)
65
+ return self._project_id
66
+
67
+ def fetch_posts_list(self) -> list[dict]:
68
+ """Получает список всех постов через API."""
69
+ project_id = self._get_project_id()
70
+ all_posts = []
71
+ offset = 0
72
+
73
+ while True:
74
+ api_url = f"https://sponsr.ru/project/{project_id}/more-posts/?offset={offset}"
75
+ response = self.session.get(api_url, timeout=self.TIMEOUT)
76
+ response.raise_for_status()
77
+
78
+ data = response.json().get("response", {})
79
+ posts_chunk = data.get("rows", [])
80
+
81
+ if not posts_chunk:
82
+ break
83
+
84
+ all_posts.extend(posts_chunk)
85
+ offset = len(all_posts)
86
+
87
+ total = data.get("rows_count", 0)
88
+ print(f" Получено {offset}/{total} постов...")
89
+
90
+ return all_posts
91
+
92
+ def fetch_post(self, post_id: str) -> Post | None:
93
+ """Получает один пост по ID."""
94
+ # Сначала пробуем получить напрямую со страницы поста
95
+ post = self._fetch_post_from_page(post_id)
96
+ if post:
97
+ return post
98
+
99
+ # Fallback: ищем в API постранично (без загрузки всего списка)
100
+ return self._find_post_in_api(post_id)
101
+
102
+ def _fetch_post_from_page(self, post_id: str) -> Post | None:
103
+ """Получает пост напрямую со страницы."""
104
+ # URL формат: https://sponsr.ru/{author}/{post_id}/...
105
+ url = f"https://sponsr.ru/{self.source.author}/{post_id}/"
106
+ try:
107
+ response = self.session.get(url, timeout=self.TIMEOUT)
108
+ response.raise_for_status()
109
+
110
+ soup = BeautifulSoup(response.text, 'lxml')
111
+ data_tag = soup.find('script', id='__NEXT_DATA__')
112
+ if not data_tag:
113
+ return None
114
+
115
+ data = json.loads(data_tag.string)
116
+ post_data = data.get('props', {}).get('pageProps', {}).get('post')
117
+ if not post_data:
118
+ return None
119
+
120
+ return self._parse_post(post_data)
121
+ except requests.RequestException:
122
+ return None
123
+
124
+ def _find_post_in_api(self, post_id: str) -> Post | None:
125
+ """Ищет пост в API постранично (останавливается при нахождении)."""
126
+ project_id = self._get_project_id()
127
+ offset = 0
128
+
129
+ while True:
130
+ api_url = f"https://sponsr.ru/project/{project_id}/more-posts/?offset={offset}"
131
+ try:
132
+ response = self.session.get(api_url, timeout=self.TIMEOUT)
133
+ response.raise_for_status()
134
+
135
+ data = response.json().get("response", {})
136
+ posts_chunk = data.get("rows", [])
137
+
138
+ if not posts_chunk:
139
+ break
140
+
141
+ for raw_post in posts_chunk:
142
+ if str(raw_post.get('post_id')) == post_id:
143
+ return self._parse_post(raw_post)
144
+
145
+ offset += len(posts_chunk)
146
+ except requests.RequestException:
147
+ break
148
+
149
+ return None
150
+
151
+ def _parse_post(self, raw_data: dict) -> Post:
152
+ """Парсит сырые данные API в Post."""
153
+ post_id = str(raw_data['post_id'])
154
+ title = raw_data.get('post_title', 'Без названия')
155
+ post_date = raw_data.get('post_date', '')
156
+
157
+ # URL поста
158
+ post_url = raw_data.get('post_url', '')
159
+ if post_url and not post_url.startswith('http'):
160
+ post_url = f"https://sponsr.ru{post_url}"
161
+
162
+ # HTML контент
163
+ content_html = raw_data.get('post_text', '')
164
+
165
+ # Теги
166
+ tags = raw_data.get('tags', [])
167
+
168
+ # Извлекаем assets из HTML
169
+ assets = self._extract_assets(content_html)
170
+
171
+ return Post(
172
+ post_id=post_id,
173
+ title=title,
174
+ content_html=content_html,
175
+ post_date=post_date,
176
+ source_url=post_url,
177
+ tags=tags,
178
+ assets=assets,
179
+ )
180
+
181
+ def _extract_assets(self, html_content: str) -> list[dict]:
182
+ """Извлекает URL изображений из HTML."""
183
+ if not html_content:
184
+ return []
185
+
186
+ assets = []
187
+ soup = BeautifulSoup(html_content, 'lxml')
188
+
189
+ for img in soup.find_all('img'):
190
+ src = img.get('src') or img.get('data-src')
191
+ if not src:
192
+ continue
193
+
194
+ # Абсолютный URL
195
+ if not src.startswith('http'):
196
+ src = urljoin('https://sponsr.ru', src)
197
+
198
+ # Alt текст
199
+ alt = img.get('alt', '')
200
+ if not alt:
201
+ parent = img.find_parent('div', class_='post-image')
202
+ if parent and parent.get('data-alt'):
203
+ alt = parent.get('data-alt')
204
+
205
+ assets.append({'url': src, 'alt': alt})
206
+
207
+ return assets
208
+
209
+ def _parse_video_url(self, embed_src: str) -> str | None:
210
+ """Преобразует embed URL в watch URL."""
211
+ for pattern, converter in VIDEO_EMBED_PATTERNS:
212
+ match = re.search(pattern, embed_src)
213
+ if match:
214
+ return converter(match)
215
+ # Fallback: вернуть оригинальный URL если не распознан
216
+ if embed_src and ('video' in embed_src or 'embed' in embed_src):
217
+ return embed_src
218
+ return None
219
+
220
+ def _replace_video_embeds(self, html_content: str) -> str:
221
+ """Заменяет iframe/embed видео на markdown-ссылки."""
222
+ soup = BeautifulSoup(html_content, 'lxml')
223
+
224
+ for iframe in soup.find_all(['iframe', 'embed']):
225
+ src = iframe.get('src', '')
226
+ video_url = self._parse_video_url(src)
227
+ if video_url:
228
+ placeholder = soup.new_tag('p')
229
+ placeholder.string = f'📹 Видео: {video_url}'
230
+ iframe.replace_with(placeholder)
231
+
232
+ return str(soup)
233
+
234
+ def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
235
+ """Конвертирует HTML в Markdown."""
236
+ if not post.content_html:
237
+ return f"# {post.title}\n\n"
238
+
239
+ # Заменяем URL изображений на локальные
240
+ html = post.content_html
241
+ for original_url, local_filename in asset_map.items():
242
+ html = html.replace(original_url, f"assets/{local_filename}")
243
+
244
+ # Заменяем iframe/embed видео на markdown-ссылки
245
+ html = self._replace_video_embeds(html)
246
+
247
+ # Конвертируем HTML в Markdown
248
+ h2t = html2text.HTML2Text()
249
+ h2t.ignore_links = False
250
+ h2t.ignore_images = False
251
+ h2t.body_width = 0 # Без переноса строк
252
+ h2t.unicode_snob = True
253
+
254
+ markdown = h2t.handle(html)
255
+
256
+ # Добавляем заголовок
257
+ return f"# {post.title}\n\n{markdown}"