article-backup 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/database.py ADDED
@@ -0,0 +1,169 @@
1
+ # src/database.py
2
+ """SQLite операции для индекса постов."""
3
+
4
+ import sqlite3
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass
10
+ class PostRecord:
11
+ platform: str
12
+ author: str
13
+ post_id: str
14
+ title: str
15
+ slug: str
16
+ post_date: str
17
+ source_url: str
18
+ local_path: str
19
+ tags: str
20
+ synced_at: str
21
+
22
+
23
+ class Database:
24
+ def __init__(self, db_path: Path):
25
+ self.db_path = db_path
26
+ self._conn: sqlite3.Connection | None = None
27
+ self._init_db()
28
+
29
+ def _get_conn(self) -> sqlite3.Connection:
30
+ """Возвращает соединение, создавая его при необходимости."""
31
+ if self._conn is None:
32
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
33
+ self._conn = sqlite3.connect(self.db_path, check_same_thread=False, timeout=30)
34
+ self._conn.execute('PRAGMA journal_mode=WAL')
35
+ self._conn.row_factory = sqlite3.Row
36
+ return self._conn
37
+
38
+ def _init_db(self):
39
+ """Создаёт таблицы, если не существуют."""
40
+ conn = self._get_conn()
41
+ conn.execute('''
42
+ CREATE TABLE IF NOT EXISTS posts (
43
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
44
+ platform TEXT NOT NULL,
45
+ author TEXT NOT NULL,
46
+ post_id TEXT NOT NULL,
47
+ title TEXT,
48
+ slug TEXT,
49
+ post_date TEXT,
50
+ source_url TEXT,
51
+ local_path TEXT,
52
+ tags TEXT,
53
+ synced_at TEXT,
54
+ UNIQUE(platform, author, post_id)
55
+ )
56
+ ''')
57
+ conn.execute('''
58
+ CREATE INDEX IF NOT EXISTS idx_platform_author
59
+ ON posts(platform, author)
60
+ ''')
61
+ conn.commit()
62
+
63
+ def close(self):
64
+ """Закрывает соединение с БД."""
65
+ if self._conn:
66
+ self._conn.close()
67
+ self._conn = None
68
+
69
+ def __enter__(self):
70
+ return self
71
+
72
+ def __exit__(self, exc_type, exc_val, exc_tb):
73
+ self.close()
74
+
75
+ def post_exists(self, platform: str, author: str, post_id: str) -> bool:
76
+ """Проверяет, существует ли пост в индексе."""
77
+ conn = self._get_conn()
78
+ cursor = conn.execute(
79
+ 'SELECT 1 FROM posts WHERE platform = ? AND author = ? AND post_id = ?',
80
+ (platform, author, post_id)
81
+ )
82
+ return cursor.fetchone() is not None
83
+
84
+ def add_post(self, record: PostRecord):
85
+ """Добавляет пост в индекс."""
86
+ conn = self._get_conn()
87
+ conn.execute('''
88
+ INSERT OR REPLACE INTO posts
89
+ (platform, author, post_id, title, slug, post_date, source_url, local_path, tags, synced_at)
90
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
91
+ ''', (
92
+ record.platform,
93
+ record.author,
94
+ record.post_id,
95
+ record.title,
96
+ record.slug,
97
+ record.post_date,
98
+ record.source_url,
99
+ record.local_path,
100
+ record.tags,
101
+ record.synced_at,
102
+ ))
103
+ conn.commit()
104
+
105
+ def get_post(self, platform: str, author: str, post_id: str) -> PostRecord | None:
106
+ """Получает пост из индекса."""
107
+ conn = self._get_conn()
108
+ cursor = conn.execute(
109
+ 'SELECT * FROM posts WHERE platform = ? AND author = ? AND post_id = ?',
110
+ (platform, author, post_id)
111
+ )
112
+ row = cursor.fetchone()
113
+ if row:
114
+ return self._row_to_record(row)
115
+ return None
116
+
117
+ def get_all_post_ids(self, platform: str, author: str) -> set[str]:
118
+ """Возвращает множество всех post_id для автора."""
119
+ conn = self._get_conn()
120
+ cursor = conn.execute(
121
+ 'SELECT post_id FROM posts WHERE platform = ? AND author = ?',
122
+ (platform, author)
123
+ )
124
+ return {row[0] for row in cursor.fetchall()}
125
+
126
+ def get_post_count(self, platform: str, author: str) -> int:
127
+ """Возвращает количество постов автора."""
128
+ conn = self._get_conn()
129
+ cursor = conn.execute(
130
+ 'SELECT COUNT(*) FROM posts WHERE platform = ? AND author = ?',
131
+ (platform, author)
132
+ )
133
+ return cursor.fetchone()[0]
134
+
135
+ def get_post_by_source_url(self, url: str) -> PostRecord | None:
136
+ """Ищет пост по исходному URL."""
137
+ conn = self._get_conn()
138
+ cursor = conn.execute(
139
+ 'SELECT * FROM posts WHERE source_url = ?',
140
+ (url,)
141
+ )
142
+ row = cursor.fetchone()
143
+ if row:
144
+ return self._row_to_record(row)
145
+ return None
146
+
147
+ def get_all_posts(self, platform: str, author: str) -> list[PostRecord]:
148
+ """Возвращает все посты автора."""
149
+ conn = self._get_conn()
150
+ cursor = conn.execute(
151
+ 'SELECT * FROM posts WHERE platform = ? AND author = ?',
152
+ (platform, author)
153
+ )
154
+ return [self._row_to_record(row) for row in cursor.fetchall()]
155
+
156
+ def _row_to_record(self, row: sqlite3.Row) -> PostRecord:
157
+ """Конвертирует строку БД в PostRecord."""
158
+ return PostRecord(
159
+ platform=row['platform'],
160
+ author=row['author'],
161
+ post_id=row['post_id'],
162
+ title=row['title'],
163
+ slug=row['slug'],
164
+ post_date=row['post_date'],
165
+ source_url=row['source_url'],
166
+ local_path=row['local_path'],
167
+ tags=row['tags'],
168
+ synced_at=row['synced_at'],
169
+ )
src/downloader.py ADDED
@@ -0,0 +1,383 @@
1
+ # src/downloader.py
2
+ """Базовый класс загрузчика и общая логика."""
3
+
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import threading
8
+ import time
9
+ from abc import ABC, abstractmethod
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from dataclasses import dataclass
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+ from urllib.parse import urljoin, urlparse
15
+
16
+ import requests
17
+ from slugify import slugify
18
+
19
+ from .config import Config, Source
20
+ from .database import Database, PostRecord
21
+ from .utils import (
22
+ ALLOWED_EXTENSIONS,
23
+ should_download_asset,
24
+ get_extension_from_content_type,
25
+ transliterate,
26
+ extract_internal_links,
27
+ )
28
+
29
+
30
+ def retry_request(
31
+ func,
32
+ max_retries: int = 3,
33
+ base_delay: float = 1.0,
34
+ max_delay: float = 30.0,
35
+ backoff_factor: float = 2.0,
36
+ ):
37
+ """
38
+ Выполняет функцию с retry и exponential backoff.
39
+
40
+ Args:
41
+ func: Функция для выполнения (должна возвращать Response или вызывать исключение)
42
+ max_retries: Максимальное количество попыток
43
+ base_delay: Начальная задержка в секундах
44
+ max_delay: Максимальная задержка в секундах
45
+ backoff_factor: Множитель для увеличения задержки
46
+ """
47
+ last_exception = None
48
+ delay = base_delay
49
+
50
+ for attempt in range(max_retries):
51
+ try:
52
+ return func()
53
+ except requests.RequestException as e:
54
+ last_exception = e
55
+ # Не ретраим 4xx ошибки (кроме 429 Too Many Requests)
56
+ if hasattr(e, 'response') and e.response is not None:
57
+ if 400 <= e.response.status_code < 500 and e.response.status_code != 429:
58
+ raise
59
+
60
+ if attempt < max_retries - 1:
61
+ time.sleep(delay)
62
+ delay = min(delay * backoff_factor, max_delay)
63
+
64
+ if last_exception:
65
+ raise last_exception
66
+ raise Exception("Max retries exceeded")
67
+
68
+
69
+ @dataclass
70
+ class Post:
71
+ """Универсальная структура поста."""
72
+ post_id: str
73
+ title: str
74
+ content_html: str
75
+ post_date: str
76
+ source_url: str
77
+ tags: list[str]
78
+ assets: list[dict]
79
+
80
+
81
+ class BaseDownloader(ABC):
82
+ """Базовый класс для загрузчиков."""
83
+
84
+ PLATFORM: str = ""
85
+ MAX_WORKERS: int = 5
86
+ TIMEOUT: tuple = (5, 30)
87
+
88
+ def __init__(self, config: Config, source: Source, db: Database):
89
+ self.config = config
90
+ self.source = source
91
+ self.db = db
92
+ self.session = requests.Session()
93
+ self._setup_session()
94
+
95
+ @abstractmethod
96
+ def _setup_session(self):
97
+ """Настройка сессии (cookies, headers)."""
98
+ pass
99
+
100
+ @abstractmethod
101
+ def fetch_posts_list(self) -> list[dict]:
102
+ """Получает список постов с API."""
103
+ pass
104
+
105
+ @abstractmethod
106
+ def fetch_post(self, post_id: str) -> Post | None:
107
+ """Получает один пост по ID."""
108
+ pass
109
+
110
+ @abstractmethod
111
+ def _parse_post(self, raw_data: dict) -> Post:
112
+ """Парсит сырые данные API в Post."""
113
+ pass
114
+
115
+ def sync(self):
116
+ """Синхронизирует все новые посты автора."""
117
+ print(f"[{self.PLATFORM}] Синхронизация {self.source.author}...")
118
+
119
+ self._create_index_files()
120
+
121
+ existing_ids = self.db.get_all_post_ids(self.PLATFORM, self.source.author)
122
+ posts = self.fetch_posts_list()
123
+
124
+ new_posts = [p for p in posts if str(p.get('id', p.get('post_id'))) not in existing_ids]
125
+ print(f" Найдено постов: {len(posts)}, новых: {len(new_posts)}")
126
+
127
+ for raw_post in new_posts:
128
+ post = self._parse_post(raw_post)
129
+ if post:
130
+ self._save_post(post)
131
+
132
+ # Фиксим ссылки после скачивания всех постов
133
+ if new_posts:
134
+ print(f" Фиксим внутренние ссылки...")
135
+ self.fix_internal_links()
136
+
137
+ def download_single(self, post_id: str):
138
+ """Скачивает один пост по ID."""
139
+ print(f"[{self.PLATFORM}] Скачивание поста {post_id}...")
140
+ self._create_index_files() # Создаем индексы, чтобы не было "Boosties"
141
+ post = self.fetch_post(post_id)
142
+ if post:
143
+ self._save_post(post)
144
+ else:
145
+ print(f" Ошибка: пост {post_id} не найден")
146
+
147
+ def _create_index_files(self):
148
+ """Создаёт _index.md файлы для навигации Hugo."""
149
+ # Для платформы
150
+ platform_dir = self.config.output_dir / self.PLATFORM
151
+ platform_dir.mkdir(parents=True, exist_ok=True)
152
+ platform_index = platform_dir / "_index.md"
153
+ if not platform_index.exists():
154
+ platform_index.write_text(f"---\ntitle: {self.PLATFORM.title()}\n---\n", encoding='utf-8')
155
+
156
+ # Для автора
157
+ author_dir = platform_dir / self.source.author
158
+ author_dir.mkdir(parents=True, exist_ok=True)
159
+ author_index = author_dir / "_index.md"
160
+ display_name = self.source.display_name or self.source.author
161
+ safe_display_name = display_name.replace('"', '\\"')
162
+ author_index.write_text(f'---\ntitle: "{safe_display_name}"\n---\n', encoding='utf-8')
163
+
164
+ # Для posts
165
+ posts_dir = author_dir / "posts"
166
+ posts_dir.mkdir(parents=True, exist_ok=True)
167
+ posts_index = posts_dir / "_index.md"
168
+ posts_index.write_text(f'---\ntitle: "Посты"\n---\n', encoding='utf-8')
169
+
170
+ def _save_post(self, post: Post):
171
+ """Сохраняет пост на диск."""
172
+ slug = self._make_slug(post)
173
+ post_dir = self._get_post_dir(slug)
174
+ post_dir.mkdir(parents=True, exist_ok=True)
175
+
176
+ # Скачиваем assets
177
+ if self.source.download_assets and post.assets:
178
+ assets_dir = post_dir / "assets"
179
+ assets_dir.mkdir(exist_ok=True)
180
+ asset_map = self._download_assets(post.assets, assets_dir) or {}
181
+ else:
182
+ asset_map = {}
183
+
184
+ # Конвертируем в Markdown
185
+ content_md = self._to_markdown(post, asset_map)
186
+
187
+ # Создаём frontmatter
188
+ frontmatter = self._make_frontmatter(post)
189
+
190
+ # Записываем файл
191
+ md_path = post_dir / "index.md"
192
+ md_path.write_text(frontmatter + content_md, encoding='utf-8')
193
+
194
+ # Обновляем индекс
195
+ record = PostRecord(
196
+ platform=self.PLATFORM,
197
+ author=self.source.author,
198
+ post_id=post.post_id,
199
+ title=post.title,
200
+ slug=slug,
201
+ post_date=post.post_date,
202
+ source_url=post.source_url,
203
+ local_path=str(post_dir),
204
+ tags=json.dumps(post.tags, ensure_ascii=False),
205
+ synced_at=datetime.now(timezone.utc).isoformat(),
206
+ )
207
+ self.db.add_post(record)
208
+ print(f" ✓ {post.title}")
209
+
210
+ def _make_slug(self, post: Post) -> str:
211
+ """Создаёт slug для папки поста."""
212
+ date_prefix = post.post_date[:10]
213
+ title_slug = transliterate(post.title)[:60]
214
+ return f"{date_prefix}-{title_slug}"
215
+
216
+ def _get_post_dir(self, slug: str) -> Path:
217
+ """Возвращает путь к папке поста."""
218
+ return (
219
+ self.config.output_dir
220
+ / self.PLATFORM
221
+ / self.source.author
222
+ / "posts"
223
+ / slug
224
+ )
225
+
226
+ def _make_frontmatter(self, post: Post) -> str:
227
+ """Создаёт YAML frontmatter."""
228
+ # Экранируем кавычки в заголовке
229
+ safe_title = post.title.replace('"', '\\"')
230
+
231
+ lines = [
232
+ "---",
233
+ f'title: "{safe_title}"',
234
+ f"date: {post.post_date}",
235
+ f"source: {post.source_url}",
236
+ f"author: {self.source.author}",
237
+ f"platform: {self.PLATFORM}",
238
+ f"post_id: {post.post_id}",
239
+ ]
240
+ if post.tags:
241
+ tags_str = json.dumps(post.tags, ensure_ascii=False)
242
+ lines.append(f"tags: {tags_str}")
243
+ lines.append("---\n\n")
244
+ return "\n".join(lines)
245
+
246
+ def _download_assets(self, assets: list[dict], assets_dir: Path) -> dict[str, str]:
247
+ """
248
+ Скачивает assets параллельно.
249
+ Возвращает маппинг {original_url: local_filename}.
250
+ """
251
+ asset_map: dict[str, str] = {}
252
+ used_filenames: set[str] = set()
253
+ used_lock = threading.Lock()
254
+
255
+ def download_one(asset: dict) -> tuple[str, str | None]:
256
+ url = asset["url"]
257
+ try:
258
+ # Предварительная проверка (если расширение есть)
259
+ ext = Path(urlparse(url).path).suffix.lower()
260
+ if ext and not should_download_asset(url, None, self.source.asset_types):
261
+ return url, None
262
+
263
+ def do_request():
264
+ resp = self.session.get(url, stream=True, timeout=self.TIMEOUT)
265
+ resp.raise_for_status()
266
+ return resp
267
+
268
+ response = retry_request(do_request, max_retries=3)
269
+
270
+ content_type = response.headers.get('Content-Type', '')
271
+
272
+ # Полная проверка после получения Content-Type
273
+ if not should_download_asset(url, content_type, self.source.asset_types):
274
+ return url, None
275
+
276
+ filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
277
+
278
+ with used_lock:
279
+ filename = filename_base
280
+ filepath = assets_dir / filename
281
+ if filename in used_filenames or filepath.exists():
282
+ filename = self._deduplicate_filename(filename, url)
283
+ filepath = assets_dir / filename
284
+
285
+ # На всякий случай добиваемся уникальности в рамках сессии
286
+ while filename in used_filenames or filepath.exists():
287
+ filename = self._deduplicate_filename(filename, url + filename)
288
+ filepath = assets_dir / filename
289
+
290
+ used_filenames.add(filename)
291
+
292
+ if not filepath.exists():
293
+ with open(filepath, 'wb') as f:
294
+ for chunk in response.iter_content(chunk_size=8192):
295
+ f.write(chunk)
296
+
297
+ return url, filename
298
+ except requests.RequestException as e:
299
+ print(f" Ошибка скачивания {url}: {e}")
300
+ return url, None
301
+
302
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
303
+ futures = {executor.submit(download_one, a): a for a in assets}
304
+ for future in as_completed(futures):
305
+ url, filename = future.result()
306
+ if filename:
307
+ asset_map[url] = filename
308
+
309
+ return asset_map
310
+
311
+ def _deduplicate_filename(self, filename: str, url: str) -> str:
312
+ """Создаёт уникальное имя файла добавляя хеш URL."""
313
+ path = Path(filename)
314
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
315
+ return f"{path.stem}-{url_hash}{path.suffix}"
316
+
317
+ def _make_asset_filename(self, url: str, content_type: str, alt: str | None) -> str:
318
+ """Создаёт имя файла для asset."""
319
+ path = urlparse(url).path
320
+ original_name = Path(path).name
321
+ ext = Path(path).suffix.lower()
322
+
323
+ if not ext or ext not in ALLOWED_EXTENSIONS:
324
+ ext = get_extension_from_content_type(content_type) or '.bin'
325
+
326
+ if alt:
327
+ name = transliterate(alt)[:50]
328
+ else:
329
+ name = slugify(Path(path).stem or 'asset', max_length=50)
330
+
331
+ return f"{name}{ext}"
332
+
333
+ def fix_internal_links(self):
334
+ """Фиксит внутренние ссылки во всех постах автора."""
335
+ posts = self.db.get_all_posts(self.PLATFORM, self.source.author)
336
+ if not posts:
337
+ return
338
+
339
+ # Строим маппинг post_id → slug
340
+ id_to_slug = {p.post_id: p.slug for p in posts}
341
+
342
+ fixed_files = 0
343
+
344
+ for post in posts:
345
+ md_path = Path(post.local_path) / "index.md"
346
+ if not md_path.exists():
347
+ continue
348
+
349
+ content = md_path.read_text(encoding='utf-8')
350
+
351
+ # Разделяем frontmatter и body
352
+ if content.startswith('---'):
353
+ parts = content.split('---', 2)
354
+ if len(parts) >= 3:
355
+ frontmatter = parts[1]
356
+ body = parts[2]
357
+ else:
358
+ continue
359
+ else:
360
+ continue
361
+
362
+ original_body = body
363
+
364
+ for full_url, platform, author, post_id in extract_internal_links(body):
365
+ if platform != self.PLATFORM:
366
+ continue
367
+ if author != self.source.author:
368
+ continue
369
+ if post_id in id_to_slug:
370
+ body = body.replace(full_url, f"../{id_to_slug[post_id]}/")
371
+
372
+ if body != original_body:
373
+ new_content = f"---{frontmatter}---{body}"
374
+ md_path.write_text(new_content, encoding='utf-8')
375
+ fixed_files += 1
376
+
377
+ if fixed_files:
378
+ print(f" Исправлено ссылок в {fixed_files} файлах")
379
+
380
+ @abstractmethod
381
+ def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
382
+ """Конвертирует контент поста в Markdown."""
383
+ pass