article-backup 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of article-backup might be problematic. Click here for more details.

src/boosty.py ADDED
@@ -0,0 +1,260 @@
1
+ # src/boosty.py
2
+ """Загрузчик для Boosty.to"""
3
+
4
+ import json
5
+ from datetime import datetime, timezone
6
+
7
+ import requests
8
+
9
+ from .config import Config, Source, load_cookie, load_auth_header
10
+ from .database import Database
11
+ from .downloader import BaseDownloader, Post
12
+
13
+
14
+ class BoostyDownloader(BaseDownloader):
15
+ """Загрузчик статей с Boosty.to"""
16
+
17
+ PLATFORM = "boosty"
18
+ API_BASE = "https://api.boosty.to/v1"
19
+
20
+ def _setup_session(self):
21
+ """Настройка сессии с cookies и authorization."""
22
+ cookie = load_cookie(self.config.auth.boosty_cookie_file)
23
+ auth = load_auth_header(self.config.auth.boosty_auth_file)
24
+
25
+ self.session.headers.update({
26
+ 'Cookie': cookie,
27
+ 'Authorization': auth,
28
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
29
+ })
30
+
31
+ def fetch_posts_list(self) -> list[dict]:
32
+ """Получает список всех постов через API."""
33
+ all_posts = []
34
+ offset = None
35
+
36
+ while True:
37
+ url = f"{self.API_BASE}/blog/{self.source.author}/post/?limit=20"
38
+ if offset:
39
+ url += f"&offset={offset}"
40
+
41
+ response = self.session.get(url, timeout=self.TIMEOUT)
42
+ response.raise_for_status()
43
+
44
+ data = response.json()
45
+ posts_chunk = data.get("data", [])
46
+
47
+ if not posts_chunk:
48
+ break
49
+
50
+ all_posts.extend(posts_chunk)
51
+ print(f" Получено {len(all_posts)} постов...")
52
+
53
+ # Проверяем, есть ли ещё страницы
54
+ extra = data.get("extra", {})
55
+ if extra.get("isLast", True):
56
+ break
57
+
58
+ offset = extra.get("offset")
59
+ if not offset:
60
+ break
61
+
62
+ return all_posts
63
+
64
+ def fetch_post(self, post_id: str) -> Post | None:
65
+ """Получает один пост по ID."""
66
+ url = f"{self.API_BASE}/blog/{self.source.author}/post/{post_id}"
67
+
68
+ try:
69
+ response = self.session.get(url, timeout=self.TIMEOUT)
70
+ response.raise_for_status()
71
+ data = response.json()
72
+ return self._parse_post(data)
73
+ except requests.RequestException as e:
74
+ print(f" Ошибка получения поста {post_id}: {e}")
75
+ return None
76
+
77
+ def _parse_post(self, raw_data: dict) -> Post:
78
+ """Парсит сырые данные API в Post."""
79
+ post_id = raw_data.get("id", "")
80
+ title = raw_data.get("title", "Без названия")
81
+
82
+ # Дата — timestamp в секундах
83
+ created_at = raw_data.get("createdAt", 0)
84
+ post_date = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
85
+
86
+ # URL поста
87
+ author = raw_data.get("user", {}).get("blogUrl", self.source.author)
88
+ source_url = f"https://boosty.to/{author}/posts/{post_id}"
89
+
90
+ # Теги
91
+ tags = [t.get("title", "") for t in raw_data.get("tags", []) if t.get("title")]
92
+
93
+ # Контент — массив блоков
94
+ content_blocks = raw_data.get("data", [])
95
+
96
+ # Извлекаем assets
97
+ assets = self._extract_assets(content_blocks)
98
+
99
+ return Post(
100
+ post_id=post_id,
101
+ title=title,
102
+ content_html=json.dumps(content_blocks, ensure_ascii=False),
103
+ post_date=post_date,
104
+ source_url=source_url,
105
+ tags=tags,
106
+ assets=assets,
107
+ )
108
+
109
+ def _extract_assets(self, blocks: list[dict]) -> list[dict]:
110
+ """Извлекает URL медиафайлов из блоков контента."""
111
+ assets = []
112
+
113
+ for block in blocks:
114
+ block_type = block.get("type", "")
115
+
116
+ if block_type == "image":
117
+ url = block.get("url", "")
118
+ if url:
119
+ assets.append({
120
+ "url": url,
121
+ "alt": block.get("id", ""),
122
+ })
123
+
124
+ elif block_type == "audio_file":
125
+ url = block.get("url", "")
126
+ if url:
127
+ assets.append({
128
+ "url": url,
129
+ "alt": block.get("title", block.get("id", "")),
130
+ })
131
+
132
+ elif block_type == "ok_video":
133
+ # ok.ru видео требует отдельной обработки
134
+ # Пока сохраняем только превью, если есть
135
+ preview = block.get("previewUrl", "")
136
+ if preview:
137
+ assets.append({
138
+ "url": preview,
139
+ "alt": f"video-preview-{block.get('id', '')}",
140
+ })
141
+
142
+ return assets
143
+
144
+ def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
145
+ """Конвертирует блоки контента в Markdown."""
146
+ try:
147
+ blocks = json.loads(post.content_html)
148
+ except json.JSONDecodeError:
149
+ return f"# {post.title}\n\n"
150
+
151
+ lines = [f"# {post.title}\n"]
152
+
153
+ for block in blocks:
154
+ md = self._block_to_markdown(block, asset_map)
155
+ if md:
156
+ lines.append(md)
157
+
158
+ return "\n".join(lines)
159
+
160
+ def _block_to_markdown(self, block: dict, asset_map: dict[str, str]) -> str:
161
+ """Конвертирует один блок в Markdown."""
162
+ block_type = block.get("type", "")
163
+
164
+ if block_type == "text":
165
+ return self._parse_text_block(block)
166
+
167
+ elif block_type == "image":
168
+ url = block.get("url", "")
169
+ local = asset_map.get(url)
170
+ if local:
171
+ return f"\n![](assets/{local})\n"
172
+ elif url:
173
+ return f"\n![]({url})\n"
174
+
175
+ elif block_type == "link":
176
+ url = block.get("url", "")
177
+ text = self._parse_text_block(block)
178
+ if text and url:
179
+ return f"[{text}]({url})"
180
+ elif url:
181
+ return f"<{url}>"
182
+
183
+ elif block_type == "audio_file":
184
+ url = block.get("url", "")
185
+ title = block.get("title", "audio")
186
+ local = asset_map.get(url)
187
+ if local:
188
+ return f"\n🎵 **{title}**: [скачать](assets/{local})\n"
189
+ elif url:
190
+ return f"\n🎵 **{title}**: [слушать]({url})\n"
191
+
192
+ elif block_type == "ok_video":
193
+ video_id = block.get("id", "")
194
+ return f"\n📹 Видео: https://ok.ru/video/{video_id}\n"
195
+
196
+ return ""
197
+
198
+ def _parse_text_block(self, block: dict) -> str:
199
+ """Парсит текстовый блок Boosty."""
200
+ content = block.get("content", "")
201
+ modificator = block.get("modificator", "")
202
+
203
+ # BLOCK_END — разделитель параграфов
204
+ if modificator == "BLOCK_END":
205
+ return "\n"
206
+
207
+ if not content:
208
+ return ""
209
+
210
+ # Формат: ["текст", "стиль", [[тип, начало, длина], ...]]
211
+ try:
212
+ parsed = json.loads(content)
213
+ if isinstance(parsed, list) and len(parsed) >= 1:
214
+ text = str(parsed[0])
215
+
216
+ # Применяем стили, если есть
217
+ if len(parsed) >= 3 and parsed[2]:
218
+ text = self._apply_styles(text, parsed[2])
219
+
220
+ return text
221
+ except (json.JSONDecodeError, IndexError, TypeError):
222
+ return content
223
+
224
+ return ""
225
+
226
+ def _apply_styles(self, text: str, styles: list) -> str:
227
+ """Применяет стили к тексту (bold, italic)."""
228
+ if not styles or not text:
229
+ return text
230
+
231
+ # Сортируем стили по позиции в обратном порядке
232
+ # чтобы вставка не сбивала индексы
233
+ sorted_styles = sorted(styles, key=lambda s: s[1] if len(s) > 1 else 0, reverse=True)
234
+
235
+ result = text
236
+ for style in sorted_styles:
237
+ if len(style) < 3:
238
+ continue
239
+
240
+ style_type, start, length = style[0], style[1], style[2]
241
+ end = start + length
242
+
243
+ if start < 0 or end > len(result):
244
+ continue
245
+
246
+ fragment = result[start:end]
247
+
248
+ # Типы стилей (примерные, на основе анализа)
249
+ if style_type == 1: # bold
250
+ styled = f"**{fragment}**"
251
+ elif style_type == 2: # italic
252
+ styled = f"*{fragment}*"
253
+ elif style_type == 4: # ссылка (обрабатывается в link блоках)
254
+ styled = fragment
255
+ else:
256
+ styled = fragment
257
+
258
+ result = result[:start] + styled + result[end:]
259
+
260
+ return result
src/config.py ADDED
@@ -0,0 +1,99 @@
1
+ # src/config.py
2
+ """Загрузка и валидация конфигурации."""
3
+
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ import yaml
9
+
10
+ Platform = Literal['sponsr', 'boosty']
11
+
12
+
13
+ @dataclass
14
+ class Source:
15
+ platform: Platform
16
+ author: str
17
+ download_assets: bool = True
18
+ display_name: str | None = None
19
+
20
+ @dataclass
21
+ class Auth:
22
+ sponsr_cookie_file: Path | None = None
23
+ boosty_cookie_file: Path | None = None
24
+ boosty_auth_file: Path | None = None # Authorization: Bearer ...
25
+
26
+
27
+ @dataclass
28
+ class HugoConfig:
29
+ base_url: str = "http://localhost:1313/"
30
+ title: str = "Бэкап статей"
31
+ language_code: str = "ru"
32
+
33
+
34
+ @dataclass
35
+ class Config:
36
+ output_dir: Path
37
+ auth: Auth
38
+ sources: list[Source] = field(default_factory=list)
39
+ hugo: HugoConfig = field(default_factory=HugoConfig)
40
+
41
+
42
+ def load_config(config_path: Path) -> Config:
43
+ """Загружает конфигурацию из YAML-файла."""
44
+ with open(config_path, 'r', encoding='utf-8') as f:
45
+ data = yaml.safe_load(f)
46
+
47
+ # output_dir
48
+ output_dir = Path(data.get('output_dir', './backup'))
49
+
50
+ # auth
51
+ auth_data = data.get('auth', {})
52
+ auth = Auth(
53
+ sponsr_cookie_file=_to_path(auth_data.get('sponsr_cookie_file')),
54
+ boosty_cookie_file=_to_path(auth_data.get('boosty_cookie_file')),
55
+ boosty_auth_file=_to_path(auth_data.get('boosty_auth_file')),
56
+ )
57
+
58
+ # sources
59
+ sources = []
60
+ for src in data.get('sources', []):
61
+ sources.append(Source(
62
+ platform=src['platform'],
63
+ author=src['author'],
64
+ download_assets=src.get('download_assets', True),
65
+ display_name=src.get('display_name'),
66
+ ))
67
+
68
+ # hugo
69
+ hugo_data = data.get('hugo', {})
70
+ hugo = HugoConfig(
71
+ base_url=hugo_data.get('base_url', HugoConfig.base_url),
72
+ title=hugo_data.get('title', HugoConfig.title),
73
+ language_code=hugo_data.get('language_code', HugoConfig.language_code),
74
+ )
75
+
76
+ return Config(output_dir=output_dir, auth=auth, sources=sources, hugo=hugo)
77
+
78
+
79
+ def _to_path(value: str | None) -> Path | None:
80
+ """Конвертирует строку в Path или возвращает None."""
81
+ return Path(value) if value else None
82
+
83
+
84
+ def load_cookie(cookie_file: Path | None) -> str:
85
+ """Загружает cookie из файла."""
86
+ if cookie_file is None:
87
+ raise FileNotFoundError("Cookie file path not specified")
88
+ if not cookie_file.exists():
89
+ raise FileNotFoundError(f"Cookie file not found: {cookie_file}")
90
+ return cookie_file.read_text(encoding='utf-8').strip()
91
+
92
+
93
+ def load_auth_header(auth_file: Path | None) -> str:
94
+ """Загружает Authorization header из файла."""
95
+ if auth_file is None:
96
+ raise FileNotFoundError("Auth file path not specified")
97
+ if not auth_file.exists():
98
+ raise FileNotFoundError(f"Auth file not found: {auth_file}")
99
+ return auth_file.read_text(encoding='utf-8').strip()
src/database.py ADDED
@@ -0,0 +1,169 @@
1
+ # src/database.py
2
+ """SQLite операции для индекса постов."""
3
+
4
+ import sqlite3
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass
10
+ class PostRecord:
11
+ platform: str
12
+ author: str
13
+ post_id: str
14
+ title: str
15
+ slug: str
16
+ post_date: str
17
+ source_url: str
18
+ local_path: str
19
+ tags: str
20
+ synced_at: str
21
+
22
+
23
+ class Database:
24
+ def __init__(self, db_path: Path):
25
+ self.db_path = db_path
26
+ self._conn: sqlite3.Connection | None = None
27
+ self._init_db()
28
+
29
+ def _get_conn(self) -> sqlite3.Connection:
30
+ """Возвращает соединение, создавая его при необходимости."""
31
+ if self._conn is None:
32
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
33
+ self._conn = sqlite3.connect(self.db_path, check_same_thread=False, timeout=30)
34
+ self._conn.execute('PRAGMA journal_mode=WAL')
35
+ self._conn.row_factory = sqlite3.Row
36
+ return self._conn
37
+
38
+ def _init_db(self):
39
+ """Создаёт таблицы, если не существуют."""
40
+ conn = self._get_conn()
41
+ conn.execute('''
42
+ CREATE TABLE IF NOT EXISTS posts (
43
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
44
+ platform TEXT NOT NULL,
45
+ author TEXT NOT NULL,
46
+ post_id TEXT NOT NULL,
47
+ title TEXT,
48
+ slug TEXT,
49
+ post_date TEXT,
50
+ source_url TEXT,
51
+ local_path TEXT,
52
+ tags TEXT,
53
+ synced_at TEXT,
54
+ UNIQUE(platform, author, post_id)
55
+ )
56
+ ''')
57
+ conn.execute('''
58
+ CREATE INDEX IF NOT EXISTS idx_platform_author
59
+ ON posts(platform, author)
60
+ ''')
61
+ conn.commit()
62
+
63
+ def close(self):
64
+ """Закрывает соединение с БД."""
65
+ if self._conn:
66
+ self._conn.close()
67
+ self._conn = None
68
+
69
+ def __enter__(self):
70
+ return self
71
+
72
+ def __exit__(self, exc_type, exc_val, exc_tb):
73
+ self.close()
74
+
75
+ def post_exists(self, platform: str, author: str, post_id: str) -> bool:
76
+ """Проверяет, существует ли пост в индексе."""
77
+ conn = self._get_conn()
78
+ cursor = conn.execute(
79
+ 'SELECT 1 FROM posts WHERE platform = ? AND author = ? AND post_id = ?',
80
+ (platform, author, post_id)
81
+ )
82
+ return cursor.fetchone() is not None
83
+
84
+ def add_post(self, record: PostRecord):
85
+ """Добавляет пост в индекс."""
86
+ conn = self._get_conn()
87
+ conn.execute('''
88
+ INSERT OR REPLACE INTO posts
89
+ (platform, author, post_id, title, slug, post_date, source_url, local_path, tags, synced_at)
90
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
91
+ ''', (
92
+ record.platform,
93
+ record.author,
94
+ record.post_id,
95
+ record.title,
96
+ record.slug,
97
+ record.post_date,
98
+ record.source_url,
99
+ record.local_path,
100
+ record.tags,
101
+ record.synced_at,
102
+ ))
103
+ conn.commit()
104
+
105
+ def get_post(self, platform: str, author: str, post_id: str) -> PostRecord | None:
106
+ """Получает пост из индекса."""
107
+ conn = self._get_conn()
108
+ cursor = conn.execute(
109
+ 'SELECT * FROM posts WHERE platform = ? AND author = ? AND post_id = ?',
110
+ (platform, author, post_id)
111
+ )
112
+ row = cursor.fetchone()
113
+ if row:
114
+ return self._row_to_record(row)
115
+ return None
116
+
117
+ def get_all_post_ids(self, platform: str, author: str) -> set[str]:
118
+ """Возвращает множество всех post_id для автора."""
119
+ conn = self._get_conn()
120
+ cursor = conn.execute(
121
+ 'SELECT post_id FROM posts WHERE platform = ? AND author = ?',
122
+ (platform, author)
123
+ )
124
+ return {row[0] for row in cursor.fetchall()}
125
+
126
+ def get_post_count(self, platform: str, author: str) -> int:
127
+ """Возвращает количество постов автора."""
128
+ conn = self._get_conn()
129
+ cursor = conn.execute(
130
+ 'SELECT COUNT(*) FROM posts WHERE platform = ? AND author = ?',
131
+ (platform, author)
132
+ )
133
+ return cursor.fetchone()[0]
134
+
135
+ def get_post_by_source_url(self, url: str) -> PostRecord | None:
136
+ """Ищет пост по исходному URL."""
137
+ conn = self._get_conn()
138
+ cursor = conn.execute(
139
+ 'SELECT * FROM posts WHERE source_url = ?',
140
+ (url,)
141
+ )
142
+ row = cursor.fetchone()
143
+ if row:
144
+ return self._row_to_record(row)
145
+ return None
146
+
147
+ def get_all_posts(self, platform: str, author: str) -> list[PostRecord]:
148
+ """Возвращает все посты автора."""
149
+ conn = self._get_conn()
150
+ cursor = conn.execute(
151
+ 'SELECT * FROM posts WHERE platform = ? AND author = ?',
152
+ (platform, author)
153
+ )
154
+ return [self._row_to_record(row) for row in cursor.fetchall()]
155
+
156
+ def _row_to_record(self, row: sqlite3.Row) -> PostRecord:
157
+ """Конвертирует строку БД в PostRecord."""
158
+ return PostRecord(
159
+ platform=row['platform'],
160
+ author=row['author'],
161
+ post_id=row['post_id'],
162
+ title=row['title'],
163
+ slug=row['slug'],
164
+ post_date=row['post_date'],
165
+ source_url=row['source_url'],
166
+ local_path=row['local_path'],
167
+ tags=row['tags'],
168
+ synced_at=row['synced_at'],
169
+ )