PyPI - article-backup - Versions diffs - 0.2.3__py3-none-any.whl - Mend

article-backup 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

article_backup-0.2.3.dist-info/METADATA +315 -0
article_backup-0.2.3.dist-info/RECORD +14 -0
article_backup-0.2.3.dist-info/WHEEL +5 -0
article_backup-0.2.3.dist-info/entry_points.txt +2 -0
article_backup-0.2.3.dist-info/licenses/LICENSE +177 -0
article_backup-0.2.3.dist-info/top_level.txt +2 -0
backup.py +179 -0
src/__init__.py +1 -0
src/boosty.py +260 -0
src/config.py +108 -0
src/database.py +169 -0
src/downloader.py +383 -0
src/sponsr.py +349 -0
src/utils.py +164 -0

backup.py ADDED Viewed

@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# backup.py
+"""CLI точка входа для бэкапа статей."""
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import cast
+from src.config import Config, load_config, Source, Platform
+from src.database import Database
+from src.utils import is_post_url, parse_post_url
+from src.sponsr import SponsorDownloader
+from src.boosty import BoostyDownloader
+def generate_hugo_config(config: Config):
+    """Генерирует site/hugo.toml из конфига."""
+    hugo_toml = Path('site/hugo.toml')
+    if not hugo_toml.parent.exists():
+        return
+    content = f'''baseURL = '{config.hugo.base_url}'
+languageCode = '{config.hugo.language_code}'
+title = '{config.hugo.title}'
+relativeURLs = true
+[params]
+  default_theme = '{config.hugo.default_theme}'
+[markup.goldmark.renderer]
+  unsafe = true
+[taxonomies]
+  tag = 'tags'
+[outputs]
+  home = ["HTML"]
+  section = ["HTML", "RSS"]
+[services.rss]
+  limit = 50
+'''
+    hugo_toml.write_text(content, encoding='utf-8')
+def ensure_site_content_link(config: Config):
+    """Создаёт симлинк site/content → output_dir."""
+    # В Docker-среде (когда задан BACKUP_OUTPUT_DIR) мы не создаем симлинк,
+    # так как пути внутри контейнера (/app/backup) не совпадают с хостовыми.
+    # Симлинк должен создаваться скриптом запуска (run-docker.sh) на хосте.
+    if os.environ.get('BACKUP_OUTPUT_DIR'):
+        return
+    site_content = Path('site/content')
+    # Если уже правильный симлинк — ничего не делаем
+    if site_content.is_symlink():
+        current_target = site_content.resolve()
+        expected_target = config.output_dir.resolve()
+        if current_target == expected_target:
+            return
+        # Симлинк на другую директорию — удаляем
+        site_content.unlink()
+    elif site_content.exists():
+        # Это реальная директория — не трогаем
+        print(f"Предупреждение: site/content существует и не является симлинком")
+        return
+    # Создаём симлинк
+    site_dir = Path('site')
+    if site_dir.exists():
+        # Относительный путь от site/ к output_dir
+        rel_path = os.path.relpath(config.output_dir.resolve(), site_dir.resolve())
+        site_content.symlink_to(rel_path)
+        print(f"Симлинк: site/content → {rel_path}")
+def get_downloader(platform: str, config: Config, source: Source, db: Database):
+    """Возвращает загрузчик для платформы."""
+    if platform == 'sponsr':
+        return SponsorDownloader(config, source, db)
+    elif platform == 'boosty':
+        return BoostyDownloader(config, source, db)
+    else:
+        raise ValueError(f"Неизвестная платформа: {platform}")
+def sync_all(config: Config, db: Database):
+    """Синхронизирует всех авторов из конфига."""
+    for source in config.sources:
+        try:
+            downloader = get_downloader(source.platform, config, source, db)
+            downloader.sync()
+        except Exception as e:
+            print(f"[{source.platform}] Ошибка при синхронизации {source.author}: {e}")
+def download_single_post(url: str, config: Config, db: Database):
+    """Скачивает один пост по URL."""
+    platform_str, author, post_id = parse_post_url(url)
+    platform = cast(Platform, platform_str)
+    # Создаём Source для этого автора
+    source = Source(platform=platform, author=author, download_assets=True)
+    # Пытаемся найти настройки источника в конфиге
+    for src in config.sources:
+        if src.platform == platform and src.author == author:
+            source = src
+            break
+    downloader = get_downloader(platform, config, source, db)
+    downloader.download_single(post_id)
+def main():
+    parser = argparse.ArgumentParser(
+        description='Бэкап статей с Sponsr и Boosty',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Примеры:
+  %(prog)s                                    # синхронизация по конфигу
+  %(prog)s "https://sponsr.ru/author/123/..." # скачать один пост
+  %(prog)s "https://boosty.to/author/posts/uuid"
+        '''
+    )
+    parser.add_argument(
+        'url',
+        nargs='?',
+        help='URL поста для скачивания (опционально)'
+    )
+    parser.add_argument(
+        '-c', '--config',
+        type=Path,
+        default=Path('config.yaml'),
+        help='Путь к конфигу (по умолчанию: config.yaml)'
+    )
+    args = parser.parse_args()
+    # Загружаем конфиг
+    if not args.config.exists():
+        print(f"Ошибка: конфиг не найден: {args.config}")
+        print("Создайте config.yaml по образцу.")
+        sys.exit(1)
+    try:
+        config = load_config(args.config)
+    except Exception as e:
+        print(f"Ошибка загрузки конфига: {e}")
+        sys.exit(1)
+    # Создаём директорию и базу
+    config.output_dir.mkdir(parents=True, exist_ok=True)
+    with Database(config.output_dir / 'index.db') as db:
+        # Выполняем команду
+        if args.url:
+            if not is_post_url(args.url):
+                print(f"Ошибка: неверный URL поста: {args.url}")
+                sys.exit(1)
+            download_single_post(args.url, config, db)
+        else:
+            if not config.sources:
+                print("Нет источников в конфиге. Добавьте секцию 'sources'.")
+                sys.exit(1)
+            sync_all(config, db)
+    ensure_site_content_link(config)
+    generate_hugo_config(config)
+    print("\nГотово!")
+if __name__ == '__main__':
+    main()

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # src package

src/boosty.py ADDED Viewed

@@ -0,0 +1,260 @@
+# src/boosty.py
+"""Загрузчик для Boosty.to"""
+import json
+from datetime import datetime, timezone
+import requests
+from .config import Config, Source, load_cookie, load_auth_header
+from .database import Database
+from .downloader import BaseDownloader, Post
+class BoostyDownloader(BaseDownloader):
+    """Загрузчик статей с Boosty.to"""
+    PLATFORM = "boosty"
+    API_BASE = "https://api.boosty.to/v1"
+    def _setup_session(self):
+        """Настройка сессии с cookies и authorization."""
+        cookie = load_cookie(self.config.auth.boosty_cookie_file)
+        auth = load_auth_header(self.config.auth.boosty_auth_file)
+        self.session.headers.update({
+            'Cookie': cookie,
+            'Authorization': auth,
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        })
+    def fetch_posts_list(self) -> list[dict]:
+        """Получает список всех постов через API."""
+        all_posts = []
+        offset = None
+        while True:
+            url = f"{self.API_BASE}/blog/{self.source.author}/post/?limit=20"
+            if offset:
+                url += f"&offset={offset}"
+            response = self.session.get(url, timeout=self.TIMEOUT)
+            response.raise_for_status()
+            data = response.json()
+            posts_chunk = data.get("data", [])
+            if not posts_chunk:
+                break
+            all_posts.extend(posts_chunk)
+            print(f"  Получено {len(all_posts)} постов...")
+            # Проверяем, есть ли ещё страницы
+            extra = data.get("extra", {})
+            if extra.get("isLast", True):
+                break
+            offset = extra.get("offset")
+            if not offset:
+                break
+        return all_posts
+    def fetch_post(self, post_id: str) -> Post | None:
+        """Получает один пост по ID."""
+        url = f"{self.API_BASE}/blog/{self.source.author}/post/{post_id}"
+        try:
+            response = self.session.get(url, timeout=self.TIMEOUT)
+            response.raise_for_status()
+            data = response.json()
+            return self._parse_post(data)
+        except requests.RequestException as e:
+            print(f"  Ошибка получения поста {post_id}: {e}")
+            return None
+    def _parse_post(self, raw_data: dict) -> Post:
+        """Парсит сырые данные API в Post."""
+        post_id = raw_data.get("id", "")
+        title = raw_data.get("title", "Без названия")
+        # Дата — timestamp в секундах
+        created_at = raw_data.get("createdAt", 0)
+        post_date = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
+        # URL поста
+        author = raw_data.get("user", {}).get("blogUrl", self.source.author)
+        source_url = f"https://boosty.to/{author}/posts/{post_id}"
+        # Теги
+        tags = [t.get("title", "") for t in raw_data.get("tags", []) if t.get("title")]
+        # Контент — массив блоков
+        content_blocks = raw_data.get("data", [])
+        # Извлекаем assets
+        assets = self._extract_assets(content_blocks)
+        return Post(
+            post_id=post_id,
+            title=title,
+            content_html=json.dumps(content_blocks, ensure_ascii=False),
+            post_date=post_date,
+            source_url=source_url,
+            tags=tags,
+            assets=assets,
+        )
+    def _extract_assets(self, blocks: list[dict]) -> list[dict]:
+        """Извлекает URL медиафайлов из блоков контента."""
+        assets = []
+        for block in blocks:
+            block_type = block.get("type", "")
+            if block_type == "image":
+                url = block.get("url", "")
+                if url:
+                    assets.append({
+                        "url": url,
+                        "alt": block.get("id", ""),
+                    })
+            elif block_type == "audio_file":
+                url = block.get("url", "")
+                if url:
+                    assets.append({
+                        "url": url,
+                        "alt": block.get("title", block.get("id", "")),
+                    })
+            elif block_type == "ok_video":
+                # ok.ru видео требует отдельной обработки
+                # Пока сохраняем только превью, если есть
+                preview = block.get("previewUrl") or block.get("preview") or ""
+                if preview:
+                    assets.append({
+                        "url": preview,
+                        "alt": f"video-preview-{block.get('id', '')}",
+                    })
+        return assets
+    def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
+        """Конвертирует блоки контента в Markdown."""
+        try:
+            blocks = json.loads(post.content_html)
+        except json.JSONDecodeError:
+            return f"# {post.title}\n\n"
+        lines = [f"# {post.title}\n"]
+        for block in blocks:
+            md = self._block_to_markdown(block, asset_map)
+            if md:
+                lines.append(md)
+        return "\n".join(lines)
+    def _block_to_markdown(self, block: dict, asset_map: dict[str, str]) -> str:
+        """Конвертирует один блок в Markdown."""
+        block_type = block.get("type", "")
+        if block_type == "text":
+            return self._parse_text_block(block)
+        elif block_type == "image":
+            url = block.get("url", "")
+            local = asset_map.get(url)
+            if local:
+                return f"\n![](assets/{local})\n"
+            elif url:
+                return f"\n![]({url})\n"
+        elif block_type == "link":
+            url = block.get("url", "")
+            text = self._parse_text_block(block)
+            if text and url:
+                return f"[{text}]({url})"
+            elif url:
+                return f"<{url}>"
+        elif block_type == "audio_file":
+            url = block.get("url", "")
+            title = block.get("title", "audio")
+            local = asset_map.get(url)
+            if local:
+                return f"\n🎵 **{title}**: [скачать](assets/{local})\n"
+            elif url:
+                return f"\n🎵 **{title}**: [слушать]({url})\n"
+        elif block_type == "ok_video":
+            video_id = block.get("id", "")
+            return f"\n📹 Видео: https://ok.ru/video/{video_id}\n"
+        return ""
+    def _parse_text_block(self, block: dict) -> str:
+        """Парсит текстовый блок Boosty."""
+        content = block.get("content", "")
+        modificator = block.get("modificator", "")
+        # BLOCK_END — разделитель параграфов
+        if modificator == "BLOCK_END":
+            return "\n"
+        if not content:
+            return ""
+        # Формат: ["текст", "стиль", [[тип, начало, длина], ...]]
+        try:
+            parsed = json.loads(content)
+            if isinstance(parsed, list) and len(parsed) >= 1:
+                text = str(parsed[0])
+                # Применяем стили, если есть
+                if len(parsed) >= 3 and parsed[2]:
+                    text = self._apply_styles(text, parsed[2])
+                return text
+        except (json.JSONDecodeError, IndexError, TypeError):
+            return content
+        return ""
+    def _apply_styles(self, text: str, styles: list) -> str:
+        """Применяет стили к тексту (bold, italic)."""
+        if not styles or not text:
+            return text
+        # Сортируем стили по позиции в обратном порядке
+        # чтобы вставка не сбивала индексы
+        sorted_styles = sorted(styles, key=lambda s: s[1] if len(s) > 1 else 0, reverse=True)
+        result = text
+        for style in sorted_styles:
+            if len(style) < 3:
+                continue
+            style_type, start, length = style[0], style[1], style[2]
+            end = start + length
+            if start < 0 or end > len(result):
+                continue
+            fragment = result[start:end]
+            # Типы стилей (примерные, на основе анализа)
+            if style_type == 1:  # bold
+                styled = f"**{fragment}**"
+            elif style_type == 2:  # italic
+                styled = f"*{fragment}*"
+            elif style_type == 4:  # ссылка (обрабатывается в link блоках)
+                styled = fragment
+            else:
+                styled = fragment
+            result = result[:start] + styled + result[end:]
+        return result

src/config.py ADDED Viewed

@@ -0,0 +1,108 @@
+# src/config.py
+"""Загрузка и валидация конфигурации."""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+import yaml
+import os
+Platform = Literal['sponsr', 'boosty']
+@dataclass
+class Source:
+    platform: Platform
+    author: str
+    download_assets: bool = True
+    display_name: str | None = None
+    asset_types: list[str] | None = None
+@dataclass
+class Auth:
+    sponsr_cookie_file: Path | None = None
+    boosty_cookie_file: Path | None = None
+    boosty_auth_file: Path | None = None  # Authorization: Bearer ...
+@dataclass
+class HugoConfig:
+    base_url: str = "http://localhost:1313/"
+    title: str = "Бэкап статей"
+    language_code: str = "ru"
+    default_theme: str = "light"
+@dataclass
+class Config:
+    output_dir: Path
+    auth: Auth
+    sources: list[Source] = field(default_factory=list)
+    hugo: HugoConfig = field(default_factory=HugoConfig)
+def load_config(config_path: Path) -> Config:
+    """Загружает конфигурацию из YAML-файла."""
+    with open(config_path, 'r', encoding='utf-8') as f:
+        data = yaml.safe_load(f)
+    # output_dir
+    env_output_dir = os.environ.get('BACKUP_OUTPUT_DIR')
+    if env_output_dir:
+        output_dir = Path(env_output_dir)
+    else:
+        output_dir = Path(data.get('output_dir', './backup'))
+    # auth
+    auth_data = data.get('auth', {})
+    auth = Auth(
+        sponsr_cookie_file=_to_path(auth_data.get('sponsr_cookie_file')),
+        boosty_cookie_file=_to_path(auth_data.get('boosty_cookie_file')),
+        boosty_auth_file=_to_path(auth_data.get('boosty_auth_file')),
+    )
+    # sources
+    sources = []
+    for src in data.get('sources', []):
+        sources.append(Source(
+            platform=src['platform'],
+            author=src['author'],
+            download_assets=src.get('download_assets', True),
+            display_name=src.get('display_name'),
+            asset_types=src.get('asset_types'),
+        ))
+    # hugo
+    hugo_data = data.get('hugo', {})
+    hugo = HugoConfig(
+        base_url=hugo_data.get('base_url', HugoConfig.base_url),
+        title=hugo_data.get('title', HugoConfig.title),
+        language_code=hugo_data.get('language_code', HugoConfig.language_code),
+        default_theme=hugo_data.get('default_theme', HugoConfig.default_theme),
+    )
+    return Config(output_dir=output_dir, auth=auth, sources=sources, hugo=hugo)
+def _to_path(value: str | None) -> Path | None:
+    """Конвертирует строку в Path или возвращает None."""
+    return Path(value) if value else None
+def load_cookie(cookie_file: Path | None) -> str:
+    """Загружает cookie из файла."""
+    if cookie_file is None:
+        raise FileNotFoundError("Cookie file path not specified")
+    if not cookie_file.exists():
+        raise FileNotFoundError(f"Cookie file not found: {cookie_file}")
+    return cookie_file.read_text(encoding='utf-8').strip()
+def load_auth_header(auth_file: Path | None) -> str:
+    """Загружает Authorization header из файла."""
+    if auth_file is None:
+        raise FileNotFoundError("Auth file path not specified")
+    if not auth_file.exists():
+        raise FileNotFoundError(f"Auth file not found: {auth_file}")
+    return auth_file.read_text(encoding='utf-8').strip()