PyPI - article-backup - Versions diffs - 0.1.0__py3-none-any.whl - Mend

article-backup 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of article-backup might be problematic. Click here for more details.

Files changed (14) hide show

article_backup-0.1.0.dist-info/METADATA +275 -0
article_backup-0.1.0.dist-info/RECORD +14 -0
article_backup-0.1.0.dist-info/WHEEL +5 -0
article_backup-0.1.0.dist-info/entry_points.txt +2 -0
article_backup-0.1.0.dist-info/licenses/LICENSE +177 -0
article_backup-0.1.0.dist-info/top_level.txt +2 -0
backup.py +162 -0
src/__init__.py +1 -0
src/boosty.py +260 -0
src/config.py +99 -0
src/database.py +169 -0
src/downloader.py +366 -0
src/sponsr.py +257 -0
src/utils.py +123 -0

src/boosty.py ADDED Viewed

@@ -0,0 +1,260 @@
+# src/boosty.py
+"""Загрузчик для Boosty.to"""
+import json
+from datetime import datetime, timezone
+import requests
+from .config import Config, Source, load_cookie, load_auth_header
+from .database import Database
+from .downloader import BaseDownloader, Post
+class BoostyDownloader(BaseDownloader):
+    """Загрузчик статей с Boosty.to"""
+    PLATFORM = "boosty"
+    API_BASE = "https://api.boosty.to/v1"
+    def _setup_session(self):
+        """Настройка сессии с cookies и authorization."""
+        cookie = load_cookie(self.config.auth.boosty_cookie_file)
+        auth = load_auth_header(self.config.auth.boosty_auth_file)
+        self.session.headers.update({
+            'Cookie': cookie,
+            'Authorization': auth,
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        })
+    def fetch_posts_list(self) -> list[dict]:
+        """Получает список всех постов через API."""
+        all_posts = []
+        offset = None
+        while True:
+            url = f"{self.API_BASE}/blog/{self.source.author}/post/?limit=20"
+            if offset:
+                url += f"&offset={offset}"
+            response = self.session.get(url, timeout=self.TIMEOUT)
+            response.raise_for_status()
+            data = response.json()
+            posts_chunk = data.get("data", [])
+            if not posts_chunk:
+                break
+            all_posts.extend(posts_chunk)
+            print(f"  Получено {len(all_posts)} постов...")
+            # Проверяем, есть ли ещё страницы
+            extra = data.get("extra", {})
+            if extra.get("isLast", True):
+                break
+            offset = extra.get("offset")
+            if not offset:
+                break
+        return all_posts
+    def fetch_post(self, post_id: str) -> Post | None:
+        """Получает один пост по ID."""
+        url = f"{self.API_BASE}/blog/{self.source.author}/post/{post_id}"
+        try:
+            response = self.session.get(url, timeout=self.TIMEOUT)
+            response.raise_for_status()
+            data = response.json()
+            return self._parse_post(data)
+        except requests.RequestException as e:
+            print(f"  Ошибка получения поста {post_id}: {e}")
+            return None
+    def _parse_post(self, raw_data: dict) -> Post:
+        """Парсит сырые данные API в Post."""
+        post_id = raw_data.get("id", "")
+        title = raw_data.get("title", "Без названия")
+        # Дата — timestamp в секундах
+        created_at = raw_data.get("createdAt", 0)
+        post_date = datetime.fromtimestamp(created_at, tz=timezone.utc).isoformat()
+        # URL поста
+        author = raw_data.get("user", {}).get("blogUrl", self.source.author)
+        source_url = f"https://boosty.to/{author}/posts/{post_id}"
+        # Теги
+        tags = [t.get("title", "") for t in raw_data.get("tags", []) if t.get("title")]
+        # Контент — массив блоков
+        content_blocks = raw_data.get("data", [])
+        # Извлекаем assets
+        assets = self._extract_assets(content_blocks)
+        return Post(
+            post_id=post_id,
+            title=title,
+            content_html=json.dumps(content_blocks, ensure_ascii=False),
+            post_date=post_date,
+            source_url=source_url,
+            tags=tags,
+            assets=assets,
+        )
+    def _extract_assets(self, blocks: list[dict]) -> list[dict]:
+        """Извлекает URL медиафайлов из блоков контента."""
+        assets = []
+        for block in blocks:
+            block_type = block.get("type", "")
+            if block_type == "image":
+                url = block.get("url", "")
+                if url:
+                    assets.append({
+                        "url": url,
+                        "alt": block.get("id", ""),
+                    })
+            elif block_type == "audio_file":
+                url = block.get("url", "")
+                if url:
+                    assets.append({
+                        "url": url,
+                        "alt": block.get("title", block.get("id", "")),
+                    })
+            elif block_type == "ok_video":
+                # ok.ru видео требует отдельной обработки
+                # Пока сохраняем только превью, если есть
+                preview = block.get("previewUrl", "")
+                if preview:
+                    assets.append({
+                        "url": preview,
+                        "alt": f"video-preview-{block.get('id', '')}",
+                    })
+        return assets
+    def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
+        """Конвертирует блоки контента в Markdown."""
+        try:
+            blocks = json.loads(post.content_html)
+        except json.JSONDecodeError:
+            return f"# {post.title}\n\n"
+        lines = [f"# {post.title}\n"]
+        for block in blocks:
+            md = self._block_to_markdown(block, asset_map)
+            if md:
+                lines.append(md)
+        return "\n".join(lines)
+    def _block_to_markdown(self, block: dict, asset_map: dict[str, str]) -> str:
+        """Конвертирует один блок в Markdown."""
+        block_type = block.get("type", "")
+        if block_type == "text":
+            return self._parse_text_block(block)
+        elif block_type == "image":
+            url = block.get("url", "")
+            local = asset_map.get(url)
+            if local:
+                return f"\n![](assets/{local})\n"
+            elif url:
+                return f"\n![]({url})\n"
+        elif block_type == "link":
+            url = block.get("url", "")
+            text = self._parse_text_block(block)
+            if text and url:
+                return f"[{text}]({url})"
+            elif url:
+                return f"<{url}>"
+        elif block_type == "audio_file":
+            url = block.get("url", "")
+            title = block.get("title", "audio")
+            local = asset_map.get(url)
+            if local:
+                return f"\n🎵 **{title}**: [скачать](assets/{local})\n"
+            elif url:
+                return f"\n🎵 **{title}**: [слушать]({url})\n"
+        elif block_type == "ok_video":
+            video_id = block.get("id", "")
+            return f"\n📹 Видео: https://ok.ru/video/{video_id}\n"
+        return ""
+    def _parse_text_block(self, block: dict) -> str:
+        """Парсит текстовый блок Boosty."""
+        content = block.get("content", "")
+        modificator = block.get("modificator", "")
+        # BLOCK_END — разделитель параграфов
+        if modificator == "BLOCK_END":
+            return "\n"
+        if not content:
+            return ""
+        # Формат: ["текст", "стиль", [[тип, начало, длина], ...]]
+        try:
+            parsed = json.loads(content)
+            if isinstance(parsed, list) and len(parsed) >= 1:
+                text = str(parsed[0])
+                # Применяем стили, если есть
+                if len(parsed) >= 3 and parsed[2]:
+                    text = self._apply_styles(text, parsed[2])
+                return text
+        except (json.JSONDecodeError, IndexError, TypeError):
+            return content
+        return ""
+    def _apply_styles(self, text: str, styles: list) -> str:
+        """Применяет стили к тексту (bold, italic)."""
+        if not styles or not text:
+            return text
+        # Сортируем стили по позиции в обратном порядке
+        # чтобы вставка не сбивала индексы
+        sorted_styles = sorted(styles, key=lambda s: s[1] if len(s) > 1 else 0, reverse=True)
+        result = text
+        for style in sorted_styles:
+            if len(style) < 3:
+                continue
+            style_type, start, length = style[0], style[1], style[2]
+            end = start + length
+            if start < 0 or end > len(result):
+                continue
+            fragment = result[start:end]
+            # Типы стилей (примерные, на основе анализа)
+            if style_type == 1:  # bold
+                styled = f"**{fragment}**"
+            elif style_type == 2:  # italic
+                styled = f"*{fragment}*"
+            elif style_type == 4:  # ссылка (обрабатывается в link блоках)
+                styled = fragment
+            else:
+                styled = fragment
+            result = result[:start] + styled + result[end:]
+        return result

src/config.py ADDED Viewed

@@ -0,0 +1,99 @@
+# src/config.py
+"""Загрузка и валидация конфигурации."""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+import yaml
+Platform = Literal['sponsr', 'boosty']
+@dataclass
+class Source:
+    platform: Platform
+    author: str
+    download_assets: bool = True
+    display_name: str | None = None
+@dataclass
+class Auth:
+    sponsr_cookie_file: Path | None = None
+    boosty_cookie_file: Path | None = None
+    boosty_auth_file: Path | None = None  # Authorization: Bearer ...
+@dataclass
+class HugoConfig:
+    base_url: str = "http://localhost:1313/"
+    title: str = "Бэкап статей"
+    language_code: str = "ru"
+@dataclass
+class Config:
+    output_dir: Path
+    auth: Auth
+    sources: list[Source] = field(default_factory=list)
+    hugo: HugoConfig = field(default_factory=HugoConfig)
+def load_config(config_path: Path) -> Config:
+    """Загружает конфигурацию из YAML-файла."""
+    with open(config_path, 'r', encoding='utf-8') as f:
+        data = yaml.safe_load(f)
+    # output_dir
+    output_dir = Path(data.get('output_dir', './backup'))
+    # auth
+    auth_data = data.get('auth', {})
+    auth = Auth(
+        sponsr_cookie_file=_to_path(auth_data.get('sponsr_cookie_file')),
+        boosty_cookie_file=_to_path(auth_data.get('boosty_cookie_file')),
+        boosty_auth_file=_to_path(auth_data.get('boosty_auth_file')),
+    )
+    # sources
+    sources = []
+    for src in data.get('sources', []):
+        sources.append(Source(
+            platform=src['platform'],
+            author=src['author'],
+            download_assets=src.get('download_assets', True),
+            display_name=src.get('display_name'),
+        ))
+    # hugo
+    hugo_data = data.get('hugo', {})
+    hugo = HugoConfig(
+        base_url=hugo_data.get('base_url', HugoConfig.base_url),
+        title=hugo_data.get('title', HugoConfig.title),
+        language_code=hugo_data.get('language_code', HugoConfig.language_code),
+    )
+    return Config(output_dir=output_dir, auth=auth, sources=sources, hugo=hugo)
+def _to_path(value: str | None) -> Path | None:
+    """Конвертирует строку в Path или возвращает None."""
+    return Path(value) if value else None
+def load_cookie(cookie_file: Path | None) -> str:
+    """Загружает cookie из файла."""
+    if cookie_file is None:
+        raise FileNotFoundError("Cookie file path not specified")
+    if not cookie_file.exists():
+        raise FileNotFoundError(f"Cookie file not found: {cookie_file}")
+    return cookie_file.read_text(encoding='utf-8').strip()
+def load_auth_header(auth_file: Path | None) -> str:
+    """Загружает Authorization header из файла."""
+    if auth_file is None:
+        raise FileNotFoundError("Auth file path not specified")
+    if not auth_file.exists():
+        raise FileNotFoundError(f"Auth file not found: {auth_file}")
+    return auth_file.read_text(encoding='utf-8').strip()

src/database.py ADDED Viewed

@@ -0,0 +1,169 @@
+# src/database.py
+"""SQLite операции для индекса постов."""
+import sqlite3
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class PostRecord:
+    platform: str
+    author: str
+    post_id: str
+    title: str
+    slug: str
+    post_date: str
+    source_url: str
+    local_path: str
+    tags: str
+    synced_at: str
+class Database:
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        self._conn: sqlite3.Connection | None = None
+        self._init_db()
+    def _get_conn(self) -> sqlite3.Connection:
+        """Возвращает соединение, создавая его при необходимости."""
+        if self._conn is None:
+            self.db_path.parent.mkdir(parents=True, exist_ok=True)
+            self._conn = sqlite3.connect(self.db_path, check_same_thread=False, timeout=30)
+            self._conn.execute('PRAGMA journal_mode=WAL')
+            self._conn.row_factory = sqlite3.Row
+        return self._conn
+    def _init_db(self):
+        """Создаёт таблицы, если не существуют."""
+        conn = self._get_conn()
+        conn.execute('''
+            CREATE TABLE IF NOT EXISTS posts (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                platform TEXT NOT NULL,
+                author TEXT NOT NULL,
+                post_id TEXT NOT NULL,
+                title TEXT,
+                slug TEXT,
+                post_date TEXT,
+                source_url TEXT,
+                local_path TEXT,
+                tags TEXT,
+                synced_at TEXT,
+                UNIQUE(platform, author, post_id)
+            )
+        ''')
+        conn.execute('''
+            CREATE INDEX IF NOT EXISTS idx_platform_author
+            ON posts(platform, author)
+        ''')
+        conn.commit()
+    def close(self):
+        """Закрывает соединение с БД."""
+        if self._conn:
+            self._conn.close()
+            self._conn = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def post_exists(self, platform: str, author: str, post_id: str) -> bool:
+        """Проверяет, существует ли пост в индексе."""
+        conn = self._get_conn()
+        cursor = conn.execute(
+            'SELECT 1 FROM posts WHERE platform = ? AND author = ? AND post_id = ?',
+            (platform, author, post_id)
+        )
+        return cursor.fetchone() is not None
+    def add_post(self, record: PostRecord):
+        """Добавляет пост в индекс."""
+        conn = self._get_conn()
+        conn.execute('''
+            INSERT OR REPLACE INTO posts
+            (platform, author, post_id, title, slug, post_date, source_url, local_path, tags, synced_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        ''', (
+            record.platform,
+            record.author,
+            record.post_id,
+            record.title,
+            record.slug,
+            record.post_date,
+            record.source_url,
+            record.local_path,
+            record.tags,
+            record.synced_at,
+        ))
+        conn.commit()
+    def get_post(self, platform: str, author: str, post_id: str) -> PostRecord | None:
+        """Получает пост из индекса."""
+        conn = self._get_conn()
+        cursor = conn.execute(
+            'SELECT * FROM posts WHERE platform = ? AND author = ? AND post_id = ?',
+            (platform, author, post_id)
+        )
+        row = cursor.fetchone()
+        if row:
+            return self._row_to_record(row)
+        return None
+    def get_all_post_ids(self, platform: str, author: str) -> set[str]:
+        """Возвращает множество всех post_id для автора."""
+        conn = self._get_conn()
+        cursor = conn.execute(
+            'SELECT post_id FROM posts WHERE platform = ? AND author = ?',
+            (platform, author)
+        )
+        return {row[0] for row in cursor.fetchall()}
+    def get_post_count(self, platform: str, author: str) -> int:
+        """Возвращает количество постов автора."""
+        conn = self._get_conn()
+        cursor = conn.execute(
+            'SELECT COUNT(*) FROM posts WHERE platform = ? AND author = ?',
+            (platform, author)
+        )
+        return cursor.fetchone()[0]
+    def get_post_by_source_url(self, url: str) -> PostRecord | None:
+        """Ищет пост по исходному URL."""
+        conn = self._get_conn()
+        cursor = conn.execute(
+            'SELECT * FROM posts WHERE source_url = ?',
+            (url,)
+        )
+        row = cursor.fetchone()
+        if row:
+            return self._row_to_record(row)
+        return None
+    def get_all_posts(self, platform: str, author: str) -> list[PostRecord]:
+        """Возвращает все посты автора."""
+        conn = self._get_conn()
+        cursor = conn.execute(
+            'SELECT * FROM posts WHERE platform = ? AND author = ?',
+            (platform, author)
+        )
+        return [self._row_to_record(row) for row in cursor.fetchall()]
+    def _row_to_record(self, row: sqlite3.Row) -> PostRecord:
+        """Конвертирует строку БД в PostRecord."""
+        return PostRecord(
+            platform=row['platform'],
+            author=row['author'],
+            post_id=row['post_id'],
+            title=row['title'],
+            slug=row['slug'],
+            post_date=row['post_date'],
+            source_url=row['source_url'],
+            local_path=row['local_path'],
+            tags=row['tags'],
+            synced_at=row['synced_at'],
+        )