PyPI - article-backup - Versions diffs - 0.3.4__tar.gz → 0.3.6__tar.gz - Mend

article-backup 0.3.4tar.gz → 0.3.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{article_backup-0.3.4 → article_backup-0.3.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: article-backup
-Version: 0.3.4
+Version: 0.3.6
 Summary: Локальный бэкап статей с Sponsr.ru и Boosty.to в Markdown с Hugo-интеграцией
 Author-email: Eugene Chaykin <eugene@chayk.in>
 License: Apache-2.0

{article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: article-backup
-Version: 0.3.4
+Version: 0.3.6
 Summary: Локальный бэкап статей с Sponsr.ru и Boosty.to в Markdown с Hugo-интеграцией
 Author-email: Eugene Chaykin <eugene@chayk.in>
 License: Apache-2.0

{article_backup-0.3.4 → article_backup-0.3.6}/article_backup.egg-info/SOURCES.txt RENAMED Viewed

@@ -18,7 +18,9 @@ src/utils.py
 tests/test_asset_dedup.py
 tests/test_boosty_empty_link.py
 tests/test_boosty_normalize.py
+tests/test_config_hardening.py
 tests/test_incremental_sync.py
+tests/test_slug_safety.py
 tests/test_sponsr_normalize.py
 tests/test_sponsr_tags.py
 tests/test_video_embed.py

{article_backup-0.3.4 → article_backup-0.3.6}/backup.py RENAMED Viewed

@@ -3,6 +3,7 @@
 """CLI точка входа для бэкапа статей."""
 import argparse
+import json
 import os
 import sys
 from pathlib import Path
@@ -21,13 +22,16 @@ def generate_hugo_config(config: Config):
     if not hugo_toml.parent.exists():
         return
-    content = f'''baseURL = '{config.hugo.base_url}'
-languageCode = '{config.hugo.language_code}'
-title = '{config.hugo.title}'
+    def toml_str(value: str) -> str:
+        return json.dumps(value, ensure_ascii=False)
+    content = f'''baseURL = {toml_str(config.hugo.base_url)}
+languageCode = {toml_str(config.hugo.language_code)}
+title = {toml_str(config.hugo.title)}
 relativeURLs = true
 [params]
-  default_theme = '{config.hugo.default_theme}'
+  default_theme = {toml_str(config.hugo.default_theme)}
 [markup.goldmark.renderer]
   unsafe = true
@@ -89,12 +93,15 @@ def get_downloader(platform: str, config: Config, source: Source, db: Database):
 def sync_all(config: Config, db: Database):
     """Синхронизирует всех авторов из конфига."""
+    errors: list[tuple[Source, Exception]] = []
     for source in config.sources:
         try:
             downloader = get_downloader(source.platform, config, source, db)
             downloader.sync()
         except Exception as e:
             print(f"[{source.platform}] Ошибка при синхронизации {source.author}: {e}")
+            errors.append((source, e))
+    return errors
 def download_single_post(url: str, config: Config, db: Database):
@@ -157,21 +164,34 @@ def main():
     # Создаём директорию и базу
     config.output_dir.mkdir(parents=True, exist_ok=True)
+    sync_errors: list[tuple[Source, Exception]] = []
     with Database(config.output_dir / 'index.db') as db:
         # Выполняем команду
         if args.url:
             if not is_post_url(args.url):
                 print(f"Ошибка: неверный URL поста: {args.url}")
                 sys.exit(1)
-            download_single_post(args.url, config, db)
+            try:
+                download_single_post(args.url, config, db)
+            except Exception as e:
+                print(f"Ошибка при скачивании поста: {e}")
+                sys.exit(1)
         else:
             if not config.sources:
                 print("Нет источников в конфиге. Добавьте секцию 'sources'.")
                 sys.exit(1)
-            sync_all(config, db)
+            sync_errors = sync_all(config, db)
     ensure_site_content_link(config)
     generate_hugo_config(config)
+    if sync_errors:
+        print(f"\nЗавершено с ошибками: {len(sync_errors)}")
+        for source, error in sync_errors:
+            print(f"  - [{source.platform}] {source.author}: {error}")
+        sys.exit(1)
     print("\nГотово!")

{article_backup-0.3.4 → article_backup-0.3.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "article-backup"
-version = "0.3.4"
+version = "0.3.6"
 description = "Локальный бэкап статей с Sponsr.ru и Boosty.to в Markdown с Hugo-интеграцией"
 readme = "README.md"
 license = {text = "Apache-2.0"}

{article_backup-0.3.4 → article_backup-0.3.6}/src/boosty.py RENAMED Viewed

@@ -8,7 +8,7 @@ import requests
 from .config import Config, Source, load_cookie, load_auth_header
 from .database import Database
-from .downloader import BaseDownloader, Post
+from .downloader import BaseDownloader, Post, retry_request
 class BoostyDownloader(BaseDownloader):
@@ -17,6 +17,10 @@ class BoostyDownloader(BaseDownloader):
     PLATFORM = "boosty"
     API_BASE = "https://api.boosty.to/v1"
+    def __init__(self, config: Config, source: Source, db: Database):
+        self._warned_unknown_block_types: set[str] = set()
+        super().__init__(config, source, db)
     def _setup_session(self):
         """Настройка сессии с cookies и authorization."""
         cookie = load_cookie(self.config.auth.boosty_cookie_file)
@@ -51,8 +55,12 @@ class BoostyDownloader(BaseDownloader):
             if offset:
                 url += f"&offset={offset}"
-            response = self.session.get(url, timeout=self.TIMEOUT)
-            response.raise_for_status()
+            def do_request():
+                resp = self.session.get(url, timeout=self.TIMEOUT)
+                resp.raise_for_status()
+                return resp
+            response = retry_request(do_request, max_retries=3)
             data = response.json()
             posts_chunk = data.get("data", [])
@@ -96,8 +104,12 @@ class BoostyDownloader(BaseDownloader):
         url = f"{self.API_BASE}/blog/{self.source.author}/post/{post_id}"
         try:
-            response = self.session.get(url, timeout=self.TIMEOUT)
-            response.raise_for_status()
+            def do_request():
+                resp = self.session.get(url, timeout=self.TIMEOUT)
+                resp.raise_for_status()
+                return resp
+            response = retry_request(do_request, max_retries=3)
             data = response.json()
             return self._parse_post(data)
         except requests.RequestException as e:
@@ -256,6 +268,10 @@ class BoostyDownloader(BaseDownloader):
             video_id = block.get("id", "")
             return f"\n[\U0001f4f9 Видео](https://ok.ru/videoembed/{video_id})\n"
+        elif block_type and block_type not in self._warned_unknown_block_types:
+            print(f"  [boosty] Пропущен неподдерживаемый тип блока: {block_type}")
+            self._warned_unknown_block_types.add(block_type)
         return ""
     def _parse_text_block(self, block: dict, paragraph_offset: int = 0) -> str:

{article_backup-0.3.4 → article_backup-0.3.6}/src/config.py RENAMED Viewed

@@ -47,6 +47,11 @@ def load_config(config_path: Path) -> Config:
     with open(config_path, 'r', encoding='utf-8') as f:
         data = yaml.safe_load(f)
+    if data is None:
+        data = {}
+    if not isinstance(data, dict):
+        raise ValueError("Корень config.yaml должен быть объектом (mapping)")
     # output_dir
     env_output_dir = os.environ.get('BACKUP_OUTPUT_DIR')
     if env_output_dir:
@@ -56,6 +61,10 @@ def load_config(config_path: Path) -> Config:
     # auth
     auth_data = data.get('auth', {})
+    if auth_data is None:
+        auth_data = {}
+    if not isinstance(auth_data, dict):
+        raise ValueError("Секция 'auth' должна быть объектом")
     auth = Auth(
         sponsr_cookie_file=_to_path(auth_data.get('sponsr_cookie_file')),
         boosty_cookie_file=_to_path(auth_data.get('boosty_cookie_file')),
@@ -64,7 +73,17 @@ def load_config(config_path: Path) -> Config:
     # sources
     sources = []
-    for src in data.get('sources', []):
+    sources_data = data.get('sources', [])
+    if sources_data is None:
+        sources_data = []
+    if not isinstance(sources_data, list):
+        raise ValueError("Секция 'sources' должна быть списком")
+    for src in sources_data:
+        if not isinstance(src, dict):
+            raise ValueError("Каждый элемент в 'sources' должен быть объектом")
+        if 'platform' not in src or 'author' not in src:
+            raise ValueError("Каждый источник в 'sources' должен содержать 'platform' и 'author'")
         sources.append(Source(
             platform=src['platform'],
             author=src['author'],
@@ -75,6 +94,10 @@ def load_config(config_path: Path) -> Config:
     # hugo
     hugo_data = data.get('hugo', {})
+    if hugo_data is None:
+        hugo_data = {}
+    if not isinstance(hugo_data, dict):
+        raise ValueError("Секция 'hugo' должна быть объектом")
     hugo = HugoConfig(
         base_url=hugo_data.get('base_url', HugoConfig.base_url),
         title=hugo_data.get('title', HugoConfig.title),
@@ -105,4 +128,4 @@ def load_auth_header(auth_file: Path | None) -> str:
         raise FileNotFoundError("Auth file path not specified")
     if not auth_file.exists():
         raise FileNotFoundError(f"Auth file not found: {auth_file}")
-    return auth_file.read_text(encoding='utf-8').strip()
+    return auth_file.read_text(encoding='utf-8').strip()

{article_backup-0.3.4 → article_backup-0.3.6}/src/downloader.py RENAMED Viewed

@@ -83,7 +83,7 @@ class BaseDownloader(ABC):
     PLATFORM: str = ""
     MAX_WORKERS: int = 5
-    TIMEOUT: tuple = (5, 30)
+    TIMEOUT: tuple = (5, 60)
     def __init__(self, config: Config, source: Source, db: Database):
         self.config = config
@@ -209,7 +209,11 @@ class BaseDownloader(ABC):
     def _save_post(self, post: Post):
         """Сохраняет пост на диск."""
-        slug = self._make_slug(post)
+        existing_record = self.db.get_post(self.PLATFORM, self.source.author, post.post_id)
+        if existing_record and existing_record.slug:
+            slug = existing_record.slug
+        else:
+            slug = self._make_slug(post)
         post_dir = self._get_post_dir(slug)
         post_dir.mkdir(parents=True, exist_ok=True)
@@ -251,7 +255,10 @@ class BaseDownloader(ABC):
         """Создаёт slug для папки поста."""
         date_prefix = post.post_date[:10]
         title_slug = transliterate(post.title)[:60]
-        return f"{date_prefix}-{title_slug}"
+        post_suffix = slugify(post.post_id, lowercase=True, max_length=16)
+        if not post_suffix:
+            post_suffix = hashlib.md5(post.post_id.encode()).hexdigest()[:8]
+        return f"{date_prefix}-{title_slug}-{post_suffix}"
     def _get_post_dir(self, slug: str) -> Path:
         """Возвращает путь к папке поста."""
@@ -306,33 +313,37 @@ class BaseDownloader(ABC):
                     return resp
                 response = retry_request(do_request, max_retries=3)
+                try:
+                    content_type = response.headers.get('Content-Type', '')
-                content_type = response.headers.get('Content-Type', '')
-                # Полная проверка после получения Content-Type
-                if not should_download_asset(url, content_type, self.source.asset_types):
-                    return url, None
+                    # Полная проверка после получения Content-Type
+                    if not should_download_asset(url, content_type, self.source.asset_types):
+                        return url, None
-                filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
+                    filename_base = self._make_asset_filename(url, content_type, asset.get('alt'))
-                with used_lock:
-                    filename = filename_base
-                    filepath = assets_dir / filename
-                    if filename in used_filenames or filepath.exists():
-                        filename = self._deduplicate_filename(filename, url)
+                    with used_lock:
+                        filename = filename_base
                         filepath = assets_dir / filename
-                    # На всякий случай добиваемся уникальности в рамках сессии
-                    while filename in used_filenames or filepath.exists():
-                        filename = self._deduplicate_filename(filename, url + filename)
-                        filepath = assets_dir / filename
-                    used_filenames.add(filename)
-                if not filepath.exists():
-                    with open(filepath, 'wb') as f:
-                        for chunk in response.iter_content(chunk_size=8192):
-                            f.write(chunk)
+                        if filename in used_filenames or filepath.exists():
+                            filename = self._deduplicate_filename(filename, url)
+                            filepath = assets_dir / filename
+                        # На всякий случай добиваемся уникальности в рамках сессии
+                        while filename in used_filenames or filepath.exists():
+                            filename = self._deduplicate_filename(filename, url + filename)
+                            filepath = assets_dir / filename
+                        used_filenames.add(filename)
+                    if not filepath.exists():
+                        with open(filepath, 'wb') as f:
+                            for chunk in response.iter_content(chunk_size=8192):
+                                f.write(chunk)
+                finally:
+                    close = getattr(response, 'close', None)
+                    if callable(close):
+                        close()
                 return url, filename
             except requests.RequestException as e:

{article_backup-0.3.4 → article_backup-0.3.6}/src/sponsr.py RENAMED Viewed

@@ -12,7 +12,7 @@ import html2text
 from .config import Config, Source, load_cookie
 from .database import Database
-from .downloader import BaseDownloader, Post
+from .downloader import BaseDownloader, Post, retry_request
 # Паттерны для распознавания embed URL видеохостингов (whitelist).
 # Если iframe src матчит один из паттернов — это встроенное видео.
@@ -49,8 +49,12 @@ class SponsorDownloader(BaseDownloader):
             return self._project_id
         url = f"https://sponsr.ru/{self.source.author}/"
-        response = self.session.get(url, timeout=self.TIMEOUT)
-        response.raise_for_status()
+        def do_request():
+            resp = self.session.get(url, timeout=self.TIMEOUT)
+            resp.raise_for_status()
+            return resp
+        response = retry_request(do_request, max_retries=3)
         soup = BeautifulSoup(response.text, 'lxml')
         data_tag = soup.find('script', id='__NEXT_DATA__')
@@ -86,8 +90,12 @@ class SponsorDownloader(BaseDownloader):
         while True:
             api_url = f"https://sponsr.ru/project/{project_id}/more-posts/?offset={offset}"
-            response = self.session.get(api_url, timeout=self.TIMEOUT)
-            response.raise_for_status()
+            def do_request():
+                resp = self.session.get(api_url, timeout=self.TIMEOUT)
+                resp.raise_for_status()
+                return resp
+            response = retry_request(do_request, max_retries=3)
             data = response.json().get("response", {})
             posts_chunk = data.get("rows", [])
@@ -135,8 +143,12 @@ class SponsorDownloader(BaseDownloader):
         # URL формат: https://sponsr.ru/{author}/{post_id}/...
         url = f"https://sponsr.ru/{self.source.author}/{post_id}/"
         try:
-            response = self.session.get(url, timeout=self.TIMEOUT)
-            response.raise_for_status()
+            def do_request():
+                resp = self.session.get(url, timeout=self.TIMEOUT)
+                resp.raise_for_status()
+                return resp
+            response = retry_request(do_request, max_retries=3)
             soup = BeautifulSoup(response.text, 'lxml')
             data_tag = soup.find('script', id='__NEXT_DATA__')
@@ -160,8 +172,12 @@ class SponsorDownloader(BaseDownloader):
         while True:
             api_url = f"https://sponsr.ru/project/{project_id}/more-posts/?offset={offset}"
             try:
-                response = self.session.get(api_url, timeout=self.TIMEOUT)
-                response.raise_for_status()
+                def do_request():
+                    resp = self.session.get(api_url, timeout=self.TIMEOUT)
+                    resp.raise_for_status()
+                    return resp
+                response = retry_request(do_request, max_retries=3)
                 data = response.json().get("response", {})
                 posts_chunk = data.get("rows", [])
@@ -345,12 +361,9 @@ class SponsorDownloader(BaseDownloader):
                     tag.insert_after(NavigableString(trailing))
         # 4. Вынос trailing/leading пробелов из <a> тегов наружу
-        #    После выноса пробелов из formatting тегов, пробел может остаться
-        #    внутри <a> (но вне <em>/<b>), что даёт [текст ](url) в markdown
         for tag in list(soup.find_all('a')):
             if tag.parent is None:
                 continue
-            # Trailing: проверяем последний дочерний узел (может быть голый пробел)
             children = list(tag.children)
             if children:
                 last_child = children[-1]
@@ -359,8 +372,40 @@ class SponsorDownloader(BaseDownloader):
                     last_child.replace_with(NavigableString(str(last_child).rstrip()))
                     tag.insert_after(NavigableString(trailing))
+        # 5. Экранирование markdown-символов в текстовых узлах
+        #    Чтобы "сырые" _, *, [ ] в тексте не превращались в разметку
+        self._escape_text_nodes(soup)
         return str(soup)
+    @staticmethod
+    def _escape_text_nodes(soup):
+        """Экранирует спецсимволы Markdown в текстовых узлах."""
+        from bs4 import NavigableString
+        replacements = {
+            '_': '@@@US@@@',
+            '*': '@@@AST@@@',
+            '[': '@@@LBR@@@',
+            ']': '@@@RBR@@@',
+        }
+        for text_node in soup.find_all(string=True):
+            if text_node.parent and text_node.parent.name in ['script', 'style', 'title']:
+                continue
+            text = str(text_node)
+            if not text:
+                continue
+            new_text = text
+            for char, placeholder in replacements.items():
+                if char in new_text:
+                    new_text = new_text.replace(char, placeholder)
+            if new_text != text:
+                text_node.replace_with(NavigableString(new_text))
     @staticmethod
     def _merge_adjacent_em(soup, em_tags: set, bold_tags: set):
         """Объединяет соседние <em>/<i> теги внутри одного родителя.
@@ -519,6 +564,12 @@ class SponsorDownloader(BaseDownloader):
         markdown = h2t.handle(html)
+        # Восстанавливаем экранированные символы (из плейсхолдеров DOM)
+        markdown = markdown.replace('@@@US@@@', r'\_')
+        markdown = markdown.replace('@@@AST@@@', r'\*')
+        markdown = markdown.replace('@@@LBR@@@', r'\[')
+        markdown = markdown.replace('@@@RBR@@@', r'\]')
         # Удаляем bidi-маркеры, которые ломают пробелы рядом с текстом
         markdown = re.sub(r'[\u200e\u200f\u202a-\u202e\u2066-\u2069]', '', markdown)
@@ -550,38 +601,23 @@ class SponsorDownloader(BaseDownloader):
         # Закрывающие: » " '
         markdown = re.sub(r'\s+([\u00bb\u201d\u2019])', r'\1', markdown)
-        # Восстанавливаем пробелы вокруг форматирования и ссылок
-        def _fix_spacing(text: str, pattern: re.Pattern) -> str:
-            """Добавляет пробелы вокруг элементов, если их нет."""
-            parts = []
-            last = 0
-            for match in pattern.finditer(text):
-                start, end = match.span()
-                before = text[last:start]
-                # Добавляем пробел слева, если нужно
-                if start > 0 and before and before[-1].isalnum():
-                    before = before + ' '
-                parts.append(before)
-                # Добавляем сам матч
-                matched_text = text[start:end]
-                # Добавляем пробел справа, если нужно
-                if end < len(text) and text[end].isalnum():
-                    matched_text = matched_text + ' '
-                parts.append(matched_text)
-                last = end
-            parts.append(text[last:])
-            return ''.join(parts)
-        # Восстанавливаем пробелы вокруг bold-italic, bold, ссылок
-        markdown = _fix_spacing(markdown, re.compile(r'\*\*\*.+?\*\*\*'))
-        markdown = _fix_spacing(markdown, re.compile(r'(?<!\*)\*\*(?!\*).+?(?<!\*)\*\*(?!\*)'))
-        markdown = _fix_spacing(markdown, re.compile(r'\[[^\]]+\]\([^)]+\)'))
+        # Восстанавливаем пробелы вокруг **bold**
+        # html2text часто склеивает: слово**bold** -> слово **bold**
+        # Используем поиск пар **, чтобы не сломать closing tag (bold**word -> bold **word - WRONG)
+        # 1. Left side: word**bold** -> word **bold**
+        markdown = re.sub(r'(\w)\*\*(.+?)\*\*', r'\1 **\2**', markdown)
+        # 2. Right side: **bold**word -> **bold** word
+        markdown = re.sub(r'\*\*(.+?)\*\*(\w)', r'**\1** \2', markdown)
+        # Убираем пробел между ссылкой и знаками препинания (даже если они курсивные)
+        # [link](url) . -> [link](url).
+        # [link](url) _._ -> [link](url)_._
+        markdown = re.sub(r'(\)\s+)([.,:;!?])', r')\2', markdown)
+        markdown = re.sub(r'(\)\s+)(_[.,:;!?]_)', r')\2', markdown)
+        # Исправляем артефакты html2text внутри ссылок: [ _текст_ ] -> [_текст_]
+        markdown = re.sub(r'\[\s+_', r'[_', markdown)
+        markdown = re.sub(r'_\s+\]', r'_]', markdown)
         # Заголовок берётся из frontmatter (Hugo), не дублируем его в body.
         return markdown

article_backup-0.3.6/tests/test_config_hardening.py ADDED Viewed

@@ -0,0 +1,51 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from backup import generate_hugo_config
+from src.config import Auth, Config, HugoConfig, load_config
+class ConfigHardeningTests(unittest.TestCase):
+    def test_load_config_accepts_empty_yaml(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            cfg_path = Path(tmp) / "config.yaml"
+            cfg_path.write_text("", encoding="utf-8")
+            cfg = load_config(cfg_path)
+            self.assertEqual(cfg.output_dir, Path("./backup"))
+            self.assertEqual(cfg.sources, [])
+    def test_generate_hugo_config_escapes_quotes(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            old_cwd = Path.cwd()
+            tmp_path = Path(tmp)
+            (tmp_path / "site").mkdir(parents=True, exist_ok=True)
+            try:
+                os.chdir(tmp_path)
+                cfg = Config(
+                    output_dir=tmp_path / "backup",
+                    auth=Auth(),
+                    hugo=HugoConfig(
+                        base_url='https://example.com/a"b',
+                        title='Bob\'s "backup"',
+                        language_code="ru",
+                        default_theme='light"mode',
+                    ),
+                )
+                generate_hugo_config(cfg)
+                toml = (tmp_path / "site" / "hugo.toml").read_text(encoding="utf-8")
+                self.assertIn('title = "Bob\'s \\"backup\\""', toml)
+                self.assertIn('baseURL = "https://example.com/a\\"b"', toml)
+                self.assertIn('default_theme = "light\\"mode"', toml)
+            finally:
+                os.chdir(old_cwd)
+if __name__ == "__main__":
+    unittest.main()

article_backup-0.3.6/tests/test_slug_safety.py ADDED Viewed

@@ -0,0 +1,111 @@
+import tempfile
+import unittest
+from pathlib import Path
+from src.config import Auth, Config, Source
+from src.database import Database, PostRecord
+from src.downloader import BaseDownloader, Post
+class _SlugDummyDownloader(BaseDownloader):
+    PLATFORM = "dummy"
+    def _setup_session(self):
+        return None
+    def fetch_posts_list(self, existing_ids=None, incremental=False, safety_chunks=1):
+        raise NotImplementedError
+    def fetch_post(self, post_id: str):
+        raise NotImplementedError
+    def _parse_post(self, raw_data: dict):
+        raise NotImplementedError
+    def _to_markdown(self, post: Post, asset_map: dict[str, str]) -> str:
+        return "content\n"
+class SlugSafetyTests(unittest.TestCase):
+    def test_slug_unique_for_same_title_and_date(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            config = Config(output_dir=tmp_path, auth=Auth())
+            source = Source(platform="sponsr", author="author")
+            with Database(tmp_path / "test.db") as db:
+                dl = _SlugDummyDownloader(config, source, db)
+                post1 = Post(
+                    post_id="101",
+                    title="Одинаковый заголовок",
+                    content_html="",
+                    post_date="2025-01-01T00:00:00",
+                    source_url="https://example.com/101",
+                    tags=[],
+                    assets=[],
+                )
+                post2 = Post(
+                    post_id="202",
+                    title="Одинаковый заголовок",
+                    content_html="",
+                    post_date="2025-01-01T01:00:00",
+                    source_url="https://example.com/202",
+                    tags=[],
+                    assets=[],
+                )
+                dl._save_post(post1)
+                dl._save_post(post2)
+                rec1 = db.get_post("dummy", "author", "101")
+                rec2 = db.get_post("dummy", "author", "202")
+                self.assertIsNotNone(rec1)
+                self.assertIsNotNone(rec2)
+                self.assertNotEqual(rec1.slug, rec2.slug)
+                self.assertTrue((Path(rec1.local_path) / "index.md").exists())
+                self.assertTrue((Path(rec2.local_path) / "index.md").exists())
+    def test_existing_slug_is_reused_for_same_post_id(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            config = Config(output_dir=tmp_path, auth=Auth())
+            source = Source(platform="sponsr", author="author")
+            with Database(tmp_path / "test.db") as db:
+                old_slug = "2025-01-01-old-style-slug"
+                old_path = str(tmp_path / "dummy" / "author" / "posts" / old_slug)
+                db.add_post(PostRecord(
+                    platform="dummy",
+                    author="author",
+                    post_id="legacy-id",
+                    title="Old",
+                    slug=old_slug,
+                    post_date="2025-01-01T00:00:00",
+                    source_url="https://example.com/legacy",
+                    local_path=old_path,
+                    tags="[]",
+                    synced_at="2025-01-01T00:00:00+00:00",
+                ))
+                dl = _SlugDummyDownloader(config, source, db)
+                updated = Post(
+                    post_id="legacy-id",
+                    title="Новое имя",
+                    content_html="",
+                    post_date="2025-01-01T02:00:00",
+                    source_url="https://example.com/legacy",
+                    tags=[],
+                    assets=[],
+                )
+                dl._save_post(updated)
+                rec = db.get_post("dummy", "author", "legacy-id")
+                self.assertIsNotNone(rec)
+                self.assertEqual(rec.slug, old_slug)
+                self.assertTrue((Path(rec.local_path) / "index.md").exists())
+if __name__ == "__main__":
+    unittest.main()

{article_backup-0.3.4 → article_backup-0.3.6}/tests/test_sponsr_normalize.py RENAMED Viewed

@@ -412,6 +412,100 @@ class SponsorNormalizeTests(unittest.TestCase):
         self.assertIn('_курсив2_', result)
         self.assertIn('обычный', result)
+    def _convert_full(self, html):
+        """Helper to convert HTML to Markdown (full text)."""
+        post = Post(
+            post_id='1',
+            title='Test',
+            content_html=html,
+            post_date='2025-01-01',
+            source_url='https://test.com',
+            tags=[],
+            assets=[]
+        )
+        return self.downloader._to_markdown(post, {})
+    def test_case_1_spacing_cleanup(self):
+        """1. Пробелы внутри курсива (_ текст _) и вокруг."""
+        html = (
+            '<p>фильме.</em></p><p><em>Например, Гор предсказал, что к 2016 году на Килиманджаро не останется снега. '
+            'В 2020 году газета The Times сообщила, что снег на горе высотой 19 000 футов (около 5800 метров) остался, '
+            'несмотря на предсказания Гора. </em></p><p><em>Гор'
+        )
+        md = self._convert_full(html)
+        # Expectation: no spaces inside markers, clean paragraphs
+        self.assertIn('фильме.', md)
+        self.assertIn('_Например, Гор', md)
+        self.assertIn('предсказания Гора._', md)
+        self.assertIn('_Гор', md)
+        self.assertNotIn('_ Например', md)
+        self.assertNotIn('Гора. _', md)
+        self.assertNotIn(' _Гор', md)
+    def test_case_2_multiline_italic(self):
+        """2. Курсив через границы абзацев."""
+        html = (
+            '<p>В.М.).</em></p><p><em>Метеоролог Крис Марц сказал, что климатология полна неопределенности и нюансов, '
+            'которые «Неудобная правда» полностью отвергает. </em></p><p><em>Однако'
+        )
+        md = self._convert_full(html)
+        self.assertIn('В.М.).', md)
+        self.assertIn('_Метеоролог Крис', md)
+        self.assertIn('отвергает._', md)
+        self.assertIn('_Однако', md)
+        self.assertNotIn('_ Метеоролог', md)
+        self.assertNotIn('отвергает. _', md)
+    def test_case_3_literal_underscore_in_text(self):
+        """3. Символы _ в обычном тексте не должны становиться разметкой."""
+        html = (
+            '<p>сформулировал: «_39 лет я никогда не писал этих слов в отзыве на кино, а сейчас пишу: _'
+            '<a href="http://example.com" target="_blank"><em>вы <strong>обязаны</strong> это посмотреть</em></a>».</p><p>К тому же'
+        )
+        md = self._convert_full(html)
+        # Literal underscores should be escaped
+        self.assertIn(r'\_39 лет', md)
+        self.assertIn(r'пишу: \_', md)
+        # Link formatting should be clean
+        self.assertIn('[_вы **обязаны** это посмотреть_](http://example.com)', md)
+        # No extra spaces
+        self.assertNotIn('[ _вы', md)
+    def test_case_4_underscore_suffix(self):
+        """4. Пробел перед закрывающим _."""
+        html = '<p>читатель данного проекта ощутил себя _не таким как все _(которого не проведёшь)?</p>'
+        md = self._convert_full(html)
+        # Literal underscores should be escaped
+        self.assertIn(r'\_не таким как все \_', md)
+        # Verify no unescaped underscores (except inside words if any, but here they are spaced)
+        # Using regex to ensure underscores are preceded by backslash
+        import re
+        self.assertFalse(re.search(r'(?<!\\)_', md), "Found unescaped underscore")
+    def test_case_5_link_italic_punctuation(self):
+        """5. Курсив вокруг ссылки и точки."""
+        html = (
+            '<p>бежать.</em></p><p><em>Из нескольких разговоров ... из </em>'
+            '<a href="https://example.com" target="_blank"><em>свежего текста</em></a><em>.</em></p><p><em>Поэтому'
+        )
+        md = self._convert_full(html)
+        self.assertIn('бежать.', md)
+        self.assertIn('_Из нескольких', md)
+        # Link inside italic context
+        self.assertIn('](https://example.com)', md)
+        self.assertNotIn(' _.', md)
+        self.assertNotIn('_. _', md)
 if __name__ == '__main__':
     unittest.main()