article-backup 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of article-backup might be problematic. Click here for more details.
- article_backup-0.1.0.dist-info/METADATA +275 -0
- article_backup-0.1.0.dist-info/RECORD +14 -0
- article_backup-0.1.0.dist-info/WHEEL +5 -0
- article_backup-0.1.0.dist-info/entry_points.txt +2 -0
- article_backup-0.1.0.dist-info/licenses/LICENSE +177 -0
- article_backup-0.1.0.dist-info/top_level.txt +2 -0
- backup.py +162 -0
- src/__init__.py +1 -0
- src/boosty.py +260 -0
- src/config.py +99 -0
- src/database.py +169 -0
- src/downloader.py +366 -0
- src/sponsr.py +257 -0
- src/utils.py +123 -0
src/utils.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# src/utils.py
|
|
2
|
+
"""Вспомогательные функции для бэкапа статей."""
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
from slugify import slugify
|
|
8
|
+
|
|
9
|
+
# Белый список расширений
|
|
10
|
+
ALLOWED_EXTENSIONS = {
|
|
11
|
+
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg',
|
|
12
|
+
'.mp4', '.webm', '.mov', '.mkv', '.avi',
|
|
13
|
+
'.mp3', '.wav', '.flac', '.ogg',
|
|
14
|
+
'.pdf',
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
# Допустимые Content-Type
|
|
18
|
+
ALLOWED_CONTENT_TYPES = {'image/', 'video/', 'audio/', 'application/pdf'}
|
|
19
|
+
|
|
20
|
+
# Паттерны для внутренних ссылок
|
|
21
|
+
SPONSR_LINK_PATTERN = re.compile(r'https?://sponsr\.ru/([^/]+)/(\d+)(?:/[^\s\)\]"\'<>]*)?')
|
|
22
|
+
BOOSTY_LINK_PATTERN = re.compile(r'https?://boosty\.to/([^/]+)/posts/([a-f0-9-]+)(?:[^\s\)\]"\'<>]*)?')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def transliterate(text: str) -> str:
|
|
26
|
+
"""Транслитерация текста в slug."""
|
|
27
|
+
return slugify(text, lowercase=True, max_length=80)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_post_url(url: str) -> tuple[str, str, str]:
|
|
31
|
+
"""
|
|
32
|
+
Парсит URL поста, возвращает (platform, author, post_id).
|
|
33
|
+
|
|
34
|
+
Примеры:
|
|
35
|
+
https://sponsr.ru/pushkin/134833/... → ('sponsr', 'pushkin', '134833')
|
|
36
|
+
https://boosty.to/lermontov/posts/uuid → ('boosty', 'lermontov', 'uuid')
|
|
37
|
+
"""
|
|
38
|
+
parsed = urlparse(url)
|
|
39
|
+
parts = [p for p in parsed.path.strip('/').split('/') if p]
|
|
40
|
+
|
|
41
|
+
if 'sponsr.ru' in parsed.netloc:
|
|
42
|
+
if len(parts) < 2:
|
|
43
|
+
raise ValueError(f"Неверный формат URL Sponsr: {url}")
|
|
44
|
+
return ('sponsr', parts[0], parts[1])
|
|
45
|
+
|
|
46
|
+
elif 'boosty.to' in parsed.netloc:
|
|
47
|
+
if len(parts) < 3 or parts[1] != 'posts':
|
|
48
|
+
raise ValueError(f"Неверный формат URL Boosty: {url}")
|
|
49
|
+
return ('boosty', parts[0], parts[2])
|
|
50
|
+
|
|
51
|
+
raise ValueError(f"Неизвестная платформа: {url}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def is_post_url(text: str) -> bool:
|
|
55
|
+
"""Проверяет, является ли строка URL поста."""
|
|
56
|
+
try:
|
|
57
|
+
parse_post_url(text)
|
|
58
|
+
return True
|
|
59
|
+
except ValueError:
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def should_download_asset(url: str, content_type: str | None = None) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Проверяет, нужно ли скачивать файл.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
url: URL файла
|
|
69
|
+
content_type: Content-Type из заголовков ответа (опционально)
|
|
70
|
+
"""
|
|
71
|
+
ext = Path(urlparse(url).path).suffix.lower()
|
|
72
|
+
|
|
73
|
+
if ext:
|
|
74
|
+
return ext in ALLOWED_EXTENSIONS
|
|
75
|
+
|
|
76
|
+
if content_type:
|
|
77
|
+
return any(ct in content_type for ct in ALLOWED_CONTENT_TYPES)
|
|
78
|
+
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_extension_from_content_type(content_type: str) -> str:
|
|
83
|
+
"""Определяет расширение файла по Content-Type."""
|
|
84
|
+
mapping = {
|
|
85
|
+
'image/jpeg': '.jpg',
|
|
86
|
+
'image/png': '.png',
|
|
87
|
+
'image/gif': '.gif',
|
|
88
|
+
'image/webp': '.webp',
|
|
89
|
+
'image/svg+xml': '.svg',
|
|
90
|
+
'video/mp4': '.mp4',
|
|
91
|
+
'video/webm': '.webm',
|
|
92
|
+
'audio/mpeg': '.mp3',
|
|
93
|
+
'audio/wav': '.wav',
|
|
94
|
+
'audio/flac': '.flac',
|
|
95
|
+
'audio/ogg': '.ogg',
|
|
96
|
+
'application/pdf': '.pdf',
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
ct = content_type.split(';')[0].strip().lower()
|
|
100
|
+
return mapping.get(ct, '')
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def sanitize_filename(name: str) -> str:
|
|
104
|
+
"""Очищает имя файла от недопустимых символов."""
|
|
105
|
+
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', name)
|
|
106
|
+
name = re.sub(r'\s+', ' ', name).strip()
|
|
107
|
+
return name or 'unnamed'
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def extract_internal_links(content: str) -> list[tuple[str, str, str]]:
|
|
111
|
+
"""
|
|
112
|
+
Извлекает внутренние ссылки из контента.
|
|
113
|
+
Возвращает [(full_url, platform, post_id), ...]
|
|
114
|
+
"""
|
|
115
|
+
links = []
|
|
116
|
+
|
|
117
|
+
for match in SPONSR_LINK_PATTERN.finditer(content):
|
|
118
|
+
links.append((match.group(0), 'sponsr', match.group(2)))
|
|
119
|
+
|
|
120
|
+
for match in BOOSTY_LINK_PATTERN.finditer(content):
|
|
121
|
+
links.append((match.group(0), 'boosty', match.group(2)))
|
|
122
|
+
|
|
123
|
+
return links
|