tgparser-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgparser/__init__.py +3 -0
- tgparser/auth/__init__.py +6 -0
- tgparser/auth/mtproto_auth.py +130 -0
- tgparser/auth/web_auth.py +260 -0
- tgparser/cli.py +637 -0
- tgparser/config.py +55 -0
- tgparser/models/__init__.py +1 -0
- tgparser/models/message.py +33 -0
- tgparser/parsers/__init__.py +6 -0
- tgparser/parsers/mtproto_parser.py +244 -0
- tgparser/parsers/web_parser.py +620 -0
- tgparser/storage/__init__.py +15 -0
- tgparser/storage/sqlite.py +118 -0
- tgparser/storage/writer.py +214 -0
- tgparser/utils.py +69 -0
- tgparser_cli-0.1.0.dist-info/METADATA +278 -0
- tgparser_cli-0.1.0.dist-info/RECORD +21 -0
- tgparser_cli-0.1.0.dist-info/WHEEL +5 -0
- tgparser_cli-0.1.0.dist-info/entry_points.txt +2 -0
- tgparser_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- tgparser_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,620 @@
|
|
|
1
|
+
"""Parser for closed/private Telegram channels via Web Telegram (Playwright + BS4).
|
|
2
|
+
|
|
3
|
+
Uses an existing Playwright session (restored by :class:`WebAuth`) to
|
|
4
|
+
navigate to the channel, scroll through the message history, bypass
|
|
5
|
+
copy-protection, and extract message data from the DOM.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import time
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
|
|
14
|
+
from bs4 import BeautifulSoup, Tag
|
|
15
|
+
from playwright.sync_api import Browser, Page, Playwright, sync_playwright
|
|
16
|
+
|
|
17
|
+
from tgparser.auth.web_auth import WebAuth
|
|
18
|
+
from tgparser.config import get_setting
|
|
19
|
+
from tgparser.models.message import Message
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("tgparser")
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# JS / CSS payloads injected into the page to defeat copy-protection
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
COPY_PROTECTION_CSS = """
|
|
28
|
+
*, *::before, *::after {
|
|
29
|
+
user-select: text !important;
|
|
30
|
+
-webkit-user-select: text !important;
|
|
31
|
+
-moz-user-select: text !important;
|
|
32
|
+
-ms-user-select: text !important;
|
|
33
|
+
}
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
COPY_PROTECTION_JS = """
|
|
37
|
+
;(function () {
|
|
38
|
+
document.querySelectorAll('*').forEach(function (el) {
|
|
39
|
+
el.oncopy = null;
|
|
40
|
+
el.oncut = null;
|
|
41
|
+
el.onpaste = null;
|
|
42
|
+
el.oncontextmenu = null;
|
|
43
|
+
el.ondragstart = null;
|
|
44
|
+
el.onselectstart = null;
|
|
45
|
+
el.onmousedown = null;
|
|
46
|
+
});
|
|
47
|
+
document.body.style.userSelect = 'text';
|
|
48
|
+
document.body.style.webkitUserSelect = 'text';
|
|
49
|
+
Array.from(document.querySelectorAll(
|
|
50
|
+
'.copy-protection-overlay, ' +
|
|
51
|
+
'.tgme-page-extra, ' +
|
|
52
|
+
'[class*="protect"], ' +
|
|
53
|
+
'[class*="nonselectable"], ' +
|
|
54
|
+
'[style*="user-select: none"], ' +
|
|
55
|
+
'[style*="user-select:none"]'
|
|
56
|
+
)).forEach(function (el) { el.remove(); });
|
|
57
|
+
})();
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
# Message element selectors — multiple fallbacks for robustness.
|
|
62
|
+
# Web K may change class names; this list is generous.
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
_MESSAGE_CONTAINER_SELECTORS = [
|
|
66
|
+
".bubbles",
|
|
67
|
+
".messages-container",
|
|
68
|
+
"#column-center .messages-container",
|
|
69
|
+
"[data-list-id='chat']",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
_MESSAGE_ITEM_SELECTORS = [
|
|
73
|
+
".bubble",
|
|
74
|
+
".bubble-content",
|
|
75
|
+
".message",
|
|
76
|
+
"div[class*='message' i]",
|
|
77
|
+
".chat-list .row",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
_TEXT_SELECTORS = [
|
|
81
|
+
".message-text",
|
|
82
|
+
".text-content",
|
|
83
|
+
".bubble-content .text",
|
|
84
|
+
"[class*='message-text' i]",
|
|
85
|
+
"[class*='text-content' i]",
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
_AUTHOR_SELECTORS = [
|
|
89
|
+
".peer-title",
|
|
90
|
+
".sender-name",
|
|
91
|
+
".name",
|
|
92
|
+
".author",
|
|
93
|
+
"[class*='peer-title' i]",
|
|
94
|
+
"[class*='sender' i]",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
_DATE_SELECTORS = [
|
|
98
|
+
"time",
|
|
99
|
+
".time",
|
|
100
|
+
".date",
|
|
101
|
+
"[data-timestamp]",
|
|
102
|
+
".message-time",
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
_MEDIA_IMG_SELECTORS = "img:not([class*='emoji']):not([class*='sticker'])"
|
|
106
|
+
_MEDIA_VIDEO_SELECTORS = "video, video source"
|
|
107
|
+
_MEDIA_LINK_SELECTORS = "a.media-link, a[class*='link'], a.preview-link"
|
|
108
|
+
|
|
109
|
+
_FORWARDED_SELECTORS = [
|
|
110
|
+
".forwarded",
|
|
111
|
+
".is-forwarded",
|
|
112
|
+
"[class*='forward' i]",
|
|
113
|
+
".fwd",
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class WebParser:
|
|
118
|
+
"""Parse messages from closed Telegram channels via the web interface.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
web_auth : WebAuth
|
|
123
|
+
Initialised auth helper that can restore a Playwright session.
|
|
124
|
+
headless : bool | None
|
|
125
|
+
Override the ``browser.headless`` config value.
|
|
126
|
+
timeout_ms : int
|
|
127
|
+
Default timeout for Playwright operations (milliseconds).
|
|
128
|
+
slow_mo : int
|
|
129
|
+
Artificial delay between Playwright actions (milliseconds).
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
web_auth: WebAuth,
|
|
135
|
+
headless: bool | None = None,
|
|
136
|
+
timeout_ms: int = 30_000,
|
|
137
|
+
slow_mo: int | None = None,
|
|
138
|
+
) -> None:
|
|
139
|
+
self._web_auth = web_auth
|
|
140
|
+
self._headless = (
|
|
141
|
+
headless if headless is not None
|
|
142
|
+
else bool(get_setting("browser", "headless", default=True))
|
|
143
|
+
)
|
|
144
|
+
self._timeout_ms = timeout_ms
|
|
145
|
+
self._slow_mo = (
|
|
146
|
+
slow_mo if slow_mo is not None
|
|
147
|
+
else int(get_setting("browser", "slow_mo", default=0) or 0)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# ------------------------------------------------------------------
|
|
151
|
+
# Public API
|
|
152
|
+
# ------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
def parse(
|
|
155
|
+
self,
|
|
156
|
+
channel_url: str,
|
|
157
|
+
limit: int = 100,
|
|
158
|
+
*,
|
|
159
|
+
max_scroll_attempts: int | None = None,
|
|
160
|
+
scroll_delay_ms: int | None = None,
|
|
161
|
+
) -> list[Message]:
|
|
162
|
+
"""Synchronous parse of a closed channel.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
channel_url : str
|
|
167
|
+
Channel identifier: ``@username``, ``https://t.me/+hash``,
|
|
168
|
+
or a full ``https://web.telegram.org/k/#@…`` URL.
|
|
169
|
+
limit : int
|
|
170
|
+
Maximum number of messages to collect.
|
|
171
|
+
max_scroll_attempts : int | None
|
|
172
|
+
How many times to scroll upward looking for older messages.
|
|
173
|
+
Falls back to ``parsing.max_scroll_attempts`` from config.
|
|
174
|
+
scroll_delay_ms : int | None
|
|
175
|
+
Delay between scroll attempts (ms). Falls back to
|
|
176
|
+
``parsing.scroll_delay_ms`` from config.
|
|
177
|
+
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
list[Message]
|
|
181
|
+
Parsed domain models, newest-first.
|
|
182
|
+
"""
|
|
183
|
+
max_scroll = (
|
|
184
|
+
max_scroll_attempts
|
|
185
|
+
if max_scroll_attempts is not None
|
|
186
|
+
else int(get_setting("parsing", "max_scroll_attempts", default=50) or 50)
|
|
187
|
+
)
|
|
188
|
+
scroll_delay = (
|
|
189
|
+
scroll_delay_ms
|
|
190
|
+
if scroll_delay_ms is not None
|
|
191
|
+
else int(get_setting("parsing", "scroll_delay_ms", default=1500) or 1500)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if not self._web_auth.is_session_valid():
|
|
195
|
+
raise RuntimeError(
|
|
196
|
+
"No valid web session found. Run `tgparser auth --method web` first."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
pw: Playwright | None = None
|
|
200
|
+
browser: Browser | None = None
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
pw = sync_playwright().start()
|
|
204
|
+
browser = pw.chromium.launch(
|
|
205
|
+
headless=self._headless, slow_mo=self._slow_mo
|
|
206
|
+
)
|
|
207
|
+
context = browser.new_context(
|
|
208
|
+
viewport={"width": 1280, "height": 900},
|
|
209
|
+
locale="en-US",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if not self._web_auth.restore_session(context):
|
|
213
|
+
raise RuntimeError("Failed to restore web session into browser context.")
|
|
214
|
+
|
|
215
|
+
page = context.new_page()
|
|
216
|
+
page.set_default_timeout(self._timeout_ms)
|
|
217
|
+
|
|
218
|
+
channel_name = self._navigate_to_channel(page, channel_url)
|
|
219
|
+
self._bypass_copy_protection(page)
|
|
220
|
+
|
|
221
|
+
messages = self._scroll_and_collect(
|
|
222
|
+
page, channel_name, limit, max_scroll, scroll_delay
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
logger.info(
|
|
226
|
+
"Parsed %d messages from %s (web).",
|
|
227
|
+
len(messages),
|
|
228
|
+
channel_url,
|
|
229
|
+
)
|
|
230
|
+
return messages
|
|
231
|
+
|
|
232
|
+
except Exception:
|
|
233
|
+
logger.exception("Web parsing failed for %s", channel_url)
|
|
234
|
+
raise
|
|
235
|
+
finally:
|
|
236
|
+
if browser:
|
|
237
|
+
browser.close()
|
|
238
|
+
if pw:
|
|
239
|
+
pw.stop()
|
|
240
|
+
|
|
241
|
+
# ------------------------------------------------------------------
|
|
242
|
+
# Channel navigation
|
|
243
|
+
# ------------------------------------------------------------------
|
|
244
|
+
|
|
245
|
+
def _navigate_to_channel(self, page: Page, channel_url: str) -> str:
|
|
246
|
+
"""Open the channel's page and return its display name."""
|
|
247
|
+
page.goto("https://web.telegram.org/k/", wait_until="domcontentloaded")
|
|
248
|
+
self._wait_for_any_selector(
|
|
249
|
+
page, [".chatlist", ".chat-list", "#LeftColumn"], timeout=15_000
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
logger.info("Navigating to channel: %s", channel_url)
|
|
253
|
+
hash_part = self._extract_hash(channel_url)
|
|
254
|
+
page.evaluate(f"window.location.hash = '{hash_part}'")
|
|
255
|
+
|
|
256
|
+
self._wait_for_any_selector(page, _MESSAGE_CONTAINER_SELECTORS, timeout=15_000)
|
|
257
|
+
time.sleep(1.0)
|
|
258
|
+
|
|
259
|
+
channel_name = self._extract_channel_name(page)
|
|
260
|
+
logger.info("Channel identified as: %s", channel_name)
|
|
261
|
+
return channel_name
|
|
262
|
+
|
|
263
|
+
@staticmethod
|
|
264
|
+
def _extract_hash(channel_url: str) -> str:
|
|
265
|
+
"""Extract the hash part (``@name`` or ``+hash``) from a channel reference."""
|
|
266
|
+
url = channel_url.strip()
|
|
267
|
+
if url.startswith("https://web.telegram.org/"):
|
|
268
|
+
if "#" in url:
|
|
269
|
+
return url.split("#", 1)[1]
|
|
270
|
+
return url.rsplit("/", 1)[-1]
|
|
271
|
+
if url.startswith("https://t.me/"):
|
|
272
|
+
return url.replace("https://t.me/", "").strip("/")
|
|
273
|
+
if url.startswith("@"):
|
|
274
|
+
return url.lstrip("@")
|
|
275
|
+
return url
|
|
276
|
+
|
|
277
|
+
@staticmethod
|
|
278
|
+
def _extract_channel_name(page: Page) -> str:
|
|
279
|
+
"""Extract the channel title from the top bar of the chat view."""
|
|
280
|
+
for sel in _AUTHOR_SELECTORS:
|
|
281
|
+
try:
|
|
282
|
+
el = page.query_selector(sel)
|
|
283
|
+
if el:
|
|
284
|
+
return el.inner_text().strip()
|
|
285
|
+
except Exception:
|
|
286
|
+
pass
|
|
287
|
+
try:
|
|
288
|
+
title = page.title()
|
|
289
|
+
if title and "Telegram" not in title:
|
|
290
|
+
return title
|
|
291
|
+
except Exception:
|
|
292
|
+
pass
|
|
293
|
+
return "unknown"
|
|
294
|
+
|
|
295
|
+
# ------------------------------------------------------------------
|
|
296
|
+
# Copy protection bypass
|
|
297
|
+
# ------------------------------------------------------------------
|
|
298
|
+
|
|
299
|
+
def _bypass_copy_protection(self, page: Page) -> None:
|
|
300
|
+
"""Inject CSS overrides and strip JS event handlers that block selection/copy."""
|
|
301
|
+
try:
|
|
302
|
+
page.add_style_tag(content=COPY_PROTECTION_CSS)
|
|
303
|
+
logger.debug("Injected copy-protection CSS override.")
|
|
304
|
+
except Exception as exc:
|
|
305
|
+
logger.warning("Failed to inject CSS override: %s", exc)
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
page.evaluate(COPY_PROTECTION_JS)
|
|
309
|
+
logger.debug("Stripped copy-protection JS handlers.")
|
|
310
|
+
except Exception as exc:
|
|
311
|
+
logger.warning("Failed to strip JS handlers: %s", exc)
|
|
312
|
+
|
|
313
|
+
# ------------------------------------------------------------------
|
|
314
|
+
# Scroll & collect loop
|
|
315
|
+
# ------------------------------------------------------------------
|
|
316
|
+
|
|
317
|
+
def _scroll_and_collect(
|
|
318
|
+
self,
|
|
319
|
+
page: Page,
|
|
320
|
+
channel_name: str,
|
|
321
|
+
limit: int,
|
|
322
|
+
max_scroll_attempts: int,
|
|
323
|
+
scroll_delay_ms: int,
|
|
324
|
+
) -> list[Message]:
|
|
325
|
+
"""Scroll upward repeatedly, parsing the DOM after each scroll."""
|
|
326
|
+
seen_ids: set[int] = set()
|
|
327
|
+
all_messages: list[Message] = []
|
|
328
|
+
streak_no_new = 0
|
|
329
|
+
|
|
330
|
+
for attempt in range(max_scroll_attempts):
|
|
331
|
+
batch = self._parse_message_elements(page, channel_name)
|
|
332
|
+
new_messages = [m for m in batch if m.id not in seen_ids]
|
|
333
|
+
|
|
334
|
+
if new_messages:
|
|
335
|
+
seen_ids.update(m.id for m in new_messages)
|
|
336
|
+
all_messages.extend(new_messages)
|
|
337
|
+
streak_no_new = 0
|
|
338
|
+
logger.debug(
|
|
339
|
+
"Scroll %d/%d: +%d messages (total %d/%d).",
|
|
340
|
+
attempt + 1,
|
|
341
|
+
max_scroll_attempts,
|
|
342
|
+
len(new_messages),
|
|
343
|
+
len(all_messages),
|
|
344
|
+
limit,
|
|
345
|
+
)
|
|
346
|
+
else:
|
|
347
|
+
streak_no_new += 1
|
|
348
|
+
logger.debug(
|
|
349
|
+
"Scroll %d/%d: no new messages (streak %d).",
|
|
350
|
+
attempt + 1,
|
|
351
|
+
max_scroll_attempts,
|
|
352
|
+
streak_no_new,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if len(all_messages) >= limit:
|
|
356
|
+
logger.info("Reached message limit (%d).", limit)
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
if streak_no_new >= 3:
|
|
360
|
+
logger.info(
|
|
361
|
+
"No new messages for %d scrolls — reached top of channel.",
|
|
362
|
+
streak_no_new,
|
|
363
|
+
)
|
|
364
|
+
break
|
|
365
|
+
|
|
366
|
+
self._scroll_up(page, scroll_delay_ms)
|
|
367
|
+
|
|
368
|
+
return all_messages[:limit]
|
|
369
|
+
|
|
370
|
+
def _scroll_up(self, page: Page, delay_ms: int) -> None:
|
|
371
|
+
"""Scroll the message container to its top to trigger lazy-load."""
|
|
372
|
+
try:
|
|
373
|
+
page.evaluate(
|
|
374
|
+
"""() => {
|
|
375
|
+
const container = document.querySelector(
|
|
376
|
+
'.bubbles, .messages-container, #column-center, ' +
|
|
377
|
+
'[data-list-id="chat"]'
|
|
378
|
+
);
|
|
379
|
+
if (container) {
|
|
380
|
+
container.scrollTop = 0;
|
|
381
|
+
}
|
|
382
|
+
}"""
|
|
383
|
+
)
|
|
384
|
+
time.sleep(max(delay_ms / 1000, 0.5))
|
|
385
|
+
except Exception as exc:
|
|
386
|
+
logger.warning("Scroll failed: %s", exc)
|
|
387
|
+
time.sleep(max(delay_ms / 1000, 0.5))
|
|
388
|
+
|
|
389
|
+
# ------------------------------------------------------------------
|
|
390
|
+
# DOM → Message parsing
|
|
391
|
+
# ------------------------------------------------------------------
|
|
392
|
+
|
|
393
|
+
def _parse_message_elements(
|
|
394
|
+
self, page: Page, channel_name: str
|
|
395
|
+
) -> list[Message]:
|
|
396
|
+
"""Extract all visible message bubbles from the current page DOM."""
|
|
397
|
+
html = page.content()
|
|
398
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
399
|
+
|
|
400
|
+
elements: list[Tag] = []
|
|
401
|
+
for sel in _MESSAGE_ITEM_SELECTORS:
|
|
402
|
+
found = soup.select(sel)
|
|
403
|
+
if found:
|
|
404
|
+
elements = found
|
|
405
|
+
break
|
|
406
|
+
|
|
407
|
+
if not elements:
|
|
408
|
+
logger.debug("No message elements found in current DOM.")
|
|
409
|
+
return []
|
|
410
|
+
|
|
411
|
+
messages: list[Message] = []
|
|
412
|
+
for el in elements:
|
|
413
|
+
try:
|
|
414
|
+
msg = self._parse_message_element(el, channel_name)
|
|
415
|
+
if msg is not None:
|
|
416
|
+
messages.append(msg)
|
|
417
|
+
except Exception as exc:
|
|
418
|
+
logger.debug("Failed to parse a message element: %s", exc)
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
return messages
|
|
422
|
+
|
|
423
|
+
def _parse_message_element(
|
|
424
|
+
self, el: Tag, channel_name: str
|
|
425
|
+
) -> Message | None:
|
|
426
|
+
"""Parse a single message DOM element into our :class:`Message` model."""
|
|
427
|
+
msg_id = _extract_id(el)
|
|
428
|
+
text = _extract_text(el)
|
|
429
|
+
author = _extract_author(el)
|
|
430
|
+
date = _extract_date(el)
|
|
431
|
+
media_urls = _extract_media_urls(el)
|
|
432
|
+
is_forwarded = _detect_forwarded(el)
|
|
433
|
+
|
|
434
|
+
if not text and not media_urls:
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
return Message(
|
|
438
|
+
id=msg_id,
|
|
439
|
+
channel=channel_name,
|
|
440
|
+
date=date or datetime.now(UTC),
|
|
441
|
+
author=author,
|
|
442
|
+
text=text or "",
|
|
443
|
+
media_urls=media_urls,
|
|
444
|
+
reactions=None,
|
|
445
|
+
is_forwarded=is_forwarded,
|
|
446
|
+
raw_source="web",
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# ------------------------------------------------------------------
|
|
450
|
+
# Helpers
|
|
451
|
+
# ------------------------------------------------------------------
|
|
452
|
+
|
|
453
|
+
@staticmethod
|
|
454
|
+
def _wait_for_any_selector(
|
|
455
|
+
page: Page, selectors: list[str], timeout: int = 15_000
|
|
456
|
+
) -> None:
|
|
457
|
+
"""Wait for at least one of the given selectors to appear."""
|
|
458
|
+
for sel in selectors:
|
|
459
|
+
try:
|
|
460
|
+
page.wait_for_selector(sel, timeout=timeout)
|
|
461
|
+
logger.debug("Selector found: %s", sel)
|
|
462
|
+
return
|
|
463
|
+
except Exception:
|
|
464
|
+
continue
|
|
465
|
+
logger.warning(
|
|
466
|
+
"None of the expected selectors appeared within %d ms: %s",
|
|
467
|
+
timeout,
|
|
468
|
+
selectors,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
# -----------------------------------------------------------------------
|
|
473
|
+
# Standalone extraction helpers (module-level, usable without instance)
|
|
474
|
+
# -----------------------------------------------------------------------
|
|
475
|
+
|
|
476
|
+
def _extract_id(el: Tag) -> int:
|
|
477
|
+
"""Extract a stable message id from DOM attributes or content hash."""
|
|
478
|
+
for attr in ("data-message-id", "data-id", "id"):
|
|
479
|
+
val = el.get(attr)
|
|
480
|
+
if val and val.strip():
|
|
481
|
+
try:
|
|
482
|
+
return abs(hash(val)) % (10**9)
|
|
483
|
+
except (ValueError, TypeError, AttributeError):
|
|
484
|
+
pass
|
|
485
|
+
text = el.get_text(strip=True)[:300]
|
|
486
|
+
return abs(hash(text)) % (10**9)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def _extract_text(el: Tag) -> str | None:
|
|
490
|
+
"""Extract message text from DOM element."""
|
|
491
|
+
for sel in _TEXT_SELECTORS:
|
|
492
|
+
text_el = el.select_one(sel)
|
|
493
|
+
if text_el:
|
|
494
|
+
t = text_el.get_text(strip=True)
|
|
495
|
+
if t:
|
|
496
|
+
return t
|
|
497
|
+
# Fallback: get all text, strip known non-text elements
|
|
498
|
+
el_copy = BeautifulSoup(str(el), "html.parser")
|
|
499
|
+
for skip_sel in (".peer-title", ".sender-name", ".time", ".reply-markup", ".bubble-meta"):
|
|
500
|
+
for skip_el in el_copy.select(skip_sel):
|
|
501
|
+
skip_el.decompose()
|
|
502
|
+
text = el_copy.get_text(strip=True)
|
|
503
|
+
return text or None
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _extract_author(el: Tag) -> str | None:
|
|
507
|
+
"""Extract message sender name."""
|
|
508
|
+
for sel in _AUTHOR_SELECTORS:
|
|
509
|
+
author_el = el.select_one(sel)
|
|
510
|
+
if author_el:
|
|
511
|
+
name = author_el.get_text(strip=True)
|
|
512
|
+
if name:
|
|
513
|
+
return name
|
|
514
|
+
return None
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def _extract_date(el: Tag) -> datetime | None:
|
|
518
|
+
"""Extract message date/time from DOM element."""
|
|
519
|
+
time_el = el.select_one("time")
|
|
520
|
+
if time_el:
|
|
521
|
+
dt_str = time_el.get("datetime", "")
|
|
522
|
+
if dt_str:
|
|
523
|
+
try:
|
|
524
|
+
return datetime.fromisoformat(dt_str).replace(tzinfo=UTC)
|
|
525
|
+
except (ValueError, TypeError):
|
|
526
|
+
pass
|
|
527
|
+
|
|
528
|
+
for sel in _DATE_SELECTORS:
|
|
529
|
+
date_el = el.select_one(sel)
|
|
530
|
+
if date_el:
|
|
531
|
+
ts = date_el.get("data-timestamp", "")
|
|
532
|
+
if ts:
|
|
533
|
+
try:
|
|
534
|
+
return datetime.fromtimestamp(int(ts), tz=UTC)
|
|
535
|
+
except (ValueError, TypeError):
|
|
536
|
+
pass
|
|
537
|
+
text = date_el.get_text(strip=True)
|
|
538
|
+
if text:
|
|
539
|
+
try:
|
|
540
|
+
return _parse_human_date(text)
|
|
541
|
+
except (ValueError, TypeError):
|
|
542
|
+
pass
|
|
543
|
+
|
|
544
|
+
return None
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _parse_human_date(text: str) -> datetime | None:
|
|
548
|
+
"""Best-effort parse of human-readable dates like '12:34 PM' or 'Jan 1'."""
|
|
549
|
+
|
|
550
|
+
now = datetime.now(UTC)
|
|
551
|
+
text = text.strip()
|
|
552
|
+
|
|
553
|
+
# "12:34" or "12:34 PM" — today
|
|
554
|
+
for fmt in ("%I:%M %p", "%H:%M", "%I:%M%p"):
|
|
555
|
+
try:
|
|
556
|
+
t = datetime.strptime(text, fmt).time()
|
|
557
|
+
return datetime.combine(now.date(), t, tzinfo=UTC)
|
|
558
|
+
except (ValueError, TypeError):
|
|
559
|
+
continue
|
|
560
|
+
|
|
561
|
+
# "Jan 1" or "1 Jan" — this year
|
|
562
|
+
for fmt in ("%b %d", "%d %b", "%B %d", "%d %B"):
|
|
563
|
+
try:
|
|
564
|
+
d = datetime.strptime(f"{text} {now.year}", f"{fmt} %Y").date()
|
|
565
|
+
return datetime.combine(d, datetime.min.time(), tzinfo=UTC)
|
|
566
|
+
except (ValueError, TypeError):
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
# "Jan 1, 2024"
|
|
570
|
+
for fmt in ("%b %d, %Y", "%B %d, %Y"):
|
|
571
|
+
try:
|
|
572
|
+
return datetime.strptime(text, fmt).replace(tzinfo=UTC)
|
|
573
|
+
except (ValueError, TypeError):
|
|
574
|
+
continue
|
|
575
|
+
|
|
576
|
+
# ISO attempt
|
|
577
|
+
try:
|
|
578
|
+
return datetime.fromisoformat(text).replace(tzinfo=UTC)
|
|
579
|
+
except (ValueError, TypeError):
|
|
580
|
+
pass
|
|
581
|
+
|
|
582
|
+
return None
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def _extract_media_urls(el: Tag) -> list[str]:
|
|
586
|
+
"""Extract media URLs (images, videos, documents) from the element."""
|
|
587
|
+
urls: list[str] = []
|
|
588
|
+
|
|
589
|
+
for img in el.select(_MEDIA_IMG_SELECTORS):
|
|
590
|
+
src = img.get("src", "") or img.get("data-src", "")
|
|
591
|
+
if src and not src.startswith("data:") and "emoji" not in src.lower():
|
|
592
|
+
urls.append(src)
|
|
593
|
+
|
|
594
|
+
for video in el.select(_MEDIA_VIDEO_SELECTORS):
|
|
595
|
+
src = video.get("src", "")
|
|
596
|
+
if src:
|
|
597
|
+
urls.append(src)
|
|
598
|
+
|
|
599
|
+
for a in el.select(_MEDIA_LINK_SELECTORS):
|
|
600
|
+
href = a.get("href", "")
|
|
601
|
+
if href and href.startswith("http"):
|
|
602
|
+
urls.append(href)
|
|
603
|
+
|
|
604
|
+
# Deduplicate preserving order
|
|
605
|
+
seen: set[str] = set()
|
|
606
|
+
result: list[str] = []
|
|
607
|
+
for u in urls:
|
|
608
|
+
if u not in seen:
|
|
609
|
+
seen.add(u)
|
|
610
|
+
result.append(u)
|
|
611
|
+
return result
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _detect_forwarded(el: Tag) -> bool:
|
|
615
|
+
"""Detect whether the message is forwarded."""
|
|
616
|
+
for sel in _FORWARDED_SELECTORS:
|
|
617
|
+
if el.select_one(sel):
|
|
618
|
+
return True
|
|
619
|
+
text = el.get_text(strip=True).lower()
|
|
620
|
+
return "forwarded from" in text
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Storage writers — JSON, CSV, TXT, SQLite."""
|
|
2
|
+
|
|
3
|
+
from tgparser.storage.writer import (
|
|
4
|
+
OutputFormat,
|
|
5
|
+
get_last_message_id,
|
|
6
|
+
save_messages,
|
|
7
|
+
save_messages_incremental,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"save_messages",
|
|
12
|
+
"save_messages_incremental",
|
|
13
|
+
"get_last_message_id",
|
|
14
|
+
"OutputFormat",
|
|
15
|
+
]
|