tgparser-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,620 @@
1
+ """Parser for closed/private Telegram channels via Web Telegram (Playwright + BS4).
2
+
3
+ Uses an existing Playwright session (restored by :class:`WebAuth`) to
4
+ navigate to the channel, scroll through the message history, bypass
5
+ copy-protection, and extract message data from the DOM.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import time
12
+ from datetime import UTC, datetime
13
+
14
+ from bs4 import BeautifulSoup, Tag
15
+ from playwright.sync_api import Browser, Page, Playwright, sync_playwright
16
+
17
+ from tgparser.auth.web_auth import WebAuth
18
+ from tgparser.config import get_setting
19
+ from tgparser.models.message import Message
20
+
21
+ logger = logging.getLogger("tgparser")
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # JS / CSS payloads injected into the page to defeat copy-protection
25
+ # ---------------------------------------------------------------------------
26
+
27
+ COPY_PROTECTION_CSS = """
28
+ *, *::before, *::after {
29
+ user-select: text !important;
30
+ -webkit-user-select: text !important;
31
+ -moz-user-select: text !important;
32
+ -ms-user-select: text !important;
33
+ }
34
+ """
35
+
36
+ COPY_PROTECTION_JS = """
37
+ ;(function () {
38
+ document.querySelectorAll('*').forEach(function (el) {
39
+ el.oncopy = null;
40
+ el.oncut = null;
41
+ el.onpaste = null;
42
+ el.oncontextmenu = null;
43
+ el.ondragstart = null;
44
+ el.onselectstart = null;
45
+ el.onmousedown = null;
46
+ });
47
+ document.body.style.userSelect = 'text';
48
+ document.body.style.webkitUserSelect = 'text';
49
+ Array.from(document.querySelectorAll(
50
+ '.copy-protection-overlay, ' +
51
+ '.tgme-page-extra, ' +
52
+ '[class*="protect"], ' +
53
+ '[class*="nonselectable"], ' +
54
+ '[style*="user-select: none"], ' +
55
+ '[style*="user-select:none"]'
56
+ )).forEach(function (el) { el.remove(); });
57
+ })();
58
+ """
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Message element selectors — multiple fallbacks for robustness.
62
+ # Web K may change class names; this list is generous.
63
+ # ---------------------------------------------------------------------------
64
+
65
+ _MESSAGE_CONTAINER_SELECTORS = [
66
+ ".bubbles",
67
+ ".messages-container",
68
+ "#column-center .messages-container",
69
+ "[data-list-id='chat']",
70
+ ]
71
+
72
+ _MESSAGE_ITEM_SELECTORS = [
73
+ ".bubble",
74
+ ".bubble-content",
75
+ ".message",
76
+ "div[class*='message' i]",
77
+ ".chat-list .row",
78
+ ]
79
+
80
+ _TEXT_SELECTORS = [
81
+ ".message-text",
82
+ ".text-content",
83
+ ".bubble-content .text",
84
+ "[class*='message-text' i]",
85
+ "[class*='text-content' i]",
86
+ ]
87
+
88
+ _AUTHOR_SELECTORS = [
89
+ ".peer-title",
90
+ ".sender-name",
91
+ ".name",
92
+ ".author",
93
+ "[class*='peer-title' i]",
94
+ "[class*='sender' i]",
95
+ ]
96
+
97
+ _DATE_SELECTORS = [
98
+ "time",
99
+ ".time",
100
+ ".date",
101
+ "[data-timestamp]",
102
+ ".message-time",
103
+ ]
104
+
105
+ _MEDIA_IMG_SELECTORS = "img:not([class*='emoji']):not([class*='sticker'])"
106
+ _MEDIA_VIDEO_SELECTORS = "video, video source"
107
+ _MEDIA_LINK_SELECTORS = "a.media-link, a[class*='link'], a.preview-link"
108
+
109
+ _FORWARDED_SELECTORS = [
110
+ ".forwarded",
111
+ ".is-forwarded",
112
+ "[class*='forward' i]",
113
+ ".fwd",
114
+ ]
115
+
116
+
117
+ class WebParser:
118
+ """Parse messages from closed Telegram channels via the web interface.
119
+
120
+ Parameters
121
+ ----------
122
+ web_auth : WebAuth
123
+ Initialised auth helper that can restore a Playwright session.
124
+ headless : bool | None
125
+ Override the ``browser.headless`` config value.
126
+ timeout_ms : int
127
+ Default timeout for Playwright operations (milliseconds).
128
+ slow_mo : int
129
+ Artificial delay between Playwright actions (milliseconds).
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ web_auth: WebAuth,
135
+ headless: bool | None = None,
136
+ timeout_ms: int = 30_000,
137
+ slow_mo: int | None = None,
138
+ ) -> None:
139
+ self._web_auth = web_auth
140
+ self._headless = (
141
+ headless if headless is not None
142
+ else bool(get_setting("browser", "headless", default=True))
143
+ )
144
+ self._timeout_ms = timeout_ms
145
+ self._slow_mo = (
146
+ slow_mo if slow_mo is not None
147
+ else int(get_setting("browser", "slow_mo", default=0) or 0)
148
+ )
149
+
150
+ # ------------------------------------------------------------------
151
+ # Public API
152
+ # ------------------------------------------------------------------
153
+
154
+ def parse(
155
+ self,
156
+ channel_url: str,
157
+ limit: int = 100,
158
+ *,
159
+ max_scroll_attempts: int | None = None,
160
+ scroll_delay_ms: int | None = None,
161
+ ) -> list[Message]:
162
+ """Synchronous parse of a closed channel.
163
+
164
+ Parameters
165
+ ----------
166
+ channel_url : str
167
+ Channel identifier: ``@username``, ``https://t.me/+hash``,
168
+ or a full ``https://web.telegram.org/k/#@…`` URL.
169
+ limit : int
170
+ Maximum number of messages to collect.
171
+ max_scroll_attempts : int | None
172
+ How many times to scroll upward looking for older messages.
173
+ Falls back to ``parsing.max_scroll_attempts`` from config.
174
+ scroll_delay_ms : int | None
175
+ Delay between scroll attempts (ms). Falls back to
176
+ ``parsing.scroll_delay_ms`` from config.
177
+
178
+ Returns
179
+ -------
180
+ list[Message]
181
+ Parsed domain models, newest-first.
182
+ """
183
+ max_scroll = (
184
+ max_scroll_attempts
185
+ if max_scroll_attempts is not None
186
+ else int(get_setting("parsing", "max_scroll_attempts", default=50) or 50)
187
+ )
188
+ scroll_delay = (
189
+ scroll_delay_ms
190
+ if scroll_delay_ms is not None
191
+ else int(get_setting("parsing", "scroll_delay_ms", default=1500) or 1500)
192
+ )
193
+
194
+ if not self._web_auth.is_session_valid():
195
+ raise RuntimeError(
196
+ "No valid web session found. Run `tgparser auth --method web` first."
197
+ )
198
+
199
+ pw: Playwright | None = None
200
+ browser: Browser | None = None
201
+
202
+ try:
203
+ pw = sync_playwright().start()
204
+ browser = pw.chromium.launch(
205
+ headless=self._headless, slow_mo=self._slow_mo
206
+ )
207
+ context = browser.new_context(
208
+ viewport={"width": 1280, "height": 900},
209
+ locale="en-US",
210
+ )
211
+
212
+ if not self._web_auth.restore_session(context):
213
+ raise RuntimeError("Failed to restore web session into browser context.")
214
+
215
+ page = context.new_page()
216
+ page.set_default_timeout(self._timeout_ms)
217
+
218
+ channel_name = self._navigate_to_channel(page, channel_url)
219
+ self._bypass_copy_protection(page)
220
+
221
+ messages = self._scroll_and_collect(
222
+ page, channel_name, limit, max_scroll, scroll_delay
223
+ )
224
+
225
+ logger.info(
226
+ "Parsed %d messages from %s (web).",
227
+ len(messages),
228
+ channel_url,
229
+ )
230
+ return messages
231
+
232
+ except Exception:
233
+ logger.exception("Web parsing failed for %s", channel_url)
234
+ raise
235
+ finally:
236
+ if browser:
237
+ browser.close()
238
+ if pw:
239
+ pw.stop()
240
+
241
+ # ------------------------------------------------------------------
242
+ # Channel navigation
243
+ # ------------------------------------------------------------------
244
+
245
+ def _navigate_to_channel(self, page: Page, channel_url: str) -> str:
246
+ """Open the channel's page and return its display name."""
247
+ page.goto("https://web.telegram.org/k/", wait_until="domcontentloaded")
248
+ self._wait_for_any_selector(
249
+ page, [".chatlist", ".chat-list", "#LeftColumn"], timeout=15_000
250
+ )
251
+
252
+ logger.info("Navigating to channel: %s", channel_url)
253
+ hash_part = self._extract_hash(channel_url)
254
+ page.evaluate(f"window.location.hash = '{hash_part}'")
255
+
256
+ self._wait_for_any_selector(page, _MESSAGE_CONTAINER_SELECTORS, timeout=15_000)
257
+ time.sleep(1.0)
258
+
259
+ channel_name = self._extract_channel_name(page)
260
+ logger.info("Channel identified as: %s", channel_name)
261
+ return channel_name
262
+
263
+ @staticmethod
264
+ def _extract_hash(channel_url: str) -> str:
265
+ """Extract the hash part (``@name`` or ``+hash``) from a channel reference."""
266
+ url = channel_url.strip()
267
+ if url.startswith("https://web.telegram.org/"):
268
+ if "#" in url:
269
+ return url.split("#", 1)[1]
270
+ return url.rsplit("/", 1)[-1]
271
+ if url.startswith("https://t.me/"):
272
+ return url.replace("https://t.me/", "").strip("/")
273
+ if url.startswith("@"):
274
+ return url.lstrip("@")
275
+ return url
276
+
277
+ @staticmethod
278
+ def _extract_channel_name(page: Page) -> str:
279
+ """Extract the channel title from the top bar of the chat view."""
280
+ for sel in _AUTHOR_SELECTORS:
281
+ try:
282
+ el = page.query_selector(sel)
283
+ if el:
284
+ return el.inner_text().strip()
285
+ except Exception:
286
+ pass
287
+ try:
288
+ title = page.title()
289
+ if title and "Telegram" not in title:
290
+ return title
291
+ except Exception:
292
+ pass
293
+ return "unknown"
294
+
295
+ # ------------------------------------------------------------------
296
+ # Copy protection bypass
297
+ # ------------------------------------------------------------------
298
+
299
+ def _bypass_copy_protection(self, page: Page) -> None:
300
+ """Inject CSS overrides and strip JS event handlers that block selection/copy."""
301
+ try:
302
+ page.add_style_tag(content=COPY_PROTECTION_CSS)
303
+ logger.debug("Injected copy-protection CSS override.")
304
+ except Exception as exc:
305
+ logger.warning("Failed to inject CSS override: %s", exc)
306
+
307
+ try:
308
+ page.evaluate(COPY_PROTECTION_JS)
309
+ logger.debug("Stripped copy-protection JS handlers.")
310
+ except Exception as exc:
311
+ logger.warning("Failed to strip JS handlers: %s", exc)
312
+
313
+ # ------------------------------------------------------------------
314
+ # Scroll & collect loop
315
+ # ------------------------------------------------------------------
316
+
317
+ def _scroll_and_collect(
318
+ self,
319
+ page: Page,
320
+ channel_name: str,
321
+ limit: int,
322
+ max_scroll_attempts: int,
323
+ scroll_delay_ms: int,
324
+ ) -> list[Message]:
325
+ """Scroll upward repeatedly, parsing the DOM after each scroll."""
326
+ seen_ids: set[int] = set()
327
+ all_messages: list[Message] = []
328
+ streak_no_new = 0
329
+
330
+ for attempt in range(max_scroll_attempts):
331
+ batch = self._parse_message_elements(page, channel_name)
332
+ new_messages = [m for m in batch if m.id not in seen_ids]
333
+
334
+ if new_messages:
335
+ seen_ids.update(m.id for m in new_messages)
336
+ all_messages.extend(new_messages)
337
+ streak_no_new = 0
338
+ logger.debug(
339
+ "Scroll %d/%d: +%d messages (total %d/%d).",
340
+ attempt + 1,
341
+ max_scroll_attempts,
342
+ len(new_messages),
343
+ len(all_messages),
344
+ limit,
345
+ )
346
+ else:
347
+ streak_no_new += 1
348
+ logger.debug(
349
+ "Scroll %d/%d: no new messages (streak %d).",
350
+ attempt + 1,
351
+ max_scroll_attempts,
352
+ streak_no_new,
353
+ )
354
+
355
+ if len(all_messages) >= limit:
356
+ logger.info("Reached message limit (%d).", limit)
357
+ break
358
+
359
+ if streak_no_new >= 3:
360
+ logger.info(
361
+ "No new messages for %d scrolls — reached top of channel.",
362
+ streak_no_new,
363
+ )
364
+ break
365
+
366
+ self._scroll_up(page, scroll_delay_ms)
367
+
368
+ return all_messages[:limit]
369
+
370
+ def _scroll_up(self, page: Page, delay_ms: int) -> None:
371
+ """Scroll the message container to its top to trigger lazy-load."""
372
+ try:
373
+ page.evaluate(
374
+ """() => {
375
+ const container = document.querySelector(
376
+ '.bubbles, .messages-container, #column-center, ' +
377
+ '[data-list-id="chat"]'
378
+ );
379
+ if (container) {
380
+ container.scrollTop = 0;
381
+ }
382
+ }"""
383
+ )
384
+ time.sleep(max(delay_ms / 1000, 0.5))
385
+ except Exception as exc:
386
+ logger.warning("Scroll failed: %s", exc)
387
+ time.sleep(max(delay_ms / 1000, 0.5))
388
+
389
+ # ------------------------------------------------------------------
390
+ # DOM → Message parsing
391
+ # ------------------------------------------------------------------
392
+
393
+ def _parse_message_elements(
394
+ self, page: Page, channel_name: str
395
+ ) -> list[Message]:
396
+ """Extract all visible message bubbles from the current page DOM."""
397
+ html = page.content()
398
+ soup = BeautifulSoup(html, "html.parser")
399
+
400
+ elements: list[Tag] = []
401
+ for sel in _MESSAGE_ITEM_SELECTORS:
402
+ found = soup.select(sel)
403
+ if found:
404
+ elements = found
405
+ break
406
+
407
+ if not elements:
408
+ logger.debug("No message elements found in current DOM.")
409
+ return []
410
+
411
+ messages: list[Message] = []
412
+ for el in elements:
413
+ try:
414
+ msg = self._parse_message_element(el, channel_name)
415
+ if msg is not None:
416
+ messages.append(msg)
417
+ except Exception as exc:
418
+ logger.debug("Failed to parse a message element: %s", exc)
419
+ continue
420
+
421
+ return messages
422
+
423
+ def _parse_message_element(
424
+ self, el: Tag, channel_name: str
425
+ ) -> Message | None:
426
+ """Parse a single message DOM element into our :class:`Message` model."""
427
+ msg_id = _extract_id(el)
428
+ text = _extract_text(el)
429
+ author = _extract_author(el)
430
+ date = _extract_date(el)
431
+ media_urls = _extract_media_urls(el)
432
+ is_forwarded = _detect_forwarded(el)
433
+
434
+ if not text and not media_urls:
435
+ return None
436
+
437
+ return Message(
438
+ id=msg_id,
439
+ channel=channel_name,
440
+ date=date or datetime.now(UTC),
441
+ author=author,
442
+ text=text or "",
443
+ media_urls=media_urls,
444
+ reactions=None,
445
+ is_forwarded=is_forwarded,
446
+ raw_source="web",
447
+ )
448
+
449
+ # ------------------------------------------------------------------
450
+ # Helpers
451
+ # ------------------------------------------------------------------
452
+
453
+ @staticmethod
454
+ def _wait_for_any_selector(
455
+ page: Page, selectors: list[str], timeout: int = 15_000
456
+ ) -> None:
457
+ """Wait for at least one of the given selectors to appear."""
458
+ for sel in selectors:
459
+ try:
460
+ page.wait_for_selector(sel, timeout=timeout)
461
+ logger.debug("Selector found: %s", sel)
462
+ return
463
+ except Exception:
464
+ continue
465
+ logger.warning(
466
+ "None of the expected selectors appeared within %d ms: %s",
467
+ timeout,
468
+ selectors,
469
+ )
470
+
471
+
472
+ # -----------------------------------------------------------------------
473
+ # Standalone extraction helpers (module-level, usable without instance)
474
+ # -----------------------------------------------------------------------
475
+
476
+ def _extract_id(el: Tag) -> int:
477
+ """Extract a stable message id from DOM attributes or content hash."""
478
+ for attr in ("data-message-id", "data-id", "id"):
479
+ val = el.get(attr)
480
+ if val and val.strip():
481
+ try:
482
+ return abs(hash(val)) % (10**9)
483
+ except (ValueError, TypeError, AttributeError):
484
+ pass
485
+ text = el.get_text(strip=True)[:300]
486
+ return abs(hash(text)) % (10**9)
487
+
488
+
489
+ def _extract_text(el: Tag) -> str | None:
490
+ """Extract message text from DOM element."""
491
+ for sel in _TEXT_SELECTORS:
492
+ text_el = el.select_one(sel)
493
+ if text_el:
494
+ t = text_el.get_text(strip=True)
495
+ if t:
496
+ return t
497
+ # Fallback: get all text, strip known non-text elements
498
+ el_copy = BeautifulSoup(str(el), "html.parser")
499
+ for skip_sel in (".peer-title", ".sender-name", ".time", ".reply-markup", ".bubble-meta"):
500
+ for skip_el in el_copy.select(skip_sel):
501
+ skip_el.decompose()
502
+ text = el_copy.get_text(strip=True)
503
+ return text or None
504
+
505
+
506
+ def _extract_author(el: Tag) -> str | None:
507
+ """Extract message sender name."""
508
+ for sel in _AUTHOR_SELECTORS:
509
+ author_el = el.select_one(sel)
510
+ if author_el:
511
+ name = author_el.get_text(strip=True)
512
+ if name:
513
+ return name
514
+ return None
515
+
516
+
517
+ def _extract_date(el: Tag) -> datetime | None:
518
+ """Extract message date/time from DOM element."""
519
+ time_el = el.select_one("time")
520
+ if time_el:
521
+ dt_str = time_el.get("datetime", "")
522
+ if dt_str:
523
+ try:
524
+ return datetime.fromisoformat(dt_str).replace(tzinfo=UTC)
525
+ except (ValueError, TypeError):
526
+ pass
527
+
528
+ for sel in _DATE_SELECTORS:
529
+ date_el = el.select_one(sel)
530
+ if date_el:
531
+ ts = date_el.get("data-timestamp", "")
532
+ if ts:
533
+ try:
534
+ return datetime.fromtimestamp(int(ts), tz=UTC)
535
+ except (ValueError, TypeError):
536
+ pass
537
+ text = date_el.get_text(strip=True)
538
+ if text:
539
+ try:
540
+ return _parse_human_date(text)
541
+ except (ValueError, TypeError):
542
+ pass
543
+
544
+ return None
545
+
546
+
547
+ def _parse_human_date(text: str) -> datetime | None:
548
+ """Best-effort parse of human-readable dates like '12:34 PM' or 'Jan 1'."""
549
+
550
+ now = datetime.now(UTC)
551
+ text = text.strip()
552
+
553
+ # "12:34" or "12:34 PM" — today
554
+ for fmt in ("%I:%M %p", "%H:%M", "%I:%M%p"):
555
+ try:
556
+ t = datetime.strptime(text, fmt).time()
557
+ return datetime.combine(now.date(), t, tzinfo=UTC)
558
+ except (ValueError, TypeError):
559
+ continue
560
+
561
+ # "Jan 1" or "1 Jan" — this year
562
+ for fmt in ("%b %d", "%d %b", "%B %d", "%d %B"):
563
+ try:
564
+ d = datetime.strptime(f"{text} {now.year}", f"{fmt} %Y").date()
565
+ return datetime.combine(d, datetime.min.time(), tzinfo=UTC)
566
+ except (ValueError, TypeError):
567
+ continue
568
+
569
+ # "Jan 1, 2024"
570
+ for fmt in ("%b %d, %Y", "%B %d, %Y"):
571
+ try:
572
+ return datetime.strptime(text, fmt).replace(tzinfo=UTC)
573
+ except (ValueError, TypeError):
574
+ continue
575
+
576
+ # ISO attempt
577
+ try:
578
+ return datetime.fromisoformat(text).replace(tzinfo=UTC)
579
+ except (ValueError, TypeError):
580
+ pass
581
+
582
+ return None
583
+
584
+
585
+ def _extract_media_urls(el: Tag) -> list[str]:
586
+ """Extract media URLs (images, videos, documents) from the element."""
587
+ urls: list[str] = []
588
+
589
+ for img in el.select(_MEDIA_IMG_SELECTORS):
590
+ src = img.get("src", "") or img.get("data-src", "")
591
+ if src and not src.startswith("data:") and "emoji" not in src.lower():
592
+ urls.append(src)
593
+
594
+ for video in el.select(_MEDIA_VIDEO_SELECTORS):
595
+ src = video.get("src", "")
596
+ if src:
597
+ urls.append(src)
598
+
599
+ for a in el.select(_MEDIA_LINK_SELECTORS):
600
+ href = a.get("href", "")
601
+ if href and href.startswith("http"):
602
+ urls.append(href)
603
+
604
+ # Deduplicate preserving order
605
+ seen: set[str] = set()
606
+ result: list[str] = []
607
+ for u in urls:
608
+ if u not in seen:
609
+ seen.add(u)
610
+ result.append(u)
611
+ return result
612
+
613
+
614
+ def _detect_forwarded(el: Tag) -> bool:
615
+ """Detect whether the message is forwarded."""
616
+ for sel in _FORWARDED_SELECTORS:
617
+ if el.select_one(sel):
618
+ return True
619
+ text = el.get_text(strip=True).lower()
620
+ return "forwarded from" in text
@@ -0,0 +1,15 @@
1
+ """Storage writers — JSON, CSV, TXT, SQLite."""
2
+
3
+ from tgparser.storage.writer import (
4
+ OutputFormat,
5
+ get_last_message_id,
6
+ save_messages,
7
+ save_messages_incremental,
8
+ )
9
+
10
+ __all__ = [
11
+ "save_messages",
12
+ "save_messages_incremental",
13
+ "get_last_message_id",
14
+ "OutputFormat",
15
+ ]