tgparser-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ """Message data model."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+
6
+
7
+ @dataclass
8
+ class Message:
9
+ """Unified message model for both open and closed channel parsing."""
10
+
11
+ id: int
12
+ channel: str
13
+ date: datetime
14
+ text: str
15
+ author: str | None = None
16
+ media_urls: list[str] = field(default_factory=list)
17
+ reactions: dict[str, int] | None = None
18
+ is_forwarded: bool = False
19
+ raw_source: str = "unknown" # "mtproto" | "web"
20
+
21
+ def to_dict(self) -> dict:
22
+ """Serialize to a JSON-compatible dict."""
23
+ return {
24
+ "id": self.id,
25
+ "channel": self.channel,
26
+ "date": self.date.isoformat(),
27
+ "author": self.author,
28
+ "text": self.text,
29
+ "media_urls": self.media_urls,
30
+ "reactions": self.reactions,
31
+ "is_forwarded": self.is_forwarded,
32
+ "raw_source": self.raw_source,
33
+ }
@@ -0,0 +1,6 @@
1
+ """Channel parsers — open (Telethon) and closed (Playwright + BS4)."""
2
+
3
+ from tgparser.parsers.mtproto_parser import MTProtoParser
4
+ from tgparser.parsers.web_parser import WebParser
5
+
6
+ __all__ = ["MTProtoParser", "WebParser"]
@@ -0,0 +1,244 @@
1
+ """Parser for open Telegram channels via MTProto (Telethon)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from datetime import UTC, datetime
8
+
9
+ from telethon import errors, types
10
+ from telethon.client import TelegramClient
11
+ from telethon.tl.custom import Message as TgMessage
12
+
13
+ from tgparser.models.message import Message
14
+
15
+ logger = logging.getLogger("tgparser")
16
+
17
+
18
+ class MTProtoParser:
19
+ """Extract messages from open Telegram channels using MTProto API.
20
+
21
+ Uses an existing *authenticated* Telethon client. Rate-limit errors
22
+ (FloodWaitError) are handled with a sleep-and-retry inside the public
23
+ ``parse`` method.
24
+ """
25
+
26
+ def __init__(self, client: TelegramClient) -> None:
27
+ self._client = client
28
+
29
+ # ------------------------------------------------------------------
30
+ # Public API
31
+ # ------------------------------------------------------------------
32
+
33
+ async def parse(
34
+ self,
35
+ channel: str,
36
+ limit: int = 100,
37
+ *,
38
+ date_from: datetime | None = None,
39
+ date_to: datetime | None = None,
40
+ offset_id: int = 0,
41
+ max_retries: int = 3,
42
+ ) -> list[Message]:
43
+ """Fetch messages from *channel* and return our domain models.
44
+
45
+ Args:
46
+ channel: ``@username`` or invite hash.
47
+ limit: Maximum number of messages to return.
48
+ date_from: Only messages after this datetime (inclusive).
49
+ date_to: Only messages before this datetime (inclusive).
50
+ offset_id: Message ID to start pagination from (older than this).
51
+ max_retries: How many times to retry on FloodWaitError.
52
+ """
53
+ results: list[Message] = []
54
+ batch_limit = min(limit, 100) # Telethon caps at 100 per call
55
+ remaining = limit
56
+ current_offset = offset_id
57
+
58
+ for attempt in range(1, max_retries + 1):
59
+ try:
60
+ # Resolve channel entity (cached internally by Telethon).
61
+ entity = await self._client.get_entity(channel)
62
+
63
+ while remaining > 0:
64
+ batch = await self._fetch_batch(
65
+ entity=entity,
66
+ channel_name=self._normalize_channel(channel),
67
+ limit=min(remaining, batch_limit),
68
+ offset_id=current_offset,
69
+ date_from=date_from,
70
+ date_to=date_to,
71
+ )
72
+ if not batch:
73
+ break
74
+
75
+ results.extend(batch)
76
+ remaining -= len(batch)
77
+ # Paginate: messages are returned newest-first;
78
+ # next offset is the id of the oldest message in this batch.
79
+ current_offset = batch[-1].id
80
+
81
+ return results
82
+
83
+ except errors.rpcerrorlist.FloodWaitError as exc:
84
+ delay = exc.seconds + 1
85
+ if attempt == max_retries:
86
+ logger.error(
87
+ "FloodWaitError after %d retries: %s", max_retries, exc
88
+ )
89
+ raise
90
+ logger.warning(
91
+ "FloodWait: sleeping %d s (attempt %d/%d)",
92
+ delay,
93
+ attempt,
94
+ max_retries,
95
+ )
96
+ await asyncio.sleep(delay)
97
+
98
+ return results # pragma: no cover – unreachable but keeps type-checker happy
99
+
100
+ # ------------------------------------------------------------------
101
+ # Internal helpers
102
+ # ------------------------------------------------------------------
103
+
104
+ async def _fetch_batch(
105
+ self,
106
+ entity: types.InputPeerChannel | types.InputPeerChat,
107
+ channel_name: str,
108
+ limit: int,
109
+ offset_id: int,
110
+ date_from: datetime | None,
111
+ date_to: datetime | None,
112
+ ) -> list[Message]:
113
+ """Single call to ``client.get_messages`` + conversion."""
114
+ tg_messages = await self._client.get_messages(
115
+ entity,
116
+ limit=limit,
117
+ offset_id=offset_id,
118
+ offset_date=date_from,
119
+ max_id=0,
120
+ min_id=0,
121
+ )
122
+
123
+ # Ensure tg_messages is iterable (can be a single item or None).
124
+ if tg_messages is None:
125
+ return []
126
+ if isinstance(tg_messages, TgMessage):
127
+ tg_messages = [tg_messages]
128
+
129
+ converted: list[Message] = []
130
+ for tg_msg in tg_messages:
131
+ msg = await self._to_message(tg_msg, channel_name)
132
+ # Apply date-range filter client-side (Telethon's offset_date
133
+ # is not perfectly precise for bidirectional filtering).
134
+ if date_from is not None and msg.date < date_from:
135
+ continue
136
+ if date_to is not None and msg.date > date_to:
137
+ continue
138
+ converted.append(msg)
139
+
140
+ return converted
141
+
142
+ async def _to_message(
143
+ self, tg_msg: TgMessage, channel_name: str
144
+ ) -> Message:
145
+ """Convert a Telethon :class:`Message` to our domain model."""
146
+ media_urls: list[str] = []
147
+ if tg_msg.media is not None:
148
+ media_urls = self._extract_media_urls(tg_msg)
149
+
150
+ # Author extraction precedence: post_author (signature) → sender first_name
151
+ author: str | None = None
152
+ if isinstance(tg_msg.post_author, str) and tg_msg.post_author:
153
+ author = tg_msg.post_author
154
+ elif tg_msg.sender_id is not None:
155
+ try:
156
+ sender = await tg_msg.get_sender()
157
+ if sender is not None:
158
+ author = getattr(sender, "first_name", None) or getattr(
159
+ sender, "username", None
160
+ )
161
+ except Exception:
162
+ author = str(tg_msg.sender_id)
163
+
164
+ # Reactions
165
+ reactions: dict[str, int] | None = None
166
+ if tg_msg.reactions is not None:
167
+ reactions = {}
168
+ for r in tg_msg.reactions.results:
169
+ emoticon = (
170
+ r.reaction.emoticon
171
+ if hasattr(r.reaction, "emoticon")
172
+ else str(r.reaction)
173
+ )
174
+ reactions[emoticon] = r.count
175
+
176
+ return Message(
177
+ id=tg_msg.id,
178
+ channel=channel_name,
179
+ date=tg_msg.date.replace(tzinfo=UTC),
180
+ author=author,
181
+ text=tg_msg.text or "",
182
+ media_urls=media_urls,
183
+ reactions=reactions,
184
+ is_forwarded=tg_msg.forward is not None,
185
+ raw_source="mtproto",
186
+ )
187
+
188
+ # ------------------------------------------------------------------
189
+ # Media helpers
190
+ # ------------------------------------------------------------------
191
+
192
+ @staticmethod
193
+ def _extract_media_urls(tg_msg: TgMessage) -> list[str]:
194
+ """Build a list of human-readable media descriptors for *tg_msg*.
195
+
196
+ We do **not** download actual files here; we return file IDs and
197
+ attributes that can be used to construct download URLs later.
198
+ """
199
+ urls: list[str] = []
200
+ media = tg_msg.media
201
+
202
+ if isinstance(media, types.MessageMediaPhoto):
203
+ photo = media.photo
204
+ if isinstance(photo, types.Photo) and photo.sizes:
205
+ # Largest size is usually last.
206
+ largest = photo.sizes[-1]
207
+ urls.append(
208
+ f"photo:{photo.id}:{getattr(largest, 'w', '?')}"
209
+ f"x{getattr(largest, 'h', '?')}"
210
+ )
211
+
212
+ elif isinstance(media, types.MessageMediaDocument):
213
+ doc = media.document
214
+ if isinstance(doc, types.Document):
215
+ name_parts: list[str] = []
216
+ for attr in doc.attributes:
217
+ if isinstance(attr, types.DocumentAttributeFilename):
218
+ name_parts.append(attr.file_name)
219
+ elif isinstance(attr, types.DocumentAttributeVideo):
220
+ name_parts.append(f"video({attr.duration}s)")
221
+ elif isinstance(attr, types.DocumentAttributeAudio):
222
+ name_parts.append(
223
+ f"audio({attr.duration}s)" + (
224
+ f"-{attr.title}" if attr.title else ""
225
+ )
226
+ )
227
+ name = "_".join(name_parts) if name_parts else f"doc:{doc.id}"
228
+ urls.append(f"document:{doc.id}:{name}")
229
+
230
+ elif isinstance(media, types.MessageMediaWebPage):
231
+ wp = media.webpage
232
+ if isinstance(wp, types.WebPage) and wp.url:
233
+ urls.append(wp.url)
234
+
235
+ return urls
236
+
237
+ # ------------------------------------------------------------------
238
+ # Helpers
239
+ # ------------------------------------------------------------------
240
+
241
+ @staticmethod
242
+ def _normalize_channel(raw: str) -> str:
243
+ """Strip leading @ if present, return uniform channel name."""
244
+ return raw.lstrip("@")