tgparser-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgparser/__init__.py +3 -0
- tgparser/auth/__init__.py +6 -0
- tgparser/auth/mtproto_auth.py +130 -0
- tgparser/auth/web_auth.py +260 -0
- tgparser/cli.py +637 -0
- tgparser/config.py +55 -0
- tgparser/models/__init__.py +1 -0
- tgparser/models/message.py +33 -0
- tgparser/parsers/__init__.py +6 -0
- tgparser/parsers/mtproto_parser.py +244 -0
- tgparser/parsers/web_parser.py +620 -0
- tgparser/storage/__init__.py +15 -0
- tgparser/storage/sqlite.py +118 -0
- tgparser/storage/writer.py +214 -0
- tgparser/utils.py +69 -0
- tgparser_cli-0.1.0.dist-info/METADATA +278 -0
- tgparser_cli-0.1.0.dist-info/RECORD +21 -0
- tgparser_cli-0.1.0.dist-info/WHEEL +5 -0
- tgparser_cli-0.1.0.dist-info/entry_points.txt +2 -0
- tgparser_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- tgparser_cli-0.1.0.dist-info/top_level.txt +1 -0
tgparser/cli.py
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
"""CLI entry point — Click-based commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import contextlib
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
|
|
14
|
+
from tgparser import __version__
|
|
15
|
+
from tgparser.auth import MTProtoAuth, WebAuth
|
|
16
|
+
from tgparser.config import get_setting
|
|
17
|
+
from tgparser.models.message import Message
|
|
18
|
+
from tgparser.parsers import MTProtoParser, WebParser
|
|
19
|
+
from tgparser.storage import (
|
|
20
|
+
save_messages,
|
|
21
|
+
save_messages_incremental,
|
|
22
|
+
)
|
|
23
|
+
from tgparser.utils import setup_logging
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("tgparser")
|
|
26
|
+
|
|
27
|
+
# Shared output-format choices
|
|
28
|
+
_FMT_CHOICES = ["json", "csv", "txt", "sqlite"]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@click.group()
|
|
32
|
+
@click.version_option(version=__version__, prog_name="tgparser")
|
|
33
|
+
@click.option("--debug", is_flag=True, help="Enable debug logging.")
|
|
34
|
+
def main(debug: bool = False) -> None:
|
|
35
|
+
"""TgParser — Telegram channel message extractor.
|
|
36
|
+
|
|
37
|
+
Parse open channels via MTProto (Telethon) and closed channels
|
|
38
|
+
via web Telegram (Playwright + BeautifulSoup).
|
|
39
|
+
"""
|
|
40
|
+
from logging import DEBUG, INFO
|
|
41
|
+
|
|
42
|
+
level = DEBUG if debug else INFO
|
|
43
|
+
setup_logging(level=level)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ------------------------------------------------------------------
|
|
47
|
+
# auth
|
|
48
|
+
# ------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@main.command()
|
|
52
|
+
@click.option(
|
|
53
|
+
"--type",
|
|
54
|
+
"auth_type",
|
|
55
|
+
type=click.Choice(["web", "mtproto"]),
|
|
56
|
+
default="web",
|
|
57
|
+
help="Authentication method (default: web QR).",
|
|
58
|
+
)
|
|
59
|
+
@click.option(
|
|
60
|
+
"--force",
|
|
61
|
+
is_flag=True,
|
|
62
|
+
help="Force re-authentication even if a valid session exists.",
|
|
63
|
+
)
|
|
64
|
+
def auth(auth_type: str, force: bool) -> None:
|
|
65
|
+
"""Authorize and save session.
|
|
66
|
+
|
|
67
|
+
Opens a browser window with Telegram Web login page.
|
|
68
|
+
Scan the QR code with your phone to authenticate.
|
|
69
|
+
Session is saved for future reuse.
|
|
70
|
+
"""
|
|
71
|
+
if auth_type == "web":
|
|
72
|
+
web_auth = WebAuth()
|
|
73
|
+
click.echo("Opening browser for QR authentication...")
|
|
74
|
+
click.echo(
|
|
75
|
+
"Scan the QR code with your phone "
|
|
76
|
+
"(Telegram → Settings → Devices → Link Desktop Device)."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
success = web_auth.login(force=force)
|
|
80
|
+
if success:
|
|
81
|
+
click.echo("✅ Authentication successful — session saved.")
|
|
82
|
+
else:
|
|
83
|
+
click.echo("❌ Authentication failed. Check logs for details.", err=True)
|
|
84
|
+
raise SystemExit(1)
|
|
85
|
+
elif auth_type == "mtproto":
|
|
86
|
+
try:
|
|
87
|
+
mtproto = MTProtoAuth()
|
|
88
|
+
except ValueError as exc:
|
|
89
|
+
click.echo(
|
|
90
|
+
f"❌ {exc}\nCopy .env.example → .env and fill in TG_API_ID, "
|
|
91
|
+
"TG_API_HASH from https://my.telegram.org/apps",
|
|
92
|
+
err=True,
|
|
93
|
+
)
|
|
94
|
+
raise SystemExit(1) from exc
|
|
95
|
+
|
|
96
|
+
if not force and mtproto.is_session_valid():
|
|
97
|
+
click.echo("✅ Valid MTProto session already exists — no re-auth needed.")
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
client = mtproto.login(force=force)
|
|
102
|
+
click.echo("✅ MTProto authentication successful — session saved.")
|
|
103
|
+
client.disconnect()
|
|
104
|
+
except Exception as exc:
|
|
105
|
+
click.echo(f"❌ MTProto auth failed: {exc}", err=True)
|
|
106
|
+
raise SystemExit(1) from exc
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
# parse (group with subcommands)
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@main.group()
|
|
115
|
+
def parse() -> None:
|
|
116
|
+
"""Parse messages from a Telegram channel."""
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _common_output_options(cmd: click.Group) -> click.Group:
|
|
121
|
+
"""Decorator adding --format, --output-dir, --db-path, --incremental."""
|
|
122
|
+
cmd = cmd
|
|
123
|
+
cmd = click.option(
|
|
124
|
+
"--format",
|
|
125
|
+
"output_fmt",
|
|
126
|
+
type=click.Choice(_FMT_CHOICES),
|
|
127
|
+
default=None,
|
|
128
|
+
help="Output format (default: from config.yaml).",
|
|
129
|
+
)(cmd)
|
|
130
|
+
cmd = click.option(
|
|
131
|
+
"--output-dir",
|
|
132
|
+
default=None,
|
|
133
|
+
type=click.Path(file_okay=False, writable=True),
|
|
134
|
+
help="Directory for output files (default: from config.yaml).",
|
|
135
|
+
)(cmd)
|
|
136
|
+
cmd = click.option(
|
|
137
|
+
"--db-path",
|
|
138
|
+
default=None,
|
|
139
|
+
type=click.Path(file_okay=True, writable=True),
|
|
140
|
+
help="Path to SQLite database (required for --format sqlite).",
|
|
141
|
+
)(cmd)
|
|
142
|
+
cmd = click.option(
|
|
143
|
+
"--incremental",
|
|
144
|
+
is_flag=True,
|
|
145
|
+
help="Only save messages newer than the last saved ID for this channel.",
|
|
146
|
+
)(cmd)
|
|
147
|
+
return cmd
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@parse.command("open")
|
|
151
|
+
@click.argument("channel")
|
|
152
|
+
@click.option(
|
|
153
|
+
"--limit",
|
|
154
|
+
default=None,
|
|
155
|
+
type=int,
|
|
156
|
+
help="Max messages to fetch (default: from config.yaml).",
|
|
157
|
+
)
|
|
158
|
+
@click.option(
|
|
159
|
+
"--date-from",
|
|
160
|
+
default=None,
|
|
161
|
+
type=str,
|
|
162
|
+
help="Only messages after this ISO date (YYYY-MM-DD).",
|
|
163
|
+
)
|
|
164
|
+
@click.option(
|
|
165
|
+
"--date-to",
|
|
166
|
+
default=None,
|
|
167
|
+
type=str,
|
|
168
|
+
help="Only messages before this ISO date (YYYY-MM-DD).",
|
|
169
|
+
)
|
|
170
|
+
@click.option(
|
|
171
|
+
"--offset-id",
|
|
172
|
+
default=0,
|
|
173
|
+
type=int,
|
|
174
|
+
help="Message ID to start fetching from (pagination).",
|
|
175
|
+
)
|
|
176
|
+
@click.option(
|
|
177
|
+
"--format",
|
|
178
|
+
"output_fmt",
|
|
179
|
+
type=click.Choice(_FMT_CHOICES),
|
|
180
|
+
default=None,
|
|
181
|
+
help="Output format (default: from config.yaml).",
|
|
182
|
+
)
|
|
183
|
+
@click.option(
|
|
184
|
+
"--output-dir",
|
|
185
|
+
default=None,
|
|
186
|
+
type=click.Path(file_okay=False, writable=True),
|
|
187
|
+
help="Directory for output files (default: from config.yaml).",
|
|
188
|
+
)
|
|
189
|
+
@click.option(
|
|
190
|
+
"--db-path",
|
|
191
|
+
default=None,
|
|
192
|
+
type=click.Path(file_okay=True, writable=True),
|
|
193
|
+
help="Path to SQLite database (required for --format sqlite).",
|
|
194
|
+
)
|
|
195
|
+
@click.option(
|
|
196
|
+
"--incremental",
|
|
197
|
+
is_flag=True,
|
|
198
|
+
help="Only save messages newer than the last saved ID for this channel.",
|
|
199
|
+
)
|
|
200
|
+
def parse_open(
|
|
201
|
+
channel: str,
|
|
202
|
+
limit: int | None,
|
|
203
|
+
output_fmt: str | None,
|
|
204
|
+
output_dir: str | None,
|
|
205
|
+
date_from: str | None,
|
|
206
|
+
date_to: str | None,
|
|
207
|
+
offset_id: int,
|
|
208
|
+
db_path: str | None,
|
|
209
|
+
incremental: bool,
|
|
210
|
+
) -> None:
|
|
211
|
+
"""Parse an OPEN Telegram channel via MTProto API.
|
|
212
|
+
|
|
213
|
+
CHANNEL — channel username (e.g. @durov) or invite hash.
|
|
214
|
+
"""
|
|
215
|
+
effective_limit = limit or int(get_setting("message_limit", "100"))
|
|
216
|
+
effective_fmt = output_fmt or get_setting("output_format", "json")
|
|
217
|
+
effective_dir = output_dir or get_setting("output_dir", "data/output")
|
|
218
|
+
|
|
219
|
+
# Parse date filters
|
|
220
|
+
df: datetime | None = None
|
|
221
|
+
dt: datetime | None = None
|
|
222
|
+
if date_from:
|
|
223
|
+
df = datetime.fromisoformat(date_from).replace(tzinfo=UTC)
|
|
224
|
+
if date_to:
|
|
225
|
+
dt = datetime.fromisoformat(date_to).replace(tzinfo=UTC)
|
|
226
|
+
|
|
227
|
+
click.echo(
|
|
228
|
+
f"📡 Parsing open channel '{channel}' "
|
|
229
|
+
f"(limit={effective_limit}, format={effective_fmt})"
|
|
230
|
+
+ (", incremental" if incremental else "")
|
|
231
|
+
+ "..."
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Run async parse in sync entry point
|
|
235
|
+
asyncio.run(
|
|
236
|
+
_run_parse_open(
|
|
237
|
+
channel=channel,
|
|
238
|
+
limit=effective_limit,
|
|
239
|
+
fmt=effective_fmt,
|
|
240
|
+
output_dir=effective_dir,
|
|
241
|
+
date_from=df,
|
|
242
|
+
date_to=dt,
|
|
243
|
+
offset_id=offset_id,
|
|
244
|
+
db_path=Path(db_path) if db_path else None,
|
|
245
|
+
incremental=incremental,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@parse.command("closed")
|
|
251
|
+
@click.argument("url")
|
|
252
|
+
@click.option(
|
|
253
|
+
"--limit",
|
|
254
|
+
default=None,
|
|
255
|
+
type=int,
|
|
256
|
+
help="Max messages to fetch (default: from config.yaml).",
|
|
257
|
+
)
|
|
258
|
+
@click.option(
|
|
259
|
+
"--format",
|
|
260
|
+
"output_fmt",
|
|
261
|
+
type=click.Choice(_FMT_CHOICES),
|
|
262
|
+
default=None,
|
|
263
|
+
help="Output format (default: from config.yaml).",
|
|
264
|
+
)
|
|
265
|
+
@click.option(
|
|
266
|
+
"--output-dir",
|
|
267
|
+
default=None,
|
|
268
|
+
type=click.Path(file_okay=False, writable=True),
|
|
269
|
+
help="Directory for output files (default: from config.yaml).",
|
|
270
|
+
)
|
|
271
|
+
@click.option(
|
|
272
|
+
"--db-path",
|
|
273
|
+
default=None,
|
|
274
|
+
type=click.Path(file_okay=True, writable=True),
|
|
275
|
+
help="Path to SQLite database (required for --format sqlite).",
|
|
276
|
+
)
|
|
277
|
+
@click.option(
|
|
278
|
+
"--incremental",
|
|
279
|
+
is_flag=True,
|
|
280
|
+
help="Only save messages newer than the last saved ID for this channel.",
|
|
281
|
+
)
|
|
282
|
+
def parse_closed(
|
|
283
|
+
url: str,
|
|
284
|
+
limit: int | None,
|
|
285
|
+
output_fmt: str | None,
|
|
286
|
+
output_dir: str | None,
|
|
287
|
+
db_path: str | None,
|
|
288
|
+
incremental: bool,
|
|
289
|
+
) -> None:
|
|
290
|
+
"""Parse a CLOSED Telegram channel via web Telegram.
|
|
291
|
+
|
|
292
|
+
URL — channel link, e.g. https://t.me/durov or https://t.me/durov/123.
|
|
293
|
+
"""
|
|
294
|
+
effective_limit = limit or int(get_setting("message_limit", "100"))
|
|
295
|
+
effective_fmt = output_fmt or get_setting("output_format", "json")
|
|
296
|
+
effective_dir = output_dir or get_setting("output_dir", "data/output")
|
|
297
|
+
|
|
298
|
+
click.echo(
|
|
299
|
+
f"🌐 Parsing closed channel '{url}' "
|
|
300
|
+
f"(limit={effective_limit}, format={effective_fmt})"
|
|
301
|
+
+ (", incremental" if incremental else "")
|
|
302
|
+
+ "..."
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
asyncio.run(
|
|
306
|
+
_run_parse_closed(
|
|
307
|
+
url=url,
|
|
308
|
+
limit=effective_limit,
|
|
309
|
+
fmt=effective_fmt,
|
|
310
|
+
output_dir=effective_dir,
|
|
311
|
+
db_path=Path(db_path) if db_path else None,
|
|
312
|
+
incremental=incremental,
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# ------------------------------------------------------------------
|
|
318
|
+
# export (convert already-parsed data)
|
|
319
|
+
# ------------------------------------------------------------------
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
@main.command()
|
|
323
|
+
@click.argument("input_path", type=click.Path(exists=True, dir_okay=False))
|
|
324
|
+
@click.option(
|
|
325
|
+
"--format",
|
|
326
|
+
"output_fmt",
|
|
327
|
+
type=click.Choice(_FMT_CHOICES),
|
|
328
|
+
default="csv",
|
|
329
|
+
help="Target format (default: csv).",
|
|
330
|
+
)
|
|
331
|
+
@click.option(
|
|
332
|
+
"--output-dir",
|
|
333
|
+
default=None,
|
|
334
|
+
type=click.Path(file_okay=False, writable=True),
|
|
335
|
+
help="Output directory (default: from config or same as input).",
|
|
336
|
+
)
|
|
337
|
+
@click.option(
|
|
338
|
+
"--db-path",
|
|
339
|
+
default=None,
|
|
340
|
+
type=click.Path(file_okay=True, writable=True),
|
|
341
|
+
help="Path to SQLite database (for --format sqlite).",
|
|
342
|
+
)
|
|
343
|
+
def export(
|
|
344
|
+
input_path: str,
|
|
345
|
+
output_fmt: str,
|
|
346
|
+
output_dir: str | None,
|
|
347
|
+
db_path: str | None,
|
|
348
|
+
) -> None:
|
|
349
|
+
"""Convert a previously saved JSON / CSV / TXT file into another format.
|
|
350
|
+
|
|
351
|
+
Reads messages from INPUT_PATH, detects the source format
|
|
352
|
+
from the file extension, and writes them in the requested --format.
|
|
353
|
+
"""
|
|
354
|
+
import csv
|
|
355
|
+
import json
|
|
356
|
+
|
|
357
|
+
inp = Path(input_path)
|
|
358
|
+
click.echo(f"📂 Reading messages from {inp} …")
|
|
359
|
+
|
|
360
|
+
# Detect source format
|
|
361
|
+
ext = inp.suffix.lower()
|
|
362
|
+
messages = []
|
|
363
|
+
|
|
364
|
+
if ext == ".json":
|
|
365
|
+
with inp.open(encoding="utf-8") as fh:
|
|
366
|
+
raw = json.load(fh)
|
|
367
|
+
for item in raw:
|
|
368
|
+
messages.append(_dict_to_message(item))
|
|
369
|
+
elif ext == ".csv":
|
|
370
|
+
with inp.open(encoding="utf-8") as fh:
|
|
371
|
+
reader = csv.DictReader(fh)
|
|
372
|
+
for row in reader:
|
|
373
|
+
messages.append(_dict_to_message(row))
|
|
374
|
+
elif ext == ".txt":
|
|
375
|
+
# naive TXT reading — parse the structured text format
|
|
376
|
+
messages = _parse_txt(inp)
|
|
377
|
+
else:
|
|
378
|
+
click.echo(f"❌ Unsupported input format: {ext}", err=True)
|
|
379
|
+
raise SystemExit(1)
|
|
380
|
+
|
|
381
|
+
if not messages:
|
|
382
|
+
click.echo("ℹ️ No messages found in input file.")
|
|
383
|
+
return
|
|
384
|
+
|
|
385
|
+
effective_dir = output_dir or inp.parent
|
|
386
|
+
|
|
387
|
+
result = save_messages(
|
|
388
|
+
messages=messages,
|
|
389
|
+
output_dir=effective_dir,
|
|
390
|
+
channel_name=inp.stem.split("_")[0], # heuristic
|
|
391
|
+
fmt=output_fmt,
|
|
392
|
+
db_path=Path(db_path) if db_path else None,
|
|
393
|
+
)
|
|
394
|
+
if result:
|
|
395
|
+
click.echo(f"✅ Exported {len(messages)} messages → {result}")
|
|
396
|
+
else:
|
|
397
|
+
click.echo(f"✅ Exported {len(messages)} messages → sqlite:{db_path or 'default.db'}")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
# ------------------------------------------------------------------
|
|
401
|
+
# Async helpers
|
|
402
|
+
# ------------------------------------------------------------------
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
async def _run_parse_open(
|
|
406
|
+
channel: str,
|
|
407
|
+
limit: int,
|
|
408
|
+
fmt: str,
|
|
409
|
+
output_dir: str,
|
|
410
|
+
date_from: datetime | None,
|
|
411
|
+
date_to: datetime | None,
|
|
412
|
+
offset_id: int,
|
|
413
|
+
db_path: Path | None,
|
|
414
|
+
incremental: bool,
|
|
415
|
+
) -> None:
|
|
416
|
+
"""Connect via MTProto, parse, save, and disconnect."""
|
|
417
|
+
try:
|
|
418
|
+
mtproto_auth = MTProtoAuth()
|
|
419
|
+
except ValueError as exc:
|
|
420
|
+
click.echo(f"❌ {exc}", err=True)
|
|
421
|
+
raise SystemExit(1) from exc
|
|
422
|
+
|
|
423
|
+
if not mtproto_auth.is_session_valid():
|
|
424
|
+
click.echo(
|
|
425
|
+
"❌ No valid MTProto session. Run 'tgparser auth --type mtproto' first.",
|
|
426
|
+
err=True,
|
|
427
|
+
)
|
|
428
|
+
raise SystemExit(1)
|
|
429
|
+
|
|
430
|
+
client = mtproto_auth.login(force=False) # reuse existing session
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
parser = MTProtoParser(client)
|
|
434
|
+
messages = await parser.parse(
|
|
435
|
+
channel=channel,
|
|
436
|
+
limit=limit,
|
|
437
|
+
date_from=date_from,
|
|
438
|
+
date_to=date_to,
|
|
439
|
+
offset_id=offset_id,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
if not messages:
|
|
443
|
+
click.echo("ℹ️ No messages found (channel may be empty or inaccessible).")
|
|
444
|
+
else:
|
|
445
|
+
if incremental:
|
|
446
|
+
filepath = save_messages_incremental(
|
|
447
|
+
messages=messages,
|
|
448
|
+
output_dir=output_dir,
|
|
449
|
+
channel_name=channel,
|
|
450
|
+
fmt=fmt,
|
|
451
|
+
db_path=db_path,
|
|
452
|
+
)
|
|
453
|
+
else:
|
|
454
|
+
filepath = save_messages(
|
|
455
|
+
messages=messages,
|
|
456
|
+
output_dir=output_dir,
|
|
457
|
+
channel_name=channel,
|
|
458
|
+
fmt=fmt,
|
|
459
|
+
db_path=db_path,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
if filepath:
|
|
463
|
+
click.echo(f"✅ Parsed {len(messages)} messages → {filepath}")
|
|
464
|
+
else:
|
|
465
|
+
click.echo(f"✅ Parsed {len(messages)} messages — no new data.")
|
|
466
|
+
finally:
|
|
467
|
+
await client.disconnect()
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
async def _run_parse_closed(
|
|
471
|
+
url: str,
|
|
472
|
+
limit: int,
|
|
473
|
+
fmt: str,
|
|
474
|
+
output_dir: str,
|
|
475
|
+
db_path: Path | None,
|
|
476
|
+
incremental: bool,
|
|
477
|
+
) -> None:
|
|
478
|
+
"""Use WebParser (Playwright) to parse a closed channel."""
|
|
479
|
+
try:
|
|
480
|
+
web_parser = WebParser()
|
|
481
|
+
messages = await web_parser.parse(url=url, limit=limit)
|
|
482
|
+
except Exception as exc:
|
|
483
|
+
click.echo(f"❌ Web parse failed: {exc}", err=True)
|
|
484
|
+
raise SystemExit(1) from exc
|
|
485
|
+
finally:
|
|
486
|
+
if "web_parser" in locals():
|
|
487
|
+
await web_parser.close()
|
|
488
|
+
|
|
489
|
+
if not messages:
|
|
490
|
+
click.echo("ℹ️ No messages found (channel may be empty or inaccessible).")
|
|
491
|
+
else:
|
|
492
|
+
if incremental:
|
|
493
|
+
filepath = save_messages_incremental(
|
|
494
|
+
messages=messages,
|
|
495
|
+
output_dir=output_dir,
|
|
496
|
+
channel_name=url.rstrip("/").rsplit("/", 1)[-1],
|
|
497
|
+
fmt=fmt,
|
|
498
|
+
db_path=db_path,
|
|
499
|
+
)
|
|
500
|
+
else:
|
|
501
|
+
filepath = save_messages(
|
|
502
|
+
messages=messages,
|
|
503
|
+
output_dir=output_dir,
|
|
504
|
+
channel_name=url.rstrip("/").rsplit("/", 1)[-1],
|
|
505
|
+
fmt=fmt,
|
|
506
|
+
db_path=db_path,
|
|
507
|
+
)
|
|
508
|
+
if filepath:
|
|
509
|
+
click.echo(f"✅ Parsed {len(messages)} messages → {filepath}")
|
|
510
|
+
else:
|
|
511
|
+
click.echo(f"✅ Parsed {len(messages)} messages — no new data.")
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
# ------------------------------------------------------------------
|
|
515
|
+
# Format conversion helpers
|
|
516
|
+
# ------------------------------------------------------------------
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def _dict_to_message(d: dict) -> Message:
|
|
520
|
+
"""Convert a plain dict back to a Message object (for export)."""
|
|
521
|
+
from tgparser.models.message import Message
|
|
522
|
+
|
|
523
|
+
media_raw = d.get("media_urls", "") or ""
|
|
524
|
+
reactions_raw = d.get("reactions", "") or ""
|
|
525
|
+
|
|
526
|
+
# media_urls could be a pipe-separated string (CSV) or a JSON list (JSON)
|
|
527
|
+
if isinstance(media_raw, list):
|
|
528
|
+
media_urls: list[str] = media_raw
|
|
529
|
+
elif media_raw and media_raw not in ("[]", ""):
|
|
530
|
+
media_urls = media_raw.split("|")
|
|
531
|
+
else:
|
|
532
|
+
media_urls = []
|
|
533
|
+
|
|
534
|
+
# reactions could be a JSON string or already a dict
|
|
535
|
+
if isinstance(reactions_raw, dict):
|
|
536
|
+
reactions: dict[str, int] = reactions_raw
|
|
537
|
+
elif reactions_raw and reactions_raw.startswith("{"):
|
|
538
|
+
reactions = json.loads(reactions_raw)
|
|
539
|
+
else:
|
|
540
|
+
reactions = {}
|
|
541
|
+
|
|
542
|
+
return Message(
|
|
543
|
+
id=int(d["id"]) if d.get("id") else 0,
|
|
544
|
+
channel=d.get("channel", ""),
|
|
545
|
+
date=datetime.fromisoformat(d["date"]) if d.get("date") else datetime.now(UTC),
|
|
546
|
+
author=d.get("author") or None,
|
|
547
|
+
text=d.get("text", ""),
|
|
548
|
+
media_urls=media_urls,
|
|
549
|
+
reactions=reactions,
|
|
550
|
+
is_forwarded=d.get("is_forwarded", "") in ("True", "true", "1", "yes"),
|
|
551
|
+
raw_source=d.get("raw_source", ""),
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def _parse_txt(path: Path) -> list[Message]:
|
|
556
|
+
"""Naive parser for the TXT format produced by _write_txt."""
|
|
557
|
+
|
|
558
|
+
from tgparser.models.message import Message
|
|
559
|
+
|
|
560
|
+
text = path.read_text(encoding="utf-8")
|
|
561
|
+
blocks = text.strip().split("\n\n--- Message #")
|
|
562
|
+
|
|
563
|
+
messages: list[Message] = []
|
|
564
|
+
for block in blocks:
|
|
565
|
+
if not block.strip():
|
|
566
|
+
continue
|
|
567
|
+
lines = block.strip().split("\n")
|
|
568
|
+
# Reconstruct the message id from the first line
|
|
569
|
+
header = lines[0].strip()
|
|
570
|
+
if header.startswith("--- Message #"):
|
|
571
|
+
mid = int(header.removeprefix("--- Message #").removesuffix(" ---"))
|
|
572
|
+
else:
|
|
573
|
+
mid = 0
|
|
574
|
+
|
|
575
|
+
# Extract metadata lines until blank line, then text
|
|
576
|
+
meta: dict[str, str] = {}
|
|
577
|
+
text_lines: list[str] = []
|
|
578
|
+
in_text = False
|
|
579
|
+
for line in lines[1:]:
|
|
580
|
+
if not in_text:
|
|
581
|
+
if line.strip() == "":
|
|
582
|
+
in_text = True
|
|
583
|
+
continue
|
|
584
|
+
if ":" in line:
|
|
585
|
+
key, _, val = line.partition(":")
|
|
586
|
+
meta[key.strip().lower()] = val.strip()
|
|
587
|
+
else:
|
|
588
|
+
text_lines.append(line)
|
|
589
|
+
|
|
590
|
+
channel = meta.get("channel", "")
|
|
591
|
+
date_str = meta.get("date", "")
|
|
592
|
+
author = meta.get("author", "—")
|
|
593
|
+
if author == "—":
|
|
594
|
+
author = None
|
|
595
|
+
media_raw = meta.get("media", "")
|
|
596
|
+
reactions_raw = meta.get("reactions", "")
|
|
597
|
+
|
|
598
|
+
# Parse date
|
|
599
|
+
dt: datetime
|
|
600
|
+
try:
|
|
601
|
+
dt = datetime.fromisoformat(date_str)
|
|
602
|
+
except (ValueError, TypeError):
|
|
603
|
+
dt = datetime.now(UTC)
|
|
604
|
+
|
|
605
|
+
# Parse media
|
|
606
|
+
media_urls: list[str] = []
|
|
607
|
+
if media_raw:
|
|
608
|
+
media_urls = [u.strip() for u in media_raw.split(",") if u.strip()]
|
|
609
|
+
|
|
610
|
+
# Parse reactions
|
|
611
|
+
reactions: dict[str, int] = {}
|
|
612
|
+
if reactions_raw:
|
|
613
|
+
for part in reactions_raw.split(","):
|
|
614
|
+
part = part.strip()
|
|
615
|
+
if ":" in part:
|
|
616
|
+
k, v = part.split(":", 1)
|
|
617
|
+
with contextlib.suppress(ValueError):
|
|
618
|
+
reactions[k.strip()] = int(v.strip())
|
|
619
|
+
|
|
620
|
+
messages.append(
|
|
621
|
+
Message(
|
|
622
|
+
id=mid,
|
|
623
|
+
channel=channel,
|
|
624
|
+
date=dt,
|
|
625
|
+
author=author,
|
|
626
|
+
text="\n".join(text_lines).strip(),
|
|
627
|
+
media_urls=media_urls,
|
|
628
|
+
reactions=reactions,
|
|
629
|
+
is_forwarded="forwarded" in meta.get("forwarded", "").lower(),
|
|
630
|
+
raw_source="txt_export",
|
|
631
|
+
)
|
|
632
|
+
)
|
|
633
|
+
return messages
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
if __name__ == "__main__":
|
|
637
|
+
main()
|
tgparser/config.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Configuration loader — .env secrets + config.yaml settings."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
|
|
10
|
+
# Project root (where pyproject.toml lives)
|
|
11
|
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
12
|
+
DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config.yaml"
|
|
13
|
+
DEFAULT_ENV_PATH = PROJECT_ROOT / ".env"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _load_env(env_path: Path | None = None) -> None:
|
|
17
|
+
"""Load .env file, ignoring if not found."""
|
|
18
|
+
path = env_path or DEFAULT_ENV_PATH
|
|
19
|
+
if path.exists():
|
|
20
|
+
load_dotenv(path)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load_yaml(config_path: Path | None = None) -> dict[str, Any]:
|
|
24
|
+
"""Load YAML config, returning empty dict if missing."""
|
|
25
|
+
path = config_path or DEFAULT_CONFIG_PATH
|
|
26
|
+
if not path.exists():
|
|
27
|
+
return {}
|
|
28
|
+
with open(path, encoding="utf-8") as f:
|
|
29
|
+
return yaml.safe_load(f) or {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Load once at import time
|
|
33
|
+
_load_env()
|
|
34
|
+
_yaml_config = _load_yaml()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_secret(key: str, default: str | None = None) -> str | None:
|
|
38
|
+
"""Read a secret from environment (os.environ — loaded from .env)."""
|
|
39
|
+
return os.environ.get(key, default)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_setting(*keys: str, default: Any = None) -> Any:
|
|
43
|
+
"""Traverse nested YAML config by key path.
|
|
44
|
+
|
|
45
|
+
Example: get_setting("parsing", "scroll_delay_ms") -> 1500
|
|
46
|
+
"""
|
|
47
|
+
node = _yaml_config
|
|
48
|
+
for k in keys:
|
|
49
|
+
if isinstance(node, dict):
|
|
50
|
+
node = node.get(k)
|
|
51
|
+
else:
|
|
52
|
+
return default
|
|
53
|
+
if node is None:
|
|
54
|
+
return default
|
|
55
|
+
return node
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Data models."""
|