tgparser-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tgparser/cli.py ADDED
@@ -0,0 +1,637 @@
1
+ """CLI entry point — Click-based commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextlib
7
+ import json
8
+ import logging
9
+ from datetime import UTC, datetime
10
+ from pathlib import Path
11
+
12
+ import click
13
+
14
+ from tgparser import __version__
15
+ from tgparser.auth import MTProtoAuth, WebAuth
16
+ from tgparser.config import get_setting
17
+ from tgparser.models.message import Message
18
+ from tgparser.parsers import MTProtoParser, WebParser
19
+ from tgparser.storage import (
20
+ save_messages,
21
+ save_messages_incremental,
22
+ )
23
+ from tgparser.utils import setup_logging
24
+
25
+ logger = logging.getLogger("tgparser")
26
+
27
+ # Shared output-format choices
28
+ _FMT_CHOICES = ["json", "csv", "txt", "sqlite"]
29
+
30
+
31
+ @click.group()
32
+ @click.version_option(version=__version__, prog_name="tgparser")
33
+ @click.option("--debug", is_flag=True, help="Enable debug logging.")
34
+ def main(debug: bool = False) -> None:
35
+ """TgParser — Telegram channel message extractor.
36
+
37
+ Parse open channels via MTProto (Telethon) and closed channels
38
+ via web Telegram (Playwright + BeautifulSoup).
39
+ """
40
+ from logging import DEBUG, INFO
41
+
42
+ level = DEBUG if debug else INFO
43
+ setup_logging(level=level)
44
+
45
+
46
+ # ------------------------------------------------------------------
47
+ # auth
48
+ # ------------------------------------------------------------------
49
+
50
+
51
+ @main.command()
52
+ @click.option(
53
+ "--type",
54
+ "auth_type",
55
+ type=click.Choice(["web", "mtproto"]),
56
+ default="web",
57
+ help="Authentication method (default: web QR).",
58
+ )
59
+ @click.option(
60
+ "--force",
61
+ is_flag=True,
62
+ help="Force re-authentication even if a valid session exists.",
63
+ )
64
+ def auth(auth_type: str, force: bool) -> None:
65
+ """Authorize and save session.
66
+
67
+ Opens a browser window with Telegram Web login page.
68
+ Scan the QR code with your phone to authenticate.
69
+ Session is saved for future reuse.
70
+ """
71
+ if auth_type == "web":
72
+ web_auth = WebAuth()
73
+ click.echo("Opening browser for QR authentication...")
74
+ click.echo(
75
+ "Scan the QR code with your phone "
76
+ "(Telegram → Settings → Devices → Link Desktop Device)."
77
+ )
78
+
79
+ success = web_auth.login(force=force)
80
+ if success:
81
+ click.echo("✅ Authentication successful — session saved.")
82
+ else:
83
+ click.echo("❌ Authentication failed. Check logs for details.", err=True)
84
+ raise SystemExit(1)
85
+ elif auth_type == "mtproto":
86
+ try:
87
+ mtproto = MTProtoAuth()
88
+ except ValueError as exc:
89
+ click.echo(
90
+ f"❌ {exc}\nCopy .env.example → .env and fill in TG_API_ID, "
91
+ "TG_API_HASH from https://my.telegram.org/apps",
92
+ err=True,
93
+ )
94
+ raise SystemExit(1) from exc
95
+
96
+ if not force and mtproto.is_session_valid():
97
+ click.echo("✅ Valid MTProto session already exists — no re-auth needed.")
98
+ return
99
+
100
+ try:
101
+ client = mtproto.login(force=force)
102
+ click.echo("✅ MTProto authentication successful — session saved.")
103
+ client.disconnect()
104
+ except Exception as exc:
105
+ click.echo(f"❌ MTProto auth failed: {exc}", err=True)
106
+ raise SystemExit(1) from exc
107
+
108
+
109
+ # ------------------------------------------------------------------
110
+ # parse (group with subcommands)
111
+ # ------------------------------------------------------------------
112
+
113
+
114
+ @main.group()
115
+ def parse() -> None:
116
+ """Parse messages from a Telegram channel."""
117
+ pass
118
+
119
+
120
+ def _common_output_options(cmd: click.Group) -> click.Group:
121
+ """Decorator adding --format, --output-dir, --db-path, --incremental."""
122
+ cmd = cmd
123
+ cmd = click.option(
124
+ "--format",
125
+ "output_fmt",
126
+ type=click.Choice(_FMT_CHOICES),
127
+ default=None,
128
+ help="Output format (default: from config.yaml).",
129
+ )(cmd)
130
+ cmd = click.option(
131
+ "--output-dir",
132
+ default=None,
133
+ type=click.Path(file_okay=False, writable=True),
134
+ help="Directory for output files (default: from config.yaml).",
135
+ )(cmd)
136
+ cmd = click.option(
137
+ "--db-path",
138
+ default=None,
139
+ type=click.Path(file_okay=True, writable=True),
140
+ help="Path to SQLite database (required for --format sqlite).",
141
+ )(cmd)
142
+ cmd = click.option(
143
+ "--incremental",
144
+ is_flag=True,
145
+ help="Only save messages newer than the last saved ID for this channel.",
146
+ )(cmd)
147
+ return cmd
148
+
149
+
150
+ @parse.command("open")
151
+ @click.argument("channel")
152
+ @click.option(
153
+ "--limit",
154
+ default=None,
155
+ type=int,
156
+ help="Max messages to fetch (default: from config.yaml).",
157
+ )
158
+ @click.option(
159
+ "--date-from",
160
+ default=None,
161
+ type=str,
162
+ help="Only messages after this ISO date (YYYY-MM-DD).",
163
+ )
164
+ @click.option(
165
+ "--date-to",
166
+ default=None,
167
+ type=str,
168
+ help="Only messages before this ISO date (YYYY-MM-DD).",
169
+ )
170
+ @click.option(
171
+ "--offset-id",
172
+ default=0,
173
+ type=int,
174
+ help="Message ID to start fetching from (pagination).",
175
+ )
176
+ @click.option(
177
+ "--format",
178
+ "output_fmt",
179
+ type=click.Choice(_FMT_CHOICES),
180
+ default=None,
181
+ help="Output format (default: from config.yaml).",
182
+ )
183
+ @click.option(
184
+ "--output-dir",
185
+ default=None,
186
+ type=click.Path(file_okay=False, writable=True),
187
+ help="Directory for output files (default: from config.yaml).",
188
+ )
189
+ @click.option(
190
+ "--db-path",
191
+ default=None,
192
+ type=click.Path(file_okay=True, writable=True),
193
+ help="Path to SQLite database (required for --format sqlite).",
194
+ )
195
+ @click.option(
196
+ "--incremental",
197
+ is_flag=True,
198
+ help="Only save messages newer than the last saved ID for this channel.",
199
+ )
200
+ def parse_open(
201
+ channel: str,
202
+ limit: int | None,
203
+ output_fmt: str | None,
204
+ output_dir: str | None,
205
+ date_from: str | None,
206
+ date_to: str | None,
207
+ offset_id: int,
208
+ db_path: str | None,
209
+ incremental: bool,
210
+ ) -> None:
211
+ """Parse an OPEN Telegram channel via MTProto API.
212
+
213
+ CHANNEL — channel username (e.g. @durov) or invite hash.
214
+ """
215
+ effective_limit = limit or int(get_setting("message_limit", "100"))
216
+ effective_fmt = output_fmt or get_setting("output_format", "json")
217
+ effective_dir = output_dir or get_setting("output_dir", "data/output")
218
+
219
+ # Parse date filters
220
+ df: datetime | None = None
221
+ dt: datetime | None = None
222
+ if date_from:
223
+ df = datetime.fromisoformat(date_from).replace(tzinfo=UTC)
224
+ if date_to:
225
+ dt = datetime.fromisoformat(date_to).replace(tzinfo=UTC)
226
+
227
+ click.echo(
228
+ f"📡 Parsing open channel '{channel}' "
229
+ f"(limit={effective_limit}, format={effective_fmt})"
230
+ + (", incremental" if incremental else "")
231
+ + "..."
232
+ )
233
+
234
+ # Run async parse in sync entry point
235
+ asyncio.run(
236
+ _run_parse_open(
237
+ channel=channel,
238
+ limit=effective_limit,
239
+ fmt=effective_fmt,
240
+ output_dir=effective_dir,
241
+ date_from=df,
242
+ date_to=dt,
243
+ offset_id=offset_id,
244
+ db_path=Path(db_path) if db_path else None,
245
+ incremental=incremental,
246
+ )
247
+ )
248
+
249
+
250
+ @parse.command("closed")
251
+ @click.argument("url")
252
+ @click.option(
253
+ "--limit",
254
+ default=None,
255
+ type=int,
256
+ help="Max messages to fetch (default: from config.yaml).",
257
+ )
258
+ @click.option(
259
+ "--format",
260
+ "output_fmt",
261
+ type=click.Choice(_FMT_CHOICES),
262
+ default=None,
263
+ help="Output format (default: from config.yaml).",
264
+ )
265
+ @click.option(
266
+ "--output-dir",
267
+ default=None,
268
+ type=click.Path(file_okay=False, writable=True),
269
+ help="Directory for output files (default: from config.yaml).",
270
+ )
271
+ @click.option(
272
+ "--db-path",
273
+ default=None,
274
+ type=click.Path(file_okay=True, writable=True),
275
+ help="Path to SQLite database (required for --format sqlite).",
276
+ )
277
+ @click.option(
278
+ "--incremental",
279
+ is_flag=True,
280
+ help="Only save messages newer than the last saved ID for this channel.",
281
+ )
282
+ def parse_closed(
283
+ url: str,
284
+ limit: int | None,
285
+ output_fmt: str | None,
286
+ output_dir: str | None,
287
+ db_path: str | None,
288
+ incremental: bool,
289
+ ) -> None:
290
+ """Parse a CLOSED Telegram channel via web Telegram.
291
+
292
+ URL — channel link, e.g. https://t.me/durov or https://t.me/durov/123.
293
+ """
294
+ effective_limit = limit or int(get_setting("message_limit", "100"))
295
+ effective_fmt = output_fmt or get_setting("output_format", "json")
296
+ effective_dir = output_dir or get_setting("output_dir", "data/output")
297
+
298
+ click.echo(
299
+ f"🌐 Parsing closed channel '{url}' "
300
+ f"(limit={effective_limit}, format={effective_fmt})"
301
+ + (", incremental" if incremental else "")
302
+ + "..."
303
+ )
304
+
305
+ asyncio.run(
306
+ _run_parse_closed(
307
+ url=url,
308
+ limit=effective_limit,
309
+ fmt=effective_fmt,
310
+ output_dir=effective_dir,
311
+ db_path=Path(db_path) if db_path else None,
312
+ incremental=incremental,
313
+ )
314
+ )
315
+
316
+
317
+ # ------------------------------------------------------------------
318
+ # export (convert already-parsed data)
319
+ # ------------------------------------------------------------------
320
+
321
+
322
+ @main.command()
323
+ @click.argument("input_path", type=click.Path(exists=True, dir_okay=False))
324
+ @click.option(
325
+ "--format",
326
+ "output_fmt",
327
+ type=click.Choice(_FMT_CHOICES),
328
+ default="csv",
329
+ help="Target format (default: csv).",
330
+ )
331
+ @click.option(
332
+ "--output-dir",
333
+ default=None,
334
+ type=click.Path(file_okay=False, writable=True),
335
+ help="Output directory (default: from config or same as input).",
336
+ )
337
+ @click.option(
338
+ "--db-path",
339
+ default=None,
340
+ type=click.Path(file_okay=True, writable=True),
341
+ help="Path to SQLite database (for --format sqlite).",
342
+ )
343
+ def export(
344
+ input_path: str,
345
+ output_fmt: str,
346
+ output_dir: str | None,
347
+ db_path: str | None,
348
+ ) -> None:
349
+ """Convert a previously saved JSON / CSV / TXT file into another format.
350
+
351
+ Reads messages from INPUT_PATH, detects the source format
352
+ from the file extension, and writes them in the requested --format.
353
+ """
354
+ import csv
355
+ import json
356
+
357
+ inp = Path(input_path)
358
+ click.echo(f"📂 Reading messages from {inp} …")
359
+
360
+ # Detect source format
361
+ ext = inp.suffix.lower()
362
+ messages = []
363
+
364
+ if ext == ".json":
365
+ with inp.open(encoding="utf-8") as fh:
366
+ raw = json.load(fh)
367
+ for item in raw:
368
+ messages.append(_dict_to_message(item))
369
+ elif ext == ".csv":
370
+ with inp.open(encoding="utf-8") as fh:
371
+ reader = csv.DictReader(fh)
372
+ for row in reader:
373
+ messages.append(_dict_to_message(row))
374
+ elif ext == ".txt":
375
+ # naive TXT reading — parse the structured text format
376
+ messages = _parse_txt(inp)
377
+ else:
378
+ click.echo(f"❌ Unsupported input format: {ext}", err=True)
379
+ raise SystemExit(1)
380
+
381
+ if not messages:
382
+ click.echo("ℹ️ No messages found in input file.")
383
+ return
384
+
385
+ effective_dir = output_dir or inp.parent
386
+
387
+ result = save_messages(
388
+ messages=messages,
389
+ output_dir=effective_dir,
390
+ channel_name=inp.stem.split("_")[0], # heuristic
391
+ fmt=output_fmt,
392
+ db_path=Path(db_path) if db_path else None,
393
+ )
394
+ if result:
395
+ click.echo(f"✅ Exported {len(messages)} messages → {result}")
396
+ else:
397
+ click.echo(f"✅ Exported {len(messages)} messages → sqlite:{db_path or 'default.db'}")
398
+
399
+
400
+ # ------------------------------------------------------------------
401
+ # Async helpers
402
+ # ------------------------------------------------------------------
403
+
404
+
405
+ async def _run_parse_open(
406
+ channel: str,
407
+ limit: int,
408
+ fmt: str,
409
+ output_dir: str,
410
+ date_from: datetime | None,
411
+ date_to: datetime | None,
412
+ offset_id: int,
413
+ db_path: Path | None,
414
+ incremental: bool,
415
+ ) -> None:
416
+ """Connect via MTProto, parse, save, and disconnect."""
417
+ try:
418
+ mtproto_auth = MTProtoAuth()
419
+ except ValueError as exc:
420
+ click.echo(f"❌ {exc}", err=True)
421
+ raise SystemExit(1) from exc
422
+
423
+ if not mtproto_auth.is_session_valid():
424
+ click.echo(
425
+ "❌ No valid MTProto session. Run 'tgparser auth --type mtproto' first.",
426
+ err=True,
427
+ )
428
+ raise SystemExit(1)
429
+
430
+ client = mtproto_auth.login(force=False) # reuse existing session
431
+
432
+ try:
433
+ parser = MTProtoParser(client)
434
+ messages = await parser.parse(
435
+ channel=channel,
436
+ limit=limit,
437
+ date_from=date_from,
438
+ date_to=date_to,
439
+ offset_id=offset_id,
440
+ )
441
+
442
+ if not messages:
443
+ click.echo("ℹ️ No messages found (channel may be empty or inaccessible).")
444
+ else:
445
+ if incremental:
446
+ filepath = save_messages_incremental(
447
+ messages=messages,
448
+ output_dir=output_dir,
449
+ channel_name=channel,
450
+ fmt=fmt,
451
+ db_path=db_path,
452
+ )
453
+ else:
454
+ filepath = save_messages(
455
+ messages=messages,
456
+ output_dir=output_dir,
457
+ channel_name=channel,
458
+ fmt=fmt,
459
+ db_path=db_path,
460
+ )
461
+
462
+ if filepath:
463
+ click.echo(f"✅ Parsed {len(messages)} messages → {filepath}")
464
+ else:
465
+ click.echo(f"✅ Parsed {len(messages)} messages — no new data.")
466
+ finally:
467
+ await client.disconnect()
468
+
469
+
470
+ async def _run_parse_closed(
471
+ url: str,
472
+ limit: int,
473
+ fmt: str,
474
+ output_dir: str,
475
+ db_path: Path | None,
476
+ incremental: bool,
477
+ ) -> None:
478
+ """Use WebParser (Playwright) to parse a closed channel."""
479
+ try:
480
+ web_parser = WebParser()
481
+ messages = await web_parser.parse(url=url, limit=limit)
482
+ except Exception as exc:
483
+ click.echo(f"❌ Web parse failed: {exc}", err=True)
484
+ raise SystemExit(1) from exc
485
+ finally:
486
+ if "web_parser" in locals():
487
+ await web_parser.close()
488
+
489
+ if not messages:
490
+ click.echo("ℹ️ No messages found (channel may be empty or inaccessible).")
491
+ else:
492
+ if incremental:
493
+ filepath = save_messages_incremental(
494
+ messages=messages,
495
+ output_dir=output_dir,
496
+ channel_name=url.rstrip("/").rsplit("/", 1)[-1],
497
+ fmt=fmt,
498
+ db_path=db_path,
499
+ )
500
+ else:
501
+ filepath = save_messages(
502
+ messages=messages,
503
+ output_dir=output_dir,
504
+ channel_name=url.rstrip("/").rsplit("/", 1)[-1],
505
+ fmt=fmt,
506
+ db_path=db_path,
507
+ )
508
+ if filepath:
509
+ click.echo(f"✅ Parsed {len(messages)} messages → {filepath}")
510
+ else:
511
+ click.echo(f"✅ Parsed {len(messages)} messages — no new data.")
512
+
513
+
514
+ # ------------------------------------------------------------------
515
+ # Format conversion helpers
516
+ # ------------------------------------------------------------------
517
+
518
+
519
+ def _dict_to_message(d: dict) -> Message:
520
+ """Convert a plain dict back to a Message object (for export)."""
521
+ from tgparser.models.message import Message
522
+
523
+ media_raw = d.get("media_urls", "") or ""
524
+ reactions_raw = d.get("reactions", "") or ""
525
+
526
+ # media_urls could be a pipe-separated string (CSV) or a JSON list (JSON)
527
+ if isinstance(media_raw, list):
528
+ media_urls: list[str] = media_raw
529
+ elif media_raw and media_raw not in ("[]", ""):
530
+ media_urls = media_raw.split("|")
531
+ else:
532
+ media_urls = []
533
+
534
+ # reactions could be a JSON string or already a dict
535
+ if isinstance(reactions_raw, dict):
536
+ reactions: dict[str, int] = reactions_raw
537
+ elif reactions_raw and reactions_raw.startswith("{"):
538
+ reactions = json.loads(reactions_raw)
539
+ else:
540
+ reactions = {}
541
+
542
+ return Message(
543
+ id=int(d["id"]) if d.get("id") else 0,
544
+ channel=d.get("channel", ""),
545
+ date=datetime.fromisoformat(d["date"]) if d.get("date") else datetime.now(UTC),
546
+ author=d.get("author") or None,
547
+ text=d.get("text", ""),
548
+ media_urls=media_urls,
549
+ reactions=reactions,
550
+ is_forwarded=d.get("is_forwarded", "") in ("True", "true", "1", "yes"),
551
+ raw_source=d.get("raw_source", ""),
552
+ )
553
+
554
+
555
+ def _parse_txt(path: Path) -> list[Message]:
556
+ """Naive parser for the TXT format produced by _write_txt."""
557
+
558
+ from tgparser.models.message import Message
559
+
560
+ text = path.read_text(encoding="utf-8")
561
+ blocks = text.strip().split("\n\n--- Message #")
562
+
563
+ messages: list[Message] = []
564
+ for block in blocks:
565
+ if not block.strip():
566
+ continue
567
+ lines = block.strip().split("\n")
568
+ # Reconstruct the message id from the first line
569
+ header = lines[0].strip()
570
+ if header.startswith("--- Message #"):
571
+ mid = int(header.removeprefix("--- Message #").removesuffix(" ---"))
572
+ else:
573
+ mid = 0
574
+
575
+ # Extract metadata lines until blank line, then text
576
+ meta: dict[str, str] = {}
577
+ text_lines: list[str] = []
578
+ in_text = False
579
+ for line in lines[1:]:
580
+ if not in_text:
581
+ if line.strip() == "":
582
+ in_text = True
583
+ continue
584
+ if ":" in line:
585
+ key, _, val = line.partition(":")
586
+ meta[key.strip().lower()] = val.strip()
587
+ else:
588
+ text_lines.append(line)
589
+
590
+ channel = meta.get("channel", "")
591
+ date_str = meta.get("date", "")
592
+ author = meta.get("author", "—")
593
+ if author == "—":
594
+ author = None
595
+ media_raw = meta.get("media", "")
596
+ reactions_raw = meta.get("reactions", "")
597
+
598
+ # Parse date
599
+ dt: datetime
600
+ try:
601
+ dt = datetime.fromisoformat(date_str)
602
+ except (ValueError, TypeError):
603
+ dt = datetime.now(UTC)
604
+
605
+ # Parse media
606
+ media_urls: list[str] = []
607
+ if media_raw:
608
+ media_urls = [u.strip() for u in media_raw.split(",") if u.strip()]
609
+
610
+ # Parse reactions
611
+ reactions: dict[str, int] = {}
612
+ if reactions_raw:
613
+ for part in reactions_raw.split(","):
614
+ part = part.strip()
615
+ if ":" in part:
616
+ k, v = part.split(":", 1)
617
+ with contextlib.suppress(ValueError):
618
+ reactions[k.strip()] = int(v.strip())
619
+
620
+ messages.append(
621
+ Message(
622
+ id=mid,
623
+ channel=channel,
624
+ date=dt,
625
+ author=author,
626
+ text="\n".join(text_lines).strip(),
627
+ media_urls=media_urls,
628
+ reactions=reactions,
629
+ is_forwarded="forwarded" in meta.get("forwarded", "").lower(),
630
+ raw_source="txt_export",
631
+ )
632
+ )
633
+ return messages
634
+
635
+
636
+ if __name__ == "__main__":
637
+ main()
tgparser/config.py ADDED
@@ -0,0 +1,55 @@
1
+ """Configuration loader — .env secrets + config.yaml settings."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import yaml
8
+ from dotenv import load_dotenv
9
+
10
+ # Project root (where pyproject.toml lives)
11
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
12
+ DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config.yaml"
13
+ DEFAULT_ENV_PATH = PROJECT_ROOT / ".env"
14
+
15
+
16
+ def _load_env(env_path: Path | None = None) -> None:
17
+ """Load .env file, ignoring if not found."""
18
+ path = env_path or DEFAULT_ENV_PATH
19
+ if path.exists():
20
+ load_dotenv(path)
21
+
22
+
23
+ def _load_yaml(config_path: Path | None = None) -> dict[str, Any]:
24
+ """Load YAML config, returning empty dict if missing."""
25
+ path = config_path or DEFAULT_CONFIG_PATH
26
+ if not path.exists():
27
+ return {}
28
+ with open(path, encoding="utf-8") as f:
29
+ return yaml.safe_load(f) or {}
30
+
31
+
32
+ # Load once at import time
33
+ _load_env()
34
+ _yaml_config = _load_yaml()
35
+
36
+
37
+ def get_secret(key: str, default: str | None = None) -> str | None:
38
+ """Read a secret from environment (os.environ — loaded from .env)."""
39
+ return os.environ.get(key, default)
40
+
41
+
42
+ def get_setting(*keys: str, default: Any = None) -> Any:
43
+ """Traverse nested YAML config by key path.
44
+
45
+ Example: get_setting("parsing", "scroll_delay_ms") -> 1500
46
+ """
47
+ node = _yaml_config
48
+ for k in keys:
49
+ if isinstance(node, dict):
50
+ node = node.get(k)
51
+ else:
52
+ return default
53
+ if node is None:
54
+ return default
55
+ return node
@@ -0,0 +1 @@
1
+ """Data models."""