tgparser-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgparser/__init__.py +3 -0
- tgparser/auth/__init__.py +6 -0
- tgparser/auth/mtproto_auth.py +130 -0
- tgparser/auth/web_auth.py +260 -0
- tgparser/cli.py +637 -0
- tgparser/config.py +55 -0
- tgparser/models/__init__.py +1 -0
- tgparser/models/message.py +33 -0
- tgparser/parsers/__init__.py +6 -0
- tgparser/parsers/mtproto_parser.py +244 -0
- tgparser/parsers/web_parser.py +620 -0
- tgparser/storage/__init__.py +15 -0
- tgparser/storage/sqlite.py +118 -0
- tgparser/storage/writer.py +214 -0
- tgparser/utils.py +69 -0
- tgparser_cli-0.1.0.dist-info/METADATA +278 -0
- tgparser_cli-0.1.0.dist-info/RECORD +21 -0
- tgparser_cli-0.1.0.dist-info/WHEEL +5 -0
- tgparser_cli-0.1.0.dist-info/entry_points.txt +2 -0
- tgparser_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- tgparser_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""SQLite storage for parsed messages — optional dependency (sqlite3 built-in)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import sqlite3
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from tgparser.models.message import Message
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("tgparser")
|
|
13
|
+
|
|
14
|
+
# SQLite table schema
|
|
15
|
+
CREATE_TABLE_SQL = """
|
|
16
|
+
CREATE TABLE IF NOT EXISTS messages (
|
|
17
|
+
id INTEGER NOT NULL,
|
|
18
|
+
channel TEXT NOT NULL,
|
|
19
|
+
date TEXT NOT NULL,
|
|
20
|
+
author TEXT,
|
|
21
|
+
text TEXT NOT NULL,
|
|
22
|
+
media_urls TEXT, -- JSON array stored as text
|
|
23
|
+
reactions TEXT, -- JSON object stored as text
|
|
24
|
+
is_forwarded INTEGER DEFAULT 0,
|
|
25
|
+
raw_source TEXT DEFAULT 'unknown',
|
|
26
|
+
saved_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
27
|
+
PRIMARY KEY (id, channel)
|
|
28
|
+
);
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
CREATE_METADATA_SQL = """
|
|
32
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
33
|
+
channel TEXT PRIMARY KEY,
|
|
34
|
+
last_message_id INTEGER NOT NULL,
|
|
35
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
36
|
+
);
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ensure_tables(db: sqlite3.Connection) -> None:
|
|
41
|
+
db.execute(CREATE_TABLE_SQL)
|
|
42
|
+
db.execute(CREATE_METADATA_SQL)
|
|
43
|
+
db.commit()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_connection(db_path: Path) -> sqlite3.Connection:
|
|
47
|
+
"""Open a connection and ensure tables exist."""
|
|
48
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
db = sqlite3.connect(str(db_path))
|
|
50
|
+
db.row_factory = sqlite3.Row
|
|
51
|
+
_ensure_tables(db)
|
|
52
|
+
return db
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def save_messages(db_path: Path, messages: list[Message]) -> None:
|
|
56
|
+
"""Insert *messages* into the SQLite database, ignoring duplicates (id+channel)."""
|
|
57
|
+
db = _get_connection(db_path)
|
|
58
|
+
try:
|
|
59
|
+
for m in messages:
|
|
60
|
+
db.execute(
|
|
61
|
+
"""
|
|
62
|
+
INSERT OR IGNORE INTO messages
|
|
63
|
+
(id, channel, date, author, text, media_urls, reactions,
|
|
64
|
+
is_forwarded, raw_source)
|
|
65
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
66
|
+
""",
|
|
67
|
+
(
|
|
68
|
+
m.id,
|
|
69
|
+
m.channel,
|
|
70
|
+
m.date.isoformat(),
|
|
71
|
+
m.author,
|
|
72
|
+
m.text,
|
|
73
|
+
json.dumps(m.media_urls, ensure_ascii=False),
|
|
74
|
+
json.dumps(m.reactions, ensure_ascii=False) if m.reactions else None,
|
|
75
|
+
int(m.is_forwarded),
|
|
76
|
+
m.raw_source,
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
db.commit()
|
|
80
|
+
finally:
|
|
81
|
+
db.close()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_last_message_id(db_path: Path, channel: str) -> int | None:
|
|
85
|
+
"""Return the highest message id stored for *channel*, or ``None``."""
|
|
86
|
+
db = _get_connection(db_path)
|
|
87
|
+
try:
|
|
88
|
+
row = db.execute(
|
|
89
|
+
"SELECT last_message_id FROM metadata WHERE channel = ?", (channel,)
|
|
90
|
+
).fetchone()
|
|
91
|
+
if row is not None:
|
|
92
|
+
return row["last_message_id"]
|
|
93
|
+
# Fallback: scan messages table
|
|
94
|
+
row = db.execute(
|
|
95
|
+
"SELECT MAX(id) AS max_id FROM messages WHERE channel = ?", (channel,)
|
|
96
|
+
).fetchone()
|
|
97
|
+
return row["max_id"] if row and row["max_id"] is not None else None
|
|
98
|
+
finally:
|
|
99
|
+
db.close()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def update_last_message_id(db_path: Path, channel: str, last_id: int) -> None:
|
|
103
|
+
"""Update (or insert) the last message id metadata for *channel*."""
|
|
104
|
+
db = _get_connection(db_path)
|
|
105
|
+
try:
|
|
106
|
+
db.execute(
|
|
107
|
+
"""
|
|
108
|
+
INSERT INTO metadata (channel, last_message_id, updated_at)
|
|
109
|
+
VALUES (?, ?, datetime('now'))
|
|
110
|
+
ON CONFLICT(channel) DO UPDATE SET
|
|
111
|
+
last_message_id = excluded.last_message_id,
|
|
112
|
+
updated_at = excluded.updated_at
|
|
113
|
+
""",
|
|
114
|
+
(channel, last_id),
|
|
115
|
+
)
|
|
116
|
+
db.commit()
|
|
117
|
+
finally:
|
|
118
|
+
db.close()
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Serialize Message lists to structured formats (JSON, CSV, TXT, SQLite)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Literal
|
|
11
|
+
|
|
12
|
+
from tgparser.models.message import Message
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("tgparser")
|
|
15
|
+
|
|
16
|
+
OutputFormat = Literal["json", "csv", "txt", "sqlite"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def save_messages(
|
|
20
|
+
messages: list[Message],
|
|
21
|
+
output_dir: str | Path,
|
|
22
|
+
channel_name: str,
|
|
23
|
+
fmt: OutputFormat = "json",
|
|
24
|
+
db_path: str | Path | None = None,
|
|
25
|
+
) -> Path | None:
|
|
26
|
+
"""Persist *messages* to a file and return its path.
|
|
27
|
+
|
|
28
|
+
File name is auto-generated: ``<channel>_<timestamp>.<ext>``.
|
|
29
|
+
Creates *output_dir* if it does not exist.
|
|
30
|
+
|
|
31
|
+
For ``sqlite`` format the result is written into an SQLite database;
|
|
32
|
+
in that case *db_path* must be provided and the return value is ``None``.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
messages: List of parsed messages.
|
|
36
|
+
output_dir: Directory to write the output file.
|
|
37
|
+
channel_name: Channel slug used in the file name.
|
|
38
|
+
fmt: ``"json"``, ``"csv"``, ``"txt"`` or ``"sqlite"``.
|
|
39
|
+
db_path: Path to the SQLite database file (required for ``sqlite``).
|
|
40
|
+
"""
|
|
41
|
+
output_dir = Path(output_dir)
|
|
42
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
if fmt == "sqlite":
|
|
45
|
+
if db_path is None:
|
|
46
|
+
raise ValueError("db_path is required for sqlite format")
|
|
47
|
+
_write_sqlite(Path(db_path), messages)
|
|
48
|
+
logger.info("Saved %d messages → sqlite:%s", len(messages), db_path)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
52
|
+
safe_channel = channel_name.lstrip("@").replace("/", "_")
|
|
53
|
+
filename = f"{safe_channel}_{ts}.{fmt}"
|
|
54
|
+
filepath = output_dir / filename
|
|
55
|
+
|
|
56
|
+
if fmt == "json":
|
|
57
|
+
_write_json(filepath, messages)
|
|
58
|
+
elif fmt == "csv":
|
|
59
|
+
_write_csv(filepath, messages)
|
|
60
|
+
elif fmt == "txt":
|
|
61
|
+
_write_txt(filepath, messages)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(f"Unsupported format: {fmt}")
|
|
64
|
+
|
|
65
|
+
logger.info("Saved %d messages → %s", len(messages), filepath)
|
|
66
|
+
return filepath
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def save_messages_incremental(
|
|
70
|
+
messages: list[Message],
|
|
71
|
+
output_dir: str | Path,
|
|
72
|
+
channel_name: str,
|
|
73
|
+
fmt: OutputFormat = "json",
|
|
74
|
+
db_path: str | Path | None = None,
|
|
75
|
+
) -> Path | None:
|
|
76
|
+
"""Incremental variant -- only appends messages that are newer than the last stored ID.
|
|
77
|
+
|
|
78
|
+
For file-based formats (json/csv/txt) the whole list is re-written each time,
|
|
79
|
+
but only *new* messages (those with id > last saved id for that channel)
|
|
80
|
+
are included. For sqlite the new messages are inserted directly.
|
|
81
|
+
|
|
82
|
+
The last message id is persisted in a small state file ``<channel>_state.json``
|
|
83
|
+
inside *output_dir* (for file formats) or in the sqlite metadata table.
|
|
84
|
+
"""
|
|
85
|
+
last_id = get_last_message_id(output_dir, channel_name, db_path)
|
|
86
|
+
|
|
87
|
+
if last_id is not None:
|
|
88
|
+
new_messages = [m for m in messages if m.id > last_id]
|
|
89
|
+
if not new_messages:
|
|
90
|
+
logger.info("No new messages for '%s' (last id = %d)", channel_name, last_id)
|
|
91
|
+
return None
|
|
92
|
+
logger.info("%d new messages (out of %d) for '%s'", len(new_messages), len(messages), channel_name)
|
|
93
|
+
else:
|
|
94
|
+
new_messages = messages
|
|
95
|
+
|
|
96
|
+
result = save_messages(new_messages, output_dir, channel_name, fmt, db_path)
|
|
97
|
+
|
|
98
|
+
# persist the new last id
|
|
99
|
+
if new_messages:
|
|
100
|
+
_save_last_message_id(output_dir, channel_name, max(m.id for m in new_messages))
|
|
101
|
+
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_last_message_id(
|
|
106
|
+
output_dir: str | Path,
|
|
107
|
+
channel_name: str,
|
|
108
|
+
db_path: str | Path | None = None,
|
|
109
|
+
) -> int | None:
|
|
110
|
+
"""Return the last persisted message id for *channel_name*, or ``None``."""
|
|
111
|
+
if db_path is not None:
|
|
112
|
+
from tgparser.storage.sqlite import get_last_message_id as _sqlite_last_id
|
|
113
|
+
return _sqlite_last_id(Path(db_path), channel_name)
|
|
114
|
+
|
|
115
|
+
state_file = Path(output_dir) / f"{channel_name.lstrip('@').replace('/', '_')}_state.json"
|
|
116
|
+
if state_file.exists():
|
|
117
|
+
try:
|
|
118
|
+
data = json.loads(state_file.read_text(encoding="utf-8"))
|
|
119
|
+
return data.get("last_message_id")
|
|
120
|
+
except Exception:
|
|
121
|
+
logger.warning("Could not read state file %s", state_file)
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ------------------------------------------------------------------
|
|
126
|
+
# Internal writers
|
|
127
|
+
# ------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _write_json(filepath: Path, messages: list[Message]) -> None:
|
|
131
|
+
data = []
|
|
132
|
+
for m in messages:
|
|
133
|
+
data.append(
|
|
134
|
+
{
|
|
135
|
+
"id": m.id,
|
|
136
|
+
"channel": m.channel,
|
|
137
|
+
"date": m.date.isoformat(),
|
|
138
|
+
"author": m.author,
|
|
139
|
+
"text": m.text,
|
|
140
|
+
"media_urls": m.media_urls,
|
|
141
|
+
"reactions": m.reactions,
|
|
142
|
+
"is_forwarded": m.is_forwarded,
|
|
143
|
+
"raw_source": m.raw_source,
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
filepath.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _write_csv(filepath: Path, messages: list[Message]) -> None:
|
|
150
|
+
fieldnames = [
|
|
151
|
+
"id",
|
|
152
|
+
"channel",
|
|
153
|
+
"date",
|
|
154
|
+
"author",
|
|
155
|
+
"text",
|
|
156
|
+
"media_urls",
|
|
157
|
+
"reactions",
|
|
158
|
+
"is_forwarded",
|
|
159
|
+
"raw_source",
|
|
160
|
+
]
|
|
161
|
+
with filepath.open("w", newline="", encoding="utf-8") as fh:
|
|
162
|
+
writer = csv.DictWriter(fh, fieldnames=fieldnames)
|
|
163
|
+
writer.writeheader()
|
|
164
|
+
for m in messages:
|
|
165
|
+
writer.writerow(
|
|
166
|
+
{
|
|
167
|
+
"id": m.id,
|
|
168
|
+
"channel": m.channel,
|
|
169
|
+
"date": m.date.isoformat(),
|
|
170
|
+
"author": m.author or "",
|
|
171
|
+
"text": m.text,
|
|
172
|
+
"media_urls": "|".join(m.media_urls),
|
|
173
|
+
"reactions": json.dumps(m.reactions, ensure_ascii=False) if m.reactions else "",
|
|
174
|
+
"is_forwarded": m.is_forwarded,
|
|
175
|
+
"raw_source": m.raw_source,
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _write_txt(filepath: Path, messages: list[Message]) -> None:
|
|
181
|
+
"""Write messages as a plain-text file separated by blank lines."""
|
|
182
|
+
lines: list[str] = []
|
|
183
|
+
for m in messages:
|
|
184
|
+
lines.append(f"--- Message #{m.id} ---")
|
|
185
|
+
lines.append(f"Channel: {m.channel}")
|
|
186
|
+
lines.append(f"Date: {m.date.isoformat()}")
|
|
187
|
+
lines.append(f"Author: {m.author or '—'}")
|
|
188
|
+
if m.media_urls:
|
|
189
|
+
lines.append(f"Media: {', '.join(m.media_urls)}")
|
|
190
|
+
if m.reactions:
|
|
191
|
+
reactions_str = ", ".join(f"{k}: {v}" for k, v in m.reactions.items())
|
|
192
|
+
lines.append(f"Reactions: {reactions_str}")
|
|
193
|
+
if m.is_forwarded:
|
|
194
|
+
lines.append("Forwarded: yes")
|
|
195
|
+
lines.append("")
|
|
196
|
+
lines.append(m.text)
|
|
197
|
+
lines.append("") # blank line separator
|
|
198
|
+
filepath.write_text("\n".join(lines), encoding="utf-8")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _write_sqlite(db_path: Path, messages: list[Message]) -> None:
|
|
202
|
+
"""Delegate to the sqlite writer module."""
|
|
203
|
+
from tgparser.storage.sqlite import save_messages as _sqlite_save
|
|
204
|
+
_sqlite_save(db_path, messages)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _save_last_message_id(output_dir: Path, channel_name: str, last_id: int) -> None:
|
|
208
|
+
"""Persist the last saved message id for incremental parsing."""
|
|
209
|
+
safe_channel = channel_name.lstrip("@").replace("/", "_")
|
|
210
|
+
state_file = output_dir / f"{safe_channel}_state.json"
|
|
211
|
+
state_file.write_text(
|
|
212
|
+
json.dumps({"last_message_id": last_id}, indent=2),
|
|
213
|
+
encoding="utf-8",
|
|
214
|
+
)
|
tgparser/utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Logging setup and retry helpers."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from functools import wraps
|
|
7
|
+
from typing import Any, TypeVar
|
|
8
|
+
|
|
9
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
10
|
+
|
|
11
|
+
# Module-level logger — consumers do `from tgparser.utils import logger`
|
|
12
|
+
logger = logging.getLogger("tgparser")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def setup_logging(level: int = logging.INFO, fmt: str | None = None) -> None:
|
|
16
|
+
"""Configure root tgparser logger.
|
|
17
|
+
|
|
18
|
+
Call once at CLI entry point. Default format includes timestamp and level.
|
|
19
|
+
"""
|
|
20
|
+
if fmt is None:
|
|
21
|
+
fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
|
22
|
+
handler = logging.StreamHandler()
|
|
23
|
+
handler.setFormatter(logging.Formatter(fmt))
|
|
24
|
+
logger.addHandler(handler)
|
|
25
|
+
logger.setLevel(level)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def retry(
|
|
29
|
+
max_attempts: int = 3,
|
|
30
|
+
base_delay: float = 1.0,
|
|
31
|
+
backoff_factor: float = 2.0,
|
|
32
|
+
exceptions: tuple[type[BaseException], ...] = (Exception,),
|
|
33
|
+
) -> Callable[[F], F]:
|
|
34
|
+
"""Decorator: exponential backoff retry.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
max_attempts: Total attempts before giving up.
|
|
38
|
+
base_delay: Initial wait in seconds.
|
|
39
|
+
backoff_factor: Multiplier for each subsequent attempt.
|
|
40
|
+
exceptions: Exception types to catch and retry.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def decorator(func: F) -> F:
|
|
44
|
+
@wraps(func)
|
|
45
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
46
|
+
last_exc: BaseException | None = None
|
|
47
|
+
for attempt in range(1, max_attempts + 1):
|
|
48
|
+
try:
|
|
49
|
+
return func(*args, **kwargs)
|
|
50
|
+
except exceptions as exc:
|
|
51
|
+
last_exc = exc
|
|
52
|
+
if attempt == max_attempts:
|
|
53
|
+
raise
|
|
54
|
+
delay = base_delay * (backoff_factor ** (attempt - 1))
|
|
55
|
+
logger.warning(
|
|
56
|
+
"Retry %d/%d after %.1fs: %s",
|
|
57
|
+
attempt,
|
|
58
|
+
max_attempts,
|
|
59
|
+
delay,
|
|
60
|
+
exc,
|
|
61
|
+
)
|
|
62
|
+
time.sleep(delay)
|
|
63
|
+
# Should never reach here, but keep type-checker happy
|
|
64
|
+
assert last_exc is not None
|
|
65
|
+
raise last_exc
|
|
66
|
+
|
|
67
|
+
return wrapper # type: ignore[return-value]
|
|
68
|
+
|
|
69
|
+
return decorator
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tgparser-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Telegram channel parser — extract messages from open (MTProto) and closed (web) channels
|
|
5
|
+
Author: borodatych
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/borodatych/tgparser
|
|
8
|
+
Project-URL: Repository, https://github.com/borodatych/tgparser
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/borodatych/tgparser/issues
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Communications :: Chat
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: telethon>=1.35
|
|
20
|
+
Requires-Dist: playwright>=1.45
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
22
|
+
Requires-Dist: lxml>=5.2
|
|
23
|
+
Requires-Dist: click>=8.1
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.2; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# TgParser
|
|
33
|
+
|
|
34
|
+
**Telegram-канал парсер** — утилита для извлечения сообщений из открытых (MTProto API) и закрытых (Web HTML) Telegram-каналов.
|
|
35
|
+
|
|
36
|
+
[](https://www.python.org/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
[](https://github.com/astral-sh/ruff)
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Возможности
|
|
43
|
+
|
|
44
|
+
- **Авторизация** через QR-код (Web) или MTProto (Telethon) с сохранением сессии
|
|
45
|
+
- **Парсинг открытых каналов** — прямое чтение через MTProto API (Telethon)
|
|
46
|
+
- **Парсинг закрытых каналов** — чтение через web-версию Telegram (Playwright + BeautifulSoup)
|
|
47
|
+
- **Обход защиты от копирования** — автоматическое снятие CSS `user-select: none`, блокировки контекстного меню
|
|
48
|
+
- **Вывод данных** в JSON, CSV, plain-text или SQLite
|
|
49
|
+
- **Инкрементальный парсинг** — сохранение только новых сообщений
|
|
50
|
+
- **CLI-интерфейс** на базе Click
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Установка
|
|
55
|
+
|
|
56
|
+
### Из исходного кода
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Клонировать репозиторий
|
|
60
|
+
git clone https://github.com/borodatych/tgparser.git
|
|
61
|
+
cd tgparser
|
|
62
|
+
|
|
63
|
+
# Создать виртуальное окружение
|
|
64
|
+
python -m venv .venv
|
|
65
|
+
source .venv/bin/activate # Linux/macOS
|
|
66
|
+
.venv\Scripts\activate # Windows
|
|
67
|
+
|
|
68
|
+
# Установить пакет с dev-зависимостями
|
|
69
|
+
pip install -e ".[dev]"
|
|
70
|
+
|
|
71
|
+
# Установить Playwright браузеры (требуется для web-парсера)
|
|
72
|
+
playwright install chromium
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Через pip (после релиза)
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install tgparser-cli
|
|
79
|
+
playwright install chromium
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Настройка
|
|
85
|
+
|
|
86
|
+
### 1. Переменные окружения
|
|
87
|
+
|
|
88
|
+
Скопируйте `.env.example` в `.env` и заполните:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
cp .env.example .env
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Обязательные переменные:
|
|
95
|
+
|
|
96
|
+
| Переменная | Описание |
|
|
97
|
+
|-----------|----------|
|
|
98
|
+
| `API_ID` | API ID из [my.telegram.org](https://my.telegram.org/apps) |
|
|
99
|
+
| `API_HASH` | API Hash оттуда же |
|
|
100
|
+
| `PHONE_NUMBER` | Номер телефона для MTProto-авторизации (в международном формате) |
|
|
101
|
+
|
|
102
|
+
### 2. Конфигурационный файл (опционально)
|
|
103
|
+
|
|
104
|
+
Создайте `config.yaml` в корне проекта:
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
parsing:
|
|
108
|
+
scroll_delay_ms: 1500 # задержка между скроллами (web-парсер)
|
|
109
|
+
max_messages: 1000 # лимит сообщений за один запуск
|
|
110
|
+
rate_limit_sleep: 30 # пауза при FloodWait (сек)
|
|
111
|
+
|
|
112
|
+
storage:
|
|
113
|
+
output_dir: data/output
|
|
114
|
+
session_dir: data/sessions
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Использование
|
|
120
|
+
|
|
121
|
+
### Авторизация
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Web-авторизация (QR-код) — для закрытых каналов
|
|
125
|
+
tgparser auth
|
|
126
|
+
|
|
127
|
+
# Принудительная переавторизация
|
|
128
|
+
tgparser auth --force
|
|
129
|
+
|
|
130
|
+
# MTProto-авторизация — для открытых каналов
|
|
131
|
+
tgparser auth --type mtproto
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Парсинг открытого канала (MTProto)
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
tgparser parse open @channel_username
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Опции:
|
|
141
|
+
- `--limit N` — максимум сообщений (по умолчанию 100)
|
|
142
|
+
- `--since YYYY-MM-DD` — фильтр по дате (сообщения не старше указанной)
|
|
143
|
+
- `--until YYYY-MM-DD` — фильтр по дате (сообщения не новее указанной)
|
|
144
|
+
- `--offset N` — смещение от последнего сообщения
|
|
145
|
+
|
|
146
|
+
### Парсинг закрытого канала (Web)
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
tgparser parse closed https://t.me/channel_username
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Опции:
|
|
153
|
+
- `--limit N` — максимум сообщений
|
|
154
|
+
- `--since YYYY-MM-DD` — фильтр по дате
|
|
155
|
+
- `--until YYYY-MM-DD` — фильтр по дате
|
|
156
|
+
|
|
157
|
+
> **Примечание:** Для закрытых каналов требуется предварительная web-авторизация (`tgparser auth`).
|
|
158
|
+
|
|
159
|
+
### Экспорт
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Вывод в консоль (plain-text)
|
|
163
|
+
tgparser export --input data/output/messages.json
|
|
164
|
+
|
|
165
|
+
# Сохранение в JSON
|
|
166
|
+
tgparser export --input data/output/messages.json --format json --output data/output/export.json
|
|
167
|
+
|
|
168
|
+
# Сохранение в CSV
|
|
169
|
+
tgparser export --input data/output/messages.json --format csv --output data/output/export.csv
|
|
170
|
+
|
|
171
|
+
# Сохранение в SQLite
|
|
172
|
+
tgparser export --input data/output/messages.json --format sqlite --output data/output/export.db
|
|
173
|
+
|
|
174
|
+
# Инкрементальный экспорт (только новые сообщения)
|
|
175
|
+
tgparser export --input data/output/messages.json --incremental
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Примеры
|
|
181
|
+
|
|
182
|
+
### Сохранить 50 последних сообщений из открытого канала в JSON
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
tgparser parse open @python_news --limit 50 --format json --output data/output/python_news.json
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Сохранить сообщения из закрытого канала за последнюю неделю
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
tgparser parse closed https://t.me/private_channel --since 2025-01-01
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Экспортировать в CSV с инкрементальным режимом
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
tgparser parse open @tech_news --format csv --output data/output/tech_news.csv
|
|
198
|
+
tgparser export --input data/output/tech_news.csv --incremental
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Структура проекта
|
|
204
|
+
|
|
205
|
+
```
|
|
206
|
+
tgparser/
|
|
207
|
+
├── src/
|
|
208
|
+
│ └── tgparser/
|
|
209
|
+
│ ├── auth/ # Модули авторизации (web, mtproto)
|
|
210
|
+
│ ├── parsers/ # Парсеры (mtproto_parser, web_parser)
|
|
211
|
+
│ ├── storage/ # Вывод и хранение (JSON, CSV, TXT, SQLite)
|
|
212
|
+
│ ├── models/ # Модели данных (Message)
|
|
213
|
+
│ ├── cli.py # CLI-интерфейс (Click)
|
|
214
|
+
│ ├── config.py # Загрузка конфигурации
|
|
215
|
+
│ └── utils.py # Вспомогательные функции
|
|
216
|
+
├── tests/ # Тесты (pytest)
|
|
217
|
+
├── data/
|
|
218
|
+
│ ├── output/ # Результаты парсинга
|
|
219
|
+
│ └── sessions/ # Сохранённые сессии
|
|
220
|
+
├── docs/ # Документация
|
|
221
|
+
├── config.yaml # Конфигурация (опционально)
|
|
222
|
+
├── .env # Секреты (не в git)
|
|
223
|
+
├── pyproject.toml # Настройки проекта
|
|
224
|
+
└── README.md # Этот файл
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Разработка
|
|
230
|
+
|
|
231
|
+
### Запуск тестов
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
pytest tests/ -v
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Линтинг и форматирование
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
ruff check src/ tests/
|
|
241
|
+
ruff format src/ tests/
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Сборка пакета
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
python -m build
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## Совместимость
|
|
253
|
+
|
|
254
|
+
- **Python**: 3.11, 3.12
|
|
255
|
+
- **ОС**: Windows, Linux, macOS
|
|
256
|
+
- **Браузер**: Chromium (устанавливается через `playwright install chromium`)
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Планы
|
|
261
|
+
|
|
262
|
+
- [x] Авторизация (Web + MTProto)
|
|
263
|
+
- [x] Парсинг открытых каналов (MTProto)
|
|
264
|
+
- [x] Парсинг закрытых каналов (Web)
|
|
265
|
+
- [x] Обход защиты от копирования
|
|
266
|
+
- [x] Вывод (JSON, CSV, TXT, SQLite)
|
|
267
|
+
- [x] Инкрементальный парсинг
|
|
268
|
+
- [ ] Поддержка Telegram Premium (MTProto)
|
|
269
|
+
- [ ] Парсинг комментариев
|
|
270
|
+
- [ ] GUI-интерфейс
|
|
271
|
+
|
|
272
|
+
Полный roadmap: [docs/roadmap.md](docs/roadmap.md)
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## Лицензия
|
|
277
|
+
|
|
278
|
+
Проект распространяется под лицензией MIT. Подробнее — в файле [LICENSE](LICENSE).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
tgparser/__init__.py,sha256=Vpr2-J37aNd7YiTv5ga5gkVZsiOjAHgdrbV5HKZ74UY,68
|
|
2
|
+
tgparser/cli.py,sha256=TOvql_bjQEs50HWW_-Uq9vQXp9kLaTSGvh6MXB-L21w,19027
|
|
3
|
+
tgparser/config.py,sha256=dQur3V71C9ggqx6H7AmbaZ7XYhOkoQR-ZMAtjmPO9rY,1517
|
|
4
|
+
tgparser/utils.py,sha256=B_Za4zStyIo0RVfCMvdGMjGLcE6fh5FsMxljnQU6738,2242
|
|
5
|
+
tgparser/auth/__init__.py,sha256=vchOklgNfhFazEffESB6JtY6RbeG7ZzN4umv1-2NVuc,192
|
|
6
|
+
tgparser/auth/mtproto_auth.py,sha256=30t83WfxmEXNsH7LJ4jdgbIJlBSH0AXyUG5XKromO5o,4601
|
|
7
|
+
tgparser/auth/web_auth.py,sha256=bGUmONnFP-z4iwps5z5ugIV5HDeOD1_7eKDojohrWKw,9700
|
|
8
|
+
tgparser/models/__init__.py,sha256=GXI-G4Xb-SlTkGdWvDbtPPd1ThR6oAlp3hwjDWr4Hgs,20
|
|
9
|
+
tgparser/models/message.py,sha256=c9vtVEX1E3jNnjPmXhq5B9whiyAwyk7yomU2woT_9yg,935
|
|
10
|
+
tgparser/parsers/__init__.py,sha256=vqT0updFmmGfAbm-6fDx9MidnLPzj7C-3GhGY70bR7Y,230
|
|
11
|
+
tgparser/parsers/mtproto_parser.py,sha256=xVn4_CGiSuTE4w7TMMpVyWIM5aS0GkR4dVw-vRuk6Ak,9274
|
|
12
|
+
tgparser/parsers/web_parser.py,sha256=fxL5cvNWYntcPLVFUP2L7vPr3slwkM3pa5EVtYveQB0,20811
|
|
13
|
+
tgparser/storage/__init__.py,sha256=17ab-FheD4tvlwkLqJH2IJfDLMbX9XmhsD87voe0IlY,315
|
|
14
|
+
tgparser/storage/sqlite.py,sha256=JCSGxzytosgqsfGfKVuemOTwl0z2Yt_jsp6g0QbsPQg,3808
|
|
15
|
+
tgparser/storage/writer.py,sha256=twy0h0FOStNWDNW59f1wwcBw5A8Z9ACZsGVCp3PByDQ,7701
|
|
16
|
+
tgparser_cli-0.1.0.dist-info/licenses/LICENSE,sha256=ZQu7QqFT2Yn7EV-MCiWJMxcZMZVZJTttK1GtKvbWrYI,1088
|
|
17
|
+
tgparser_cli-0.1.0.dist-info/METADATA,sha256=y1Iy7XhZZdbX5CaWEPvyX6P59LvzwNneHxuGisywJ_A,9553
|
|
18
|
+
tgparser_cli-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
19
|
+
tgparser_cli-0.1.0.dist-info/entry_points.txt,sha256=lz_j2icS6b8n1OW-yHSJnHrv1O2gfY7gG6JkzMJXe3E,47
|
|
20
|
+
tgparser_cli-0.1.0.dist-info/top_level.txt,sha256=CrqdcWZYa02HSazR16Jda-jA2q2cmyScRmm6pW4jR14,9
|
|
21
|
+
tgparser_cli-0.1.0.dist-info/RECORD,,
|