telegram-pm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
telegram_pm/config.py ADDED
@@ -0,0 +1,25 @@
1
+ from os import environ
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ class BaseConfig: ...
9
+
10
+
11
+ class HttpClientConfig(BaseConfig):
12
+ retries: int = int(environ.get("HTTP_RETRIES", 3))
13
+ backoff: int = int(environ.get("HTTP_BACKOFF", 3))
14
+ timeout: int = int(environ.get("HTTP_TIMEOUT", 30))
15
+
16
+
17
+ class TelegramConfig(BaseConfig):
18
+ base_url: str = environ.get("TELEGRAM_BASE_URL", "https://t.me")
19
+
20
+ before_param_size: int = int(environ.get("TELEGRAM_BEFORE_PARAM_SIZE", 20))
21
+ iteration_in_preview_count: int = int(environ.get("TELEGRAM_PARSE_REPEAT_COUNT", 5))
22
+ sleep_time_seconds: int = int(environ.get("TELEGRAM_SLEEP_TIME_SECONDS", 60))
23
+ sleep_after_error_request: int = int(
24
+ environ.get("TELEGRAM_SLEEP_AFTER_ERROR_REQUEST", 30)
25
+ )
File without changes
@@ -0,0 +1,143 @@
1
+ import json
2
+ from typing import List
3
+ from dataclasses import asdict
4
+ from contextlib import asynccontextmanager
5
+
6
+ import aiosqlite
7
+
8
+ from telegram_pm.entities import Post
9
+
10
+
11
+ class DatabaseProcessor:
12
+ def __init__(self, db_path: str):
13
+ self.db_path = db_path
14
+ self._pool = None
15
+
16
+ async def initialize(self):
17
+ async with self._get_connection() as conn:
18
+ await conn.execute("PRAGMA journal_mode=WAL")
19
+ await conn.execute("PRAGMA synchronous=NORMAL")
20
+ await conn.execute("PRAGMA cache_size=-10000") # 10MB ะบััˆะฐ
21
+ await conn.execute("PRAGMA temp_store=MEMORY")
22
+ await conn.commit()
23
+
24
+ @asynccontextmanager
25
+ async def _get_connection(self):
26
+ conn = await aiosqlite.connect(self.db_path, timeout=30, isolation_level=None)
27
+ conn.row_factory = aiosqlite.Row
28
+ try:
29
+ yield conn
30
+ finally:
31
+ await conn.close()
32
+
33
+ @asynccontextmanager
34
+ async def _get_cursor(self):
35
+ async with self._get_connection() as conn:
36
+ cursor = await conn.cursor()
37
+ try:
38
+ yield cursor
39
+ await conn.commit()
40
+ except Exception as e:
41
+ await conn.rollback()
42
+ raise e
43
+
44
+ async def table_exists(self, table_name: str) -> bool:
45
+ async with self._get_cursor() as cursor:
46
+ await cursor.execute(
47
+ "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
48
+ (table_name,),
49
+ )
50
+ return await cursor.fetchone() is not None
51
+
52
+ async def create_table_from_post(self, table_name: str):
53
+ columns = [
54
+ "url TEXT PRIMARY KEY",
55
+ "username TEXT",
56
+ "id INTEGER",
57
+ "date TEXT NOT NULL",
58
+ "text TEXT",
59
+ "replied_post_url TEXT",
60
+ "urls TEXT", # JSON
61
+ "url_preview TEXT",
62
+ "photo_urls TEXT", # JSON
63
+ "video_urls TEXT", # JSON
64
+ "round_video_url TEXT",
65
+ "files TEXT", # JSON
66
+ "tags TEXT", # JSON
67
+ "created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP",
68
+ "forwarded_from_url TEXT",
69
+ "forwarded_from_name TEXT",
70
+ ]
71
+
72
+ async with self._get_cursor() as cursor:
73
+ await cursor.execute(
74
+ f"""
75
+ CREATE TABLE IF NOT EXISTS {table_name} (
76
+ {", ".join(columns)}
77
+ )
78
+ """
79
+ )
80
+ await cursor.execute(
81
+ f"CREATE INDEX IF NOT EXISTS idx_{table_name}_date ON {table_name}(date)"
82
+ )
83
+
84
+ async def insert_posts_batch(self, table_name: str, posts: List[Post]):
85
+ if not posts:
86
+ return
87
+
88
+ columns = [
89
+ "url",
90
+ "username",
91
+ "id",
92
+ "date",
93
+ "text",
94
+ "replied_post_url",
95
+ "urls",
96
+ "url_preview",
97
+ "photo_urls",
98
+ "video_urls",
99
+ "round_video_url",
100
+ "files",
101
+ "tags",
102
+ "forwarded_from_url",
103
+ "forwarded_from_name",
104
+ ]
105
+
106
+ placeholders = ", ".join(["?"] * len(columns))
107
+ query = f"""
108
+ INSERT OR IGNORE INTO {table_name}
109
+ ({", ".join(columns)})
110
+ VALUES ({placeholders})
111
+ """
112
+
113
+ async with self._get_cursor() as cursor:
114
+ data = []
115
+ for post in posts:
116
+ post_dict = asdict(post)
117
+ for field in ["urls", "photo_urls", "video_urls", "files", "tags"]:
118
+ post_dict[field] = json.dumps(post_dict[field])
119
+ data.append(tuple(post_dict[col] for col in columns))
120
+ await cursor.executemany(query, data)
121
+
122
+ async def is_table_empty(self, table_name: str) -> bool:
123
+ async with self._get_cursor() as cursor:
124
+ await cursor.execute(f"SELECT 1 FROM {table_name} LIMIT 1")
125
+ return await cursor.fetchone() is None
126
+
127
+ async def drop_table_if_empty(self, table_name: str):
128
+ if await self.table_exists(table_name) and await self.is_table_empty(
129
+ table_name
130
+ ):
131
+ async with self._get_cursor() as cursor:
132
+ await cursor.execute(f"DROP TABLE {table_name}")
133
+
134
+ async def post_exists(self, table_name: str, url: str) -> bool:
135
+ query = f"SELECT 1 FROM {table_name} WHERE url = ? LIMIT 1"
136
+
137
+ async with self._get_cursor() as cursor:
138
+ await cursor.execute(query, (url,))
139
+ return await cursor.fetchone() is not None
140
+
141
+ async def close(self):
142
+ if hasattr(self, "conn") and self.conn:
143
+ await self.conn.close()
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass, field
2
+
3
+
4
+ @dataclass
5
+ class Post:
6
+ username: str
7
+ url: str
8
+ date: str
9
+ id: int | None
10
+ text: str | None = None
11
+ replied_post_url: str | None = None
12
+ urls: list[str] = field(default_factory=list)
13
+ url_preview: str | None = None
14
+ photo_urls: list[str] = field(default_factory=list[str])
15
+ video_urls: list[str] = field(default_factory=list[str])
16
+ round_video_url: str | None = None
17
+ files: list[dict[str, str]] = field(default_factory=list)
18
+ tags: list[str] = field(default_factory=list[str])
19
+ forwarded_from_url: str | None = None
20
+ forwarded_from_name: str | None = None
File without changes
@@ -0,0 +1,22 @@
1
+ import httpx
2
+ from retry import retry
3
+
4
+ from telegram_pm.utils.logger import logger
5
+ from telegram_pm.config import HttpClientConfig
6
+
7
+
8
+ class HttpClient:
9
+ def __init__(self):
10
+ self.client = httpx.AsyncClient(
11
+ transport=httpx.AsyncHTTPTransport(
12
+ verify=False,
13
+ retries=HttpClientConfig.retries,
14
+ ),
15
+ timeout=HttpClientConfig.timeout,
16
+ verify=False,
17
+ )
18
+
19
+ @retry(backoff=HttpClientConfig.backoff, logger=logger) # type: ignore[arg-type]
20
+ async def request(self, url: str, method: str = "GET", **kwargs) -> httpx.Response:
21
+ response = await self.client.request(method=method, url=url, **kwargs)
22
+ return response
File without changes
@@ -0,0 +1 @@
1
+ class BaseParser: ...
@@ -0,0 +1,280 @@
1
+ import typer
2
+ from bs4 import BeautifulSoup, PageElement
3
+
4
+ from telegram_pm import utils
5
+ from telegram_pm.entities import Post
6
+ from telegram_pm.utils.logger import logger
7
+ from telegram_pm.parsers.base import BaseParser
8
+ from telegram_pm.config import TelegramConfig
9
+ from telegram_pm.parsers.tag_options import PostParseConfig, TagOptions
10
+
11
+
12
+ class PostsParser(BaseParser):
13
+ """
14
+ Posts parsers from preview page
15
+ """
16
+
17
+ def __init__(self, verbose: bool = False):
18
+ self.__verbose: bool = verbose
19
+ self._tag_ops = PostParseConfig
20
+
21
+ @staticmethod
22
+ def get_post_attribute(
23
+ post: PageElement,
24
+ tab_ops: TagOptions,
25
+ extract_field: str,
26
+ _warn_log_enable: bool = True,
27
+ ) -> str | None:
28
+ post_attribute = post.find(name=tab_ops.tag, attrs=tab_ops.attrs) # type: ignore[attr-defined]
29
+ if post_attribute:
30
+ if extract_field == "text":
31
+ return post_attribute.text
32
+ return post_attribute.get(extract_field)
33
+ if _warn_log_enable:
34
+ logger.warning(f"Not found. '{tab_ops.tag}': '{tab_ops.attrs}'")
35
+ return None
36
+
37
+ def get_channel_id(self, post: PageElement) -> int | None:
38
+ channel_base64 = self.get_post_attribute(
39
+ post=post,
40
+ tab_ops=self._tag_ops.channel_id,
41
+ extract_field="data-view",
42
+ _warn_log_enable=False,
43
+ )
44
+ if not channel_base64:
45
+ return None
46
+ channel_id = utils.parse.decode_channel_id(channel_base64)
47
+ return channel_id
48
+
49
+ @staticmethod
50
+ def get_urls_from_styles(post: PageElement, tag_pos: TagOptions) -> list[str]:
51
+ urls = []
52
+ styles_list = post.find_all(name=tag_pos.tag, attrs=tag_pos.attrs) # type: ignore[attr-defined]
53
+ for style in styles_list:
54
+ urls.append(
55
+ utils.parse.extract_url_from_style(style_content=style.get("style", ""))
56
+ )
57
+ return urls # type: ignore[return-value]
58
+
59
+ def get_posts(self, bs_preview_content: BeautifulSoup) -> list[PageElement]:
60
+ posts_list = utils.parse.extract_element(
61
+ bs_content=bs_preview_content,
62
+ tag_ops=self._tag_ops.post_block,
63
+ )
64
+ return posts_list
65
+
66
+ def get_post_url(self, username: str, post: PageElement) -> str:
67
+ post_url = self.get_post_attribute(
68
+ post=post,
69
+ tab_ops=self._tag_ops.post_url,
70
+ extract_field="href",
71
+ )
72
+ if post_url.startswith(f"{TelegramConfig.base_url}/"): # type: ignore[union-attr]
73
+ post_url = post_url.split("/")[-1] # type: ignore[union-attr]
74
+ post_url = f"{TelegramConfig.base_url}/{username}/{post_url}"
75
+ return post_url # type: ignore[return-value]
76
+
77
+ def get_post_date(self, post: PageElement) -> str:
78
+ return self.get_post_attribute( # type: ignore[return-value]
79
+ post=post,
80
+ tab_ops=self._tag_ops.date,
81
+ extract_field="datetime",
82
+ )
83
+
84
+ def get_replied_url(self, post: PageElement) -> str | None:
85
+ return self.get_post_attribute(
86
+ post=post,
87
+ tab_ops=self._tag_ops.replied_url,
88
+ extract_field="href",
89
+ _warn_log_enable=False,
90
+ )
91
+
92
+ def get_forwarded_from_url(self, post: PageElement) -> str | None:
93
+ return self.get_post_attribute(
94
+ post=post,
95
+ tab_ops=self._tag_ops.forwarded_from_url,
96
+ extract_field="href",
97
+ _warn_log_enable=False,
98
+ )
99
+
100
+ def get_forwarded_from_name(self, post: PageElement) -> str | None:
101
+ return self.get_post_attribute(
102
+ post=post,
103
+ tab_ops=self._tag_ops.forwarded_from_name,
104
+ extract_field="text",
105
+ _warn_log_enable=False,
106
+ )
107
+
108
+ def get_text(self, post: PageElement) -> str | None:
109
+ return self.get_post_attribute(
110
+ post=post,
111
+ tab_ops=self._tag_ops.text,
112
+ extract_field="text",
113
+ _warn_log_enable=False,
114
+ )
115
+
116
+ def get_photo_urls(self, post: PageElement) -> list[str]:
117
+ return self.get_urls_from_styles(
118
+ post=post,
119
+ tag_pos=self._tag_ops.photo_url,
120
+ )
121
+
122
+ def get_video_urls(self, post: PageElement) -> list[str]:
123
+ return self.get_urls_from_styles(
124
+ post=post,
125
+ tag_pos=self._tag_ops.video_url,
126
+ )
127
+
128
+ def get_round_video(self, post: PageElement) -> str | None:
129
+ return self.get_post_attribute(
130
+ post=post,
131
+ tab_ops=self._tag_ops.round_video_url,
132
+ extract_field="src",
133
+ _warn_log_enable=False,
134
+ )
135
+
136
+ def get_urls(self, post: PageElement) -> list[str]:
137
+ urls = set()
138
+ url_elements = post.find_all( # type: ignore[attr-defined]
139
+ name=self._tag_ops.url.tag,
140
+ attrs=self._tag_ops.url.attrs,
141
+ )
142
+ for url in url_elements:
143
+ urls.add(url.get("href"))
144
+ return list(urls)
145
+
146
+ def get_url_preview(self, post: PageElement) -> str | None:
147
+ return self.get_post_attribute(
148
+ post=post,
149
+ tab_ops=self._tag_ops.url_preview,
150
+ extract_field="text",
151
+ _warn_log_enable=False,
152
+ )
153
+
154
+ def get_files(self, post: PageElement) -> list[dict[str, str]]:
155
+ files: list = []
156
+ files_elements = post.find_all( # type: ignore[attr-defined]
157
+ name=self._tag_ops.file.tag,
158
+ attrs=self._tag_ops.file.attrs,
159
+ )
160
+ file: PageElement
161
+ for file in files_elements:
162
+ title = file.text
163
+ extra = file.find_next_sibling( # type: ignore[union-attr]
164
+ name=self._tag_ops.file_extra.tag,
165
+ attrs=self._tag_ops.file_extra.attrs,
166
+ ).text
167
+ files.append({"title": title, "extra": extra})
168
+ return files
169
+
170
+ def get_tags(self, post: PageElement) -> list[str]:
171
+ tags_elements = post.find_all( # type: ignore[attr-defined]
172
+ name=self._tag_ops.tag.tag,
173
+ attrs=self._tag_ops.tag.attrs,
174
+ )
175
+ return [tag.text for tag in tags_elements]
176
+
177
+ def parse(self, username: str, bs_preview_content: BeautifulSoup) -> list[Post]:
178
+ parse_results = []
179
+ posts_list = self.get_posts(bs_preview_content=bs_preview_content)
180
+ for post_element in posts_list:
181
+ post = Post(
182
+ username=username,
183
+ id=self.get_channel_id(post_element),
184
+ url=self.get_post_url(username, post_element),
185
+ date=self.get_post_date(post_element),
186
+ replied_post_url=self.get_replied_url(post_element),
187
+ text=self.get_text(post_element),
188
+ photo_urls=self.get_photo_urls(post_element),
189
+ video_urls=self.get_video_urls(post_element),
190
+ round_video_url=self.get_round_video(post_element),
191
+ urls=self.get_urls(post_element),
192
+ url_preview=self.get_url_preview(post_element),
193
+ files=self.get_files(post_element),
194
+ tags=self.get_tags(post_element),
195
+ forwarded_from_url=self.get_forwarded_from_url(post_element),
196
+ forwarded_from_name=self.get_forwarded_from_name(post_element),
197
+ )
198
+ parse_results.append(post)
199
+ if self.__verbose:
200
+ self._print_post(post=post)
201
+ return parse_results
202
+
203
+ @staticmethod
204
+ def _print_post(post: Post):
205
+ typer.echo("\n" + typer.style("โ•" * 50, fg=typer.colors.BRIGHT_MAGENTA))
206
+ typer.echo(
207
+ typer.style("๐ŸŽฏ Username: ", fg=typer.colors.BRIGHT_RED)
208
+ + typer.style(post.username, fg=typer.colors.RED)
209
+ )
210
+ typer.echo(
211
+ typer.style("๐Ÿ“… Date: ", fg=typer.colors.BRIGHT_CYAN)
212
+ + typer.style(post.date, fg=typer.colors.WHITE)
213
+ )
214
+
215
+ typer.echo(
216
+ typer.style("๐Ÿ”— URL: ", fg=typer.colors.BRIGHT_CYAN)
217
+ + typer.style(post.url, fg=typer.colors.BRIGHT_BLUE, underline=True)
218
+ )
219
+
220
+ if post.replied_post_url:
221
+ typer.echo(
222
+ typer.style("โ†ฉ๏ธ Replied: ", fg=typer.colors.BRIGHT_YELLOW)
223
+ + typer.style(post.replied_post_url, fg=typer.colors.BLUE)
224
+ )
225
+
226
+ if post.text:
227
+ typer.echo("\n๐Ÿ’ฌ๐Ÿ’ฌ๐Ÿ’ฌ")
228
+ typer.echo(typer.style(post.text[:50], fg=typer.colors.GREEN))
229
+ typer.echo("๐Ÿ’ฌ๐Ÿ’ฌ๐Ÿ’ฌ")
230
+
231
+ if post.photo_urls:
232
+ typer.echo("\n" + typer.style("๐Ÿ“ท Photo: ", fg=typer.colors.BRIGHT_RED))
233
+ for photo in post.photo_urls:
234
+ typer.echo(typer.style(f" โ†’ {photo}", fg=typer.colors.RED))
235
+
236
+ if post.video_urls:
237
+ typer.echo("\n" + typer.style("๐ŸŽฅ Video: ", fg=typer.colors.BRIGHT_RED))
238
+ for video in post.video_urls:
239
+ typer.echo(typer.style(f" โ†’ {video}", fg=typer.colors.RED))
240
+
241
+ if post.urls:
242
+ typer.echo("\n" + typer.style("๐ŸŒ URLs: ", fg=typer.colors.BRIGHT_MAGENTA))
243
+ for url in post.urls:
244
+ typer.echo(typer.style(f" โ†’ {url}", fg=typer.colors.MAGENTA))
245
+
246
+ if post.url_preview:
247
+ typer.echo("\n๐Ÿ‘€๐Ÿ‘€๐Ÿ‘€")
248
+ typer.echo(
249
+ typer.style(
250
+ f"๐Ÿ” URL preview: {post.url_preview[:50]}", fg=typer.colors.GREEN
251
+ )
252
+ )
253
+ typer.echo("๐Ÿ‘€๐Ÿ‘€๐Ÿ‘€")
254
+
255
+ if post.round_video_url:
256
+ typer.echo(
257
+ "\n"
258
+ + typer.style(
259
+ f"๐Ÿ” Round video: {post.round_video_url}", fg=typer.colors.BLUE
260
+ )
261
+ )
262
+
263
+ if post.tags:
264
+ typer.echo(
265
+ "\n"
266
+ + typer.style("โŒ— Tags: ", fg=typer.colors.BRIGHT_GREEN)
267
+ + typer.style(", ".join(post.tags), fg=typer.colors.GREEN)
268
+ )
269
+
270
+ if post.files:
271
+ typer.echo("\n" + typer.style("๐Ÿ“‚ Files: ", fg=typer.colors.BRIGHT_YELLOW))
272
+ for file in post.files:
273
+ print_file = file.get("title")
274
+ if print_file:
275
+ extra = file.get("extra")
276
+ if extra:
277
+ print_file = f"{print_file} ({extra})"
278
+ typer.echo(typer.style(f" โ†’ {print_file}", fg=typer.colors.YELLOW))
279
+
280
+ typer.echo(typer.style("โ•" * 50, fg=typer.colors.BRIGHT_MAGENTA) + "\n")
@@ -0,0 +1,165 @@
1
+ import asyncio
2
+
3
+ import httpx
4
+ from bs4 import BeautifulSoup
5
+ from structlog.contextvars import bound_contextvars
6
+
7
+ from telegram_pm import utils
8
+ from telegram_pm.entities import Post
9
+ from telegram_pm.utils.logger import logger
10
+ from telegram_pm.config import TelegramConfig
11
+ from telegram_pm.parsers.base import BaseParser
12
+ from telegram_pm.parsers.post import PostsParser
13
+ from telegram_pm.http_client.client import HttpClient
14
+ from telegram_pm.database.db import DatabaseProcessor
15
+
16
+
17
+ class PreviewParser(BaseParser):
18
+ """
19
+ Telegram preview page parser
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ channels: list[str],
25
+ db_path: str,
26
+ verbose: bool = False,
27
+ ):
28
+ self.channels: list[str] = channels
29
+ self.http_client = HttpClient()
30
+ self.post_parser = PostsParser(verbose=verbose)
31
+ self.db = DatabaseProcessor(db_path=db_path)
32
+ self._db_initialized = False
33
+ self.verbose = verbose
34
+
35
+ @staticmethod
36
+ def __forbidden_parse_preview(response: httpx.Response) -> bool:
37
+ """
38
+ Check parsing availability
39
+ :param response: httpx.Response
40
+ :return: bool. If True, then you can't parse preview page
41
+ """
42
+ if response.status_code in (302,):
43
+ return True
44
+ return False
45
+
46
+ @staticmethod
47
+ def __parse_before_param_value(post_url: str) -> int:
48
+ before_value = post_url.split("/")[-1]
49
+ return int(before_value)
50
+
51
+ async def _get_preview_page(self, preview_url: str) -> httpx.Response:
52
+ """
53
+ Get preview page
54
+ :param preview_url: str. Full preview URL
55
+ :return: httpx.Response
56
+ """
57
+ response_preview_url = await self.http_client.request(
58
+ url=preview_url,
59
+ )
60
+ return response_preview_url
61
+
62
+ def _parse_posts_in_preview(
63
+ self, username: str, response: httpx.Response
64
+ ) -> list[Post]:
65
+ bs_content = BeautifulSoup(response.text, "html5lib")
66
+ posts = self.post_parser.parse(username=username, bs_preview_content=bs_content)
67
+ return posts
68
+
69
+ async def initialize(self):
70
+ """Initialize database"""
71
+ if not self._db_initialized:
72
+ await self.db.initialize()
73
+ self._db_initialized = True
74
+
75
+ async def close(self):
76
+ """Clean up resources"""
77
+ if hasattr(self.db, "close"):
78
+ await self.db.close()
79
+
80
+ async def parse_channel(self, channel_username: str):
81
+ """Parse single channel"""
82
+ channel_username = utils.url.get_username_from_tg_url(channel_username)
83
+ with bound_contextvars(username=channel_username):
84
+ if not await self.db.table_exists(channel_username):
85
+ await self.db.create_table_from_post(channel_username)
86
+ await logger.ainfo("Created new table for channel")
87
+
88
+ preview_url = utils.url.build_preview_url(username=channel_username)
89
+ posts_result = []
90
+ should_break = False
91
+
92
+ for parse_repeat in range(TelegramConfig.iteration_in_preview_count):
93
+ if should_break:
94
+ await logger.ainfo("No new posts yet")
95
+ break
96
+
97
+ try:
98
+ response = await self._get_preview_page(preview_url=preview_url)
99
+ if not response:
100
+ await logger.awarning("Can not get preview page")
101
+ await asyncio.sleep(TelegramConfig.sleep_after_error_request)
102
+ continue
103
+
104
+ if self.__forbidden_parse_preview(response=response):
105
+ await logger.awarning("Forbidden parsing preview")
106
+ break
107
+
108
+ parsed_posts = self._parse_posts_in_preview(
109
+ username=channel_username, response=response
110
+ )
111
+ if not parsed_posts:
112
+ await logger.awarning("No posts parsed from preview page") # type: ignore
113
+ await self.db.drop_table_if_empty(channel_username)
114
+ await asyncio.sleep(TelegramConfig.sleep_after_error_request)
115
+ break
116
+
117
+ first_post_exists = await self.db.post_exists(
118
+ channel_username, parsed_posts[0].url
119
+ )
120
+ if first_post_exists:
121
+ should_break = True
122
+ continue
123
+
124
+ await self.db.insert_posts_batch(channel_username, parsed_posts)
125
+ posts_result.extend(parsed_posts)
126
+
127
+ before_param_number = self.__parse_before_param_value(
128
+ post_url=parsed_posts[-1].url
129
+ )
130
+ if before_param_number <= TelegramConfig.before_param_size:
131
+ before_param_number -= TelegramConfig.before_param_size
132
+ else:
133
+ before_param_number = (
134
+ before_param_number - TelegramConfig.before_param_size
135
+ )
136
+ if before_param_number <= 0:
137
+ break
138
+
139
+ preview_url = utils.url.build_param_before_url(
140
+ url=preview_url, before=before_param_number
141
+ )
142
+
143
+ except Exception as e:
144
+ await logger.aerror(
145
+ f"Error parsing channel {channel_username}: {e}"
146
+ )
147
+ break
148
+
149
+ return posts_result
150
+
151
+ async def parse(self):
152
+ """Main parsing method"""
153
+ await self.initialize()
154
+
155
+ try:
156
+ for channel_username in self.channels:
157
+ try:
158
+ await self.parse_channel(channel_username)
159
+ except Exception as e:
160
+ await logger.aerror(
161
+ f"Failed to parse channel {channel_username}: {e}"
162
+ )
163
+ continue
164
+ finally:
165
+ await self.close()
@@ -0,0 +1,78 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass
6
+ class TagOptions:
7
+ attrs: dict
8
+ tag: str
9
+
10
+
11
+ class PostParseConfig:
12
+ channel_id = TagOptions(
13
+ tag="div",
14
+ attrs={
15
+ "class": "tgme_widget_message text_not_supported_wrap js-widget_message"
16
+ },
17
+ )
18
+
19
+ post_block = TagOptions(
20
+ tag="div",
21
+ attrs={"class": re.compile(r"tgme_widget_message_wrap js-widget_message_wrap")},
22
+ )
23
+
24
+ post_url = TagOptions(tag="a", attrs={"class": "tgme_widget_message_date"})
25
+
26
+ replied_url = TagOptions(tag="a", attrs={"class": "tgme_widget_message_reply"})
27
+
28
+ text = TagOptions(
29
+ tag="div",
30
+ attrs={"class": re.compile(r"tgme_widget_message_text js-message_text")},
31
+ )
32
+
33
+ date = TagOptions(tag="time", attrs={"class": "time"})
34
+
35
+ photo_url = TagOptions(
36
+ tag="a", attrs={"class": re.compile(r"tgme_widget_message_photo_wrap")}
37
+ )
38
+
39
+ video_url = TagOptions(
40
+ tag="i", attrs={"class": re.compile(r"tgme_widget_message_video_thumb")}
41
+ )
42
+
43
+ round_video_url = TagOptions(
44
+ tag="video",
45
+ attrs={
46
+ "class": re.compile(r"tgme_widget_message_roundvideo js-message_roundvideo")
47
+ },
48
+ )
49
+
50
+ url = TagOptions(
51
+ tag="a",
52
+ attrs={
53
+ "target": re.compile(r"_blank"),
54
+ "href": re.compile(r"^https?://"),
55
+ },
56
+ )
57
+
58
+ url_preview = TagOptions(
59
+ tag="a", attrs={"class": re.compile(r"tgme_widget_message_link_preview")}
60
+ )
61
+
62
+ file = TagOptions(
63
+ tag="div", attrs={"class": re.compile(r"tgme_widget_message_document_title")}
64
+ )
65
+
66
+ file_extra = TagOptions(
67
+ tag="div", attrs={"class": re.compile(r"tgme_widget_message_document_extra")}
68
+ )
69
+
70
+ tag = TagOptions(tag="a", attrs={"href": re.compile(r"^\?q=%23")})
71
+
72
+ forwarded_from_name = TagOptions(
73
+ tag="a", attrs={"class": "tgme_widget_message_forwarded_from_name"}
74
+ )
75
+
76
+ forwarded_from_url = TagOptions(
77
+ tag="a", attrs={"class": "tgme_widget_message_forwarded_from_name"}
78
+ )
telegram_pm/run.py ADDED
@@ -0,0 +1,49 @@
1
+ import sys
2
+ import signal
3
+ import asyncio
4
+
5
+ from telegram_pm.parsers.preview import PreviewParser
6
+ from telegram_pm.utils.logger import logger
7
+ from telegram_pm.config import TelegramConfig
8
+
9
+
10
+ class ParserRunner:
11
+ def __init__(self, db_path: str, channels: list[str], verbose: bool = False):
12
+ self.db_path = db_path
13
+ self.channels = channels
14
+ self.verbose = verbose
15
+
16
+ self._shutdown = False
17
+
18
+ # Setup signal handlers
19
+ signal.signal(signal.SIGINT, self.handle_signal)
20
+ signal.signal(signal.SIGTERM, self.handle_signal)
21
+
22
+ def handle_signal(self, signum, frame):
23
+ logger.info(f"Received signal {signum}, shutting down...")
24
+ self._shutdown = True
25
+ sys.exit(0)
26
+
27
+ async def run(self):
28
+ parser = PreviewParser(
29
+ channels=self.channels, verbose=self.verbose, db_path=self.db_path
30
+ )
31
+ try:
32
+ while not self._shutdown:
33
+ try:
34
+ await parser.parse()
35
+ logger.info(
36
+ f"๐Ÿ’ค Sleep {TelegramConfig.sleep_time_seconds} seconds ... ๐Ÿ’ค"
37
+ )
38
+ await asyncio.sleep(TelegramConfig.sleep_time_seconds)
39
+ except Exception as e:
40
+ logger.error(f"Error during parsing: {e}")
41
+ await asyncio.sleep(TelegramConfig.sleep_after_error_request)
42
+ finally:
43
+ if parser:
44
+ await parser.close()
45
+
46
+
47
+ def run_parser(db_path: str, channels: list[str], verbose: bool = False):
48
+ runner = ParserRunner(channels=channels, verbose=verbose, db_path=db_path)
49
+ asyncio.run(runner.run())
@@ -0,0 +1,8 @@
1
+ from telegram_pm.utils import logger, url, parse
2
+
3
+
4
+ __all__ = [
5
+ "logger",
6
+ "url",
7
+ "parse",
8
+ ]
@@ -0,0 +1,5 @@
1
+ import structlog
2
+ from structlog.typing import FilteringBoundLogger
3
+
4
+
5
+ logger: FilteringBoundLogger = structlog.get_logger()
@@ -0,0 +1,46 @@
1
+ import re
2
+ import json
3
+ import base64
4
+
5
+ from bs4 import BeautifulSoup, PageElement
6
+
7
+ from telegram_pm.parsers.tag_options import TagOptions
8
+
9
+
10
+ URL_REGEX = re.compile(
11
+ r"https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_+.~#?&/=]*"
12
+ )
13
+
14
+
15
+ def content_to_bs(content: str) -> BeautifulSoup:
16
+ return BeautifulSoup(content, "html5lib")
17
+
18
+
19
+ def extract_element(
20
+ bs_content: BeautifulSoup, tag_ops: TagOptions
21
+ ) -> list[PageElement]:
22
+ elements = bs_content.find_all(tag_ops.tag, attrs=tag_ops.attrs)
23
+ return [elem for elem in elements]
24
+
25
+
26
+ def extract_url_from_style(style_content: str) -> str | None:
27
+ url = URL_REGEX.search(style_content)
28
+ if url:
29
+ return url.group(0)
30
+ return None
31
+
32
+
33
+ def channel_id_clean(id_str: str) -> int:
34
+ """
35
+ Extract id from channel id string
36
+ c2233445566/14992 -> 2233445566
37
+ """
38
+ channel_id = id_str.split("/")[0][1:]
39
+ return int(channel_id)
40
+
41
+
42
+ def decode_channel_id(channel_id_base64: str) -> int:
43
+ if not channel_id_base64.endswith("="):
44
+ channel_id_base64 += "=="
45
+ channel_id = json.loads(base64.b64decode(channel_id_base64))
46
+ return channel_id["c"]
@@ -0,0 +1,34 @@
1
+ from urllib.parse import urljoin
2
+
3
+ from telegram_pm.config import TelegramConfig
4
+
5
+
6
+ def build_preview_url(username: str) -> str:
7
+ """
8
+ Build preview URL.
9
+ username -> https://t.me/s/username
10
+ :param username: Telegram username
11
+ :return: str
12
+ """
13
+ return urljoin(TelegramConfig.base_url, urljoin("/s/", username))
14
+
15
+
16
+ def build_param_before_url(url: str, before: int | str) -> str:
17
+ """
18
+ Build preview URL with before parameter.
19
+ - https://t.me/s/username -> https://t.me/s/username?before
20
+ - https://t.me/s/username -> https://t.me/s/username?before=123
21
+ :param url: str - Preview URL
22
+ :param before: - Before parameter value
23
+ :return: str
24
+ """
25
+ return urljoin(url, f"?before={before}")
26
+
27
+
28
+ def get_username_from_tg_url(url: str) -> str:
29
+ """
30
+ Get username from Telegram URL.
31
+ """
32
+ if url.startswith(TelegramConfig.base_url):
33
+ return url.split("/")[-1]
34
+ return url
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 aIligat0r
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.3
2
+ Name: telegram-pm
3
+ Version: 0.1.0
4
+ Summary: Telegram preview page parser
5
+ Author: Your Name
6
+ Author-email: you@example.com
7
+ Requires-Python: >=3.12
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiosqlite (>=0.21.0,<0.22.0)
12
+ Requires-Dist: bs4 (>=0.0.2,<0.0.3)
13
+ Requires-Dist: html5lib (>=1.1,<2.0)
14
+ Requires-Dist: httpx (>=0.28.1,<0.29.0)
15
+ Requires-Dist: python-dotenv (>=1.1.0,<2.0.0)
16
+ Requires-Dist: retry (>=0.9.2,<0.10.0)
17
+ Requires-Dist: structlog (>=25.2.0,<26.0.0)
18
+ Requires-Dist: typer (>=0.15.2,<0.16.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Telegram Channels Monitor
22
+
23
+ ![Python](https://img.shields.io/badge/python-3.12%2B-blue)
24
+ ![License](https://img.shields.io/badge/license-MIT-green)
25
+
26
+ Tool for monitoring public Telegram channels available in WEB preview mode
27
+
28
+ ## ๐ŸŒŸ Features
29
+ 1. [x] Parsing recent messages from public Telegram channels
30
+ 2. [x] Extracting metadata and media attachments
31
+ 3. [x] Storing data in SQLite database
32
+ 4. [x] Support for forwarded messages and replies
33
+ 5. [x] Configurable data collection parameters
34
+
35
+ ## ๐Ÿ›  Installation
36
+ 1. Ensure Python 3.12+ is installed (recommendation)
37
+ 2. Clone repository
38
+ ```bash
39
+ git clone 'https://github.com/aIligat0r/tpm.git'
40
+ ```
41
+
42
+ ## โš™๏ธ Configuration
43
+ Configurations (file `.env` or `tpm/config.py`)
44
+
45
+ Parsing configurations:
46
+ * `TELEGRAM_PARSE_REPEAT_COUNT` - Number of requests (default `5`). 20 messages per request. (1 iter - last 20 messages)
47
+ * `TELEGRAM_SLEEP_TIME_SECONDS` - Number of seconds after which the next process of receiving data from channels will begin (default `60` seconds)
48
+ * `TELEGRAM_SLEEP_AFTER_ERROR_REQUEST` - Waiting after a failed requests (default `30`)
49
+
50
+ HTTP configurations:
51
+ * `HTTP_RETRIES` - Number of repeated request attempts (default `3`)
52
+ * `HTTP_BACKOFF` - Delay between attempts for failed requests (default `3` seconds)
53
+ * `HTTP_TIMEOUT` - Waiting for a response (default `30` seconds)
54
+
55
+ ## ๐Ÿš€ Usage
56
+
57
+ #### 1. Build application:
58
+
59
+ Build docker image:
60
+ ```bash
61
+ docker build -t telegram_pm .
62
+ ```
63
+ Create poetry env:
64
+ * Install poetry:
65
+ ```bash
66
+ pip install poetry
67
+ ```
68
+ * Create poetry env and install packages:
69
+ ```bash
70
+ poetry install
71
+ ```
72
+
73
+ #### 2. Launching the app
74
+
75
+ | Options | Description | Required |
76
+ |-----------------------------------|-----------------------------------------------------------------------|----------------------------------------------------------------|
77
+ | `--db-path` | Path to the base (if not, it will be created) | โŒ required |
78
+ | `--channels-filepath`/`--ch-file` | File of channel usernames (file where in each line Telegram username) | โŒ required (or usernames `--channel`/`--ch`) |
79
+ | `--channel`/`--ch` | List of usernames that are passed by the parameter | โŒ required (or file of channels `--channels-filepath`/`--chf`) |
80
+ | `--verbose`/`--v` | Verbose mode | โž– |
81
+ | `--help`/`--h` | Help information | โž– |
82
+
83
+ **Poetry**:
84
+ ```bash
85
+ poetry run telegram_pm --ch freegaza --ch BREAKINGNewsTG --db-path .\tg.db --v
86
+ ```
87
+ or
88
+ ```bash
89
+ poetry run telegram_pm --channels-filepath /path/to/monitoring_usernames.txt --db-path .\tg.db
90
+ ```
91
+ **Docker**:
92
+ ```bash
93
+ docker run -it --rm telegram_pm --ch freegaza --db-path test_tg.db --v
94
+ ```
95
+ or (if you want to transfer usernames in a file, then you need to mount the paths)
96
+ ```bash
97
+ $ mkdir ~/tpm_data/ # create a folder for data
98
+ $ cp /path/to/channel/usernames.txt ~/tpm_data/usernames.txt # copy the file with the user names to the previously created folder
99
+ $ chmod 666 ~/tpm_data_dir/telegram_messages.sqlite && chmod 666 ~/tpm_data_dir/usernames.txt # grant access to use this folder from the container
100
+ ```
101
+ ```bash
102
+ docker run -it --rm \
103
+ -v ~/tpm_data_dir/telegram_messages.sqlite:/data/telegram_messages.sqlite \
104
+ -v ~/tpm_data_dir/usernames.txt:/data/usernames.txt \
105
+ telegram_pm --db-path /data/telegram_messages.sqlite --chf /data/usernames.txt
106
+ ```
107
+ ## ๐Ÿ—ƒ๏ธ Database Structure
108
+
109
+ The tables will be named as usernames. Each table is a username that was passed in the running parameters.
110
+
111
+ | Field | Type | Description |
112
+ |-----------------------|-----------------------------------|----------------------------------------------------------|
113
+ | `id` | **INTEGER** | Channel ID |
114
+ | `url` | **TEXT** | Message URL |
115
+ | `username` | **TEXT** | Channel username |
116
+ | `date` | **TEXT** _(ISO 8601)_ | Message date |
117
+ | `text` | **TEXT** | Message text |
118
+ | `replied_post_url` | **TEXT** | Replied message URL |
119
+ | `urls` | **JSON** | URLs from text |
120
+ | `photo_urls` | **JSON** | Photo URLs |
121
+ | `video_urls` | **JSON** | Video URLs |
122
+ | `created_at` | **CURRENT_DATETIME** _(ISO 8601)_ | Record creation time |
123
+ | `url_preview` | **TEXT** | Text from preview URL |
124
+ | `round_video_url` | **TEXT** | URL to round video message |
125
+ | `files` | **JSON** | List of file names and their description |
126
+ | `tags` | **JSON** | List of tags from a message body |
127
+ | `forwarded_from_url` | **TEXT** | URL of the channel from which the message was forwarded |
128
+ | `forwarded_from_name` | **TEXT** | Name of the channel from which the message was forwarded |
129
+
130
+
131
+ ## โš ๏ธ Limitations
132
+ Works only with public channels
133
+
134
+ ## ๐Ÿงฎ Example of work
135
+ **_Verbose mode:_**
136
+
137
+ ![img.png](img_verbose_sample.png)
138
+
139
+ **_View database_**
140
+ ![img.png](img_view_tables.png)
141
+
142
+ ## ๐Ÿ“œ License
143
+ MIT License
144
+
@@ -0,0 +1,22 @@
1
+ telegram_pm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ telegram_pm/config.py,sha256=Qa7QMgRpYiGPZ8dmsSR6eYSxXuKoSGmiGG1mMe4Rmmc,765
3
+ telegram_pm/database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ telegram_pm/database/db.py,sha256=rSfqCbBYrD4E_Msb5q8ilY1QIPlq7vnVE_-dNlYOXaM,4716
5
+ telegram_pm/entities.py,sha256=-mdx3u1M7bKFtEXaLcaaBjLQg08NBW77c2VeNHQQ_Gw,646
6
+ telegram_pm/http_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ telegram_pm/http_client/client.py,sha256=xvBsoFDo4QaQtgcmpNr3NjtKnyryL7HSn71DR89Quec,718
8
+ telegram_pm/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ telegram_pm/parsers/base.py,sha256=9GH7bJaqPueohRoK1OVOmjF9pY_fqaFizIc9Ak6qS-Y,22
10
+ telegram_pm/parsers/post.py,sha256=4wf4KBG6NBOFGpk8_GH88M1hbyjTWXTQgRgGaHXgB40,10469
11
+ telegram_pm/parsers/preview.py,sha256=Z9xdbFPktQQzTyfEzw0M0vANHXYZkfmarZAoXDfDl5I,6055
12
+ telegram_pm/parsers/tag_options.py,sha256=0YRQH5O8fpfReHRDXEThmFFyiacsUz-wlbjVFOLoiJ8,2040
13
+ telegram_pm/run.py,sha256=8g2AWgAEr6uqcYuEh67WyA3BXv_WuccwBUTaQ8blJU4,1653
14
+ telegram_pm/utils/__init__.py,sha256=loG7JOo8Th7vV7lYrVeCEhObguEaMQr7xRCmVkV7CM4,103
15
+ telegram_pm/utils/logger.py,sha256=RqwcrFNMzjQfqB-aC9w79g9WLbcj6GvokRDtj9ZPH1Y,123
16
+ telegram_pm/utils/parse.py,sha256=vSI4kNVvt2hqXLcOdp0MuCChG6fFqSrb17VzH6huqVQ,1167
17
+ telegram_pm/utils/url.py,sha256=mv5Lc4PZbyL4hQXku3sGzMt3lmGKjtlYhbmzL0fKeb8,941
18
+ telegram_pm-0.1.0.dist-info/entry_points.txt,sha256=dIvBN0V4aMrJKl7tB1qCYy7VM40uFqnuwcPibXfnSU0,40
19
+ telegram_pm-0.1.0.dist-info/LICENSE,sha256=kaLyGzbJPljgIIJrGiWc2611z1YfjYG8QsI6v0C_oug,1066
20
+ telegram_pm-0.1.0.dist-info/METADATA,sha256=ea4tZgKbhsV1dpvJWLmamzEOEKYDaagzU4xCyQt1Go8,7053
21
+ telegram_pm-0.1.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
22
+ telegram_pm-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.2
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ tpm=commands.cli:app
3
+