telegram-pm 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telegram_pm/config.py +3 -0
- telegram_pm/http_client/client.py +20 -6
- telegram_pm/parsers/preview.py +39 -9
- telegram_pm/run.py +72 -11
- {telegram_pm-0.1.0.dist-info → telegram_pm-0.1.2.dist-info}/METADATA +36 -8
- {telegram_pm-0.1.0.dist-info → telegram_pm-0.1.2.dist-info}/RECORD +9 -9
- {telegram_pm-0.1.0.dist-info → telegram_pm-0.1.2.dist-info}/LICENSE +0 -0
- {telegram_pm-0.1.0.dist-info → telegram_pm-0.1.2.dist-info}/WHEEL +0 -0
- {telegram_pm-0.1.0.dist-info → telegram_pm-0.1.2.dist-info}/entry_points.txt +0 -0
telegram_pm/config.py
CHANGED
@@ -12,6 +12,9 @@ class HttpClientConfig(BaseConfig):
|
|
12
12
|
retries: int = int(environ.get("HTTP_RETRIES", 3))
|
13
13
|
backoff: int = int(environ.get("HTTP_BACKOFF", 3))
|
14
14
|
timeout: int = int(environ.get("HTTP_TIMEOUT", 30))
|
15
|
+
headers: dict[str, str] = {
|
16
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
|
17
|
+
}
|
15
18
|
|
16
19
|
|
17
20
|
class TelegramConfig(BaseConfig):
|
@@ -6,17 +6,31 @@ from telegram_pm.config import HttpClientConfig
|
|
6
6
|
|
7
7
|
|
8
8
|
class HttpClient:
|
9
|
-
def __init__(
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
retries: int = HttpClientConfig.retries,
|
12
|
+
timeout: int = HttpClientConfig.timeout,
|
13
|
+
backoff: int = HttpClientConfig.backoff,
|
14
|
+
headers: dict[str, str] = HttpClientConfig.headers,
|
15
|
+
):
|
16
|
+
self._headers = headers
|
17
|
+
self._backoff = backoff
|
18
|
+
self._retries = retries
|
10
19
|
self.client = httpx.AsyncClient(
|
11
20
|
transport=httpx.AsyncHTTPTransport(
|
12
21
|
verify=False,
|
13
|
-
retries=
|
22
|
+
retries=retries,
|
14
23
|
),
|
15
|
-
timeout=
|
24
|
+
timeout=timeout,
|
16
25
|
verify=False,
|
17
26
|
)
|
18
27
|
|
19
|
-
@retry(backoff=HttpClientConfig.backoff, logger=logger) # type: ignore[arg-type]
|
20
28
|
async def request(self, url: str, method: str = "GET", **kwargs) -> httpx.Response:
|
21
|
-
|
22
|
-
|
29
|
+
@retry(backoff=self._backoff, logger=logger) # type: ignore[arg-type]
|
30
|
+
async def nested_request() -> httpx.Response:
|
31
|
+
response = await self.client.request(
|
32
|
+
method=method, url=url, headers=self._headers, **kwargs
|
33
|
+
)
|
34
|
+
return response
|
35
|
+
|
36
|
+
return await nested_request()
|
telegram_pm/parsers/preview.py
CHANGED
@@ -4,10 +4,9 @@ import httpx
|
|
4
4
|
from bs4 import BeautifulSoup
|
5
5
|
from structlog.contextvars import bound_contextvars
|
6
6
|
|
7
|
-
from telegram_pm import utils
|
7
|
+
from telegram_pm import utils, config
|
8
8
|
from telegram_pm.entities import Post
|
9
9
|
from telegram_pm.utils.logger import logger
|
10
|
-
from telegram_pm.config import TelegramConfig
|
11
10
|
from telegram_pm.parsers.base import BaseParser
|
12
11
|
from telegram_pm.parsers.post import PostsParser
|
13
12
|
from telegram_pm.http_client.client import HttpClient
|
@@ -24,9 +23,40 @@ class PreviewParser(BaseParser):
|
|
24
23
|
channels: list[str],
|
25
24
|
db_path: str,
|
26
25
|
verbose: bool = False,
|
26
|
+
tg_before_param_size: int = config.TelegramConfig.before_param_size,
|
27
|
+
tg_iteration_in_preview_count: int = config.TelegramConfig.iteration_in_preview_count,
|
28
|
+
tg_sleep_time_seconds: int = config.TelegramConfig.sleep_time_seconds,
|
29
|
+
tg_sleep_after_error_request: int = config.TelegramConfig.sleep_after_error_request,
|
30
|
+
http_retries: int = config.HttpClientConfig.retries,
|
31
|
+
http_backoff: int = config.HttpClientConfig.backoff,
|
32
|
+
http_timeout: int = config.HttpClientConfig.timeout,
|
33
|
+
http_headers: dict[str, str] = config.HttpClientConfig.headers,
|
27
34
|
):
|
35
|
+
"""
|
36
|
+
:param db_path: Path to sqlite database
|
37
|
+
:param channels: Channels list
|
38
|
+
:param verbose: Verbose mode
|
39
|
+
:param tg_before_param_size: 20 messages per request. (1 iter - last 20 messages)
|
40
|
+
:param tg_iteration_in_preview_count: Number of requests (default 5). 20 messages per request. (1 iter - last 20 messages)
|
41
|
+
:param tg_sleep_time_seconds: Number of seconds after which the next process of receiving data from channels will begin (default 60 seconds)
|
42
|
+
:param tg_sleep_after_error_request: Waiting after a failed requests (default 30)
|
43
|
+
:param http_retries: Number of repeated request attempts (default 3)
|
44
|
+
:param http_backoff: Delay between attempts for failed requests (default 3 seconds)
|
45
|
+
:param http_timeout: Waiting for a response (default 30 seconds)
|
46
|
+
:param http_headers: HTTP headers
|
47
|
+
"""
|
48
|
+
self._tg_sleep_after_error_request = tg_sleep_after_error_request
|
49
|
+
self._tg_sleep_time_seconds = tg_sleep_time_seconds
|
50
|
+
self._tg_iteration_in_preview_count = tg_iteration_in_preview_count
|
51
|
+
self._tg_before_param_size = tg_before_param_size
|
52
|
+
|
28
53
|
self.channels: list[str] = channels
|
29
|
-
self.http_client = HttpClient(
|
54
|
+
self.http_client = HttpClient(
|
55
|
+
retries=http_retries,
|
56
|
+
backoff=http_backoff,
|
57
|
+
timeout=http_timeout,
|
58
|
+
headers=http_headers,
|
59
|
+
)
|
30
60
|
self.post_parser = PostsParser(verbose=verbose)
|
31
61
|
self.db = DatabaseProcessor(db_path=db_path)
|
32
62
|
self._db_initialized = False
|
@@ -89,7 +119,7 @@ class PreviewParser(BaseParser):
|
|
89
119
|
posts_result = []
|
90
120
|
should_break = False
|
91
121
|
|
92
|
-
for parse_repeat in range(
|
122
|
+
for parse_repeat in range(self._tg_iteration_in_preview_count):
|
93
123
|
if should_break:
|
94
124
|
await logger.ainfo("No new posts yet")
|
95
125
|
break
|
@@ -98,7 +128,7 @@ class PreviewParser(BaseParser):
|
|
98
128
|
response = await self._get_preview_page(preview_url=preview_url)
|
99
129
|
if not response:
|
100
130
|
await logger.awarning("Can not get preview page")
|
101
|
-
await asyncio.sleep(
|
131
|
+
await asyncio.sleep(self._tg_sleep_after_error_request)
|
102
132
|
continue
|
103
133
|
|
104
134
|
if self.__forbidden_parse_preview(response=response):
|
@@ -111,7 +141,7 @@ class PreviewParser(BaseParser):
|
|
111
141
|
if not parsed_posts:
|
112
142
|
await logger.awarning("No posts parsed from preview page") # type: ignore
|
113
143
|
await self.db.drop_table_if_empty(channel_username)
|
114
|
-
await asyncio.sleep(
|
144
|
+
await asyncio.sleep(self._tg_sleep_after_error_request)
|
115
145
|
break
|
116
146
|
|
117
147
|
first_post_exists = await self.db.post_exists(
|
@@ -127,11 +157,11 @@ class PreviewParser(BaseParser):
|
|
127
157
|
before_param_number = self.__parse_before_param_value(
|
128
158
|
post_url=parsed_posts[-1].url
|
129
159
|
)
|
130
|
-
if before_param_number <=
|
131
|
-
before_param_number -=
|
160
|
+
if before_param_number <= self._tg_before_param_size:
|
161
|
+
before_param_number -= self._tg_before_param_size
|
132
162
|
else:
|
133
163
|
before_param_number = (
|
134
|
-
before_param_number -
|
164
|
+
before_param_number - self._tg_before_param_size
|
135
165
|
)
|
136
166
|
if before_param_number <= 0:
|
137
167
|
break
|
telegram_pm/run.py
CHANGED
@@ -2,17 +2,16 @@ import sys
|
|
2
2
|
import signal
|
3
3
|
import asyncio
|
4
4
|
|
5
|
-
from telegram_pm
|
5
|
+
from telegram_pm import config
|
6
6
|
from telegram_pm.utils.logger import logger
|
7
7
|
from telegram_pm.config import TelegramConfig
|
8
|
+
from telegram_pm.parsers.preview import PreviewParser
|
8
9
|
|
9
10
|
|
10
11
|
class ParserRunner:
|
11
|
-
def __init__(
|
12
|
-
self
|
13
|
-
|
14
|
-
self.verbose = verbose
|
15
|
-
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
):
|
16
15
|
self._shutdown = False
|
17
16
|
|
18
17
|
# Setup signal handlers
|
@@ -24,9 +23,45 @@ class ParserRunner:
|
|
24
23
|
self._shutdown = True
|
25
24
|
sys.exit(0)
|
26
25
|
|
27
|
-
async def run(
|
26
|
+
async def run(
|
27
|
+
self,
|
28
|
+
db_path: str,
|
29
|
+
channels: list[str],
|
30
|
+
verbose: bool = False,
|
31
|
+
tg_before_param_size: int = config.TelegramConfig.before_param_size,
|
32
|
+
tg_iteration_in_preview_count: int = config.TelegramConfig.iteration_in_preview_count,
|
33
|
+
tg_sleep_time_seconds: int = config.TelegramConfig.sleep_time_seconds,
|
34
|
+
tg_sleep_after_error_request: int = config.TelegramConfig.sleep_after_error_request,
|
35
|
+
http_retries: int = config.HttpClientConfig.retries,
|
36
|
+
http_backoff: int = config.HttpClientConfig.backoff,
|
37
|
+
http_timeout: int = config.HttpClientConfig.timeout,
|
38
|
+
http_headers: dict[str, str] = config.HttpClientConfig.headers,
|
39
|
+
):
|
40
|
+
"""
|
41
|
+
:param db_path: Path to sqlite database
|
42
|
+
:param channels: Channels list
|
43
|
+
:param verbose: Verbose mode
|
44
|
+
:param tg_before_param_size: 20 messages per request. (1 iter - last 20 messages)
|
45
|
+
:param tg_iteration_in_preview_count: Number of requests (default 5). 20 messages per request. (1 iter - last 20 messages)
|
46
|
+
:param tg_sleep_time_seconds: Number of seconds after which the next process of receiving data from channels will begin (default 60 seconds)
|
47
|
+
:param tg_sleep_after_error_request: Waiting after a failed requests (default 30)
|
48
|
+
:param http_retries: Number of repeated request attempts (default 3)
|
49
|
+
:param http_backoff: Delay between attempts for failed requests (default 3 seconds)
|
50
|
+
:param http_timeout: Waiting for a response (default 30 seconds)
|
51
|
+
:param http_headers: HTTP headers
|
52
|
+
"""
|
28
53
|
parser = PreviewParser(
|
29
|
-
channels=
|
54
|
+
channels=channels,
|
55
|
+
verbose=verbose,
|
56
|
+
db_path=db_path,
|
57
|
+
tg_before_param_size=tg_before_param_size,
|
58
|
+
tg_iteration_in_preview_count=tg_iteration_in_preview_count,
|
59
|
+
tg_sleep_time_seconds=tg_sleep_time_seconds,
|
60
|
+
tg_sleep_after_error_request=tg_sleep_after_error_request,
|
61
|
+
http_retries=http_retries,
|
62
|
+
http_backoff=http_backoff,
|
63
|
+
http_timeout=http_timeout,
|
64
|
+
http_headers=http_headers,
|
30
65
|
)
|
31
66
|
try:
|
32
67
|
while not self._shutdown:
|
@@ -44,6 +79,32 @@ class ParserRunner:
|
|
44
79
|
await parser.close()
|
45
80
|
|
46
81
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
82
|
+
def run_tpm(
|
83
|
+
db_path: str,
|
84
|
+
channels: list[str],
|
85
|
+
verbose: bool = False,
|
86
|
+
tg_before_param_size: int = config.TelegramConfig.before_param_size,
|
87
|
+
tg_iteration_in_preview_count: int = config.TelegramConfig.iteration_in_preview_count,
|
88
|
+
tg_sleep_time_seconds: int = config.TelegramConfig.sleep_time_seconds,
|
89
|
+
tg_sleep_after_error_request: int = config.TelegramConfig.sleep_after_error_request,
|
90
|
+
http_retries: int = config.HttpClientConfig.retries,
|
91
|
+
http_backoff: int = config.HttpClientConfig.backoff,
|
92
|
+
http_timeout: int = config.HttpClientConfig.timeout,
|
93
|
+
http_headers: dict[str, str] = config.HttpClientConfig.headers,
|
94
|
+
):
|
95
|
+
runner = ParserRunner()
|
96
|
+
asyncio.run(
|
97
|
+
runner.run(
|
98
|
+
channels=channels,
|
99
|
+
verbose=verbose,
|
100
|
+
db_path=db_path,
|
101
|
+
tg_before_param_size=tg_before_param_size,
|
102
|
+
tg_iteration_in_preview_count=tg_iteration_in_preview_count,
|
103
|
+
tg_sleep_time_seconds=tg_sleep_time_seconds,
|
104
|
+
tg_sleep_after_error_request=tg_sleep_after_error_request,
|
105
|
+
http_retries=http_retries,
|
106
|
+
http_backoff=http_backoff,
|
107
|
+
http_timeout=http_timeout,
|
108
|
+
http_headers=http_headers,
|
109
|
+
)
|
110
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: telegram-pm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Telegram preview page parser
|
5
5
|
Author: Your Name
|
6
6
|
Author-email: you@example.com
|
@@ -32,15 +32,20 @@ Tool for monitoring public Telegram channels available in WEB preview mode
|
|
32
32
|
4. [x] Support for forwarded messages and replies
|
33
33
|
5. [x] Configurable data collection parameters
|
34
34
|
|
35
|
+
|
35
36
|
## 🛠 Installation
|
36
37
|
1. Ensure Python 3.12+ is installed (recommendation)
|
37
38
|
2. Clone repository
|
38
39
|
```bash
|
39
40
|
git clone 'https://github.com/aIligat0r/tpm.git'
|
40
41
|
```
|
42
|
+
or
|
43
|
+
```bash
|
44
|
+
pip install telegram-pm
|
45
|
+
```
|
41
46
|
|
42
47
|
## ⚙️ Configuration
|
43
|
-
Configurations (file `.env` or `
|
48
|
+
Configurations (file `.env` or `telegram_pm/config.py`)
|
44
49
|
|
45
50
|
Parsing configurations:
|
46
51
|
* `TELEGRAM_PARSE_REPEAT_COUNT` - Number of requests (default `5`). 20 messages per request. (1 iter - last 20 messages)
|
@@ -58,7 +63,7 @@ HTTP configurations:
|
|
58
63
|
|
59
64
|
Build docker image:
|
60
65
|
```bash
|
61
|
-
docker build -t
|
66
|
+
docker build -t tpm .
|
62
67
|
```
|
63
68
|
Create poetry env:
|
64
69
|
* Install poetry:
|
@@ -80,17 +85,17 @@ poetry install
|
|
80
85
|
| `--verbose`/`--v` | Verbose mode | ➖ |
|
81
86
|
| `--help`/`--h` | Help information | ➖ |
|
82
87
|
|
83
|
-
**Poetry
|
88
|
+
**Poetry:**
|
84
89
|
```bash
|
85
|
-
poetry run
|
90
|
+
poetry run tpm --ch freegaza --ch BREAKINGNewsTG --db-path .\tg.db --v
|
86
91
|
```
|
87
92
|
or
|
88
93
|
```bash
|
89
|
-
poetry run
|
94
|
+
poetry run tpm --channels-filepath /path/to/monitoring_usernames.txt --db-path .\tg.db
|
90
95
|
```
|
91
|
-
**Docker
|
96
|
+
**Docker:**
|
92
97
|
```bash
|
93
|
-
docker run -it --rm
|
98
|
+
docker run -it --rm tpm --ch freegaza --db-path test_tg.db --v
|
94
99
|
```
|
95
100
|
or (if you want to transfer usernames in a file, then you need to mount the paths)
|
96
101
|
```bash
|
@@ -104,6 +109,29 @@ docker run -it --rm \
|
|
104
109
|
-v ~/tpm_data_dir/usernames.txt:/data/usernames.txt \
|
105
110
|
telegram_pm --db-path /data/telegram_messages.sqlite --chf /data/usernames.txt
|
106
111
|
```
|
112
|
+
**Python:**
|
113
|
+
```python
|
114
|
+
from telegram_pm.run import run_tpm
|
115
|
+
|
116
|
+
|
117
|
+
run_tpm(
|
118
|
+
db_path="tg.db", # Path to sqlite database
|
119
|
+
channels=["channel1", "channel2"], # Channels list
|
120
|
+
verbose=True, # Verbose mode
|
121
|
+
|
122
|
+
# Configuration (optional)
|
123
|
+
tg_iteration_in_preview_count=5, # Number of requests (default 5). 20 messages per request. (1 iter - last 20 messages)
|
124
|
+
tg_sleep_time_seconds=60, # Number of seconds after which the next process of receiving data from channels will begin (default 60 seconds)
|
125
|
+
tg_sleep_after_error_request=30, # Waiting after a failed requests (default 30)
|
126
|
+
http_retries=3, # Number of repeated request attempts (default 3)
|
127
|
+
http_backoff=3, # Delay between attempts for failed requests (default 3 seconds)
|
128
|
+
http_timeout=60, # Waiting for a response (default 30 seconds)
|
129
|
+
http_headers={ # HTTP headers
|
130
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
|
131
|
+
}
|
132
|
+
)
|
133
|
+
```
|
134
|
+
|
107
135
|
## 🗃️ Database Structure
|
108
136
|
|
109
137
|
The tables will be named as usernames. Each table is a username that was passed in the running parameters.
|
@@ -1,22 +1,22 @@
|
|
1
1
|
telegram_pm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
telegram_pm/config.py,sha256=
|
2
|
+
telegram_pm/config.py,sha256=w1BZPxy8adyUnVQeGjUseSlVNRgpf7ZGXi4ltCXIo1Y,939
|
3
3
|
telegram_pm/database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
telegram_pm/database/db.py,sha256=rSfqCbBYrD4E_Msb5q8ilY1QIPlq7vnVE_-dNlYOXaM,4716
|
5
5
|
telegram_pm/entities.py,sha256=-mdx3u1M7bKFtEXaLcaaBjLQg08NBW77c2VeNHQQ_Gw,646
|
6
6
|
telegram_pm/http_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
telegram_pm/http_client/client.py,sha256=
|
7
|
+
telegram_pm/http_client/client.py,sha256=EYFiCFZcICntF7Lc1QHsqQ_CcGtNI6G8j-DLmt1VJG4,1149
|
8
8
|
telegram_pm/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
telegram_pm/parsers/base.py,sha256=9GH7bJaqPueohRoK1OVOmjF9pY_fqaFizIc9Ak6qS-Y,22
|
10
10
|
telegram_pm/parsers/post.py,sha256=4wf4KBG6NBOFGpk8_GH88M1hbyjTWXTQgRgGaHXgB40,10469
|
11
|
-
telegram_pm/parsers/preview.py,sha256=
|
11
|
+
telegram_pm/parsers/preview.py,sha256=TvWy36NOWvMMms3vUdc96wGRuYCvHI8R896gxiKrnJQ,7887
|
12
12
|
telegram_pm/parsers/tag_options.py,sha256=0YRQH5O8fpfReHRDXEThmFFyiacsUz-wlbjVFOLoiJ8,2040
|
13
|
-
telegram_pm/run.py,sha256=
|
13
|
+
telegram_pm/run.py,sha256=dKMw2IrtCh3rkkyiwQHNQwRM97f887Z_LsKvUulomrI,4608
|
14
14
|
telegram_pm/utils/__init__.py,sha256=loG7JOo8Th7vV7lYrVeCEhObguEaMQr7xRCmVkV7CM4,103
|
15
15
|
telegram_pm/utils/logger.py,sha256=RqwcrFNMzjQfqB-aC9w79g9WLbcj6GvokRDtj9ZPH1Y,123
|
16
16
|
telegram_pm/utils/parse.py,sha256=vSI4kNVvt2hqXLcOdp0MuCChG6fFqSrb17VzH6huqVQ,1167
|
17
17
|
telegram_pm/utils/url.py,sha256=mv5Lc4PZbyL4hQXku3sGzMt3lmGKjtlYhbmzL0fKeb8,941
|
18
|
-
telegram_pm-0.1.
|
19
|
-
telegram_pm-0.1.
|
20
|
-
telegram_pm-0.1.
|
21
|
-
telegram_pm-0.1.
|
22
|
-
telegram_pm-0.1.
|
18
|
+
telegram_pm-0.1.2.dist-info/entry_points.txt,sha256=dIvBN0V4aMrJKl7tB1qCYy7VM40uFqnuwcPibXfnSU0,40
|
19
|
+
telegram_pm-0.1.2.dist-info/LICENSE,sha256=kaLyGzbJPljgIIJrGiWc2611z1YfjYG8QsI6v0C_oug,1066
|
20
|
+
telegram_pm-0.1.2.dist-info/METADATA,sha256=VY6WAWEI_pn83xycXPq-1d8k1kMx8tHKdulFIi2VHSA,8199
|
21
|
+
telegram_pm-0.1.2.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
22
|
+
telegram_pm-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|