telegram-pm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telegram_pm-0.1.0/LICENSE +21 -0
- telegram_pm-0.1.0/PKG-INFO +144 -0
- telegram_pm-0.1.0/README.md +123 -0
- telegram_pm-0.1.0/pyproject.toml +30 -0
- telegram_pm-0.1.0/telegram_pm/__init__.py +0 -0
- telegram_pm-0.1.0/telegram_pm/config.py +25 -0
- telegram_pm-0.1.0/telegram_pm/database/__init__.py +0 -0
- telegram_pm-0.1.0/telegram_pm/database/db.py +143 -0
- telegram_pm-0.1.0/telegram_pm/entities.py +20 -0
- telegram_pm-0.1.0/telegram_pm/http_client/__init__.py +0 -0
- telegram_pm-0.1.0/telegram_pm/http_client/client.py +22 -0
- telegram_pm-0.1.0/telegram_pm/parsers/__init__.py +0 -0
- telegram_pm-0.1.0/telegram_pm/parsers/base.py +1 -0
- telegram_pm-0.1.0/telegram_pm/parsers/post.py +280 -0
- telegram_pm-0.1.0/telegram_pm/parsers/preview.py +165 -0
- telegram_pm-0.1.0/telegram_pm/parsers/tag_options.py +78 -0
- telegram_pm-0.1.0/telegram_pm/run.py +49 -0
- telegram_pm-0.1.0/telegram_pm/utils/__init__.py +8 -0
- telegram_pm-0.1.0/telegram_pm/utils/logger.py +5 -0
- telegram_pm-0.1.0/telegram_pm/utils/parse.py +46 -0
- telegram_pm-0.1.0/telegram_pm/utils/url.py +34 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 aIligat0r
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,144 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: telegram-pm
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Telegram preview page parser
|
5
|
+
Author: Your Name
|
6
|
+
Author-email: you@example.com
|
7
|
+
Requires-Python: >=3.12
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: aiosqlite (>=0.21.0,<0.22.0)
|
12
|
+
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
13
|
+
Requires-Dist: html5lib (>=1.1,<2.0)
|
14
|
+
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
15
|
+
Requires-Dist: python-dotenv (>=1.1.0,<2.0.0)
|
16
|
+
Requires-Dist: retry (>=0.9.2,<0.10.0)
|
17
|
+
Requires-Dist: structlog (>=25.2.0,<26.0.0)
|
18
|
+
Requires-Dist: typer (>=0.15.2,<0.16.0)
|
19
|
+
Description-Content-Type: text/markdown
|
20
|
+
|
21
|
+
# Telegram Channels Monitor
|
22
|
+
|
23
|
+

|
24
|
+

|
25
|
+
|
26
|
+
Tool for monitoring public Telegram channels available in WEB preview mode
|
27
|
+
|
28
|
+
## 🌟 Features
|
29
|
+
1. [x] Parsing recent messages from public Telegram channels
|
30
|
+
2. [x] Extracting metadata and media attachments
|
31
|
+
3. [x] Storing data in SQLite database
|
32
|
+
4. [x] Support for forwarded messages and replies
|
33
|
+
5. [x] Configurable data collection parameters
|
34
|
+
|
35
|
+
## 🛠 Installation
|
36
|
+
1. Ensure Python 3.12+ is installed (recommendation)
|
37
|
+
2. Clone repository
|
38
|
+
```bash
|
39
|
+
git clone 'https://github.com/aIligat0r/tpm.git'
|
40
|
+
```
|
41
|
+
|
42
|
+
## ⚙️ Configuration
|
43
|
+
Configurations (file `.env` or `tpm/config.py`)
|
44
|
+
|
45
|
+
Parsing configurations:
|
46
|
+
* `TELEGRAM_PARSE_REPEAT_COUNT` - Number of requests (default `5`). 20 messages per request. (1 iter - last 20 messages)
|
47
|
+
* `TELEGRAM_SLEEP_TIME_SECONDS` - Number of seconds after which the next process of receiving data from channels will begin (default `60` seconds)
|
48
|
+
* `TELEGRAM_SLEEP_AFTER_ERROR_REQUEST` - Waiting after a failed requests (default `30`)
|
49
|
+
|
50
|
+
HTTP configurations:
|
51
|
+
* `HTTP_RETRIES` - Number of repeated request attempts (default `3`)
|
52
|
+
* `HTTP_BACKOFF` - Delay between attempts for failed requests (default `3` seconds)
|
53
|
+
* `HTTP_TIMEOUT` - Waiting for a response (default `30` seconds)
|
54
|
+
|
55
|
+
## 🚀 Usage
|
56
|
+
|
57
|
+
#### 1. Build application:
|
58
|
+
|
59
|
+
Build docker image:
|
60
|
+
```bash
|
61
|
+
docker build -t telegram_pm .
|
62
|
+
```
|
63
|
+
Create poetry env:
|
64
|
+
* Install poetry:
|
65
|
+
```bash
|
66
|
+
pip install poetry
|
67
|
+
```
|
68
|
+
* Create poetry env and install packages:
|
69
|
+
```bash
|
70
|
+
poetry install
|
71
|
+
```
|
72
|
+
|
73
|
+
#### 2. Launching the app
|
74
|
+
|
75
|
+
| Options | Description | Required |
|
76
|
+
|-----------------------------------|-----------------------------------------------------------------------|----------------------------------------------------------------|
|
77
|
+
| `--db-path` | Path to the base (if not, it will be created) | ❌ required |
|
78
|
+
| `--channels-filepath`/`--ch-file` | File of channel usernames (file where in each line Telegram username) | ❌ required (or usernames `--channel`/`--ch`) |
|
79
|
+
| `--channel`/`--ch` | List of usernames that are passed by the parameter | ❌ required (or file of channels `--channels-filepath`/`--chf`) |
|
80
|
+
| `--verbose`/`--v` | Verbose mode | ➖ |
|
81
|
+
| `--help`/`--h` | Help information | ➖ |
|
82
|
+
|
83
|
+
**Poetry**:
|
84
|
+
```bash
|
85
|
+
poetry run telegram_pm --ch freegaza --ch BREAKINGNewsTG --db-path .\tg.db --v
|
86
|
+
```
|
87
|
+
or
|
88
|
+
```bash
|
89
|
+
poetry run telegram_pm --channels-filepath /path/to/monitoring_usernames.txt --db-path .\tg.db
|
90
|
+
```
|
91
|
+
**Docker**:
|
92
|
+
```bash
|
93
|
+
docker run -it --rm telegram_pm --ch freegaza --db-path test_tg.db --v
|
94
|
+
```
|
95
|
+
or (if you want to transfer usernames in a file, then you need to mount the paths)
|
96
|
+
```bash
|
97
|
+
$ mkdir ~/tpm_data/ # create a folder for data
|
98
|
+
$ cp /path/to/channel/usernames.txt ~/tpm_data/usernames.txt # copy the file with the user names to the previously created folder
|
99
|
+
$ chmod 666 ~/tpm_data_dir/telegram_messages.sqlite && chmod 666 ~/tpm_data_dir/usernames.txt # grant access to use this folder from the container
|
100
|
+
```
|
101
|
+
```bash
|
102
|
+
docker run -it --rm \
|
103
|
+
-v ~/tpm_data_dir/telegram_messages.sqlite:/data/telegram_messages.sqlite \
|
104
|
+
-v ~/tpm_data_dir/usernames.txt:/data/usernames.txt \
|
105
|
+
telegram_pm --db-path /data/telegram_messages.sqlite --chf /data/usernames.txt
|
106
|
+
```
|
107
|
+
## 🗃️ Database Structure
|
108
|
+
|
109
|
+
The tables will be named as usernames. Each table is a username that was passed in the running parameters.
|
110
|
+
|
111
|
+
| Field | Type | Description |
|
112
|
+
|-----------------------|-----------------------------------|----------------------------------------------------------|
|
113
|
+
| `id` | **INTEGER** | Channel ID |
|
114
|
+
| `url` | **TEXT** | Message URL |
|
115
|
+
| `username` | **TEXT** | Channel username |
|
116
|
+
| `date` | **TEXT** _(ISO 8601)_ | Message date |
|
117
|
+
| `text` | **TEXT** | Message text |
|
118
|
+
| `replied_post_url` | **TEXT** | Replied message URL |
|
119
|
+
| `urls` | **JSON** | URLs from text |
|
120
|
+
| `photo_urls` | **JSON** | Photo URLs |
|
121
|
+
| `video_urls` | **JSON** | Video URLs |
|
122
|
+
| `created_at` | **CURRENT_DATETIME** _(ISO 8601)_ | Record creation time |
|
123
|
+
| `url_preview` | **TEXT** | Text from preview URL |
|
124
|
+
| `round_video_url` | **TEXT** | URL to round video message |
|
125
|
+
| `files` | **JSON** | List of file names and their description |
|
126
|
+
| `tags` | **JSON** | List of tags from a message body |
|
127
|
+
| `forwarded_from_url` | **TEXT** | URL of the channel from which the message was forwarded |
|
128
|
+
| `forwarded_from_name` | **TEXT** | Name of the channel from which the message was forwarded |
|
129
|
+
|
130
|
+
|
131
|
+
## ⚠️ Limitations
|
132
|
+
Works only with public channels
|
133
|
+
|
134
|
+
## 🧮 Example of work
|
135
|
+
**_Verbose mode:_**
|
136
|
+
|
137
|
+

|
138
|
+
|
139
|
+
**_View database_**
|
140
|
+

|
141
|
+
|
142
|
+
## 📜 License
|
143
|
+
MIT License
|
144
|
+
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# Telegram Channels Monitor
|
2
|
+
|
3
|
+

|
4
|
+

|
5
|
+
|
6
|
+
Tool for monitoring public Telegram channels available in WEB preview mode
|
7
|
+
|
8
|
+
## 🌟 Features
|
9
|
+
1. [x] Parsing recent messages from public Telegram channels
|
10
|
+
2. [x] Extracting metadata and media attachments
|
11
|
+
3. [x] Storing data in SQLite database
|
12
|
+
4. [x] Support for forwarded messages and replies
|
13
|
+
5. [x] Configurable data collection parameters
|
14
|
+
|
15
|
+
## 🛠 Installation
|
16
|
+
1. Ensure Python 3.12+ is installed (recommendation)
|
17
|
+
2. Clone repository
|
18
|
+
```bash
|
19
|
+
git clone 'https://github.com/aIligat0r/tpm.git'
|
20
|
+
```
|
21
|
+
|
22
|
+
## ⚙️ Configuration
|
23
|
+
Configurations (file `.env` or `tpm/config.py`)
|
24
|
+
|
25
|
+
Parsing configurations:
|
26
|
+
* `TELEGRAM_PARSE_REPEAT_COUNT` - Number of requests (default `5`). 20 messages per request. (1 iter - last 20 messages)
|
27
|
+
* `TELEGRAM_SLEEP_TIME_SECONDS` - Number of seconds after which the next process of receiving data from channels will begin (default `60` seconds)
|
28
|
+
* `TELEGRAM_SLEEP_AFTER_ERROR_REQUEST` - Waiting after a failed requests (default `30`)
|
29
|
+
|
30
|
+
HTTP configurations:
|
31
|
+
* `HTTP_RETRIES` - Number of repeated request attempts (default `3`)
|
32
|
+
* `HTTP_BACKOFF` - Delay between attempts for failed requests (default `3` seconds)
|
33
|
+
* `HTTP_TIMEOUT` - Waiting for a response (default `30` seconds)
|
34
|
+
|
35
|
+
## 🚀 Usage
|
36
|
+
|
37
|
+
#### 1. Build application:
|
38
|
+
|
39
|
+
Build docker image:
|
40
|
+
```bash
|
41
|
+
docker build -t telegram_pm .
|
42
|
+
```
|
43
|
+
Create poetry env:
|
44
|
+
* Install poetry:
|
45
|
+
```bash
|
46
|
+
pip install poetry
|
47
|
+
```
|
48
|
+
* Create poetry env and install packages:
|
49
|
+
```bash
|
50
|
+
poetry install
|
51
|
+
```
|
52
|
+
|
53
|
+
#### 2. Launching the app
|
54
|
+
|
55
|
+
| Options | Description | Required |
|
56
|
+
|-----------------------------------|-----------------------------------------------------------------------|----------------------------------------------------------------|
|
57
|
+
| `--db-path` | Path to the base (if not, it will be created) | ❌ required |
|
58
|
+
| `--channels-filepath`/`--ch-file` | File of channel usernames (file where in each line Telegram username) | ❌ required (or usernames `--channel`/`--ch`) |
|
59
|
+
| `--channel`/`--ch` | List of usernames that are passed by the parameter | ❌ required (or file of channels `--channels-filepath`/`--chf`) |
|
60
|
+
| `--verbose`/`--v` | Verbose mode | ➖ |
|
61
|
+
| `--help`/`--h` | Help information | ➖ |
|
62
|
+
|
63
|
+
**Poetry**:
|
64
|
+
```bash
|
65
|
+
poetry run telegram_pm --ch freegaza --ch BREAKINGNewsTG --db-path .\tg.db --v
|
66
|
+
```
|
67
|
+
or
|
68
|
+
```bash
|
69
|
+
poetry run telegram_pm --channels-filepath /path/to/monitoring_usernames.txt --db-path .\tg.db
|
70
|
+
```
|
71
|
+
**Docker**:
|
72
|
+
```bash
|
73
|
+
docker run -it --rm telegram_pm --ch freegaza --db-path test_tg.db --v
|
74
|
+
```
|
75
|
+
or (if you want to transfer usernames in a file, then you need to mount the paths)
|
76
|
+
```bash
|
77
|
+
$ mkdir ~/tpm_data/ # create a folder for data
|
78
|
+
$ cp /path/to/channel/usernames.txt ~/tpm_data/usernames.txt # copy the file with the user names to the previously created folder
|
79
|
+
$ chmod 666 ~/tpm_data_dir/telegram_messages.sqlite && chmod 666 ~/tpm_data_dir/usernames.txt # grant access to use this folder from the container
|
80
|
+
```
|
81
|
+
```bash
|
82
|
+
docker run -it --rm \
|
83
|
+
-v ~/tpm_data_dir/telegram_messages.sqlite:/data/telegram_messages.sqlite \
|
84
|
+
-v ~/tpm_data_dir/usernames.txt:/data/usernames.txt \
|
85
|
+
telegram_pm --db-path /data/telegram_messages.sqlite --chf /data/usernames.txt
|
86
|
+
```
|
87
|
+
## 🗃️ Database Structure
|
88
|
+
|
89
|
+
The tables will be named as usernames. Each table is a username that was passed in the running parameters.
|
90
|
+
|
91
|
+
| Field | Type | Description |
|
92
|
+
|-----------------------|-----------------------------------|----------------------------------------------------------|
|
93
|
+
| `id` | **INTEGER** | Channel ID |
|
94
|
+
| `url` | **TEXT** | Message URL |
|
95
|
+
| `username` | **TEXT** | Channel username |
|
96
|
+
| `date` | **TEXT** _(ISO 8601)_ | Message date |
|
97
|
+
| `text` | **TEXT** | Message text |
|
98
|
+
| `replied_post_url` | **TEXT** | Replied message URL |
|
99
|
+
| `urls` | **JSON** | URLs from text |
|
100
|
+
| `photo_urls` | **JSON** | Photo URLs |
|
101
|
+
| `video_urls` | **JSON** | Video URLs |
|
102
|
+
| `created_at` | **CURRENT_DATETIME** _(ISO 8601)_ | Record creation time |
|
103
|
+
| `url_preview` | **TEXT** | Text from preview URL |
|
104
|
+
| `round_video_url` | **TEXT** | URL to round video message |
|
105
|
+
| `files` | **JSON** | List of file names and their description |
|
106
|
+
| `tags` | **JSON** | List of tags from a message body |
|
107
|
+
| `forwarded_from_url` | **TEXT** | URL of the channel from which the message was forwarded |
|
108
|
+
| `forwarded_from_name` | **TEXT** | Name of the channel from which the message was forwarded |
|
109
|
+
|
110
|
+
|
111
|
+
## ⚠️ Limitations
|
112
|
+
Works only with public channels
|
113
|
+
|
114
|
+
## 🧮 Example of work
|
115
|
+
**_Verbose mode:_**
|
116
|
+
|
117
|
+

|
118
|
+
|
119
|
+
**_View database_**
|
120
|
+

|
121
|
+
|
122
|
+
## 📜 License
|
123
|
+
MIT License
|
@@ -0,0 +1,30 @@
|
|
1
|
+
[project]
|
2
|
+
name = "telegram-pm"
|
3
|
+
version = "0.1.0"
|
4
|
+
description = "Telegram preview page parser"
|
5
|
+
authors = [{name = "Your Name",email = "you@example.com"}]
|
6
|
+
readme = "README.md"
|
7
|
+
requires-python = ">=3.12"
|
8
|
+
packages = [{ include = "src" }, { include = "commands" }]
|
9
|
+
|
10
|
+
[tool.poetry.scripts]
|
11
|
+
tpm = "commands.cli:app"
|
12
|
+
|
13
|
+
[tool.poetry.dependencies]
|
14
|
+
httpx = "^0.28.1"
|
15
|
+
bs4 = "^0.0.2"
|
16
|
+
structlog = "^25.2.0"
|
17
|
+
typer = "^0.15.2"
|
18
|
+
retry = "^0.9.2"
|
19
|
+
python-dotenv = "^1.1.0"
|
20
|
+
html5lib = "^1.1"
|
21
|
+
aiosqlite = "^0.21.0"
|
22
|
+
|
23
|
+
[tool.poetry.group.dev.dependencies]
|
24
|
+
ruff = "^0.11.5"
|
25
|
+
pre-commit = "^4.2.0"
|
26
|
+
mypy = "^1.15.0"
|
27
|
+
|
28
|
+
[build-system]
|
29
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
30
|
+
build-backend = "poetry.core.masonry.api"
|
File without changes
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from os import environ
|
2
|
+
|
3
|
+
from dotenv import load_dotenv
|
4
|
+
|
5
|
+
load_dotenv()
|
6
|
+
|
7
|
+
|
8
|
+
class BaseConfig: ...
|
9
|
+
|
10
|
+
|
11
|
+
class HttpClientConfig(BaseConfig):
|
12
|
+
retries: int = int(environ.get("HTTP_RETRIES", 3))
|
13
|
+
backoff: int = int(environ.get("HTTP_BACKOFF", 3))
|
14
|
+
timeout: int = int(environ.get("HTTP_TIMEOUT", 30))
|
15
|
+
|
16
|
+
|
17
|
+
class TelegramConfig(BaseConfig):
|
18
|
+
base_url: str = environ.get("TELEGRAM_BASE_URL", "https://t.me")
|
19
|
+
|
20
|
+
before_param_size: int = int(environ.get("TELEGRAM_BEFORE_PARAM_SIZE", 20))
|
21
|
+
iteration_in_preview_count: int = int(environ.get("TELEGRAM_PARSE_REPEAT_COUNT", 5))
|
22
|
+
sleep_time_seconds: int = int(environ.get("TELEGRAM_SLEEP_TIME_SECONDS", 60))
|
23
|
+
sleep_after_error_request: int = int(
|
24
|
+
environ.get("TELEGRAM_SLEEP_AFTER_ERROR_REQUEST", 30)
|
25
|
+
)
|
File without changes
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import json
|
2
|
+
from typing import List
|
3
|
+
from dataclasses import asdict
|
4
|
+
from contextlib import asynccontextmanager
|
5
|
+
|
6
|
+
import aiosqlite
|
7
|
+
|
8
|
+
from telegram_pm.entities import Post
|
9
|
+
|
10
|
+
|
11
|
+
class DatabaseProcessor:
|
12
|
+
def __init__(self, db_path: str):
|
13
|
+
self.db_path = db_path
|
14
|
+
self._pool = None
|
15
|
+
|
16
|
+
async def initialize(self):
|
17
|
+
async with self._get_connection() as conn:
|
18
|
+
await conn.execute("PRAGMA journal_mode=WAL")
|
19
|
+
await conn.execute("PRAGMA synchronous=NORMAL")
|
20
|
+
await conn.execute("PRAGMA cache_size=-10000") # 10MB кэша
|
21
|
+
await conn.execute("PRAGMA temp_store=MEMORY")
|
22
|
+
await conn.commit()
|
23
|
+
|
24
|
+
@asynccontextmanager
|
25
|
+
async def _get_connection(self):
|
26
|
+
conn = await aiosqlite.connect(self.db_path, timeout=30, isolation_level=None)
|
27
|
+
conn.row_factory = aiosqlite.Row
|
28
|
+
try:
|
29
|
+
yield conn
|
30
|
+
finally:
|
31
|
+
await conn.close()
|
32
|
+
|
33
|
+
@asynccontextmanager
|
34
|
+
async def _get_cursor(self):
|
35
|
+
async with self._get_connection() as conn:
|
36
|
+
cursor = await conn.cursor()
|
37
|
+
try:
|
38
|
+
yield cursor
|
39
|
+
await conn.commit()
|
40
|
+
except Exception as e:
|
41
|
+
await conn.rollback()
|
42
|
+
raise e
|
43
|
+
|
44
|
+
async def table_exists(self, table_name: str) -> bool:
|
45
|
+
async with self._get_cursor() as cursor:
|
46
|
+
await cursor.execute(
|
47
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
48
|
+
(table_name,),
|
49
|
+
)
|
50
|
+
return await cursor.fetchone() is not None
|
51
|
+
|
52
|
+
async def create_table_from_post(self, table_name: str):
|
53
|
+
columns = [
|
54
|
+
"url TEXT PRIMARY KEY",
|
55
|
+
"username TEXT",
|
56
|
+
"id INTEGER",
|
57
|
+
"date TEXT NOT NULL",
|
58
|
+
"text TEXT",
|
59
|
+
"replied_post_url TEXT",
|
60
|
+
"urls TEXT", # JSON
|
61
|
+
"url_preview TEXT",
|
62
|
+
"photo_urls TEXT", # JSON
|
63
|
+
"video_urls TEXT", # JSON
|
64
|
+
"round_video_url TEXT",
|
65
|
+
"files TEXT", # JSON
|
66
|
+
"tags TEXT", # JSON
|
67
|
+
"created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP",
|
68
|
+
"forwarded_from_url TEXT",
|
69
|
+
"forwarded_from_name TEXT",
|
70
|
+
]
|
71
|
+
|
72
|
+
async with self._get_cursor() as cursor:
|
73
|
+
await cursor.execute(
|
74
|
+
f"""
|
75
|
+
CREATE TABLE IF NOT EXISTS {table_name} (
|
76
|
+
{", ".join(columns)}
|
77
|
+
)
|
78
|
+
"""
|
79
|
+
)
|
80
|
+
await cursor.execute(
|
81
|
+
f"CREATE INDEX IF NOT EXISTS idx_{table_name}_date ON {table_name}(date)"
|
82
|
+
)
|
83
|
+
|
84
|
+
async def insert_posts_batch(self, table_name: str, posts: List[Post]):
|
85
|
+
if not posts:
|
86
|
+
return
|
87
|
+
|
88
|
+
columns = [
|
89
|
+
"url",
|
90
|
+
"username",
|
91
|
+
"id",
|
92
|
+
"date",
|
93
|
+
"text",
|
94
|
+
"replied_post_url",
|
95
|
+
"urls",
|
96
|
+
"url_preview",
|
97
|
+
"photo_urls",
|
98
|
+
"video_urls",
|
99
|
+
"round_video_url",
|
100
|
+
"files",
|
101
|
+
"tags",
|
102
|
+
"forwarded_from_url",
|
103
|
+
"forwarded_from_name",
|
104
|
+
]
|
105
|
+
|
106
|
+
placeholders = ", ".join(["?"] * len(columns))
|
107
|
+
query = f"""
|
108
|
+
INSERT OR IGNORE INTO {table_name}
|
109
|
+
({", ".join(columns)})
|
110
|
+
VALUES ({placeholders})
|
111
|
+
"""
|
112
|
+
|
113
|
+
async with self._get_cursor() as cursor:
|
114
|
+
data = []
|
115
|
+
for post in posts:
|
116
|
+
post_dict = asdict(post)
|
117
|
+
for field in ["urls", "photo_urls", "video_urls", "files", "tags"]:
|
118
|
+
post_dict[field] = json.dumps(post_dict[field])
|
119
|
+
data.append(tuple(post_dict[col] for col in columns))
|
120
|
+
await cursor.executemany(query, data)
|
121
|
+
|
122
|
+
async def is_table_empty(self, table_name: str) -> bool:
|
123
|
+
async with self._get_cursor() as cursor:
|
124
|
+
await cursor.execute(f"SELECT 1 FROM {table_name} LIMIT 1")
|
125
|
+
return await cursor.fetchone() is None
|
126
|
+
|
127
|
+
async def drop_table_if_empty(self, table_name: str):
|
128
|
+
if await self.table_exists(table_name) and await self.is_table_empty(
|
129
|
+
table_name
|
130
|
+
):
|
131
|
+
async with self._get_cursor() as cursor:
|
132
|
+
await cursor.execute(f"DROP TABLE {table_name}")
|
133
|
+
|
134
|
+
async def post_exists(self, table_name: str, url: str) -> bool:
|
135
|
+
query = f"SELECT 1 FROM {table_name} WHERE url = ? LIMIT 1"
|
136
|
+
|
137
|
+
async with self._get_cursor() as cursor:
|
138
|
+
await cursor.execute(query, (url,))
|
139
|
+
return await cursor.fetchone() is not None
|
140
|
+
|
141
|
+
async def close(self):
|
142
|
+
if hasattr(self, "conn") and self.conn:
|
143
|
+
await self.conn.close()
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
|
3
|
+
|
4
|
+
@dataclass
|
5
|
+
class Post:
|
6
|
+
username: str
|
7
|
+
url: str
|
8
|
+
date: str
|
9
|
+
id: int | None
|
10
|
+
text: str | None = None
|
11
|
+
replied_post_url: str | None = None
|
12
|
+
urls: list[str] = field(default_factory=list)
|
13
|
+
url_preview: str | None = None
|
14
|
+
photo_urls: list[str] = field(default_factory=list[str])
|
15
|
+
video_urls: list[str] = field(default_factory=list[str])
|
16
|
+
round_video_url: str | None = None
|
17
|
+
files: list[dict[str, str]] = field(default_factory=list)
|
18
|
+
tags: list[str] = field(default_factory=list[str])
|
19
|
+
forwarded_from_url: str | None = None
|
20
|
+
forwarded_from_name: str | None = None
|
File without changes
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import httpx
|
2
|
+
from retry import retry
|
3
|
+
|
4
|
+
from telegram_pm.utils.logger import logger
|
5
|
+
from telegram_pm.config import HttpClientConfig
|
6
|
+
|
7
|
+
|
8
|
+
class HttpClient:
|
9
|
+
def __init__(self):
|
10
|
+
self.client = httpx.AsyncClient(
|
11
|
+
transport=httpx.AsyncHTTPTransport(
|
12
|
+
verify=False,
|
13
|
+
retries=HttpClientConfig.retries,
|
14
|
+
),
|
15
|
+
timeout=HttpClientConfig.timeout,
|
16
|
+
verify=False,
|
17
|
+
)
|
18
|
+
|
19
|
+
@retry(backoff=HttpClientConfig.backoff, logger=logger) # type: ignore[arg-type]
|
20
|
+
async def request(self, url: str, method: str = "GET", **kwargs) -> httpx.Response:
|
21
|
+
response = await self.client.request(method=method, url=url, **kwargs)
|
22
|
+
return response
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
class BaseParser: ...
|
@@ -0,0 +1,280 @@
|
|
1
|
+
import typer
|
2
|
+
from bs4 import BeautifulSoup, PageElement
|
3
|
+
|
4
|
+
from telegram_pm import utils
|
5
|
+
from telegram_pm.entities import Post
|
6
|
+
from telegram_pm.utils.logger import logger
|
7
|
+
from telegram_pm.parsers.base import BaseParser
|
8
|
+
from telegram_pm.config import TelegramConfig
|
9
|
+
from telegram_pm.parsers.tag_options import PostParseConfig, TagOptions
|
10
|
+
|
11
|
+
|
12
|
+
class PostsParser(BaseParser):
|
13
|
+
"""
|
14
|
+
Posts parsers from preview page
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(self, verbose: bool = False):
|
18
|
+
self.__verbose: bool = verbose
|
19
|
+
self._tag_ops = PostParseConfig
|
20
|
+
|
21
|
+
@staticmethod
|
22
|
+
def get_post_attribute(
|
23
|
+
post: PageElement,
|
24
|
+
tab_ops: TagOptions,
|
25
|
+
extract_field: str,
|
26
|
+
_warn_log_enable: bool = True,
|
27
|
+
) -> str | None:
|
28
|
+
post_attribute = post.find(name=tab_ops.tag, attrs=tab_ops.attrs) # type: ignore[attr-defined]
|
29
|
+
if post_attribute:
|
30
|
+
if extract_field == "text":
|
31
|
+
return post_attribute.text
|
32
|
+
return post_attribute.get(extract_field)
|
33
|
+
if _warn_log_enable:
|
34
|
+
logger.warning(f"Not found. '{tab_ops.tag}': '{tab_ops.attrs}'")
|
35
|
+
return None
|
36
|
+
|
37
|
+
def get_channel_id(self, post: PageElement) -> int | None:
|
38
|
+
channel_base64 = self.get_post_attribute(
|
39
|
+
post=post,
|
40
|
+
tab_ops=self._tag_ops.channel_id,
|
41
|
+
extract_field="data-view",
|
42
|
+
_warn_log_enable=False,
|
43
|
+
)
|
44
|
+
if not channel_base64:
|
45
|
+
return None
|
46
|
+
channel_id = utils.parse.decode_channel_id(channel_base64)
|
47
|
+
return channel_id
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def get_urls_from_styles(post: PageElement, tag_pos: TagOptions) -> list[str]:
|
51
|
+
urls = []
|
52
|
+
styles_list = post.find_all(name=tag_pos.tag, attrs=tag_pos.attrs) # type: ignore[attr-defined]
|
53
|
+
for style in styles_list:
|
54
|
+
urls.append(
|
55
|
+
utils.parse.extract_url_from_style(style_content=style.get("style", ""))
|
56
|
+
)
|
57
|
+
return urls # type: ignore[return-value]
|
58
|
+
|
59
|
+
def get_posts(self, bs_preview_content: BeautifulSoup) -> list[PageElement]:
|
60
|
+
posts_list = utils.parse.extract_element(
|
61
|
+
bs_content=bs_preview_content,
|
62
|
+
tag_ops=self._tag_ops.post_block,
|
63
|
+
)
|
64
|
+
return posts_list
|
65
|
+
|
66
|
+
def get_post_url(self, username: str, post: PageElement) -> str:
|
67
|
+
post_url = self.get_post_attribute(
|
68
|
+
post=post,
|
69
|
+
tab_ops=self._tag_ops.post_url,
|
70
|
+
extract_field="href",
|
71
|
+
)
|
72
|
+
if post_url.startswith(f"{TelegramConfig.base_url}/"): # type: ignore[union-attr]
|
73
|
+
post_url = post_url.split("/")[-1] # type: ignore[union-attr]
|
74
|
+
post_url = f"{TelegramConfig.base_url}/{username}/{post_url}"
|
75
|
+
return post_url # type: ignore[return-value]
|
76
|
+
|
77
|
+
def get_post_date(self, post: PageElement) -> str:
|
78
|
+
return self.get_post_attribute( # type: ignore[return-value]
|
79
|
+
post=post,
|
80
|
+
tab_ops=self._tag_ops.date,
|
81
|
+
extract_field="datetime",
|
82
|
+
)
|
83
|
+
|
84
|
+
def get_replied_url(self, post: PageElement) -> str | None:
|
85
|
+
return self.get_post_attribute(
|
86
|
+
post=post,
|
87
|
+
tab_ops=self._tag_ops.replied_url,
|
88
|
+
extract_field="href",
|
89
|
+
_warn_log_enable=False,
|
90
|
+
)
|
91
|
+
|
92
|
+
def get_forwarded_from_url(self, post: PageElement) -> str | None:
|
93
|
+
return self.get_post_attribute(
|
94
|
+
post=post,
|
95
|
+
tab_ops=self._tag_ops.forwarded_from_url,
|
96
|
+
extract_field="href",
|
97
|
+
_warn_log_enable=False,
|
98
|
+
)
|
99
|
+
|
100
|
+
def get_forwarded_from_name(self, post: PageElement) -> str | None:
|
101
|
+
return self.get_post_attribute(
|
102
|
+
post=post,
|
103
|
+
tab_ops=self._tag_ops.forwarded_from_name,
|
104
|
+
extract_field="text",
|
105
|
+
_warn_log_enable=False,
|
106
|
+
)
|
107
|
+
|
108
|
+
def get_text(self, post: PageElement) -> str | None:
|
109
|
+
return self.get_post_attribute(
|
110
|
+
post=post,
|
111
|
+
tab_ops=self._tag_ops.text,
|
112
|
+
extract_field="text",
|
113
|
+
_warn_log_enable=False,
|
114
|
+
)
|
115
|
+
|
116
|
+
def get_photo_urls(self, post: PageElement) -> list[str]:
|
117
|
+
return self.get_urls_from_styles(
|
118
|
+
post=post,
|
119
|
+
tag_pos=self._tag_ops.photo_url,
|
120
|
+
)
|
121
|
+
|
122
|
+
def get_video_urls(self, post: PageElement) -> list[str]:
|
123
|
+
return self.get_urls_from_styles(
|
124
|
+
post=post,
|
125
|
+
tag_pos=self._tag_ops.video_url,
|
126
|
+
)
|
127
|
+
|
128
|
+
def get_round_video(self, post: PageElement) -> str | None:
|
129
|
+
return self.get_post_attribute(
|
130
|
+
post=post,
|
131
|
+
tab_ops=self._tag_ops.round_video_url,
|
132
|
+
extract_field="src",
|
133
|
+
_warn_log_enable=False,
|
134
|
+
)
|
135
|
+
|
136
|
+
def get_urls(self, post: PageElement) -> list[str]:
|
137
|
+
urls = set()
|
138
|
+
url_elements = post.find_all( # type: ignore[attr-defined]
|
139
|
+
name=self._tag_ops.url.tag,
|
140
|
+
attrs=self._tag_ops.url.attrs,
|
141
|
+
)
|
142
|
+
for url in url_elements:
|
143
|
+
urls.add(url.get("href"))
|
144
|
+
return list(urls)
|
145
|
+
|
146
|
+
def get_url_preview(self, post: PageElement) -> str | None:
|
147
|
+
return self.get_post_attribute(
|
148
|
+
post=post,
|
149
|
+
tab_ops=self._tag_ops.url_preview,
|
150
|
+
extract_field="text",
|
151
|
+
_warn_log_enable=False,
|
152
|
+
)
|
153
|
+
|
154
|
+
def get_files(self, post: PageElement) -> list[dict[str, str]]:
|
155
|
+
files: list = []
|
156
|
+
files_elements = post.find_all( # type: ignore[attr-defined]
|
157
|
+
name=self._tag_ops.file.tag,
|
158
|
+
attrs=self._tag_ops.file.attrs,
|
159
|
+
)
|
160
|
+
file: PageElement
|
161
|
+
for file in files_elements:
|
162
|
+
title = file.text
|
163
|
+
extra = file.find_next_sibling( # type: ignore[union-attr]
|
164
|
+
name=self._tag_ops.file_extra.tag,
|
165
|
+
attrs=self._tag_ops.file_extra.attrs,
|
166
|
+
).text
|
167
|
+
files.append({"title": title, "extra": extra})
|
168
|
+
return files
|
169
|
+
|
170
|
+
def get_tags(self, post: PageElement) -> list[str]:
|
171
|
+
tags_elements = post.find_all( # type: ignore[attr-defined]
|
172
|
+
name=self._tag_ops.tag.tag,
|
173
|
+
attrs=self._tag_ops.tag.attrs,
|
174
|
+
)
|
175
|
+
return [tag.text for tag in tags_elements]
|
176
|
+
|
177
|
+
def parse(self, username: str, bs_preview_content: BeautifulSoup) -> list[Post]:
|
178
|
+
parse_results = []
|
179
|
+
posts_list = self.get_posts(bs_preview_content=bs_preview_content)
|
180
|
+
for post_element in posts_list:
|
181
|
+
post = Post(
|
182
|
+
username=username,
|
183
|
+
id=self.get_channel_id(post_element),
|
184
|
+
url=self.get_post_url(username, post_element),
|
185
|
+
date=self.get_post_date(post_element),
|
186
|
+
replied_post_url=self.get_replied_url(post_element),
|
187
|
+
text=self.get_text(post_element),
|
188
|
+
photo_urls=self.get_photo_urls(post_element),
|
189
|
+
video_urls=self.get_video_urls(post_element),
|
190
|
+
round_video_url=self.get_round_video(post_element),
|
191
|
+
urls=self.get_urls(post_element),
|
192
|
+
url_preview=self.get_url_preview(post_element),
|
193
|
+
files=self.get_files(post_element),
|
194
|
+
tags=self.get_tags(post_element),
|
195
|
+
forwarded_from_url=self.get_forwarded_from_url(post_element),
|
196
|
+
forwarded_from_name=self.get_forwarded_from_name(post_element),
|
197
|
+
)
|
198
|
+
parse_results.append(post)
|
199
|
+
if self.__verbose:
|
200
|
+
self._print_post(post=post)
|
201
|
+
return parse_results
|
202
|
+
|
203
|
+
@staticmethod
|
204
|
+
def _print_post(post: Post):
|
205
|
+
typer.echo("\n" + typer.style("═" * 50, fg=typer.colors.BRIGHT_MAGENTA))
|
206
|
+
typer.echo(
|
207
|
+
typer.style("🎯 Username: ", fg=typer.colors.BRIGHT_RED)
|
208
|
+
+ typer.style(post.username, fg=typer.colors.RED)
|
209
|
+
)
|
210
|
+
typer.echo(
|
211
|
+
typer.style("📅 Date: ", fg=typer.colors.BRIGHT_CYAN)
|
212
|
+
+ typer.style(post.date, fg=typer.colors.WHITE)
|
213
|
+
)
|
214
|
+
|
215
|
+
typer.echo(
|
216
|
+
typer.style("🔗 URL: ", fg=typer.colors.BRIGHT_CYAN)
|
217
|
+
+ typer.style(post.url, fg=typer.colors.BRIGHT_BLUE, underline=True)
|
218
|
+
)
|
219
|
+
|
220
|
+
if post.replied_post_url:
|
221
|
+
typer.echo(
|
222
|
+
typer.style("↩️ Replied: ", fg=typer.colors.BRIGHT_YELLOW)
|
223
|
+
+ typer.style(post.replied_post_url, fg=typer.colors.BLUE)
|
224
|
+
)
|
225
|
+
|
226
|
+
if post.text:
|
227
|
+
typer.echo("\n💬💬💬")
|
228
|
+
typer.echo(typer.style(post.text[:50], fg=typer.colors.GREEN))
|
229
|
+
typer.echo("💬💬💬")
|
230
|
+
|
231
|
+
if post.photo_urls:
|
232
|
+
typer.echo("\n" + typer.style("📷 Photo: ", fg=typer.colors.BRIGHT_RED))
|
233
|
+
for photo in post.photo_urls:
|
234
|
+
typer.echo(typer.style(f" → {photo}", fg=typer.colors.RED))
|
235
|
+
|
236
|
+
if post.video_urls:
|
237
|
+
typer.echo("\n" + typer.style("🎥 Video: ", fg=typer.colors.BRIGHT_RED))
|
238
|
+
for video in post.video_urls:
|
239
|
+
typer.echo(typer.style(f" → {video}", fg=typer.colors.RED))
|
240
|
+
|
241
|
+
if post.urls:
|
242
|
+
typer.echo("\n" + typer.style("🌐 URLs: ", fg=typer.colors.BRIGHT_MAGENTA))
|
243
|
+
for url in post.urls:
|
244
|
+
typer.echo(typer.style(f" → {url}", fg=typer.colors.MAGENTA))
|
245
|
+
|
246
|
+
if post.url_preview:
|
247
|
+
typer.echo("\n👀👀👀")
|
248
|
+
typer.echo(
|
249
|
+
typer.style(
|
250
|
+
f"🔍 URL preview: {post.url_preview[:50]}", fg=typer.colors.GREEN
|
251
|
+
)
|
252
|
+
)
|
253
|
+
typer.echo("👀👀👀")
|
254
|
+
|
255
|
+
if post.round_video_url:
|
256
|
+
typer.echo(
|
257
|
+
"\n"
|
258
|
+
+ typer.style(
|
259
|
+
f"🔍 Round video: {post.round_video_url}", fg=typer.colors.BLUE
|
260
|
+
)
|
261
|
+
)
|
262
|
+
|
263
|
+
if post.tags:
|
264
|
+
typer.echo(
|
265
|
+
"\n"
|
266
|
+
+ typer.style("⌗ Tags: ", fg=typer.colors.BRIGHT_GREEN)
|
267
|
+
+ typer.style(", ".join(post.tags), fg=typer.colors.GREEN)
|
268
|
+
)
|
269
|
+
|
270
|
+
if post.files:
|
271
|
+
typer.echo("\n" + typer.style("📂 Files: ", fg=typer.colors.BRIGHT_YELLOW))
|
272
|
+
for file in post.files:
|
273
|
+
print_file = file.get("title")
|
274
|
+
if print_file:
|
275
|
+
extra = file.get("extra")
|
276
|
+
if extra:
|
277
|
+
print_file = f"{print_file} ({extra})"
|
278
|
+
typer.echo(typer.style(f" → {print_file}", fg=typer.colors.YELLOW))
|
279
|
+
|
280
|
+
typer.echo(typer.style("═" * 50, fg=typer.colors.BRIGHT_MAGENTA) + "\n")
|
@@ -0,0 +1,165 @@
|
|
1
|
+
import asyncio
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
from bs4 import BeautifulSoup
|
5
|
+
from structlog.contextvars import bound_contextvars
|
6
|
+
|
7
|
+
from telegram_pm import utils
|
8
|
+
from telegram_pm.entities import Post
|
9
|
+
from telegram_pm.utils.logger import logger
|
10
|
+
from telegram_pm.config import TelegramConfig
|
11
|
+
from telegram_pm.parsers.base import BaseParser
|
12
|
+
from telegram_pm.parsers.post import PostsParser
|
13
|
+
from telegram_pm.http_client.client import HttpClient
|
14
|
+
from telegram_pm.database.db import DatabaseProcessor
|
15
|
+
|
16
|
+
|
17
|
+
class PreviewParser(BaseParser):
|
18
|
+
"""
|
19
|
+
Telegram preview page parser
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
channels: list[str],
|
25
|
+
db_path: str,
|
26
|
+
verbose: bool = False,
|
27
|
+
):
|
28
|
+
self.channels: list[str] = channels
|
29
|
+
self.http_client = HttpClient()
|
30
|
+
self.post_parser = PostsParser(verbose=verbose)
|
31
|
+
self.db = DatabaseProcessor(db_path=db_path)
|
32
|
+
self._db_initialized = False
|
33
|
+
self.verbose = verbose
|
34
|
+
|
35
|
+
@staticmethod
|
36
|
+
def __forbidden_parse_preview(response: httpx.Response) -> bool:
|
37
|
+
"""
|
38
|
+
Check parsing availability
|
39
|
+
:param response: httpx.Response
|
40
|
+
:return: bool. If True, then you can't parse preview page
|
41
|
+
"""
|
42
|
+
if response.status_code in (302,):
|
43
|
+
return True
|
44
|
+
return False
|
45
|
+
|
46
|
+
@staticmethod
|
47
|
+
def __parse_before_param_value(post_url: str) -> int:
|
48
|
+
before_value = post_url.split("/")[-1]
|
49
|
+
return int(before_value)
|
50
|
+
|
51
|
+
async def _get_preview_page(self, preview_url: str) -> httpx.Response:
|
52
|
+
"""
|
53
|
+
Get preview page
|
54
|
+
:param preview_url: str. Full preview URL
|
55
|
+
:return: httpx.Response
|
56
|
+
"""
|
57
|
+
response_preview_url = await self.http_client.request(
|
58
|
+
url=preview_url,
|
59
|
+
)
|
60
|
+
return response_preview_url
|
61
|
+
|
62
|
+
def _parse_posts_in_preview(
|
63
|
+
self, username: str, response: httpx.Response
|
64
|
+
) -> list[Post]:
|
65
|
+
bs_content = BeautifulSoup(response.text, "html5lib")
|
66
|
+
posts = self.post_parser.parse(username=username, bs_preview_content=bs_content)
|
67
|
+
return posts
|
68
|
+
|
69
|
+
async def initialize(self):
|
70
|
+
"""Initialize database"""
|
71
|
+
if not self._db_initialized:
|
72
|
+
await self.db.initialize()
|
73
|
+
self._db_initialized = True
|
74
|
+
|
75
|
+
async def close(self):
|
76
|
+
"""Clean up resources"""
|
77
|
+
if hasattr(self.db, "close"):
|
78
|
+
await self.db.close()
|
79
|
+
|
80
|
+
async def parse_channel(self, channel_username: str):
|
81
|
+
"""Parse single channel"""
|
82
|
+
channel_username = utils.url.get_username_from_tg_url(channel_username)
|
83
|
+
with bound_contextvars(username=channel_username):
|
84
|
+
if not await self.db.table_exists(channel_username):
|
85
|
+
await self.db.create_table_from_post(channel_username)
|
86
|
+
await logger.ainfo("Created new table for channel")
|
87
|
+
|
88
|
+
preview_url = utils.url.build_preview_url(username=channel_username)
|
89
|
+
posts_result = []
|
90
|
+
should_break = False
|
91
|
+
|
92
|
+
for parse_repeat in range(TelegramConfig.iteration_in_preview_count):
|
93
|
+
if should_break:
|
94
|
+
await logger.ainfo("No new posts yet")
|
95
|
+
break
|
96
|
+
|
97
|
+
try:
|
98
|
+
response = await self._get_preview_page(preview_url=preview_url)
|
99
|
+
if not response:
|
100
|
+
await logger.awarning("Can not get preview page")
|
101
|
+
await asyncio.sleep(TelegramConfig.sleep_after_error_request)
|
102
|
+
continue
|
103
|
+
|
104
|
+
if self.__forbidden_parse_preview(response=response):
|
105
|
+
await logger.awarning("Forbidden parsing preview")
|
106
|
+
break
|
107
|
+
|
108
|
+
parsed_posts = self._parse_posts_in_preview(
|
109
|
+
username=channel_username, response=response
|
110
|
+
)
|
111
|
+
if not parsed_posts:
|
112
|
+
await logger.awarning("No posts parsed from preview page") # type: ignore
|
113
|
+
await self.db.drop_table_if_empty(channel_username)
|
114
|
+
await asyncio.sleep(TelegramConfig.sleep_after_error_request)
|
115
|
+
break
|
116
|
+
|
117
|
+
first_post_exists = await self.db.post_exists(
|
118
|
+
channel_username, parsed_posts[0].url
|
119
|
+
)
|
120
|
+
if first_post_exists:
|
121
|
+
should_break = True
|
122
|
+
continue
|
123
|
+
|
124
|
+
await self.db.insert_posts_batch(channel_username, parsed_posts)
|
125
|
+
posts_result.extend(parsed_posts)
|
126
|
+
|
127
|
+
before_param_number = self.__parse_before_param_value(
|
128
|
+
post_url=parsed_posts[-1].url
|
129
|
+
)
|
130
|
+
if before_param_number <= TelegramConfig.before_param_size:
|
131
|
+
before_param_number -= TelegramConfig.before_param_size
|
132
|
+
else:
|
133
|
+
before_param_number = (
|
134
|
+
before_param_number - TelegramConfig.before_param_size
|
135
|
+
)
|
136
|
+
if before_param_number <= 0:
|
137
|
+
break
|
138
|
+
|
139
|
+
preview_url = utils.url.build_param_before_url(
|
140
|
+
url=preview_url, before=before_param_number
|
141
|
+
)
|
142
|
+
|
143
|
+
except Exception as e:
|
144
|
+
await logger.aerror(
|
145
|
+
f"Error parsing channel {channel_username}: {e}"
|
146
|
+
)
|
147
|
+
break
|
148
|
+
|
149
|
+
return posts_result
|
150
|
+
|
151
|
+
async def parse(self):
|
152
|
+
"""Main parsing method"""
|
153
|
+
await self.initialize()
|
154
|
+
|
155
|
+
try:
|
156
|
+
for channel_username in self.channels:
|
157
|
+
try:
|
158
|
+
await self.parse_channel(channel_username)
|
159
|
+
except Exception as e:
|
160
|
+
await logger.aerror(
|
161
|
+
f"Failed to parse channel {channel_username}: {e}"
|
162
|
+
)
|
163
|
+
continue
|
164
|
+
finally:
|
165
|
+
await self.close()
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import re
|
2
|
+
from dataclasses import dataclass
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class TagOptions:
|
7
|
+
attrs: dict
|
8
|
+
tag: str
|
9
|
+
|
10
|
+
|
11
|
+
class PostParseConfig:
|
12
|
+
channel_id = TagOptions(
|
13
|
+
tag="div",
|
14
|
+
attrs={
|
15
|
+
"class": "tgme_widget_message text_not_supported_wrap js-widget_message"
|
16
|
+
},
|
17
|
+
)
|
18
|
+
|
19
|
+
post_block = TagOptions(
|
20
|
+
tag="div",
|
21
|
+
attrs={"class": re.compile(r"tgme_widget_message_wrap js-widget_message_wrap")},
|
22
|
+
)
|
23
|
+
|
24
|
+
post_url = TagOptions(tag="a", attrs={"class": "tgme_widget_message_date"})
|
25
|
+
|
26
|
+
replied_url = TagOptions(tag="a", attrs={"class": "tgme_widget_message_reply"})
|
27
|
+
|
28
|
+
text = TagOptions(
|
29
|
+
tag="div",
|
30
|
+
attrs={"class": re.compile(r"tgme_widget_message_text js-message_text")},
|
31
|
+
)
|
32
|
+
|
33
|
+
date = TagOptions(tag="time", attrs={"class": "time"})
|
34
|
+
|
35
|
+
photo_url = TagOptions(
|
36
|
+
tag="a", attrs={"class": re.compile(r"tgme_widget_message_photo_wrap")}
|
37
|
+
)
|
38
|
+
|
39
|
+
video_url = TagOptions(
|
40
|
+
tag="i", attrs={"class": re.compile(r"tgme_widget_message_video_thumb")}
|
41
|
+
)
|
42
|
+
|
43
|
+
round_video_url = TagOptions(
|
44
|
+
tag="video",
|
45
|
+
attrs={
|
46
|
+
"class": re.compile(r"tgme_widget_message_roundvideo js-message_roundvideo")
|
47
|
+
},
|
48
|
+
)
|
49
|
+
|
50
|
+
url = TagOptions(
|
51
|
+
tag="a",
|
52
|
+
attrs={
|
53
|
+
"target": re.compile(r"_blank"),
|
54
|
+
"href": re.compile(r"^https?://"),
|
55
|
+
},
|
56
|
+
)
|
57
|
+
|
58
|
+
url_preview = TagOptions(
|
59
|
+
tag="a", attrs={"class": re.compile(r"tgme_widget_message_link_preview")}
|
60
|
+
)
|
61
|
+
|
62
|
+
file = TagOptions(
|
63
|
+
tag="div", attrs={"class": re.compile(r"tgme_widget_message_document_title")}
|
64
|
+
)
|
65
|
+
|
66
|
+
file_extra = TagOptions(
|
67
|
+
tag="div", attrs={"class": re.compile(r"tgme_widget_message_document_extra")}
|
68
|
+
)
|
69
|
+
|
70
|
+
tag = TagOptions(tag="a", attrs={"href": re.compile(r"^\?q=%23")})
|
71
|
+
|
72
|
+
forwarded_from_name = TagOptions(
|
73
|
+
tag="a", attrs={"class": "tgme_widget_message_forwarded_from_name"}
|
74
|
+
)
|
75
|
+
|
76
|
+
forwarded_from_url = TagOptions(
|
77
|
+
tag="a", attrs={"class": "tgme_widget_message_forwarded_from_name"}
|
78
|
+
)
|
@@ -0,0 +1,49 @@
|
|
1
|
+
import sys
|
2
|
+
import signal
|
3
|
+
import asyncio
|
4
|
+
|
5
|
+
from telegram_pm.parsers.preview import PreviewParser
|
6
|
+
from telegram_pm.utils.logger import logger
|
7
|
+
from telegram_pm.config import TelegramConfig
|
8
|
+
|
9
|
+
|
10
|
+
class ParserRunner:
|
11
|
+
def __init__(self, db_path: str, channels: list[str], verbose: bool = False):
|
12
|
+
self.db_path = db_path
|
13
|
+
self.channels = channels
|
14
|
+
self.verbose = verbose
|
15
|
+
|
16
|
+
self._shutdown = False
|
17
|
+
|
18
|
+
# Setup signal handlers
|
19
|
+
signal.signal(signal.SIGINT, self.handle_signal)
|
20
|
+
signal.signal(signal.SIGTERM, self.handle_signal)
|
21
|
+
|
22
|
+
def handle_signal(self, signum, frame):
|
23
|
+
logger.info(f"Received signal {signum}, shutting down...")
|
24
|
+
self._shutdown = True
|
25
|
+
sys.exit(0)
|
26
|
+
|
27
|
+
async def run(self):
|
28
|
+
parser = PreviewParser(
|
29
|
+
channels=self.channels, verbose=self.verbose, db_path=self.db_path
|
30
|
+
)
|
31
|
+
try:
|
32
|
+
while not self._shutdown:
|
33
|
+
try:
|
34
|
+
await parser.parse()
|
35
|
+
logger.info(
|
36
|
+
f"💤 Sleep {TelegramConfig.sleep_time_seconds} seconds ... 💤"
|
37
|
+
)
|
38
|
+
await asyncio.sleep(TelegramConfig.sleep_time_seconds)
|
39
|
+
except Exception as e:
|
40
|
+
logger.error(f"Error during parsing: {e}")
|
41
|
+
await asyncio.sleep(TelegramConfig.sleep_after_error_request)
|
42
|
+
finally:
|
43
|
+
if parser:
|
44
|
+
await parser.close()
|
45
|
+
|
46
|
+
|
47
|
+
def run_parser(db_path: str, channels: list[str], verbose: bool = False):
|
48
|
+
runner = ParserRunner(channels=channels, verbose=verbose, db_path=db_path)
|
49
|
+
asyncio.run(runner.run())
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import re
|
2
|
+
import json
|
3
|
+
import base64
|
4
|
+
|
5
|
+
from bs4 import BeautifulSoup, PageElement
|
6
|
+
|
7
|
+
from telegram_pm.parsers.tag_options import TagOptions
|
8
|
+
|
9
|
+
|
10
|
+
URL_REGEX = re.compile(
|
11
|
+
r"https?://(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_+.~#?&/=]*"
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def content_to_bs(content: str) -> BeautifulSoup:
|
16
|
+
return BeautifulSoup(content, "html5lib")
|
17
|
+
|
18
|
+
|
19
|
+
def extract_element(
|
20
|
+
bs_content: BeautifulSoup, tag_ops: TagOptions
|
21
|
+
) -> list[PageElement]:
|
22
|
+
elements = bs_content.find_all(tag_ops.tag, attrs=tag_ops.attrs)
|
23
|
+
return [elem for elem in elements]
|
24
|
+
|
25
|
+
|
26
|
+
def extract_url_from_style(style_content: str) -> str | None:
|
27
|
+
url = URL_REGEX.search(style_content)
|
28
|
+
if url:
|
29
|
+
return url.group(0)
|
30
|
+
return None
|
31
|
+
|
32
|
+
|
33
|
+
def channel_id_clean(id_str: str) -> int:
|
34
|
+
"""
|
35
|
+
Extract id from channel id string
|
36
|
+
c2233445566/14992 -> 2233445566
|
37
|
+
"""
|
38
|
+
channel_id = id_str.split("/")[0][1:]
|
39
|
+
return int(channel_id)
|
40
|
+
|
41
|
+
|
42
|
+
def decode_channel_id(channel_id_base64: str) -> int:
|
43
|
+
if not channel_id_base64.endswith("="):
|
44
|
+
channel_id_base64 += "=="
|
45
|
+
channel_id = json.loads(base64.b64decode(channel_id_base64))
|
46
|
+
return channel_id["c"]
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from urllib.parse import urljoin
|
2
|
+
|
3
|
+
from telegram_pm.config import TelegramConfig
|
4
|
+
|
5
|
+
|
6
|
+
def build_preview_url(username: str) -> str:
|
7
|
+
"""
|
8
|
+
Build preview URL.
|
9
|
+
username -> https://t.me/s/username
|
10
|
+
:param username: Telegram username
|
11
|
+
:return: str
|
12
|
+
"""
|
13
|
+
return urljoin(TelegramConfig.base_url, urljoin("/s/", username))
|
14
|
+
|
15
|
+
|
16
|
+
def build_param_before_url(url: str, before: int | str) -> str:
|
17
|
+
"""
|
18
|
+
Build preview URL with before parameter.
|
19
|
+
- https://t.me/s/username -> https://t.me/s/username?before
|
20
|
+
- https://t.me/s/username -> https://t.me/s/username?before=123
|
21
|
+
:param url: str - Preview URL
|
22
|
+
:param before: - Before parameter value
|
23
|
+
:return: str
|
24
|
+
"""
|
25
|
+
return urljoin(url, f"?before={before}")
|
26
|
+
|
27
|
+
|
28
|
+
def get_username_from_tg_url(url: str) -> str:
|
29
|
+
"""
|
30
|
+
Get username from Telegram URL.
|
31
|
+
"""
|
32
|
+
if url.startswith(TelegramConfig.base_url):
|
33
|
+
return url.split("/")[-1]
|
34
|
+
return url
|