warp-beacon 2.8.13__tar.gz → 2.8.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warp_beacon-2.8.13/warp_beacon.egg-info → warp_beacon-2.8.15}/PKG-INFO +1 -1
- warp_beacon-2.8.15/warp_beacon/__version__.py +2 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/jobs/abstract.py +7 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/jobs/types.py +2 -1
- warp_beacon-2.8.15/warp_beacon/scraper/X/X.py +304 -0
- warp_beacon-2.8.15/warp_beacon/scraper/X/types.py +8 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/__init__.py +4 -1
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/fail_handler.py +22 -3
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/storage/__init__.py +5 -6
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/bot.py +18 -8
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/handlers.py +23 -25
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/uploader/__init__.py +52 -66
- {warp_beacon-2.8.13 → warp_beacon-2.8.15/warp_beacon.egg-info}/PKG-INFO +1 -1
- warp_beacon-2.8.13/warp_beacon/__version__.py +0 -2
- warp_beacon-2.8.13/warp_beacon/scraper/X/X.py +0 -330
- warp_beacon-2.8.13/warp_beacon/scraper/X/types.py +0 -7
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/LICENSE +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/MANIFEST.in +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/README.md +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/assets/cc-group-black.png +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/assets/placeholder.gif +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/etc/.gitignore +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/etc/accounts.json +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/etc/proxies.json +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/etc/warp_beacon.conf +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/etc/warp_beacon.service +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/pyproject.toml +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/setup.cfg +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/setup.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/compress/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/compress/video.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/jobs/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/jobs/download_job.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/jobs/upload_job.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/mediainfo/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/mediainfo/abstract.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/mediainfo/audio.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/mediainfo/silencer.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/mediainfo/video.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scheduler/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scheduler/instagram_human.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scheduler/scheduler.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/X/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/X/abstract.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/abstract.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/account_selector.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/exceptions.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/instagram/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/instagram/captcha.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/instagram/instagram.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/instagram/wb_instagrapi.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/link_resolver.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/utils.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/youtube/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/youtube/abstract.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/youtube/music.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/youtube/shorts.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/scraper/youtube/youtube.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/storage/mongo.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/__init__.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/caption_shortener.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/download_status.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/edit_message.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/placeholder_message.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/progress_bar.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/progress_file_reader.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/types.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/telegram/utils.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/warp_beacon.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon/yt_auth.py +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon.egg-info/SOURCES.txt +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon.egg-info/dependency_links.txt +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon.egg-info/entry_points.txt +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon.egg-info/requires.txt +0 -0
- {warp_beacon-2.8.13 → warp_beacon-2.8.15}/warp_beacon.egg-info/top_level.txt +0 -0
@@ -109,6 +109,9 @@ class AbstractJob(ABC):
|
|
109
109
|
if self.media_type == JobType.COLLECTION:
|
110
110
|
if not self.media_collection:
|
111
111
|
return True
|
112
|
+
elif self.media_type == JobType.TEXT:
|
113
|
+
if not self.message_text:
|
114
|
+
return True
|
112
115
|
elif not self.local_media_path:
|
113
116
|
return True
|
114
117
|
return False
|
@@ -129,9 +132,13 @@ class AbstractJob(ABC):
|
|
129
132
|
for j in i:
|
130
133
|
if os.path.exists(j.local_media_path):
|
131
134
|
os.unlink(j.local_media_path)
|
135
|
+
elif self.media_type == JobType.TEXT:
|
136
|
+
pass
|
132
137
|
else:
|
133
138
|
if os.path.exists(self.local_media_path):
|
134
139
|
os.unlink(self.local_media_path)
|
135
140
|
if self.local_compressed_media_path:
|
136
141
|
if os.path.exists(self.local_compressed_media_path):
|
137
142
|
os.unlink(self.local_compressed_media_path)
|
143
|
+
|
144
|
+
return True
|
@@ -0,0 +1,304 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
import logging
|
4
|
+
from mimetypes import guess_extension, guess_type
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
import requests
|
7
|
+
import yt_dlp
|
8
|
+
from playwright.sync_api import sync_playwright, Page
|
9
|
+
|
10
|
+
from warp_beacon.telegram.utils import Utils
|
11
|
+
from warp_beacon.scraper.utils import ScraperUtils
|
12
|
+
from warp_beacon.jobs.types import JobType
|
13
|
+
from warp_beacon.scraper.X.abstract import XAbstract
|
14
|
+
|
15
|
+
from warp_beacon.scraper.exceptions import Unavailable
|
16
|
+
|
17
|
+
class XScraper(XAbstract):
|
18
|
+
DOWNLOAD_DIR = "/tmp"
|
19
|
+
|
20
|
+
def extract_canonical_name(self, media: dict) -> str:
|
21
|
+
ret = ""
|
22
|
+
try:
|
23
|
+
if media.get("title", None):
|
24
|
+
ret = media["title"]
|
25
|
+
if media.get("description", ""):
|
26
|
+
ret += "\n" + media["description"]
|
27
|
+
except Exception as e:
|
28
|
+
logging.warning("Failed to extract canonical media name!")
|
29
|
+
logging.exception(e)
|
30
|
+
|
31
|
+
return ret
|
32
|
+
|
33
|
+
def generate_result(self, local_files: list, job_type: JobType, canonical_name: str = "", performer: str = "") -> list:
|
34
|
+
res = []
|
35
|
+
if local_files:
|
36
|
+
if job_type == JobType.COLLECTION:
|
37
|
+
chunks = []
|
38
|
+
for media_chunk in Utils.chunker(local_files, 10):
|
39
|
+
chunk = []
|
40
|
+
for media in media_chunk:
|
41
|
+
mime_type, _ = guess_type(media)
|
42
|
+
chunk.append({
|
43
|
+
"local_media_path": self.rename_local_file(media),
|
44
|
+
"canonical_name": canonical_name,
|
45
|
+
"media_type": JobType.VIDEO if "video" in mime_type else JobType.IMAGE,
|
46
|
+
"media_info": {}
|
47
|
+
})
|
48
|
+
chunks.append(chunk)
|
49
|
+
res.append({
|
50
|
+
"media_type": JobType.COLLECTION,
|
51
|
+
"canonical_name": canonical_name,
|
52
|
+
"items": chunks
|
53
|
+
})
|
54
|
+
else:
|
55
|
+
for local_file in local_files:
|
56
|
+
res.append({
|
57
|
+
"local_media_path": self.rename_local_file(local_file),
|
58
|
+
"performer": performer,
|
59
|
+
"canonical_name": canonical_name,
|
60
|
+
"media_type": job_type
|
61
|
+
})
|
62
|
+
logging.debug(res)
|
63
|
+
return res
|
64
|
+
|
65
|
+
def _download(self, url: str, timeout: int = 60) -> list:
|
66
|
+
res = []
|
67
|
+
post_text = ""
|
68
|
+
pw_proxy = None
|
69
|
+
if self.proxy:
|
70
|
+
dsn = self.proxy.get("dsn", "")
|
71
|
+
if dsn:
|
72
|
+
parsed = urlparse(dsn)
|
73
|
+
pw_proxy = {
|
74
|
+
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
75
|
+
"username": parsed.username,
|
76
|
+
"password": parsed.password
|
77
|
+
}
|
78
|
+
logging.info("[X] build proxy: %s", pw_proxy)
|
79
|
+
|
80
|
+
contains_images, contains_videos = False, False
|
81
|
+
images, videos = [], []
|
82
|
+
with sync_playwright() as p:
|
83
|
+
with p.chromium.launch(headless=True) as browser:
|
84
|
+
with browser.new_context(proxy=pw_proxy, ignore_https_errors=True) as context:
|
85
|
+
page = context.new_page()
|
86
|
+
page.goto(url, wait_until="networkidle", timeout=(timeout*1000))
|
87
|
+
page.wait_for_selector("article[role='article']", timeout=(timeout*1000))
|
88
|
+
|
89
|
+
contains_videos = self.tweet_contains_video(page)
|
90
|
+
contains_images = self.tweet_contains_images(page)
|
91
|
+
|
92
|
+
if contains_images:
|
93
|
+
post_text, images = self.download_images(page, timeout)
|
94
|
+
|
95
|
+
if not contains_images and not contains_videos:
|
96
|
+
post_text = self.extract_post_text(page)
|
97
|
+
|
98
|
+
if contains_videos:
|
99
|
+
media_info, videos = self.download_videos(url, timeout)
|
100
|
+
if media_info:
|
101
|
+
post_text = self.extract_canonical_name(media_info)
|
102
|
+
|
103
|
+
if not images and not videos:
|
104
|
+
if not post_text:
|
105
|
+
raise Unavailable("Content unvailable")
|
106
|
+
logging.info("[X]: Sending text message")
|
107
|
+
res.append({
|
108
|
+
"message_text": post_text,
|
109
|
+
"media_type": JobType.TEXT
|
110
|
+
})
|
111
|
+
return res
|
112
|
+
|
113
|
+
if len(images) > 1 or len(videos) > 1:
|
114
|
+
logging.info("[X]: uploading collection")
|
115
|
+
content = images + videos
|
116
|
+
res.extend(self.generate_result(content, JobType.COLLECTION, canonical_name=post_text))
|
117
|
+
else:
|
118
|
+
logging.info("[X]: uploading media")
|
119
|
+
for job_type, content in {JobType.IMAGE: images, JobType.VIDEO: videos}.items():
|
120
|
+
if content:
|
121
|
+
res.extend(self.generate_result(content, job_type, canonical_name=post_text))
|
122
|
+
|
123
|
+
return res
|
124
|
+
|
125
|
+
def download_videos(self, url: str, timeout: int = 60) -> tuple[dict, list[str]]:
|
126
|
+
local_files = []
|
127
|
+
media_info = {}
|
128
|
+
time_name = str(time.time()).replace('.', '_')
|
129
|
+
ydl_opts = {
|
130
|
+
'socket_timeout': timeout,
|
131
|
+
'outtmpl': f'{self.DOWNLOAD_DIR}/x_download_{time_name}_%(id)s.%(ext)s',
|
132
|
+
'quiet': False,
|
133
|
+
'force_generic_extractor': False,
|
134
|
+
#'noplaylist': True,
|
135
|
+
'merge_output_format': 'mp4',
|
136
|
+
'dump_single_json': False,
|
137
|
+
'nocheckcertificate': True,
|
138
|
+
'progress_hooks': [self.dlp_on_progress],
|
139
|
+
}
|
140
|
+
if self.proxy:
|
141
|
+
proxy_dsn = self.proxy.get("dsn", "")
|
142
|
+
logging.info("[X] Using proxy DSN '%s'", proxy_dsn)
|
143
|
+
if proxy_dsn:
|
144
|
+
ydl_opts["proxy"] = proxy_dsn
|
145
|
+
|
146
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
147
|
+
info = ydl.extract_info(url, download=False)
|
148
|
+
media_info = info
|
149
|
+
entries = info.get("entries", [info])
|
150
|
+
|
151
|
+
for entry in entries:
|
152
|
+
ret = ydl.download([entry['webpage_url']])
|
153
|
+
if ret == 0:
|
154
|
+
file_path = ydl.prepare_filename(entry)
|
155
|
+
if isinstance(file_path, str):
|
156
|
+
local_files.append(file_path)
|
157
|
+
else:
|
158
|
+
local_files.extend(file_path)
|
159
|
+
|
160
|
+
return media_info, local_files
|
161
|
+
|
162
|
+
def adaptive_chunk_size(self, content_length: int) -> int:
|
163
|
+
if content_length < 100_000:
|
164
|
+
return 2048
|
165
|
+
elif content_length < 5_000_000:
|
166
|
+
return 8192
|
167
|
+
elif content_length < 100_000_000:
|
168
|
+
return 32768
|
169
|
+
else:
|
170
|
+
return 65536
|
171
|
+
|
172
|
+
def get_extension_from_headers(self, response: requests.Response) -> str:
|
173
|
+
content_type = response.headers.get("Content-Type", "")
|
174
|
+
return guess_extension(content_type) or ".jpg"
|
175
|
+
|
176
|
+
def download_images(self, page: Page, timeout: int) -> tuple[str, list[str]]:
|
177
|
+
downloaded_imgs = []
|
178
|
+
headers = {
|
179
|
+
"User-Agent": ScraperUtils.get_ua(),
|
180
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
181
|
+
"Accept-Language": "en-us,en;q=0.5",
|
182
|
+
"Sec-Fetch-Mode": "navigate"
|
183
|
+
}
|
184
|
+
proxies = None
|
185
|
+
if self.proxy:
|
186
|
+
proxies = {"https": self.proxy.get("dsn", ""), "http": self.proxy.get("dsn", "")}
|
187
|
+
|
188
|
+
image_urls, post_text = self.extract_image_urls_from_x_post(page, timeout)
|
189
|
+
|
190
|
+
if not image_urls:
|
191
|
+
logging.error("[X] Content images are not found!")
|
192
|
+
return downloaded_imgs
|
193
|
+
|
194
|
+
time_name = str(time.time()).replace('.', '_')
|
195
|
+
for i, img_url in enumerate(set(image_urls)):
|
196
|
+
downloaded = 0
|
197
|
+
if "?name=small" in img_url:
|
198
|
+
img_url = img_url.replace("?name=small", "?name=orig")
|
199
|
+
with requests.get(
|
200
|
+
img_url,
|
201
|
+
headers=headers,
|
202
|
+
timeout=timeout,
|
203
|
+
stream=True,
|
204
|
+
verify=False,
|
205
|
+
proxies=proxies) as request:
|
206
|
+
|
207
|
+
request.raise_for_status()
|
208
|
+
|
209
|
+
parsed = urlparse(img_url)
|
210
|
+
ext = os.path.splitext(parsed.path)[1]
|
211
|
+
if not ext:
|
212
|
+
ext = self.get_extension_from_headers(request)
|
213
|
+
filename = f"x_download_{time_name}_{i}{ext}"
|
214
|
+
filepath = os.path.join(self.DOWNLOAD_DIR, filename)
|
215
|
+
|
216
|
+
content_length = int(request.headers.get("Content-Length", 0))
|
217
|
+
|
218
|
+
with open(filepath, "wb") as f:
|
219
|
+
#request.raw.decode_content = True
|
220
|
+
chunk_size = self.adaptive_chunk_size(content_length)
|
221
|
+
for chunk in request.iter_content(chunk_size=chunk_size):
|
222
|
+
if chunk:
|
223
|
+
f.write(chunk)
|
224
|
+
downloaded += len(chunk)
|
225
|
+
self.download_progress(
|
226
|
+
total=content_length or None,
|
227
|
+
bytes_transferred=downloaded,
|
228
|
+
path=filepath
|
229
|
+
)
|
230
|
+
downloaded_imgs.append(filepath)
|
231
|
+
|
232
|
+
return post_text, downloaded_imgs
|
233
|
+
|
234
|
+
def extract_image_urls_from_x_post(self, page: Page, timeout: int) -> tuple[list[str], str]:
|
235
|
+
img_urls, post_text = [], ''
|
236
|
+
|
237
|
+
page.wait_for_selector("img[src*='pbs.twimg.com/media']", timeout=(timeout*1000))
|
238
|
+
post_text = self.extract_post_text(page)
|
239
|
+
|
240
|
+
image_elements = page.query_selector_all("img")
|
241
|
+
image_urls = []
|
242
|
+
|
243
|
+
for img in image_elements:
|
244
|
+
src = img.get_attribute("src")
|
245
|
+
if src and "pbs.twimg.com/media" in src:
|
246
|
+
image_urls.append(src)
|
247
|
+
|
248
|
+
img_urls = list(set(image_urls))
|
249
|
+
return img_urls, post_text
|
250
|
+
|
251
|
+
def tweet_contains_video(self, page: Page) -> bool:
|
252
|
+
try:
|
253
|
+
return bool(
|
254
|
+
page.query_selector("article video") or
|
255
|
+
page.query_selector("div[data-testid='videoPlayer']") or
|
256
|
+
page.query_selector("div[aria-label='Embedded video']")
|
257
|
+
)
|
258
|
+
except Exception:
|
259
|
+
pass
|
260
|
+
return False
|
261
|
+
|
262
|
+
def tweet_contains_images(self, page: Page) -> bool:
|
263
|
+
try:
|
264
|
+
image_elements = page.query_selector_all("img")
|
265
|
+
image_urls = [
|
266
|
+
img.get_attribute("src")
|
267
|
+
for img in image_elements
|
268
|
+
if img.get_attribute("src") and "pbs.twimg.com/media" in img.get_attribute("src")
|
269
|
+
]
|
270
|
+
return bool(image_urls)
|
271
|
+
except Exception:
|
272
|
+
pass
|
273
|
+
return False
|
274
|
+
|
275
|
+
def extract_post_text(self, page: Page) -> str:
|
276
|
+
try:
|
277
|
+
text_fragments = []
|
278
|
+
|
279
|
+
# find tweetText containers (in main and quoted)
|
280
|
+
containers = page.query_selector_all('div[data-testid="tweetText"]')
|
281
|
+
for container in containers:
|
282
|
+
fragments = []
|
283
|
+
|
284
|
+
# find <span> and <img alt=...> inside text
|
285
|
+
for node in container.query_selector_all("span, img"):
|
286
|
+
tag = node.evaluate("node => node.tagName.toLowerCase()")
|
287
|
+
if tag == "span":
|
288
|
+
value = node.inner_text().strip()
|
289
|
+
if value:
|
290
|
+
fragments.append(value)
|
291
|
+
elif tag == "img":
|
292
|
+
# emoji as image
|
293
|
+
alt = node.get_attribute("alt")
|
294
|
+
if alt:
|
295
|
+
fragments.append(alt)
|
296
|
+
|
297
|
+
if fragments:
|
298
|
+
text_fragments.append("".join(fragments))
|
299
|
+
|
300
|
+
return "\n\n".join(text_fragments).strip()
|
301
|
+
|
302
|
+
except Exception as e:
|
303
|
+
logging.warning("X: [extract_post_text] error", exc_info=e)
|
304
|
+
return ""
|
@@ -318,7 +318,8 @@ class AsyncDownloader(object):
|
|
318
318
|
if items:
|
319
319
|
# success
|
320
320
|
for job in fail_handler.get_failed_jobs():
|
321
|
-
self.queue_task(job)
|
321
|
+
self.queue_task(job["job"])
|
322
|
+
# media info processing
|
322
323
|
for item in items:
|
323
324
|
media_info = {"filesize": 0}
|
324
325
|
if item["media_type"] == JobType.VIDEO:
|
@@ -372,6 +373,8 @@ class AsyncDownloader(object):
|
|
372
373
|
job_args["media_collection"] = item["items"]
|
373
374
|
if item.get("save_items", None) is not None:
|
374
375
|
job_args["save_items"] = item.get("save_items", False)
|
376
|
+
elif item["media_type"] == JobType.TEXT:
|
377
|
+
job_args["message_text"] = item.get("message_text", "")
|
375
378
|
else:
|
376
379
|
job_args["local_media_path"] = item["local_media_path"]
|
377
380
|
if item.get("local_compressed_media_path", None):
|
@@ -16,12 +16,15 @@ class FailHandler(object):
|
|
16
16
|
self.client.close()
|
17
17
|
|
18
18
|
def store_failed_job(self, job: DownloadJob) -> int:
|
19
|
-
db_id =
|
19
|
+
db_id = ""
|
20
20
|
try:
|
21
21
|
job_serilized = pickle.dumps(job)
|
22
22
|
db_id = self.db.insert_one(
|
23
23
|
{
|
24
|
-
"job_data": job_serilized
|
24
|
+
"job_data": job_serilized,
|
25
|
+
"uniq_id": job.uniq_id,
|
26
|
+
"message_id": job.message_id,
|
27
|
+
"chat_id": job.chat_id
|
25
28
|
}).inserted_id
|
26
29
|
except Exception as e:
|
27
30
|
logging.error("Failed to store job as failed!")
|
@@ -33,10 +36,26 @@ class FailHandler(object):
|
|
33
36
|
try:
|
34
37
|
cursor = self.db.find()
|
35
38
|
for document in cursor:
|
36
|
-
ret.append(
|
39
|
+
ret.append({
|
40
|
+
"_id": document["_id"],
|
41
|
+
"job": pickle.loads(document["job_data"]),
|
42
|
+
"uniq_id": document.get("uniq_id"),
|
43
|
+
"message_id": document.get("message_id"),
|
44
|
+
"chat_id": document.get("chat_id")
|
45
|
+
})
|
37
46
|
if clean:
|
38
47
|
self.db.delete_many({})
|
39
48
|
except Exception as e:
|
40
49
|
logging.error("Failed to get failed jobs!")
|
41
50
|
logging.exception(e)
|
42
51
|
return ret
|
52
|
+
|
53
|
+
def remove_failed_job(self, uniq_id: str) -> bool:
|
54
|
+
try:
|
55
|
+
result = self.db.delete_one({"uniq_id": uniq_id})
|
56
|
+
if result.deleted_count > 0:
|
57
|
+
return True
|
58
|
+
except Exception as e:
|
59
|
+
logging.error("Failed to remove failed job!", exc_info=e)
|
60
|
+
|
61
|
+
return False
|
@@ -67,7 +67,8 @@ class Storage(object):
|
|
67
67
|
"uniq_id": document["uniq_id"],
|
68
68
|
"tg_file_id": document["tg_file_id"],
|
69
69
|
"media_type": document["media_type"],
|
70
|
-
"canonical_name": document.get("canonical_name")
|
70
|
+
"canonical_name": document.get("canonical_name"),
|
71
|
+
"message_text": document.get("message_text")
|
71
72
|
})
|
72
73
|
except Exception as e:
|
73
74
|
logging.error("Error occurred while trying to read from the database!")
|
@@ -82,13 +83,10 @@ class Storage(object):
|
|
82
83
|
def db_lookup_id(self, uniq_id: str) -> list[dict]:
|
83
84
|
return self.db_find(uniq_id)
|
84
85
|
|
85
|
-
def add_media(self, tg_file_ids: list[str], media_url: str, media_type: str, origin: str, canonical_name: str = "") -> list[int]:
|
86
|
+
def add_media(self, tg_file_ids: list[str], media_url: str, media_type: str, origin: str, canonical_name: str = "", message_text: str = "") -> list[int]:
|
86
87
|
uniq_id = self.compute_uniq(media_url)
|
87
88
|
media_ids = []
|
88
89
|
for tg_file_id in tg_file_ids:
|
89
|
-
if not tg_file_id:
|
90
|
-
logging.warning("Passed empty `tg_file_id`! Skipping.")
|
91
|
-
continue
|
92
90
|
if self.db_lookup_id(uniq_id):
|
93
91
|
logging.info("Detected existing uniq_id, skipping storage write operation")
|
94
92
|
continue
|
@@ -98,7 +96,8 @@ class Storage(object):
|
|
98
96
|
"media_type": media_type,
|
99
97
|
"tg_file_id": tg_file_id,
|
100
98
|
"origin": origin,
|
101
|
-
"canonical_name": canonical_name
|
99
|
+
"canonical_name": canonical_name,
|
100
|
+
"message_text": message_text
|
102
101
|
}).inserted_id)
|
103
102
|
|
104
103
|
return media_ids
|
@@ -393,6 +393,9 @@ class Bot(object):
|
|
393
393
|
tg_chunk.append(anim)
|
394
394
|
mediafs.append(tg_chunk)
|
395
395
|
args["media"] = mediafs
|
396
|
+
elif job.media_type == JobType.TEXT:
|
397
|
+
args["text"] = f"<b>Post text:</b><pre>{job.message_text}</pre>\n\n{self.build_signature_caption(job)}"
|
398
|
+
args["parse_mode"] = ParseMode.HTML
|
396
399
|
|
397
400
|
args["chat_id"] = job.chat_id
|
398
401
|
|
@@ -412,7 +415,7 @@ class Bot(object):
|
|
412
415
|
if render_donates:
|
413
416
|
keyboard_buttons[0].append(InlineKeyboardButton("❤ Donate", url=os.environ.get("DONATE_LINK", "https://pay.cryptocloud.plus/pos/W5BMtNQt5bJFoW2E")))
|
414
417
|
|
415
|
-
if keyboard_buttons[0]:
|
418
|
+
if keyboard_buttons[0]:
|
416
419
|
args["reply_markup"] = InlineKeyboardMarkup(keyboard_buttons)
|
417
420
|
|
418
421
|
return args
|
@@ -425,9 +428,14 @@ class Bot(object):
|
|
425
428
|
while not retry_amount >= max_retries:
|
426
429
|
try:
|
427
430
|
reply_message = None
|
428
|
-
if job.media_type in (JobType.VIDEO, JobType.IMAGE, JobType.AUDIO, JobType.ANIMATION):
|
429
|
-
if job.media_type in (JobType.VIDEO, JobType.AUDIO):
|
430
|
-
|
431
|
+
if job.media_type in (JobType.VIDEO, JobType.IMAGE, JobType.AUDIO, JobType.ANIMATION, JobType.TEXT):
|
432
|
+
#if job.media_type in (JobType.VIDEO, JobType.AUDIO):
|
433
|
+
# await Utils.ensure_me_loaded(self.client)
|
434
|
+
if job.media_type == JobType.TEXT:
|
435
|
+
if job.placeholder_message_id:
|
436
|
+
await self.placeholder.remove(job.chat_id, job.placeholder_message_id)
|
437
|
+
job.placeholder_message_id = None
|
438
|
+
|
431
439
|
if job.placeholder_message_id:
|
432
440
|
try:
|
433
441
|
reply_message = await self.editor.edit(**self.build_tg_args(job))
|
@@ -440,7 +448,8 @@ class Bot(object):
|
|
440
448
|
JobType.VIDEO: self.client.send_video,
|
441
449
|
JobType.IMAGE: self.client.send_photo,
|
442
450
|
JobType.AUDIO: self.client.send_audio,
|
443
|
-
JobType.ANIMATION: self.client.send_animation
|
451
|
+
JobType.ANIMATION: self.client.send_animation,
|
452
|
+
JobType.TEXT: self.client.send_message
|
444
453
|
}
|
445
454
|
try:
|
446
455
|
while True:
|
@@ -460,9 +469,10 @@ class Bot(object):
|
|
460
469
|
job_args[reality.value.lower()] = job_args.pop(expectation.value.lower())
|
461
470
|
reply_message = await send_funcs[reality](**job_args)
|
462
471
|
|
463
|
-
|
464
|
-
|
465
|
-
|
472
|
+
if reply_message:
|
473
|
+
tg_file_id = Utils.extract_file_id(reply_message)
|
474
|
+
tg_file_ids.append(tg_file_id)
|
475
|
+
job.tg_file_id = tg_file_id
|
466
476
|
logging.info("Uploaded media file with type '%s' tg_file_id is '%s'", job.media_type.value, job.tg_file_id)
|
467
477
|
elif job.media_type == JobType.COLLECTION:
|
468
478
|
col_job_args = self.build_tg_args(job)
|
@@ -14,8 +14,6 @@ from warp_beacon.jobs.upload_job import UploadJob
|
|
14
14
|
from warp_beacon.jobs import Origin
|
15
15
|
from warp_beacon.jobs.types import JobType
|
16
16
|
from warp_beacon.scraper.link_resolver import LinkResolver
|
17
|
-
from warp_beacon.scraper.fail_handler import FailHandler
|
18
|
-
from warp_beacon.storage.mongo import DBClient
|
19
17
|
|
20
18
|
class Handlers(object):
|
21
19
|
storage = None
|
@@ -25,12 +23,7 @@ class Handlers(object):
|
|
25
23
|
def __init__(self, bot: "Bot") -> None:
|
26
24
|
self.bot = bot
|
27
25
|
self.storage = bot.storage
|
28
|
-
|
29
|
-
for job in FailHandler(DBClient()).get_failed_jobs(clean=False):
|
30
|
-
self.bot.uploader.add_callback(
|
31
|
-
job.placeholder_message_id,
|
32
|
-
self.upload_wrapper
|
33
|
-
)
|
26
|
+
self.bot.uploader.uploader_wrapper = self.upload_wrapper
|
34
27
|
|
35
28
|
async def help(self, _: Client, message: Message) -> None:
|
36
29
|
"""Send a message when the command /help is issued."""
|
@@ -111,6 +104,14 @@ class Handlers(object):
|
|
111
104
|
origin=job.job_origin.value,
|
112
105
|
canonical_name=common_canonical_name
|
113
106
|
)
|
107
|
+
elif job.media_type == JobType.TEXT:
|
108
|
+
self.storage.add_media(
|
109
|
+
tg_file_ids=[None],
|
110
|
+
media_url=job.url,
|
111
|
+
media_type=job.media_type.value,
|
112
|
+
origin=job.job_origin.value,
|
113
|
+
message_text=job.message_text
|
114
|
+
)
|
114
115
|
else:
|
115
116
|
self.storage.add_media(
|
116
117
|
tg_file_ids=[','.join(tg_file_ids)],
|
@@ -139,11 +140,6 @@ class Handlers(object):
|
|
139
140
|
text="Failed to create message placeholder. Please check your bot Internet connection."
|
140
141
|
)
|
141
142
|
|
142
|
-
self.bot.uploader.add_callback(
|
143
|
-
job.placeholder_message_id,
|
144
|
-
self.upload_wrapper
|
145
|
-
)
|
146
|
-
|
147
143
|
self.bot.downloader.queue_task(job)
|
148
144
|
except Exception as e:
|
149
145
|
logging.error("Failed to schedule download task!")
|
@@ -215,6 +211,7 @@ class Handlers(object):
|
|
215
211
|
elif ent_len:
|
216
212
|
media_type = JobType[entities[0]["media_type"].upper()]
|
217
213
|
canonical_name = entities[0]["canonical_name"]
|
214
|
+
message_text = entities[0]["message_text"]
|
218
215
|
await self.bot.upload_job(
|
219
216
|
UploadJob(
|
220
217
|
url=url,
|
@@ -228,22 +225,23 @@ class Handlers(object):
|
|
228
225
|
chat_type=message.chat.type,
|
229
226
|
source_username=Utils.extract_message_author(message),
|
230
227
|
canonical_name=canonical_name,
|
231
|
-
message_leftover=msg_leftover
|
228
|
+
message_leftover=msg_leftover,
|
229
|
+
message_text=message_text
|
232
230
|
)
|
233
231
|
)
|
234
232
|
else:
|
235
233
|
if await self.queue_job(DownloadJob.build(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
234
|
+
url=url,
|
235
|
+
message_id=effective_message_id,
|
236
|
+
chat_id=chat.id,
|
237
|
+
user_id=message.from_user.id,
|
238
|
+
in_process=self.bot.uploader.is_inprocess(uniq_id),
|
239
|
+
uniq_id=uniq_id,
|
240
|
+
job_origin=origin,
|
241
|
+
source_username=Utils.extract_message_author(message),
|
242
|
+
chat_type=chat.type,
|
243
|
+
message_leftover=msg_leftover
|
244
|
+
)):
|
247
245
|
self.bot.uploader.set_inprocess(uniq_id)
|
248
246
|
|
249
247
|
if chat.type not in (ChatType.GROUP, ChatType.SUPERGROUP) and not urls:
|