warp-beacon 2.8.13__tar.gz → 2.8.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {warp_beacon-2.8.13/warp_beacon.egg-info → warp_beacon-2.8.14}/PKG-INFO +1 -1
  2. warp_beacon-2.8.14/warp_beacon/__version__.py +2 -0
  3. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/jobs/abstract.py +7 -0
  4. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/jobs/types.py +2 -1
  5. warp_beacon-2.8.14/warp_beacon/scraper/X/X.py +304 -0
  6. warp_beacon-2.8.14/warp_beacon/scraper/X/types.py +8 -0
  7. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/__init__.py +3 -0
  8. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/storage/__init__.py +5 -6
  9. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/bot.py +18 -8
  10. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/handlers.py +22 -12
  11. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/uploader/__init__.py +5 -2
  12. {warp_beacon-2.8.13 → warp_beacon-2.8.14/warp_beacon.egg-info}/PKG-INFO +1 -1
  13. warp_beacon-2.8.13/warp_beacon/__version__.py +0 -2
  14. warp_beacon-2.8.13/warp_beacon/scraper/X/X.py +0 -330
  15. warp_beacon-2.8.13/warp_beacon/scraper/X/types.py +0 -7
  16. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/LICENSE +0 -0
  17. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/MANIFEST.in +0 -0
  18. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/README.md +0 -0
  19. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/assets/cc-group-black.png +0 -0
  20. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/assets/placeholder.gif +0 -0
  21. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/etc/.gitignore +0 -0
  22. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/etc/accounts.json +0 -0
  23. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/etc/proxies.json +0 -0
  24. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/etc/warp_beacon.conf +0 -0
  25. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/etc/warp_beacon.service +0 -0
  26. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/pyproject.toml +0 -0
  27. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/setup.cfg +0 -0
  28. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/setup.py +0 -0
  29. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/__init__.py +0 -0
  30. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/compress/__init__.py +0 -0
  31. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/compress/video.py +0 -0
  32. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/jobs/__init__.py +0 -0
  33. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/jobs/download_job.py +0 -0
  34. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/jobs/upload_job.py +0 -0
  35. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/mediainfo/__init__.py +0 -0
  36. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/mediainfo/abstract.py +0 -0
  37. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/mediainfo/audio.py +0 -0
  38. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/mediainfo/silencer.py +0 -0
  39. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/mediainfo/video.py +0 -0
  40. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scheduler/__init__.py +0 -0
  41. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scheduler/instagram_human.py +0 -0
  42. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scheduler/scheduler.py +0 -0
  43. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/X/__init__.py +0 -0
  44. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/X/abstract.py +0 -0
  45. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/abstract.py +0 -0
  46. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/account_selector.py +0 -0
  47. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/exceptions.py +0 -0
  48. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/fail_handler.py +0 -0
  49. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/instagram/__init__.py +0 -0
  50. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/instagram/captcha.py +0 -0
  51. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/instagram/instagram.py +0 -0
  52. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/instagram/wb_instagrapi.py +0 -0
  53. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/link_resolver.py +0 -0
  54. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/utils.py +0 -0
  55. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/youtube/__init__.py +0 -0
  56. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/youtube/abstract.py +0 -0
  57. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/youtube/music.py +0 -0
  58. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/youtube/shorts.py +0 -0
  59. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/scraper/youtube/youtube.py +0 -0
  60. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/storage/mongo.py +0 -0
  61. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/__init__.py +0 -0
  62. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/caption_shortener.py +0 -0
  63. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/download_status.py +0 -0
  64. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/edit_message.py +0 -0
  65. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/placeholder_message.py +0 -0
  66. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/progress_bar.py +0 -0
  67. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/progress_file_reader.py +0 -0
  68. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/types.py +0 -0
  69. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/telegram/utils.py +0 -0
  70. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/warp_beacon.py +0 -0
  71. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon/yt_auth.py +0 -0
  72. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon.egg-info/SOURCES.txt +0 -0
  73. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon.egg-info/dependency_links.txt +0 -0
  74. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon.egg-info/entry_points.txt +0 -0
  75. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon.egg-info/requires.txt +0 -0
  76. {warp_beacon-2.8.13 → warp_beacon-2.8.14}/warp_beacon.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warp_beacon
3
- Version: 2.8.13
3
+ Version: 2.8.14
4
4
  Summary: Telegram bot for expanding external media links
5
5
  Home-page: https://github.com/sb0y/warp_beacon
6
6
  Author: Andrey Bagrintsev
@@ -0,0 +1,2 @@
1
+ __version__ = "2.8.14"
2
+
@@ -109,6 +109,9 @@ class AbstractJob(ABC):
109
109
  if self.media_type == JobType.COLLECTION:
110
110
  if not self.media_collection:
111
111
  return True
112
+ elif self.media_type == JobType.TEXT:
113
+ if not self.message_text:
114
+ return True
112
115
  elif not self.local_media_path:
113
116
  return True
114
117
  return False
@@ -129,9 +132,13 @@ class AbstractJob(ABC):
129
132
  for j in i:
130
133
  if os.path.exists(j.local_media_path):
131
134
  os.unlink(j.local_media_path)
135
+ elif self.media_type == JobType.TEXT:
136
+ pass
132
137
  else:
133
138
  if os.path.exists(self.local_media_path):
134
139
  os.unlink(self.local_media_path)
135
140
  if self.local_compressed_media_path:
136
141
  if os.path.exists(self.local_compressed_media_path):
137
142
  os.unlink(self.local_compressed_media_path)
143
+
144
+ return True
@@ -6,4 +6,5 @@ class JobType(str, Enum):
6
6
  IMAGE = "image",
7
7
  AUDIO = "audio",
8
8
  COLLECTION = "collection"
9
- ANIMATION = "animation"
9
+ ANIMATION = "animation"
10
+ TEXT = "text"
@@ -0,0 +1,304 @@
1
+ import os
2
+ import time
3
+ import logging
4
+ from mimetypes import guess_extension, guess_type
5
+ from urllib.parse import urlparse
6
+ import requests
7
+ import yt_dlp
8
+ from playwright.sync_api import sync_playwright, Page
9
+
10
+ from warp_beacon.telegram.utils import Utils
11
+ from warp_beacon.scraper.utils import ScraperUtils
12
+ from warp_beacon.jobs.types import JobType
13
+ from warp_beacon.scraper.X.abstract import XAbstract
14
+
15
+ from warp_beacon.scraper.exceptions import Unavailable
16
+
17
+ class XScraper(XAbstract):
18
+ DOWNLOAD_DIR = "/tmp"
19
+
20
+ def extract_canonical_name(self, media: dict) -> str:
21
+ ret = ""
22
+ try:
23
+ if media.get("title", None):
24
+ ret = media["title"]
25
+ if media.get("description", ""):
26
+ ret += "\n" + media["description"]
27
+ except Exception as e:
28
+ logging.warning("Failed to extract canonical media name!")
29
+ logging.exception(e)
30
+
31
+ return ret
32
+
33
+ def generate_result(self, local_files: list, job_type: JobType, canonical_name: str = "", performer: str = "") -> list:
34
+ res = []
35
+ if local_files:
36
+ if job_type == JobType.COLLECTION:
37
+ chunks = []
38
+ for media_chunk in Utils.chunker(local_files, 10):
39
+ chunk = []
40
+ for media in media_chunk:
41
+ mime_type, _ = guess_type(media)
42
+ chunk.append({
43
+ "local_media_path": self.rename_local_file(media),
44
+ "canonical_name": canonical_name,
45
+ "media_type": JobType.VIDEO if "video" in mime_type else JobType.IMAGE,
46
+ "media_info": {}
47
+ })
48
+ chunks.append(chunk)
49
+ res.append({
50
+ "media_type": JobType.COLLECTION,
51
+ "canonical_name": canonical_name,
52
+ "items": chunks
53
+ })
54
+ else:
55
+ for local_file in local_files:
56
+ res.append({
57
+ "local_media_path": self.rename_local_file(local_file),
58
+ "performer": performer,
59
+ "canonical_name": canonical_name,
60
+ "media_type": job_type
61
+ })
62
+ logging.debug(res)
63
+ return res
64
+
65
+ def _download(self, url: str, timeout: int = 60) -> list:
66
+ res = []
67
+ post_text = ""
68
+ pw_proxy = None
69
+ if self.proxy:
70
+ dsn = self.proxy.get("dsn", "")
71
+ if dsn:
72
+ parsed = urlparse(dsn)
73
+ pw_proxy = {
74
+ "server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
75
+ "username": parsed.username,
76
+ "password": parsed.password
77
+ }
78
+ logging.info("[X] build proxy: %s", pw_proxy)
79
+
80
+ contains_images, contains_videos = False, False
81
+ images, videos = [], []
82
+ with sync_playwright() as p:
83
+ with p.chromium.launch(headless=True) as browser:
84
+ with browser.new_context(proxy=pw_proxy, ignore_https_errors=True) as context:
85
+ page = context.new_page()
86
+ page.goto(url, wait_until="networkidle", timeout=(timeout*1000))
87
+ page.wait_for_selector("article[role='article']", timeout=(timeout*1000))
88
+
89
+ contains_videos = self.tweet_contains_video(page)
90
+ contains_images = self.tweet_contains_images(page)
91
+
92
+ if contains_images:
93
+ post_text, images = self.download_images(page, timeout)
94
+
95
+ if not contains_images and not contains_videos:
96
+ post_text = self.extract_post_text(page)
97
+
98
+ if contains_videos:
99
+ media_info, videos = self.download_videos(url, timeout)
100
+ if media_info:
101
+ post_text = self.extract_canonical_name(media_info)
102
+
103
+ if not images and not videos:
104
+ if not post_text:
105
+ raise Unavailable("Content unvailable")
106
+ logging.info("[X]: Sending text message")
107
+ res.append({
108
+ "message_text": post_text,
109
+ "media_type": JobType.TEXT
110
+ })
111
+ return res
112
+
113
+ if len(images) > 1 or len(videos) > 1:
114
+ logging.info("[X]: uploading collection")
115
+ content = images + videos
116
+ res.extend(self.generate_result(content, JobType.COLLECTION, canonical_name=post_text))
117
+ else:
118
+ logging.info("[X]: uploading media")
119
+ for job_type, content in {JobType.IMAGE: images, JobType.VIDEO: videos}.items():
120
+ if content:
121
+ res.extend(self.generate_result(content, job_type, canonical_name=post_text))
122
+
123
+ return res
124
+
125
+ def download_videos(self, url: str, timeout: int = 60) -> tuple[dict, list[str]]:
126
+ local_files = []
127
+ media_info = {}
128
+ time_name = str(time.time()).replace('.', '_')
129
+ ydl_opts = {
130
+ 'socket_timeout': timeout,
131
+ 'outtmpl': f'{self.DOWNLOAD_DIR}/x_download_{time_name}_%(id)s.%(ext)s',
132
+ 'quiet': False,
133
+ 'force_generic_extractor': False,
134
+ #'noplaylist': True,
135
+ 'merge_output_format': 'mp4',
136
+ 'dump_single_json': False,
137
+ 'nocheckcertificate': True,
138
+ 'progress_hooks': [self.dlp_on_progress],
139
+ }
140
+ if self.proxy:
141
+ proxy_dsn = self.proxy.get("dsn", "")
142
+ logging.info("[X] Using proxy DSN '%s'", proxy_dsn)
143
+ if proxy_dsn:
144
+ ydl_opts["proxy"] = proxy_dsn
145
+
146
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
147
+ info = ydl.extract_info(url, download=False)
148
+ media_info = info
149
+ entries = info.get("entries", [info])
150
+
151
+ for entry in entries:
152
+ ret = ydl.download([entry['webpage_url']])
153
+ if ret == 0:
154
+ file_path = ydl.prepare_filename(entry)
155
+ if isinstance(file_path, str):
156
+ local_files.append(file_path)
157
+ else:
158
+ local_files.extend(file_path)
159
+
160
+ return media_info, local_files
161
+
162
+ def adaptive_chunk_size(self, content_length: int) -> int:
163
+ if content_length < 100_000:
164
+ return 2048
165
+ elif content_length < 5_000_000:
166
+ return 8192
167
+ elif content_length < 100_000_000:
168
+ return 32768
169
+ else:
170
+ return 65536
171
+
172
+ def get_extension_from_headers(self, response: requests.Response) -> str:
173
+ content_type = response.headers.get("Content-Type", "")
174
+ return guess_extension(content_type) or ".jpg"
175
+
176
+ def download_images(self, page: Page, timeout: int) -> tuple[str, list[str]]:
177
+ downloaded_imgs = []
178
+ headers = {
179
+ "User-Agent": ScraperUtils.get_ua(),
180
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
181
+ "Accept-Language": "en-us,en;q=0.5",
182
+ "Sec-Fetch-Mode": "navigate"
183
+ }
184
+ proxies = None
185
+ if self.proxy:
186
+ proxies = {"https": self.proxy.get("dsn", ""), "http": self.proxy.get("dsn", "")}
187
+
188
+ image_urls, post_text = self.extract_image_urls_from_x_post(page, timeout)
189
+
190
+ if not image_urls:
191
+ logging.error("[X] Content images are not found!")
192
+ return downloaded_imgs
193
+
194
+ time_name = str(time.time()).replace('.', '_')
195
+ for i, img_url in enumerate(set(image_urls)):
196
+ downloaded = 0
197
+ if "?name=small" in img_url:
198
+ img_url = img_url.replace("?name=small", "?name=orig")
199
+ with requests.get(
200
+ img_url,
201
+ headers=headers,
202
+ timeout=timeout,
203
+ stream=True,
204
+ verify=False,
205
+ proxies=proxies) as request:
206
+
207
+ request.raise_for_status()
208
+
209
+ parsed = urlparse(img_url)
210
+ ext = os.path.splitext(parsed.path)[1]
211
+ if not ext:
212
+ ext = self.get_extension_from_headers(request)
213
+ filename = f"x_download_{time_name}_{i}{ext}"
214
+ filepath = os.path.join(self.DOWNLOAD_DIR, filename)
215
+
216
+ content_length = int(request.headers.get("Content-Length", 0))
217
+
218
+ with open(filepath, "wb") as f:
219
+ #request.raw.decode_content = True
220
+ chunk_size = self.adaptive_chunk_size(content_length)
221
+ for chunk in request.iter_content(chunk_size=chunk_size):
222
+ if chunk:
223
+ f.write(chunk)
224
+ downloaded += len(chunk)
225
+ self.download_progress(
226
+ total=content_length or None,
227
+ bytes_transferred=downloaded,
228
+ path=filepath
229
+ )
230
+ downloaded_imgs.append(filepath)
231
+
232
+ return post_text, downloaded_imgs
233
+
234
+ def extract_image_urls_from_x_post(self, page: Page, timeout: int) -> tuple[list[str], str]:
235
+ img_urls, post_text = [], ''
236
+
237
+ page.wait_for_selector("img[src*='pbs.twimg.com/media']", timeout=(timeout*1000))
238
+ post_text = self.extract_post_text(page)
239
+
240
+ image_elements = page.query_selector_all("img")
241
+ image_urls = []
242
+
243
+ for img in image_elements:
244
+ src = img.get_attribute("src")
245
+ if src and "pbs.twimg.com/media" in src:
246
+ image_urls.append(src)
247
+
248
+ img_urls = list(set(image_urls))
249
+ return img_urls, post_text
250
+
251
+ def tweet_contains_video(self, page: Page) -> bool:
252
+ try:
253
+ return bool(
254
+ page.query_selector("article video") or
255
+ page.query_selector("div[data-testid='videoPlayer']") or
256
+ page.query_selector("div[aria-label='Embedded video']")
257
+ )
258
+ except Exception:
259
+ pass
260
+ return False
261
+
262
+ def tweet_contains_images(self, page: Page) -> bool:
263
+ try:
264
+ image_elements = page.query_selector_all("img")
265
+ image_urls = [
266
+ img.get_attribute("src")
267
+ for img in image_elements
268
+ if img.get_attribute("src") and "pbs.twimg.com/media" in img.get_attribute("src")
269
+ ]
270
+ return bool(image_urls)
271
+ except Exception:
272
+ pass
273
+ return False
274
+
275
+ def extract_post_text(self, page: Page) -> str:
276
+ try:
277
+ text_fragments = []
278
+
279
+ # find tweetText containers (in main and quoted)
280
+ containers = page.query_selector_all('div[data-testid="tweetText"]')
281
+ for container in containers:
282
+ fragments = []
283
+
284
+ # find <span> and <img alt=...> inside text
285
+ for node in container.query_selector_all("span, img"):
286
+ tag = node.evaluate("node => node.tagName.toLowerCase()")
287
+ if tag == "span":
288
+ value = node.inner_text().strip()
289
+ if value:
290
+ fragments.append(value)
291
+ elif tag == "img":
292
+ # emoji as image
293
+ alt = node.get_attribute("alt")
294
+ if alt:
295
+ fragments.append(alt)
296
+
297
+ if fragments:
298
+ text_fragments.append("".join(fragments))
299
+
300
+ return "\n\n".join(text_fragments).strip()
301
+
302
+ except Exception as e:
303
+ logging.warning("X: [extract_post_text] error", exc_info=e)
304
+ return ""
@@ -0,0 +1,8 @@
1
+ from enum import Flag, auto
2
+
3
+ class XMediaType(Flag):
4
+ UNKNOWN = 0
5
+ VIDEO = auto()
6
+ IMAGE = auto()
7
+ MIXED = auto()
8
+ PLAYLIST = auto()
@@ -319,6 +319,7 @@ class AsyncDownloader(object):
319
319
  # success
320
320
  for job in fail_handler.get_failed_jobs():
321
321
  self.queue_task(job)
322
+ # media info processing
322
323
  for item in items:
323
324
  media_info = {"filesize": 0}
324
325
  if item["media_type"] == JobType.VIDEO:
@@ -372,6 +373,8 @@ class AsyncDownloader(object):
372
373
  job_args["media_collection"] = item["items"]
373
374
  if item.get("save_items", None) is not None:
374
375
  job_args["save_items"] = item.get("save_items", False)
376
+ elif item["media_type"] == JobType.TEXT:
377
+ job_args["message_text"] = item.get("message_text", "")
375
378
  else:
376
379
  job_args["local_media_path"] = item["local_media_path"]
377
380
  if item.get("local_compressed_media_path", None):
@@ -67,7 +67,8 @@ class Storage(object):
67
67
  "uniq_id": document["uniq_id"],
68
68
  "tg_file_id": document["tg_file_id"],
69
69
  "media_type": document["media_type"],
70
- "canonical_name": document.get("canonical_name")
70
+ "canonical_name": document.get("canonical_name"),
71
+ "message_text": document.get("message_text")
71
72
  })
72
73
  except Exception as e:
73
74
  logging.error("Error occurred while trying to read from the database!")
@@ -82,13 +83,10 @@ class Storage(object):
82
83
  def db_lookup_id(self, uniq_id: str) -> list[dict]:
83
84
  return self.db_find(uniq_id)
84
85
 
85
- def add_media(self, tg_file_ids: list[str], media_url: str, media_type: str, origin: str, canonical_name: str = "") -> list[int]:
86
+ def add_media(self, tg_file_ids: list[str], media_url: str, media_type: str, origin: str, canonical_name: str = "", message_text: str = "") -> list[int]:
86
87
  uniq_id = self.compute_uniq(media_url)
87
88
  media_ids = []
88
89
  for tg_file_id in tg_file_ids:
89
- if not tg_file_id:
90
- logging.warning("Passed empty `tg_file_id`! Skipping.")
91
- continue
92
90
  if self.db_lookup_id(uniq_id):
93
91
  logging.info("Detected existing uniq_id, skipping storage write operation")
94
92
  continue
@@ -98,7 +96,8 @@ class Storage(object):
98
96
  "media_type": media_type,
99
97
  "tg_file_id": tg_file_id,
100
98
  "origin": origin,
101
- "canonical_name": canonical_name
99
+ "canonical_name": canonical_name,
100
+ "message_text": message_text
102
101
  }).inserted_id)
103
102
 
104
103
  return media_ids
@@ -393,6 +393,9 @@ class Bot(object):
393
393
  tg_chunk.append(anim)
394
394
  mediafs.append(tg_chunk)
395
395
  args["media"] = mediafs
396
+ elif job.media_type == JobType.TEXT:
397
+ args["text"] = f"<b>Post text:</b><pre>{job.message_text}</pre>\n\n{self.build_signature_caption(job)}"
398
+ args["parse_mode"] = ParseMode.HTML
396
399
 
397
400
  args["chat_id"] = job.chat_id
398
401
 
@@ -412,7 +415,7 @@ class Bot(object):
412
415
  if render_donates:
413
416
  keyboard_buttons[0].append(InlineKeyboardButton("❤ Donate", url=os.environ.get("DONATE_LINK", "https://pay.cryptocloud.plus/pos/W5BMtNQt5bJFoW2E")))
414
417
 
415
- if keyboard_buttons[0]: #job.short_text or render_donates:
418
+ if keyboard_buttons[0]:
416
419
  args["reply_markup"] = InlineKeyboardMarkup(keyboard_buttons)
417
420
 
418
421
  return args
@@ -425,9 +428,14 @@ class Bot(object):
425
428
  while not retry_amount >= max_retries:
426
429
  try:
427
430
  reply_message = None
428
- if job.media_type in (JobType.VIDEO, JobType.IMAGE, JobType.AUDIO, JobType.ANIMATION):
429
- if job.media_type in (JobType.VIDEO, JobType.AUDIO):
430
- await Utils.ensure_me_loaded(self.client)
431
+ if job.media_type in (JobType.VIDEO, JobType.IMAGE, JobType.AUDIO, JobType.ANIMATION, JobType.TEXT):
432
+ #if job.media_type in (JobType.VIDEO, JobType.AUDIO):
433
+ # await Utils.ensure_me_loaded(self.client)
434
+ if job.media_type == JobType.TEXT:
435
+ if job.placeholder_message_id:
436
+ await self.placeholder.remove(job.chat_id, job.placeholder_message_id)
437
+ job.placeholder_message_id = None
438
+
431
439
  if job.placeholder_message_id:
432
440
  try:
433
441
  reply_message = await self.editor.edit(**self.build_tg_args(job))
@@ -440,7 +448,8 @@ class Bot(object):
440
448
  JobType.VIDEO: self.client.send_video,
441
449
  JobType.IMAGE: self.client.send_photo,
442
450
  JobType.AUDIO: self.client.send_audio,
443
- JobType.ANIMATION: self.client.send_animation
451
+ JobType.ANIMATION: self.client.send_animation,
452
+ JobType.TEXT: self.client.send_message
444
453
  }
445
454
  try:
446
455
  while True:
@@ -460,9 +469,10 @@ class Bot(object):
460
469
  job_args[reality.value.lower()] = job_args.pop(expectation.value.lower())
461
470
  reply_message = await send_funcs[reality](**job_args)
462
471
 
463
- tg_file_id = Utils.extract_file_id(reply_message)
464
- tg_file_ids.append(tg_file_id)
465
- job.tg_file_id = tg_file_id
472
+ if reply_message:
473
+ tg_file_id = Utils.extract_file_id(reply_message)
474
+ tg_file_ids.append(tg_file_id)
475
+ job.tg_file_id = tg_file_id
466
476
  logging.info("Uploaded media file with type '%s' tg_file_id is '%s'", job.media_type.value, job.tg_file_id)
467
477
  elif job.media_type == JobType.COLLECTION:
468
478
  col_job_args = self.build_tg_args(job)
@@ -111,6 +111,14 @@ class Handlers(object):
111
111
  origin=job.job_origin.value,
112
112
  canonical_name=common_canonical_name
113
113
  )
114
+ elif job.media_type == JobType.TEXT:
115
+ self.storage.add_media(
116
+ tg_file_ids=[None],
117
+ media_url=job.url,
118
+ media_type=job.media_type.value,
119
+ origin=job.job_origin.value,
120
+ message_text=job.message_text
121
+ )
114
122
  else:
115
123
  self.storage.add_media(
116
124
  tg_file_ids=[','.join(tg_file_ids)],
@@ -215,6 +223,7 @@ class Handlers(object):
215
223
  elif ent_len:
216
224
  media_type = JobType[entities[0]["media_type"].upper()]
217
225
  canonical_name = entities[0]["canonical_name"]
226
+ message_text = entities[0]["message_text"]
218
227
  await self.bot.upload_job(
219
228
  UploadJob(
220
229
  url=url,
@@ -228,22 +237,23 @@ class Handlers(object):
228
237
  chat_type=message.chat.type,
229
238
  source_username=Utils.extract_message_author(message),
230
239
  canonical_name=canonical_name,
231
- message_leftover=msg_leftover
240
+ message_leftover=msg_leftover,
241
+ message_text=message_text
232
242
  )
233
243
  )
234
244
  else:
235
245
  if await self.queue_job(DownloadJob.build(
236
- url=url,
237
- message_id=effective_message_id,
238
- chat_id=chat.id,
239
- user_id=message.from_user.id,
240
- in_process=self.bot.uploader.is_inprocess(uniq_id),
241
- uniq_id=uniq_id,
242
- job_origin=origin,
243
- source_username=Utils.extract_message_author(message),
244
- chat_type=chat.type,
245
- message_leftover=msg_leftover
246
- )):
246
+ url=url,
247
+ message_id=effective_message_id,
248
+ chat_id=chat.id,
249
+ user_id=message.from_user.id,
250
+ in_process=self.bot.uploader.is_inprocess(uniq_id),
251
+ uniq_id=uniq_id,
252
+ job_origin=origin,
253
+ source_username=Utils.extract_message_author(message),
254
+ chat_type=chat.type,
255
+ message_leftover=msg_leftover
256
+ )):
247
257
  self.bot.uploader.set_inprocess(uniq_id)
248
258
 
249
259
  if chat.type not in (ChatType.GROUP, ChatType.SUPERGROUP) and not urls:
@@ -89,7 +89,7 @@ class AsyncUploader(object):
89
89
  while self.allow_loop:
90
90
  try:
91
91
  try:
92
- job = self.job_queue.get()
92
+ job: UploadJob = self.job_queue.get()
93
93
  if job is self.__JOE_BIDEN_WAKEUP:
94
94
  break
95
95
  if job.is_message_to_admin and job.message_text and self.admin_message_callback:
@@ -118,7 +118,10 @@ class AsyncUploader(object):
118
118
  message_id = job.placeholder_message_id
119
119
 
120
120
  if not in_process and not job.job_failed and not job.job_warning and not job.replay:
121
- logging.info("Accepted upload job, file(s): '%s'", path)
121
+ if job.media_type == JobType.TEXT:
122
+ logging.info("Uploading job text: '%s'", job.message_text)
123
+ else:
124
+ logging.info("Accepted upload job, file(s): '%s'", path)
122
125
 
123
126
  try:
124
127
  if message_id in self.callbacks:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warp_beacon
3
- Version: 2.8.13
3
+ Version: 2.8.14
4
4
  Summary: Telegram bot for expanding external media links
5
5
  Home-page: https://github.com/sb0y/warp_beacon
6
6
  Author: Andrey Bagrintsev
@@ -1,2 +0,0 @@
1
- __version__ = "2.8.13"
2
-
@@ -1,330 +0,0 @@
1
- import os
2
- import time
3
- import logging
4
- from mimetypes import guess_extension
5
- from urllib.parse import urlparse
6
- import requests
7
- import yt_dlp
8
- from playwright.sync_api import sync_playwright, Page
9
-
10
- from warp_beacon.telegram.utils import Utils
11
- from warp_beacon.scraper.utils import ScraperUtils
12
- from warp_beacon.scraper.X.types import XMediaType
13
- from warp_beacon.jobs.types import JobType
14
- from warp_beacon.scraper.X.abstract import XAbstract
15
-
16
- class XScraper(XAbstract):
17
- DOWNLOAD_DIR = "/tmp"
18
-
19
- def extract_canonical_name(self, media: dict) -> str:
20
- ret = ""
21
- try:
22
- if media.get("title", None):
23
- ret = media["title"]
24
- if media.get("description", ""):
25
- ret += "\n" + media["description"]
26
- except Exception as e:
27
- logging.warning("Failed to extract canonical media name!")
28
- logging.exception(e)
29
-
30
- return ret
31
-
32
- def get_media_type(self, media_info: dict) -> XMediaType:
33
- media_type = XMediaType.UNKNOWN
34
- #logging.info("[X] post info: '%s'", media_info)
35
-
36
- if 'ext' in media_info:
37
- logging.info("[X] Format: '%s'", media_info['ext'])
38
- if 'formats' in media_info:
39
- logging.info("[X] Contains video.")
40
- media_type = XMediaType.VIDEO
41
- elif 'thumbnails' in media_info:
42
- logging.info("[X] contains images.")
43
- media_type = XMediaType.IMAGE
44
- else:
45
- logging.info("[X] No media found.")
46
-
47
- return media_type
48
-
49
- def _download(self, url: str, timeout: int = 60) -> list:
50
- res = []
51
- job_type = JobType.UNKNOWN
52
- time_name = str(time.time()).replace('.', '_')
53
- ydl_opts = {
54
- 'socket_timeout': timeout,
55
- 'outtmpl': f'{self.DOWNLOAD_DIR}/x_download_{time_name}.%(ext)s',
56
- 'quiet': False,
57
- 'force_generic_extractor': False,
58
- 'noplaylist': True,
59
- 'merge_output_format': 'mp4',
60
- 'dump_single_json': True,
61
- 'nocheckcertificate': True,
62
- 'progress_hooks': [self.dlp_on_progress],
63
- }
64
-
65
- if self.proxy:
66
- proxy_dsn = self.proxy.get("dsn", "")
67
- logging.info("[X] Using proxy DSN '%s'", proxy_dsn)
68
- if proxy_dsn:
69
- ydl_opts["proxy"] = proxy_dsn
70
-
71
- local_file, media_info, media_type, post_text = "", {}, XMediaType.UNKNOWN, ""
72
- #tweet_contains_video, tweet_contains_images = False, False
73
-
74
- #with sync_playwright() as p:
75
- # with p.chromium.launch(headless=True) as browser:
76
- # with browser.new_context(proxy=proxy, ignore_https_errors=True) as context:
77
- # page = context.new_page()
78
- # page.goto(url, wait_until="networkidle", timeout=(timeout*1000))
79
- # tweet_contains_video = self.tweet_contains_video(page)
80
- # tweet_contains_images = self.tweet_contains_images(page)
81
-
82
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
83
- try:
84
- media_info = ydl.extract_info(url, download=False)
85
- media_type = self.get_media_type(media_info)
86
- if media_type == XMediaType.VIDEO:
87
- local_file = self.download_video(url, ydl, media_info)
88
- post_text = self.extract_canonical_name(media_info)
89
- job_type = JobType.VIDEO
90
- except yt_dlp.utils.DownloadError as e:
91
- msg = str(e).lower()
92
- if "no video could be found in this tweet" in msg:
93
- logging.warning("[X] yt_dlp failed to extract info. Falling back to image scraping.")
94
- media_type = XMediaType.IMAGE
95
- else:
96
- raise
97
-
98
- images = []
99
- if media_type == XMediaType.IMAGE:
100
- job_type = JobType.IMAGE
101
- images, post_text = self.download_images(url, timeout)
102
- if images:
103
- if len(images) > 1:
104
- job_type = JobType.COLLECTION
105
- else:
106
- local_file = images[0]
107
-
108
- if job_type == JobType.COLLECTION:
109
- chunks = []
110
- for media_chunk in Utils.chunker(images, 10):
111
- chunk = []
112
- for media in media_chunk:
113
- chunk.append({
114
- "local_media_path": self.rename_local_file(media),
115
- "canonical_name": post_text,
116
- "media_type": JobType.IMAGE
117
- })
118
- chunks.append(chunk)
119
- res.append({
120
- "media_type": JobType.COLLECTION,
121
- "items": chunks
122
- })
123
- else:
124
- if local_file:
125
- res.append({
126
- "local_media_path": self.rename_local_file(local_file),
127
- "performer": media_info.get("uploader", "Unknown"),
128
- "canonical_name": post_text,
129
- "media_type": job_type
130
- })
131
-
132
- return res
133
-
134
- def adaptive_chunk_size(self, content_length: int) -> int:
135
- if content_length < 100_000:
136
- return 2048
137
- elif content_length < 5_000_000:
138
- return 8192
139
- elif content_length < 100_000_000:
140
- return 32768
141
- else:
142
- return 65536
143
-
144
- def download_video(self, url: str, ydl: yt_dlp.YoutubeDL, media_info: dict) -> str:
145
- local_file = ""
146
- ydl.download([url])
147
- local_file = ydl.prepare_filename(media_info)
148
- logging.debug("Temp filename: '%s'", local_file)
149
- if local_file:
150
- local_file = self.rename_local_file(local_file)
151
- return local_file
152
-
153
- def get_extension_from_headers(self, response: requests.Response) -> str:
154
- content_type = response.headers.get("Content-Type", "")
155
- return guess_extension(content_type) or ".jpg"
156
-
157
- def download_images(self, url: str, timeout: int = 60) -> tuple[list[str], str]:
158
- downloaded_imgs = []
159
- headers = {
160
- "User-Agent": ScraperUtils.get_ua(),
161
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
162
- "Accept-Language": "en-us,en;q=0.5",
163
- "Sec-Fetch-Mode": "navigate"
164
- }
165
- proxies = None
166
- if self.proxy:
167
- proxies = {"https": self.proxy.get("dsn", ""), "http": self.proxy.get("dsn", "")}
168
-
169
- image_urls, post_text = self.extract_image_urls_from_x_post(url, timeout=timeout)
170
-
171
- if not image_urls:
172
- logging.error("[X] Content images are not found!")
173
- return downloaded_imgs
174
-
175
- time_name = str(time.time()).replace('.', '_')
176
- for i, img_url in enumerate(set(image_urls)):
177
- downloaded = 0
178
- if "?name=small" in img_url:
179
- img_url = img_url.replace("?name=small", "?name=orig")
180
- with requests.get(
181
- img_url,
182
- headers=headers,
183
- timeout=timeout,
184
- stream=True,
185
- verify=False,
186
- proxies=proxies) as request:
187
-
188
- request.raise_for_status()
189
-
190
- parsed = urlparse(img_url)
191
- ext = os.path.splitext(parsed.path)[1]
192
- if not ext:
193
- ext = self.get_extension_from_headers(request)
194
- filename = f"x_download_{time_name}_{i}{ext}"
195
- filepath = os.path.join(self.DOWNLOAD_DIR, filename)
196
-
197
- content_length = int(request.headers.get("Content-Length", 0))
198
-
199
- with open(filepath, "wb") as f:
200
- #request.raw.decode_content = True
201
- chunk_size = self.adaptive_chunk_size(content_length)
202
- for chunk in request.iter_content(chunk_size=chunk_size):
203
- if chunk:
204
- f.write(chunk)
205
- downloaded += len(chunk)
206
- self.download_progress(
207
- total=content_length or None,
208
- bytes_transferred=downloaded,
209
- path=filepath
210
- )
211
- downloaded_imgs.append(filepath)
212
-
213
- return downloaded_imgs, post_text
214
-
215
- def extract_post_text(self, page: Page) -> str:
216
- try:
217
- tweet_texts = []
218
- # collecting text blocks from post
219
- containers = page.query_selector_all('div[data-testid="tweetText"]')
220
- for container in containers:
221
- try:
222
- spans = container.query_selector_all("span")
223
- if spans:
224
- for span in spans:
225
- text = span.inner_text().strip()
226
- if text:
227
- tweet_texts.append(text)
228
- else:
229
- # to span's try container itself
230
- text = container.inner_text().strip()
231
- if text:
232
- tweet_texts.append(text)
233
- except Exception:
234
- continue
235
-
236
- return " ".join(tweet_texts).strip()
237
- except Exception as e:
238
- logging.warning("Failed to extract tweet text.", exc_info=e)
239
-
240
- return ""
241
-
242
- def extract_image_urls_from_x_post(self, url: str, timeout: int = 60) -> tuple[list[str], str]:
243
- img_urls, post_text = [], ''
244
-
245
- proxy = None
246
- if self.proxy:
247
- dsn = self.proxy.get("dsn", "")
248
- if dsn:
249
- parsed = urlparse(dsn)
250
- proxy = {
251
- "server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
252
- "username": parsed.username,
253
- "password": parsed.password
254
- }
255
- logging.info("[X] build proxy: %s", proxy)
256
-
257
- with sync_playwright() as p:
258
- with p.chromium.launch(headless=True) as browser:
259
- with browser.new_context(proxy=proxy, ignore_https_errors=True) as context:
260
- page = context.new_page()
261
- page.goto(url, wait_until="networkidle", timeout=(timeout*1000))
262
-
263
- #page.wait_for_timeout(3000)
264
- page.wait_for_selector("img[src*='pbs.twimg.com/media']", timeout=(timeout*1000))
265
- post_text = self.extract_post_text(page)
266
-
267
- image_elements = page.query_selector_all("img")
268
- image_urls = []
269
-
270
- for img in image_elements:
271
- src = img.get_attribute("src")
272
- if src and "pbs.twimg.com/media" in src:
273
- image_urls.append(src)
274
-
275
- img_urls = list(set(image_urls))
276
- return img_urls, post_text
277
-
278
- def get_media_type_from_info_and_dom(self, media_info: dict, page: Page) -> XMediaType:
279
- is_video = (
280
- media_info.get("vcodec") != "none" or
281
- media_info.get("ext") in {"mp4", "mov", "mkv"} or
282
- any(
283
- f.get("vcodec") not in (None, "none")
284
- for f in media_info.get("formats", [])
285
- )
286
- )
287
-
288
- try:
289
- image_elements = page.query_selector_all("img")
290
- image_urls = [
291
- img.get_attribute("src")
292
- for img in image_elements
293
- if img.get_attribute("src") and "pbs.twimg.com/media" in img.get_attribute("src")
294
- ]
295
- has_images = bool(image_urls)
296
- except Exception:
297
- has_images = False
298
-
299
- if is_video and has_images:
300
- return XMediaType.MIXED
301
- elif is_video:
302
- return XMediaType.VIDEO
303
- elif has_images:
304
- return XMediaType.IMAGE
305
-
306
- return XMediaType.UNKNOWN
307
-
308
- def tweet_contains_video(self, page: Page) -> bool:
309
- try:
310
- return bool(
311
- page.query_selector("article video") or
312
- page.query_selector("div[data-testid='videoPlayer']") or
313
- page.query_selector("div[aria-label='Embedded video']")
314
- )
315
- except Exception:
316
- pass
317
- return False
318
-
319
- def tweet_contains_images(self, page: Page) -> bool:
320
- try:
321
- image_elements = page.query_selector_all("img")
322
- image_urls = [
323
- img.get_attribute("src")
324
- for img in image_elements
325
- if img.get_attribute("src") and "pbs.twimg.com/media" in img.get_attribute("src")
326
- ]
327
- return bool(image_urls)
328
- except Exception:
329
- pass
330
- return False
@@ -1,7 +0,0 @@
1
- import enum
2
-
3
- class XMediaType(enum.Enum):
4
- UNKNOWN = 0
5
- VIDEO = 1
6
- IMAGE = 2
7
- MIXED = 3
File without changes
File without changes
File without changes
File without changes
File without changes