warp-beacon 2.7.31__tar.gz → 2.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warp_beacon-2.7.31/warp_beacon.egg-info → warp_beacon-2.8.0}/PKG-INFO +1 -1
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/setup.py +4 -0
- warp_beacon-2.8.0/warp_beacon/__version__.py +2 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/jobs/__init__.py +1 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scheduler/instagram_human.py +17 -7
- warp_beacon-2.8.0/warp_beacon/scraper/X/X.py +215 -0
- warp_beacon-2.8.0/warp_beacon/scraper/X/abstract.py +139 -0
- warp_beacon-2.8.0/warp_beacon/scraper/X/types.py +6 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/__init__.py +4 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/account_selector.py +10 -4
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/youtube/abstract.py +1 -5
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/storage/__init__.py +2 -2
- warp_beacon-2.8.0/warp_beacon/telegram/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/utils.py +3 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0/warp_beacon.egg-info}/PKG-INFO +1 -1
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon.egg-info/SOURCES.txt +4 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon.egg-info/top_level.txt +4 -0
- warp_beacon-2.7.31/warp_beacon/__version__.py +0 -2
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/LICENSE +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/MANIFEST.in +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/README.md +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/assets/cc-group-black.png +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/assets/placeholder.gif +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/etc/.gitignore +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/etc/accounts.json +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/etc/proxies.json +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/etc/warp_beacon.conf +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/etc/warp_beacon.service +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/pyproject.toml +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/setup.cfg +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/compress/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/compress/video.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/jobs/abstract.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/jobs/download_job.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/jobs/types.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/jobs/upload_job.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/mediainfo/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/mediainfo/abstract.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/mediainfo/audio.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/mediainfo/silencer.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/mediainfo/video.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scheduler/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scheduler/scheduler.py +0 -0
- {warp_beacon-2.7.31/warp_beacon/scraper/instagram → warp_beacon-2.8.0/warp_beacon/scraper/X}/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/abstract.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/exceptions.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/fail_handler.py +0 -0
- {warp_beacon-2.7.31/warp_beacon/scraper/youtube → warp_beacon-2.8.0/warp_beacon/scraper/instagram}/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/instagram/captcha.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/instagram/instagram.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/instagram/wb_instagrapi.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/link_resolver.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/utils.py +0 -0
- {warp_beacon-2.7.31/warp_beacon/telegram → warp_beacon-2.8.0/warp_beacon/scraper/youtube}/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/youtube/music.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/youtube/shorts.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/scraper/youtube/youtube.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/storage/mongo.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/bot.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/caption_shortener.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/download_status.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/edit_message.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/handlers.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/placeholder_message.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/progress_bar.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/progress_file_reader.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/telegram/types.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/uploader/__init__.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/warp_beacon.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon/yt_auth.py +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon.egg-info/dependency_links.txt +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon.egg-info/entry_points.txt +0 -0
- {warp_beacon-2.7.31 → warp_beacon-2.8.0}/warp_beacon.egg-info/requires.txt +0 -0
@@ -52,6 +52,7 @@ setup(
|
|
52
52
|
'warp_beacon/scraper',
|
53
53
|
'warp_beacon/scraper/instagram',
|
54
54
|
'warp_beacon/scraper/youtube',
|
55
|
+
'warp_beacon/scraper/X',
|
55
56
|
'warp_beacon/mediainfo',
|
56
57
|
'warp_beacon/jobs',
|
57
58
|
'warp_beacon/compress',
|
@@ -91,6 +92,9 @@ setup(
|
|
91
92
|
"warp_beacon/scraper/youtube/youtube",
|
92
93
|
"warp_beacon/scraper/youtube/shorts",
|
93
94
|
"warp_beacon/scraper/youtube/music",
|
95
|
+
"warp_beacon/scraper/X/abstract",
|
96
|
+
"warp_beacon/scraper/X/X",
|
97
|
+
"warp_beacon/scraper/X/types",
|
94
98
|
"warp_beacon/scraper/fail_handler",
|
95
99
|
"warp_beacon/scraper/link_resolver",
|
96
100
|
"warp_beacon/scraper/utils",
|
@@ -5,7 +5,7 @@ from datetime import datetime
|
|
5
5
|
|
6
6
|
import logging
|
7
7
|
|
8
|
-
from instagrapi.types import UserShort
|
8
|
+
from instagrapi.types import UserShort, Story
|
9
9
|
from warp_beacon.scraper.instagram.instagram import InstagramScraper
|
10
10
|
|
11
11
|
class InstagramHuman(object):
|
@@ -69,7 +69,7 @@ class InstagramHuman(object):
|
|
69
69
|
|
70
70
|
def watch_stories(self) -> None:
|
71
71
|
logging.info("Simulating stories watch ...")
|
72
|
-
stories = []
|
72
|
+
stories: list[Story] = []
|
73
73
|
try:
|
74
74
|
raw_tray = self.reel_tray_feed_if_needed()
|
75
75
|
if not raw_tray:
|
@@ -77,16 +77,17 @@ class InstagramHuman(object):
|
|
77
77
|
return
|
78
78
|
#logging.info("raw_tray: %s", str(raw_tray))
|
79
79
|
tray = raw_tray.get("tray", [])
|
80
|
-
if
|
80
|
+
filtered_tray = [item for item in tray if item.get("seen", 0) == 0]
|
81
|
+
if filtered_tray:
|
81
82
|
reels_tray = []
|
82
83
|
target_len = random.randint(1, 5)
|
83
|
-
for el in
|
84
|
+
for el in filtered_tray[:target_len]:
|
84
85
|
user = el["user"]
|
85
86
|
user_id = user["pk"]
|
86
|
-
|
87
|
-
reels_tray.append({"user_id": user_id})
|
87
|
+
reels_tray.append({"user_id": user_id})
|
88
88
|
|
89
89
|
for el in reels_tray:
|
90
|
+
# amount?
|
90
91
|
_stories = self.scrapler.download_hndlr(self.scrapler.cl.user_stories, el["user_id"])
|
91
92
|
self.operations_count += 1
|
92
93
|
if _stories:
|
@@ -106,7 +107,16 @@ class InstagramHuman(object):
|
|
106
107
|
explore_user = m.user
|
107
108
|
self.operations_count += 1
|
108
109
|
break
|
109
|
-
|
110
|
+
pause = 0.0
|
111
|
+
if m.media_type == 2:
|
112
|
+
pause = m.video_duration or random.uniform(2, 20)
|
113
|
+
elif m.media_type == 8:
|
114
|
+
pause = random.uniform(2, 31)
|
115
|
+
else: # img, etc.
|
116
|
+
pause = random.uniform(2, 14)
|
117
|
+
logging.info("Pause for '%.2f' sec ...", round(pause, 2))
|
118
|
+
time.sleep(pause)
|
119
|
+
#self.random_pause()
|
110
120
|
|
111
121
|
if seen:
|
112
122
|
self.scrapler.download_hndlr(self.scrapler.cl.media_seen, seen)
|
@@ -0,0 +1,215 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
import logging
|
4
|
+
from mimetypes import guess_extension
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
import requests
|
7
|
+
import yt_dlp
|
8
|
+
from playwright.sync_api import sync_playwright
|
9
|
+
|
10
|
+
from warp_beacon.scraper.utils import ScraperUtils
|
11
|
+
from warp_beacon.scraper.X.types import XMediaType
|
12
|
+
from warp_beacon.jobs.types import JobType
|
13
|
+
from warp_beacon.scraper.X.abstract import XAbstract
|
14
|
+
|
15
|
+
class XScraper(XAbstract):
|
16
|
+
DOWNLOAD_DIR = "/tmp"
|
17
|
+
|
18
|
+
def extract_canonical_name(self, media: dict) -> str:
|
19
|
+
ret = ""
|
20
|
+
try:
|
21
|
+
if media.get("title", None):
|
22
|
+
ret = media["title"]
|
23
|
+
if media.get("description", ""):
|
24
|
+
ret += "\n" + media["description"]
|
25
|
+
except Exception as e:
|
26
|
+
logging.warning("Failed to extract canonical media name!")
|
27
|
+
logging.exception(e)
|
28
|
+
|
29
|
+
return ret
|
30
|
+
|
31
|
+
def get_media_type(self, media_info: dict) -> XMediaType:
|
32
|
+
media_type = XMediaType.UNKNOWN
|
33
|
+
#logging.info("[X] post info: '%s'", media_info)
|
34
|
+
|
35
|
+
if 'ext' in media_info:
|
36
|
+
logging.info("[X] Format: '%s'", media_info['ext'])
|
37
|
+
if 'formats' in media_info:
|
38
|
+
logging.info("[X] Contains video.")
|
39
|
+
media_type = XMediaType.VIDEO
|
40
|
+
elif 'thumbnails' in media_info:
|
41
|
+
logging.info("[X] contains images.")
|
42
|
+
media_type = XMediaType.IMAGE
|
43
|
+
else:
|
44
|
+
logging.info("[X] No media found.")
|
45
|
+
|
46
|
+
return media_type
|
47
|
+
|
48
|
+
def _download(self, url: str, timeout: int = 60) -> list:
|
49
|
+
res = []
|
50
|
+
job_type = JobType.UNKNOWN
|
51
|
+
time_name = str(time.time()).replace('.', '_')
|
52
|
+
ydl_opts = {
|
53
|
+
'socket_timeout': timeout,
|
54
|
+
'outtmpl': f'{self.DOWNLOAD_DIR}/x_download_{time_name}.%(ext)s',
|
55
|
+
'quiet': False,
|
56
|
+
'force_generic_extractor': False,
|
57
|
+
'noplaylist': True,
|
58
|
+
'merge_output_format': 'mp4',
|
59
|
+
'dump_single_json': True,
|
60
|
+
}
|
61
|
+
|
62
|
+
if self.proxy:
|
63
|
+
proxy_dsn = self.proxy.get("dsn", "")
|
64
|
+
logging.info("[X] Using proxy DSN '%s'", proxy_dsn)
|
65
|
+
if proxy_dsn:
|
66
|
+
ydl_opts["proxy"] = proxy_dsn
|
67
|
+
|
68
|
+
local_file, media_info, media_type, post_text = "", {}, XMediaType.UNKNOWN, ""
|
69
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
70
|
+
try:
|
71
|
+
media_info = ydl.extract_info(url, download=False)
|
72
|
+
media_type = self.get_media_type(media_info)
|
73
|
+
if media_type == XMediaType.VIDEO:
|
74
|
+
local_file = self.download_video(url, ydl, media_info)
|
75
|
+
post_text = self.extract_canonical_name(media_info)
|
76
|
+
job_type = JobType.VIDEO
|
77
|
+
except yt_dlp.utils.DownloadError:
|
78
|
+
logging.warning("[X] yt_dlp failed to extract info. Falling back to image scraping.")
|
79
|
+
media_type = XMediaType.IMAGE
|
80
|
+
|
81
|
+
if media_type == XMediaType.IMAGE:
|
82
|
+
job_type = JobType.IMAGE
|
83
|
+
images, post_text = self.download_images(url, timeout)
|
84
|
+
if images:
|
85
|
+
local_file = images[0]
|
86
|
+
|
87
|
+
if local_file:
|
88
|
+
res.append({
|
89
|
+
"local_media_path": local_file,
|
90
|
+
"performer": media_info.get("uploader", "Unknown"),
|
91
|
+
'progress_hooks': [self.dlp_on_progress],
|
92
|
+
#"thumb": thumbnail,
|
93
|
+
"canonical_name": post_text,
|
94
|
+
"media_type": job_type
|
95
|
+
})
|
96
|
+
|
97
|
+
return res
|
98
|
+
|
99
|
+
def adaptive_chunk_size(self, content_length: int) -> int:
|
100
|
+
if content_length < 100_000:
|
101
|
+
return 2048
|
102
|
+
elif content_length < 5_000_000:
|
103
|
+
return 8192
|
104
|
+
elif content_length < 100_000_000:
|
105
|
+
return 32768
|
106
|
+
else:
|
107
|
+
return 65536
|
108
|
+
|
109
|
+
def download_video(self, url: str, ydl: yt_dlp.YoutubeDL, media_info: dict) -> str:
|
110
|
+
local_file = ""
|
111
|
+
ydl.download([url])
|
112
|
+
local_file = ydl.prepare_filename(media_info)
|
113
|
+
logging.debug("Temp filename: '%s'", local_file)
|
114
|
+
if local_file:
|
115
|
+
local_file = self.rename_local_file(local_file)
|
116
|
+
return local_file
|
117
|
+
|
118
|
+
def get_extension_from_headers(self, response: requests.Response) -> str:
|
119
|
+
content_type = response.headers.get("Content-Type", "")
|
120
|
+
return guess_extension(content_type) or ".jpg"
|
121
|
+
|
122
|
+
def download_images(self, url: str, timeout: int = 60) -> tuple[list[str], str]:
|
123
|
+
downloaded_imgs = []
|
124
|
+
headers = {
|
125
|
+
"User-Agent": ScraperUtils.get_ua(),
|
126
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
127
|
+
"Accept-Language": "en-us,en;q=0.5",
|
128
|
+
"Sec-Fetch-Mode": "navigate"
|
129
|
+
}
|
130
|
+
proxies = None
|
131
|
+
if self.proxy:
|
132
|
+
proxies = {"https": self.proxy.get("dsn", ""), "http": self.proxy.get("dsn", "")}
|
133
|
+
|
134
|
+
image_urls, post_text = self.extract_image_urls_from_x_post(url, timeout=timeout)
|
135
|
+
|
136
|
+
if not image_urls:
|
137
|
+
logging.error("[X] Content images are not found!")
|
138
|
+
return downloaded_imgs
|
139
|
+
|
140
|
+
time_name = str(time.time()).replace('.', '_')
|
141
|
+
for i, img_url in enumerate(set(image_urls)):
|
142
|
+
downloaded = 0
|
143
|
+
if "?name=small" in img_url:
|
144
|
+
img_url = img_url.replace("?name=small", "?name=orig")
|
145
|
+
with requests.get(
|
146
|
+
img_url,
|
147
|
+
headers=headers,
|
148
|
+
timeout=timeout,
|
149
|
+
stream=True,
|
150
|
+
verify=False,
|
151
|
+
proxies=proxies) as request:
|
152
|
+
|
153
|
+
request.raise_for_status()
|
154
|
+
|
155
|
+
parsed = urlparse(img_url)
|
156
|
+
ext = os.path.splitext(parsed.path)[1]
|
157
|
+
if not ext:
|
158
|
+
ext = self.get_extension_from_headers(request)
|
159
|
+
filename = f"x_download_{time_name}_{i}{ext}"
|
160
|
+
filepath = os.path.join(self.DOWNLOAD_DIR, filename)
|
161
|
+
|
162
|
+
content_length = int(request.headers.get("Content-Length", 0))
|
163
|
+
|
164
|
+
with open(filepath, "wb") as f:
|
165
|
+
#request.raw.decode_content = True
|
166
|
+
chunk_size = self.adaptive_chunk_size(content_length)
|
167
|
+
for chunk in request.iter_content(chunk_size=chunk_size):
|
168
|
+
if chunk:
|
169
|
+
f.write(chunk)
|
170
|
+
downloaded += len(chunk)
|
171
|
+
self.download_progress(
|
172
|
+
total=content_length or None,
|
173
|
+
bytes_transferred=downloaded,
|
174
|
+
path=filepath
|
175
|
+
)
|
176
|
+
downloaded_imgs.append(filepath)
|
177
|
+
|
178
|
+
return downloaded_imgs, post_text
|
179
|
+
|
180
|
+
def extract_image_urls_from_x_post(self, url: str, timeout: int = 60) -> tuple[list[str], str]:
|
181
|
+
img_urls, post_text = [], ''
|
182
|
+
|
183
|
+
proxy = None
|
184
|
+
if self.proxy:
|
185
|
+
dsn = self.proxy.get("dsn", "")
|
186
|
+
if dsn:
|
187
|
+
parsed = urlparse(dsn)
|
188
|
+
proxy = {
|
189
|
+
"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
190
|
+
"username": parsed.username,
|
191
|
+
"password": parsed.password
|
192
|
+
}
|
193
|
+
|
194
|
+
with sync_playwright() as p:
|
195
|
+
with p.chromium.launch(headless=True) as browser:
|
196
|
+
with browser.new_context(proxy=proxy) as context:
|
197
|
+
page = context.new_page()
|
198
|
+
page.goto(url, wait_until="networkidle", timeout=(timeout*1000))
|
199
|
+
|
200
|
+
#page.wait_for_timeout(3000)
|
201
|
+
page.wait_for_selector("img[src*='pbs.twimg.com/media']", timeout=(timeout*1000))
|
202
|
+
text_element = page.wait_for_selector('[data-testid="tweetText"]', timeout=(timeout*1000))
|
203
|
+
text = text_element.inner_text()
|
204
|
+
|
205
|
+
image_elements = page.query_selector_all("img")
|
206
|
+
image_urls = []
|
207
|
+
|
208
|
+
for img in image_elements:
|
209
|
+
src = img.get_attribute("src")
|
210
|
+
if src and "pbs.twimg.com/media" in src:
|
211
|
+
image_urls.append(src)
|
212
|
+
|
213
|
+
img_urls = list(set(image_urls))
|
214
|
+
post_text = str(text)
|
215
|
+
return img_urls, post_text
|
@@ -0,0 +1,139 @@
|
|
1
|
+
import http.client
|
2
|
+
import io
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import socket
|
6
|
+
import ssl
|
7
|
+
import time
|
8
|
+
import urllib
|
9
|
+
from typing import Callable, Optional, Union
|
10
|
+
|
11
|
+
import playwright
|
12
|
+
import playwright.sync_api
|
13
|
+
import requests
|
14
|
+
import urllib3
|
15
|
+
#from pytubefix.exceptions import VideoUnavailable, VideoPrivate, MaxRetriesExceeded
|
16
|
+
import yt_dlp
|
17
|
+
|
18
|
+
from warp_beacon.jobs.download_job import DownloadJob
|
19
|
+
from warp_beacon.scraper.abstract import ScraperAbstract
|
20
|
+
from warp_beacon.scraper.exceptions import (BadProxy, TimeOut, Unavailable,
|
21
|
+
extract_exception_message)
|
22
|
+
from warp_beacon.telegram.types import ReportType
|
23
|
+
|
24
|
+
|
25
|
+
class XAbstract(ScraperAbstract):
|
26
|
+
DOWNLOAD_DIR = "/tmp"
|
27
|
+
X_MAX_RETRIES_DEFAULT = 8
|
28
|
+
X_PAUSE_BEFORE_RETRY_DEFAULT = 3
|
29
|
+
X_TIMEOUT_DEFAULT = 15
|
30
|
+
X_TIMEOUT_INCREMENT_DEFAULT = 20
|
31
|
+
|
32
|
+
def __init__(self, account: tuple, proxy: dict=None) -> None:
|
33
|
+
super().__init__(account, proxy)
|
34
|
+
self._download_progress_threshold = 0
|
35
|
+
|
36
|
+
def validate_session(self) -> int:
|
37
|
+
return 0
|
38
|
+
|
39
|
+
def download_hndlr(self, func: Callable, *args: tuple[Union[str, int, dict, tuple, bool]], **kwargs: dict[Union[str, int, dict, tuple, bool]]) -> Optional[Union[list, dict, str, io.BytesIO]]:
|
40
|
+
ret_val = None
|
41
|
+
max_retries = int(os.environ.get("X_MAX_RETRIES", default=self.X_MAX_RETRIES_DEFAULT))
|
42
|
+
pause_secs = int(os.environ.get("X_PAUSE_BEFORE_RETRY", default=self.X_PAUSE_BEFORE_RETRY_DEFAULT))
|
43
|
+
timeout = int(os.environ.get("X_TIMEOUT", default=self.X_TIMEOUT_DEFAULT))
|
44
|
+
timeout_increment = int(os.environ.get("X_TIMEOUT_INCREMENT", default=self.X_TIMEOUT_INCREMENT_DEFAULT))
|
45
|
+
retries = 0
|
46
|
+
while max_retries >= retries:
|
47
|
+
try:
|
48
|
+
kwargs["timeout"] = timeout
|
49
|
+
ret_val = func(*args, **kwargs)
|
50
|
+
break
|
51
|
+
except urllib3.exceptions.ProxyError as e:
|
52
|
+
logging.warning("Proxy error!")
|
53
|
+
raise BadProxy(extract_exception_message(e.original_error))
|
54
|
+
except (socket.timeout,
|
55
|
+
ssl.SSLError,
|
56
|
+
http.client.IncompleteRead,
|
57
|
+
http.client.HTTPException,
|
58
|
+
requests.RequestException,
|
59
|
+
urllib.error.URLError,
|
60
|
+
urllib.error.HTTPError,
|
61
|
+
playwright.sync_api.TimeoutError) as e:
|
62
|
+
if hasattr(e, "code") and (int(e.code) == 403 or int(e.code) == 400):
|
63
|
+
raise Unavailable(extract_exception_message(e))
|
64
|
+
if hasattr(e, "reason") and "Remote end closed connection without response" in str(e.reason):
|
65
|
+
raise Unavailable(extract_exception_message(e))
|
66
|
+
logging.warning("X read timeout! Retrying in '%d' seconds ...", pause_secs)
|
67
|
+
logging.info("Your `X_MAX_RETRIES` values is '%d'", max_retries)
|
68
|
+
logging.exception(extract_exception_message(e))
|
69
|
+
if max_retries <= retries:
|
70
|
+
#self.remove_tmp_files()
|
71
|
+
raise TimeOut(extract_exception_message(e))
|
72
|
+
retries += 1
|
73
|
+
timeout += timeout_increment
|
74
|
+
time.sleep(pause_secs)
|
75
|
+
except yt_dlp.utils.DownloadError as e:
|
76
|
+
raise Unavailable(extract_exception_message(e))
|
77
|
+
except yt_dlp.utils.GeoRestrictedError:
|
78
|
+
raise Unavailable(extract_exception_message(e))
|
79
|
+
except yt_dlp.utils.PostProcessingError as e:
|
80
|
+
raise Unavailable(extract_exception_message(e))
|
81
|
+
except yt_dlp.utils.ExtractorError as e:
|
82
|
+
raise Unavailable(extract_exception_message(e))
|
83
|
+
except yt_dlp.utils.MaxDownloadsReached as e:
|
84
|
+
raise Unavailable(extract_exception_message(e))
|
85
|
+
except yt_dlp.utils.UnavailableVideoError as e:
|
86
|
+
raise Unavailable(extract_exception_message(e))
|
87
|
+
except yt_dlp.utils.ThrottledDownload as e:
|
88
|
+
raise Unavailable(extract_exception_message(e))
|
89
|
+
|
90
|
+
return ret_val
|
91
|
+
|
92
|
+
def download_progress(self, total: int | None, bytes_transferred: int, path: str) -> None:
|
93
|
+
if not total:
|
94
|
+
return
|
95
|
+
percentage_of_completion = round(bytes_transferred / (total or 1) * 100)
|
96
|
+
if percentage_of_completion >= self._download_progress_threshold:
|
97
|
+
logging.debug("[Download] X file '%s', %d", path, percentage_of_completion)
|
98
|
+
msg = {
|
99
|
+
"action": "report_download_status",
|
100
|
+
"current": bytes_transferred,
|
101
|
+
"total": total or 0,
|
102
|
+
"message_id": self.job.placeholder_message_id,
|
103
|
+
"chat_id": self.job.chat_id,
|
104
|
+
"completed": percentage_of_completion >= 100,
|
105
|
+
"report_type": ReportType.PROGRESS
|
106
|
+
}
|
107
|
+
self.status_pipe.send(msg)
|
108
|
+
self._download_progress_threshold += 20
|
109
|
+
|
110
|
+
def dlp_on_progress(self, params: dict) -> None:
|
111
|
+
if params.get("status", "") == "downloading":
|
112
|
+
total_size = int(params.get("total_bytes") or params.get("total_bytes_estimate") or 0)
|
113
|
+
if not total_size or total_size < 0:
|
114
|
+
logging.warning("[Download worker][yt_dlp]: total_size is '%d'", total_size)
|
115
|
+
return
|
116
|
+
bytes_downloaded = int(params.get("downloaded_bytes", 0))
|
117
|
+
percentage_of_completion = bytes_downloaded / (total_size or 1) * 100
|
118
|
+
if total_size == 0 or percentage_of_completion >= self._download_progress_threshold:
|
119
|
+
msg = {
|
120
|
+
"action": "report_download_status",
|
121
|
+
"current": bytes_downloaded,
|
122
|
+
"total": total_size,
|
123
|
+
"message_id": self.job.placeholder_message_id,
|
124
|
+
"chat_id": self.job.chat_id,
|
125
|
+
"completed": percentage_of_completion >= 100,
|
126
|
+
"report_type": ReportType.PROGRESS
|
127
|
+
}
|
128
|
+
self.status_pipe.send(msg)
|
129
|
+
logging.debug("[Download worker][yt_dlp] Downloaded %d%%", percentage_of_completion)
|
130
|
+
if total_size > 0:
|
131
|
+
self._download_progress_threshold += 20
|
132
|
+
|
133
|
+
def _download(self, url: str, timeout: int = 60) -> list:
|
134
|
+
raise NotImplementedError("You should to implement _download method")
|
135
|
+
|
136
|
+
def download(self, job: DownloadJob) -> list:
|
137
|
+
self.job = job
|
138
|
+
ret = self.download_hndlr(self._download, job.url)
|
139
|
+
return ret
|
@@ -148,6 +148,10 @@ class AsyncDownloader(object):
|
|
148
148
|
elif job.job_origin is Origin.YOUTUBE:
|
149
149
|
from warp_beacon.scraper.youtube.youtube import YoutubeScraper
|
150
150
|
actor = YoutubeScraper(selector.get_current(), proxy)
|
151
|
+
elif job.job_origin is Origin.X:
|
152
|
+
from warp_beacon.scraper.X.X import XScraper
|
153
|
+
actor = XScraper(selector.get_current(), proxy)
|
154
|
+
|
151
155
|
actor.send_message_to_admin_func = self.send_message_to_admin
|
152
156
|
actor.request_yt_auth = self.request_yt_auth
|
153
157
|
actor.auth_event = self.auth_event
|
@@ -111,12 +111,15 @@ class AccountSelector(object):
|
|
111
111
|
def get_last_proxy(self) -> Optional[dict]:
|
112
112
|
return self.accounts_meta_data.get("last_proxy", None)
|
113
113
|
|
114
|
-
def get_proxy_list(self) -> List[dict]:
|
114
|
+
def get_proxy_list(self, ipv4: bool = False) -> List[dict]:
|
115
115
|
matched_proxy = []
|
116
116
|
try:
|
117
117
|
acc_id, acc_data = self.get_current()
|
118
118
|
current_acc_pid = acc_data.get("proxy_id", "").strip()
|
119
119
|
for proxy in self.proxies:
|
120
|
+
if ipv4:
|
121
|
+
if proxy.get("ip_version", '') not in ("v4", "both"):
|
122
|
+
continue
|
120
123
|
pid = proxy.get("id", "").strip()
|
121
124
|
if pid and current_acc_pid and pid == current_acc_pid:
|
122
125
|
if "override_force_ipv6" in proxy:
|
@@ -129,10 +132,10 @@ class AccountSelector(object):
|
|
129
132
|
|
130
133
|
return matched_proxy
|
131
134
|
|
132
|
-
def get_random_account_proxy(self) -> Optional[dict]:
|
135
|
+
def get_random_account_proxy(self, ipv4: bool = False) -> Optional[dict]:
|
133
136
|
if self.proxies:
|
134
137
|
try:
|
135
|
-
matched_proxy = self.get_proxy_list()
|
138
|
+
matched_proxy = self.get_proxy_list(ipv4)
|
136
139
|
if matched_proxy:
|
137
140
|
if len(matched_proxy) > 1:
|
138
141
|
random.seed(random.seed(time.time_ns() ^ int.from_bytes(os.urandom(len(matched_proxy)), "big")))
|
@@ -204,7 +207,10 @@ class AccountSelector(object):
|
|
204
207
|
if not self.current.get("enabled", True):
|
205
208
|
logging.info("Account '%d' is disabled. Probing next ...", idx)
|
206
209
|
self.next()
|
207
|
-
|
210
|
+
ipv4 = False
|
211
|
+
if module_origin is Origin.X:
|
212
|
+
ipv4 = True
|
213
|
+
self.current_proxy = self.get_random_account_proxy(ipv4)
|
208
214
|
|
209
215
|
def next(self) -> dict:
|
210
216
|
idx = self.account_index[self.current_module_name].value
|
@@ -381,11 +381,7 @@ class YoutubeAbstract(ScraperAbstract):
|
|
381
381
|
logging.warning("Download failed, trying to download with yt_dlp")
|
382
382
|
logging.exception(e)
|
383
383
|
|
384
|
-
|
385
|
-
ret = self.download_hndlr(self._download_yt_dlp, job.url, thumbnail=thumbnail)
|
386
|
-
except NotImplementedError:
|
387
|
-
logging.info("yt_dlp is not supported for this submodule yet")
|
388
|
-
raise Unavailable("Сontent unvailable")
|
384
|
+
ret = self.download_hndlr(self._download_yt_dlp, job.url, thumbnail=thumbnail)
|
389
385
|
|
390
386
|
return ret
|
391
387
|
|
@@ -45,9 +45,9 @@ class Storage(object):
|
|
45
45
|
yt_vid_id = yt_vid_id_list.pop() if yt_vid_id_list else ""
|
46
46
|
if yt_vid_id:
|
47
47
|
path = urlparse(url).path.strip('/').replace("watch", ("yt_music" if parse_mode is UrlParseMode.YT_MUSIC else "youtube"))
|
48
|
-
return
|
48
|
+
return f"{path}/{yt_vid_id}".strip('/')
|
49
49
|
else:
|
50
|
-
raise ValueError("Failed to generate uniq_id for url '
|
50
|
+
raise ValueError(f"Failed to generate uniq_id for url '{url}'")
|
51
51
|
|
52
52
|
path = urlparse(url).path.strip('/')
|
53
53
|
return path
|
File without changes
|
@@ -43,6 +43,10 @@ warp_beacon/scraper/exceptions.py
|
|
43
43
|
warp_beacon/scraper/fail_handler.py
|
44
44
|
warp_beacon/scraper/link_resolver.py
|
45
45
|
warp_beacon/scraper/utils.py
|
46
|
+
warp_beacon/scraper/X/X.py
|
47
|
+
warp_beacon/scraper/X/__init__.py
|
48
|
+
warp_beacon/scraper/X/abstract.py
|
49
|
+
warp_beacon/scraper/X/types.py
|
46
50
|
warp_beacon/scraper/instagram/__init__.py
|
47
51
|
warp_beacon/scraper/instagram/captcha.py
|
48
52
|
warp_beacon/scraper/instagram/instagram.py
|
@@ -15,6 +15,10 @@ warp_beacon/scheduler
|
|
15
15
|
warp_beacon/scheduler/instagram_human
|
16
16
|
warp_beacon/scheduler/scheduler
|
17
17
|
warp_beacon/scraper
|
18
|
+
warp_beacon/scraper/X
|
19
|
+
warp_beacon/scraper/X/X
|
20
|
+
warp_beacon/scraper/X/abstract
|
21
|
+
warp_beacon/scraper/X/types
|
18
22
|
warp_beacon/scraper/abstract
|
19
23
|
warp_beacon/scraper/account_selector
|
20
24
|
warp_beacon/scraper/exceptions
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|