warp-beacon 2.6.17__tar.gz → 2.6.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warp_beacon-2.6.17/warp_beacon.egg-info → warp_beacon-2.6.18}/PKG-INFO +1 -1
- warp_beacon-2.6.18/warp_beacon/__version__.py +2 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/jobs/abstract.py +4 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scheduler/instagram_human.py +26 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/__init__.py +13 -3
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/abstract.py +17 -11
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/instagram/instagram.py +13 -4
- {warp_beacon-2.6.17 → warp_beacon-2.6.18/warp_beacon.egg-info}/PKG-INFO +1 -1
- warp_beacon-2.6.17/warp_beacon/__version__.py +0 -2
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/LICENSE +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/MANIFEST.in +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/README.md +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/assets/placeholder.gif +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/etc/.gitignore +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/etc/accounts.json +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/etc/proxies.json +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/etc/warp_beacon.conf +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/etc/warp_beacon.service +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/pyproject.toml +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/setup.cfg +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/setup.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/compress/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/compress/video.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/jobs/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/jobs/download_job.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/jobs/types.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/jobs/upload_job.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/mediainfo/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/mediainfo/abstract.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/mediainfo/audio.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/mediainfo/silencer.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/mediainfo/video.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scheduler/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scheduler/scheduler.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/account_selector.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/exceptions.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/fail_handler.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/instagram/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/instagram/captcha.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/link_resolver.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/youtube/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/youtube/abstract.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/youtube/music.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/youtube/shorts.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/scraper/youtube/youtube.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/storage/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/storage/mongo.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/telegram/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/telegram/bot.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/telegram/caption_shortener.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/telegram/handlers.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/telegram/placeholder_message.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/telegram/utils.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/uploader/__init__.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/warp_beacon.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon/yt_auth.py +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon.egg-info/SOURCES.txt +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon.egg-info/dependency_links.txt +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon.egg-info/entry_points.txt +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon.egg-info/requires.txt +0 -0
- {warp_beacon-2.6.17 → warp_beacon-2.6.18}/warp_beacon.egg-info/top_level.txt +0 -0
@@ -47,6 +47,8 @@ class JobSettings(TypedDict):
|
|
47
47
|
message_leftover: str
|
48
48
|
replay: bool
|
49
49
|
short_text: bool
|
50
|
+
scroll_content: bool
|
51
|
+
last_pk: int
|
50
52
|
|
51
53
|
class AbstractJob(ABC):
|
52
54
|
job_id: uuid.UUID = None
|
@@ -86,6 +88,8 @@ class AbstractJob(ABC):
|
|
86
88
|
message_leftover: str = ""
|
87
89
|
replay: bool = False
|
88
90
|
short_text: bool = False
|
91
|
+
scroll_content: bool = False
|
92
|
+
last_pk: int = 0
|
89
93
|
|
90
94
|
def __init__(self, **kwargs: Unpack[JobSettings]) -> None:
|
91
95
|
if kwargs:
|
@@ -16,6 +16,32 @@ class InstagramHuman(object):
|
|
16
16
|
self.scrapler = scrapler
|
17
17
|
self.operations_count = 0
|
18
18
|
|
19
|
+
def watch_content(self, media: list) -> None:
|
20
|
+
for m in media[:random.randint(2, 4)]:
|
21
|
+
try:
|
22
|
+
logging.info("Wathing content with pk '%d'", m.pk)
|
23
|
+
content = self.scrapler.cl.media_info(m.pk)
|
24
|
+
logging.info("Watched content with id '%d'", content.pk)
|
25
|
+
self.operations_count += 1
|
26
|
+
time.sleep(random.uniform(2, 5))
|
27
|
+
except Exception as e:
|
28
|
+
logging.warning("Exception while watching content")
|
29
|
+
logging.exception(e)
|
30
|
+
|
31
|
+
def scroll_content(self, last_pk: int) -> None:
|
32
|
+
if random.random() > 0.2:
|
33
|
+
logging.info("Starting to watch related reels with media_pk '%d'", last_pk)
|
34
|
+
media = self.scrapler.download_hndlr(self.scrapler.cl.reels, amount=random.randint(4, 10), last_media_pk=last_pk)
|
35
|
+
self.operations_count += 1
|
36
|
+
self.watch_content(media)
|
37
|
+
|
38
|
+
if random.random() > 0.3:
|
39
|
+
time.sleep(random.uniform(2, 5))
|
40
|
+
logging.info("Starting to explore reels with media_pk '%d'", last_pk)
|
41
|
+
media = self.scrapler.download_hndlr(self.scrapler.cl.explore_reels, amount=random.randint(4, 10), last_media_pk=last_pk)
|
42
|
+
self.operations_count += 1
|
43
|
+
self.watch_content(media)
|
44
|
+
|
19
45
|
def simulate_activity(self) -> None:
|
20
46
|
now = datetime.now()
|
21
47
|
hour = now.hour
|
@@ -147,6 +147,13 @@ class AsyncDownloader(object):
|
|
147
147
|
# job retry loop
|
148
148
|
while self.allow_loop.value == 1:
|
149
149
|
try:
|
150
|
+
if job.scroll_content and job.last_pk and job.job_origin is Origin.INSTAGRAM:
|
151
|
+
logging.info("Scrolling relative content with pk '%s'", job.last_pk)
|
152
|
+
operations = actor.scroll_content(last_pk=job.last_pk)
|
153
|
+
if operations:
|
154
|
+
selector.inc_ig_request_count(amount=operations)
|
155
|
+
logging.info("Scrolling done")
|
156
|
+
break
|
150
157
|
if job.session_validation and job.job_origin in (Origin.INSTAGRAM, Origin.YOUTUBE):
|
151
158
|
if job.job_origin is Origin.INSTAGRAM:
|
152
159
|
if selector.get_ig_request_count() >= int(os.environ.get("IG_REQUESTS_PER_ACCOUNT", default="10")):
|
@@ -361,6 +368,12 @@ class AsyncDownloader(object):
|
|
361
368
|
)
|
362
369
|
else:
|
363
370
|
self.uploader.queue_task(upload_job)
|
371
|
+
# watch related reels to simulate human
|
372
|
+
if item.get("last_pk", 0) and "reel/" in job.url:
|
373
|
+
self.queue_task(DownloadJob.build(
|
374
|
+
scroll_content=True,
|
375
|
+
last_pk=int(item.get("last_pk", 0))
|
376
|
+
))
|
364
377
|
else:
|
365
378
|
logging.info("Job already in work in parallel worker. Redirecting job to upload worker.")
|
366
379
|
self.uploader.queue_task(job.to_upload_job())
|
@@ -384,9 +397,6 @@ class AsyncDownloader(object):
|
|
384
397
|
logging.error("Error inside download worker!")
|
385
398
|
logging.exception(e)
|
386
399
|
self.notify_task_failed(job)
|
387
|
-
finally:
|
388
|
-
if actor:
|
389
|
-
actor.restore_gai()
|
390
400
|
except Empty:
|
391
401
|
pass
|
392
402
|
except Exception as e:
|
@@ -3,6 +3,7 @@ import pathlib
|
|
3
3
|
|
4
4
|
import socket
|
5
5
|
import requests.packages.urllib3.util.connection as urllib3_cn
|
6
|
+
import multiprocessing
|
6
7
|
|
7
8
|
from abc import ABC, abstractmethod
|
8
9
|
from typing import Callable, Union
|
@@ -22,16 +23,19 @@ class ScraperAbstract(ABC):
|
|
22
23
|
proxy = None
|
23
24
|
|
24
25
|
def __init__(self, account: tuple, proxy: dict=None) -> None:
|
26
|
+
self._gai_lock = multiprocessing.Lock()
|
25
27
|
self.account_index = account[0]
|
26
28
|
self.account = account[1]
|
27
29
|
self.proxy = proxy
|
28
30
|
if self.account.get("force_ipv6", False):
|
29
31
|
self.force_ipv6()
|
30
|
-
|
31
|
-
def __del__(self) -> None:
|
32
|
-
if self.account.get("force_ipv6", False):
|
32
|
+
else:
|
33
33
|
self.restore_gai()
|
34
34
|
|
35
|
+
#def __del__(self) -> None:
|
36
|
+
# if self.account.get("force_ipv6", False):
|
37
|
+
# self.restore_gai()
|
38
|
+
|
35
39
|
@abstractmethod
|
36
40
|
def download(self, url: str) -> bool:
|
37
41
|
raise NotImplementedError
|
@@ -86,13 +90,15 @@ class ScraperAbstract(ABC):
|
|
86
90
|
if urllib3_cn.HAS_IPV6:
|
87
91
|
family = socket.AF_INET6 # force ipv6 only if it is available
|
88
92
|
return family
|
89
|
-
|
90
|
-
self.original_gai_family
|
91
|
-
|
92
|
-
|
93
|
+
with self._gai_lock:
|
94
|
+
if self.original_gai_family is None:
|
95
|
+
self.original_gai_family = urllib3_cn.allowed_gai_family
|
96
|
+
logging.info("Forcing IPv6 ...")
|
97
|
+
urllib3_cn.allowed_gai_family = allowed_gai_family
|
93
98
|
|
94
99
|
def restore_gai(self) -> None:
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
100
|
+
with self._gai_lock:
|
101
|
+
if self.original_gai_family:
|
102
|
+
logging.info("Restoring normal IP stack ...")
|
103
|
+
urllib3_cn.allowed_gai_family = self.original_gai_family
|
104
|
+
self.original_gai_family = None
|
@@ -109,6 +109,14 @@ class InstagramScraper(ScraperAbstract):
|
|
109
109
|
inst_human.simulate_activity()
|
110
110
|
self.safe_write_session()
|
111
111
|
return inst_human.operations_count
|
112
|
+
|
113
|
+
def scroll_content(self, last_pk: int) -> None:
|
114
|
+
from warp_beacon.scheduler.instagram_human import InstagramHuman
|
115
|
+
self.load_session()
|
116
|
+
inst_human = InstagramHuman(self)
|
117
|
+
inst_human.scroll_content(last_pk)
|
118
|
+
self.safe_write_session()
|
119
|
+
return inst_human.operations_count
|
112
120
|
|
113
121
|
def scrap(self, url: str) -> tuple[str]:
|
114
122
|
self.load_session()
|
@@ -208,7 +216,8 @@ class InstagramScraper(ScraperAbstract):
|
|
208
216
|
self.cl.request_timeout = int(os.environ.get("IG_REQUEST_TIMEOUT", default=60))
|
209
217
|
path = self.download_hndlr(self.cl.video_download_by_url, url, folder='/tmp')
|
210
218
|
return {"local_media_path": str(path), "canonical_name": self.extract_canonical_name(media_info), \
|
211
|
-
"media_type": JobType.VIDEO, "media_info": {"duration": round(media_info.video_duration)
|
219
|
+
"media_type": JobType.VIDEO, "media_info": {"duration": round(media_info.video_duration), \
|
220
|
+
"last_pk": media_info.pk}}
|
212
221
|
|
213
222
|
def download_photo(self, url: str, media_info: Media) -> dict:
|
214
223
|
path = str(self.download_hndlr(self.cl.photo_download_by_url, url, folder='/tmp'))
|
@@ -217,7 +226,7 @@ class InstagramScraper(ScraperAbstract):
|
|
217
226
|
path = InstagramScraper.convert_webp_to_png(path)
|
218
227
|
if ".heic" in path_lowered:
|
219
228
|
path = InstagramScraper.convert_heic_to_png(path)
|
220
|
-
return {"local_media_path": path, "canonical_name": self.extract_canonical_name(media_info), "media_type": JobType.IMAGE}
|
229
|
+
return {"local_media_path": path, "canonical_name": self.extract_canonical_name(media_info), "media_type": JobType.IMAGE, "last_pk": media_info.pk}
|
221
230
|
|
222
231
|
def download_story(self, story_info: Story) -> dict:
|
223
232
|
path, media_type, media_info = "", JobType.UNKNOWN, {}
|
@@ -259,7 +268,7 @@ class InstagramScraper(ScraperAbstract):
|
|
259
268
|
for media_chunk in Utils.chunker(media_info.resources, 10):
|
260
269
|
chunk = []
|
261
270
|
for media in media_chunk:
|
262
|
-
_media_info = self.download_hndlr(self.cl.media_info, media.pk)
|
271
|
+
_media_info = self.download_hndlr(self.cl.media_info, media.pk, use_cache=False)
|
263
272
|
if media.media_type == 1: # photo
|
264
273
|
chunk.append(self.download_photo(url=_media_info.thumbnail_url, media_info=_media_info))
|
265
274
|
elif media.media_type == 2: # video
|
@@ -287,7 +296,7 @@ class InstagramScraper(ScraperAbstract):
|
|
287
296
|
try:
|
288
297
|
scrap_type, media_id = self.scrap(job.url)
|
289
298
|
if scrap_type == "media":
|
290
|
-
media_info = self.download_hndlr(self.cl.media_info, media_id)
|
299
|
+
media_info = self.download_hndlr(self.cl.media_info, media_id, use_cache=False)
|
291
300
|
logging.info("media_type is '%d', product_type is '%s'", media_info.media_type, media_info.product_type)
|
292
301
|
if media_info.media_type == 2 and media_info.product_type == "clips": # Reels
|
293
302
|
res.append(self.download_video(url=media_info.video_url, media_info=media_info))
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|