warp-beacon 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etc/warp_beacon/warp_beacon.conf +10 -0
- lib/systemd/system/warp_beacon.service +14 -0
- warp_beacon/__init__.py +0 -0
- warp_beacon/__version__.py +2 -0
- warp_beacon/jobs/__init__.py +0 -0
- warp_beacon/jobs/abstract.py +60 -0
- warp_beacon/jobs/download_job.py +23 -0
- warp_beacon/jobs/upload_job.py +26 -0
- warp_beacon/mediainfo/__init__.py +0 -0
- warp_beacon/mediainfo/video.py +80 -0
- warp_beacon/scrapler/__init__.py +155 -0
- warp_beacon/scrapler/abstract.py +16 -0
- warp_beacon/scrapler/instagram.py +191 -0
- warp_beacon/storage/__init__.py +82 -0
- warp_beacon/uploader/__init__.py +118 -0
- warp_beacon/warp_beacon.py +360 -0
- warp_beacon-1.0.2.dist-info/LICENSE +201 -0
- warp_beacon-1.0.2.dist-info/METADATA +286 -0
- warp_beacon-1.0.2.dist-info/RECORD +22 -0
- warp_beacon-1.0.2.dist-info/WHEEL +5 -0
- warp_beacon-1.0.2.dist-info/entry_points.txt +5 -0
- warp_beacon-1.0.2.dist-info/top_level.txt +14 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
[Unit]
|
2
|
+
Description=Telegram bot for expanding external media links
|
3
|
+
After=network-online.target syslog.target network.target remote-fs.target nss-lookup.target multi-user.target
|
4
|
+
|
5
|
+
[Service]
|
6
|
+
User=root
|
7
|
+
Group=root
|
8
|
+
Type=simple
|
9
|
+
Restart=always
|
10
|
+
EnvironmentFile=/etc/warp_beacon/warp_beacon.conf
|
11
|
+
ExecStart=/opt/venvs/warp-beacon/bin/warp_beacon
|
12
|
+
|
13
|
+
[Install]
|
14
|
+
WantedBy=multi-user.target
|
warp_beacon/__init__.py
ADDED
File without changes
|
File without changes
|
@@ -0,0 +1,60 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import TypedDict
|
3
|
+
from typing_extensions import Unpack
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
class JobSettings(TypedDict):
|
7
|
+
job_id: uuid.UUID
|
8
|
+
message_id: int
|
9
|
+
local_media_path: str
|
10
|
+
media_info: dict
|
11
|
+
url: str
|
12
|
+
uniq_id: str
|
13
|
+
tg_file_id: str
|
14
|
+
in_process: bool
|
15
|
+
job_failed: bool
|
16
|
+
media_type: str
|
17
|
+
job_failed_msg: str
|
18
|
+
effective_url: str
|
19
|
+
save_items: bool
|
20
|
+
media_collection: list
|
21
|
+
|
22
|
+
class AbstractJob(ABC):
|
23
|
+
job_id: uuid.UUID = None
|
24
|
+
message_id: int = 0
|
25
|
+
local_media_path: str = ""
|
26
|
+
media_info: dict = {}
|
27
|
+
url: str = ""
|
28
|
+
uniq_id: str = ""
|
29
|
+
tg_file_id: str = ""
|
30
|
+
media_type: str = "video"
|
31
|
+
in_process: bool = False
|
32
|
+
job_failed: bool = False
|
33
|
+
job_failed_msg: str = ""
|
34
|
+
effective_url: str = ""
|
35
|
+
save_items: bool = False
|
36
|
+
media_collection: list = []
|
37
|
+
|
38
|
+
def __init__(self, **kwargs: Unpack[JobSettings]) -> None:
|
39
|
+
if kwargs:
|
40
|
+
self.__dict__.update(kwargs)
|
41
|
+
self.job_id = uuid.uuid4()
|
42
|
+
|
43
|
+
def __del__(self) -> None:
|
44
|
+
pass
|
45
|
+
|
46
|
+
def __str__(self) -> str:
|
47
|
+
return str(self.to_dict())
|
48
|
+
|
49
|
+
def __repr__(self) -> str:
|
50
|
+
return str(self.to_dict())
|
51
|
+
|
52
|
+
def to_dict(self) -> dict:
|
53
|
+
d = {}
|
54
|
+
for key in dir(self.__class__):
|
55
|
+
if not key.startswith('_'):
|
56
|
+
value = getattr(self, key)
|
57
|
+
if not callable(value):
|
58
|
+
d[key] = value
|
59
|
+
|
60
|
+
return d
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from typing import TypedDict
|
2
|
+
from typing_extensions import Unpack
|
3
|
+
|
4
|
+
from warp_beacon.jobs.upload_job import UploadJob
|
5
|
+
from warp_beacon.jobs.abstract import AbstractJob, JobSettings
|
6
|
+
|
7
|
+
class DownloadJob(AbstractJob):
|
8
|
+
def __init__(self, **kwargs: Unpack[JobSettings]) -> None:
|
9
|
+
super(DownloadJob, self).__init__(**kwargs)
|
10
|
+
def __del__(self) -> None:
|
11
|
+
pass
|
12
|
+
|
13
|
+
@staticmethod
|
14
|
+
def build(**kwargs: Unpack[JobSettings]) -> "DownloadJob":
|
15
|
+
return DownloadJob(**kwargs)
|
16
|
+
|
17
|
+
def to_upload_job(self, **kwargs: Unpack[JobSettings]) -> AbstractJob:
|
18
|
+
d = self.to_dict()
|
19
|
+
d.update(kwargs)
|
20
|
+
if "media_collection" in d:
|
21
|
+
for k, v in enumerate(d["media_collection"]):
|
22
|
+
d["media_collection"][k] = UploadJob.build(**v)
|
23
|
+
return UploadJob.build(**d)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from typing import TypedDict
|
2
|
+
from typing_extensions import Unpack
|
3
|
+
|
4
|
+
from warp_beacon.jobs.abstract import AbstractJob, JobSettings
|
5
|
+
|
6
|
+
class UploadJob(AbstractJob):
|
7
|
+
def __init__(self, **kwargs: Unpack[JobSettings]) -> None:
|
8
|
+
super(UploadJob, self).__init__(**kwargs)
|
9
|
+
def __del__(self) -> None:
|
10
|
+
pass
|
11
|
+
|
12
|
+
@staticmethod
|
13
|
+
def build(**kwargs: Unpack[JobSettings]) -> "UploadJob":
|
14
|
+
return UploadJob(**kwargs)
|
15
|
+
|
16
|
+
def to_download_job(self, **kwargs: Unpack[JobSettings]) -> AbstractJob:
|
17
|
+
from warp_beacon.jobs.download_job import DownloadJob
|
18
|
+
d = self.to_dict()
|
19
|
+
d.update(kwargs)
|
20
|
+
return DownloadJob.build(**d)
|
21
|
+
|
22
|
+
def set_flag(self, key: str, value: bool) -> "UploadJob":
|
23
|
+
if key in self.__dict__:
|
24
|
+
self.__dict__[key] = value
|
25
|
+
|
26
|
+
return self
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import io, os
|
2
|
+
from typing import Optional
|
3
|
+
import cv2
|
4
|
+
|
5
|
+
class VideoInfo(object):
|
6
|
+
vid = None
|
7
|
+
# need for filesize
|
8
|
+
filename = ""
|
9
|
+
|
10
|
+
def __init__(self, filename: str) -> None:
|
11
|
+
self.vid = cv2.VideoCapture(filename)
|
12
|
+
self.filename = filename
|
13
|
+
|
14
|
+
def __del__(self) -> None:
|
15
|
+
self.vid.release()
|
16
|
+
|
17
|
+
def get_demensions(self) -> dict:
|
18
|
+
res = {"width": None, "height": None}
|
19
|
+
if self.vid.isOpened():
|
20
|
+
res["width"] = int(self.vid.get(cv2.CAP_PROP_FRAME_WIDTH))
|
21
|
+
res["height"] = int(self.vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
22
|
+
|
23
|
+
return res
|
24
|
+
|
25
|
+
def get_duration(self) -> int:
|
26
|
+
duration_in_seconds = None
|
27
|
+
if self.vid.isOpened():
|
28
|
+
fps = self.vid.get(cv2.CAP_PROP_FPS)
|
29
|
+
total_no_frames = self.vid.get(cv2.CAP_PROP_FRAME_COUNT)
|
30
|
+
duration_in_seconds = int(total_no_frames / fps)
|
31
|
+
|
32
|
+
return duration_in_seconds
|
33
|
+
|
34
|
+
def get_filesize(self) -> float:
|
35
|
+
size = os.path.getsize(self.filename)
|
36
|
+
return round(size/(pow(1024,2)), 2)
|
37
|
+
|
38
|
+
def get_finfo(self, except_info: tuple=()) -> dict:
|
39
|
+
res = {}
|
40
|
+
res.update(self.get_demensions())
|
41
|
+
if "duration" not in except_info:
|
42
|
+
res["duration"] = self.get_duration()
|
43
|
+
if "filesize" not in except_info:
|
44
|
+
res["filesize"] = self.get_filesize()
|
45
|
+
return res
|
46
|
+
|
47
|
+
def shrink_image_to_fit(self, img):
|
48
|
+
height, width = img.shape[:2]
|
49
|
+
max_height = 320
|
50
|
+
max_width = 320
|
51
|
+
|
52
|
+
# only shrink if img is bigger than required
|
53
|
+
if max_height < height or max_width < width:
|
54
|
+
# get scaling factor
|
55
|
+
scaling_factor = max_height / float(height)
|
56
|
+
if max_width/float(width) < scaling_factor:
|
57
|
+
scaling_factor = max_width / float(width)
|
58
|
+
# resize image
|
59
|
+
img = cv2.resize(img, None, fx=scaling_factor, fy=scaling_factor, interpolation=cv2.INTER_AREA)
|
60
|
+
|
61
|
+
return img
|
62
|
+
|
63
|
+
def generate_thumbnail(self) -> Optional[io.BytesIO]:
|
64
|
+
if self.vid.isOpened():
|
65
|
+
count = 4
|
66
|
+
success = True
|
67
|
+
while success:
|
68
|
+
self.vid.set(cv2.CAP_PROP_POS_MSEC,(count*1000))
|
69
|
+
success, image = self.vid.read()
|
70
|
+
if success:
|
71
|
+
image = self.shrink_image_to_fit(image)
|
72
|
+
success, buffer = cv2.imencode(".jpg", image)
|
73
|
+
if success:
|
74
|
+
io_buf = io.BytesIO(buffer)
|
75
|
+
io_buf.seek(0)
|
76
|
+
#io_buf.name = "thumbnail.png"
|
77
|
+
return io_buf
|
78
|
+
count += 1
|
79
|
+
|
80
|
+
return None
|
@@ -0,0 +1,155 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
import multiprocessing
|
3
|
+
import time
|
4
|
+
import logging
|
5
|
+
from requests.exceptions import ConnectTimeout, HTTPError
|
6
|
+
from instagrapi.exceptions import MediaNotFound, UnknownError
|
7
|
+
|
8
|
+
from warp_beacon.mediainfo.video import VideoInfo
|
9
|
+
from warp_beacon.uploader import AsyncUploader
|
10
|
+
from warp_beacon.jobs.download_job import DownloadJob
|
11
|
+
|
12
|
+
CONST_CPU_COUNT = multiprocessing.cpu_count()
|
13
|
+
|
14
|
+
class AsyncDownloader(object):
|
15
|
+
workers = []
|
16
|
+
allow_loop = None
|
17
|
+
job_queue = multiprocessing.Queue()
|
18
|
+
uploader = None
|
19
|
+
workers_count = CONST_CPU_COUNT
|
20
|
+
|
21
|
+
def __init__(self, uploader: AsyncUploader, workers_count: int=CONST_CPU_COUNT) -> None:
|
22
|
+
self.allow_loop = multiprocessing.Value('i', 1)
|
23
|
+
self.uploader = uploader
|
24
|
+
self.workers_count = workers_count
|
25
|
+
|
26
|
+
def __del__(self) -> None:
|
27
|
+
self.stop_all()
|
28
|
+
|
29
|
+
def start(self) -> None:
|
30
|
+
for _ in range(self.workers_count):
|
31
|
+
proc = multiprocessing.Process(target=self.do_work)
|
32
|
+
self.workers.append(proc)
|
33
|
+
proc.start()
|
34
|
+
|
35
|
+
def get_media_info(self, path: str, fr_media_info: dict={}) -> Optional[dict]:
|
36
|
+
media_info = None
|
37
|
+
try:
|
38
|
+
if path:
|
39
|
+
video_info = VideoInfo(path)
|
40
|
+
media_info = video_info.get_finfo(tuple(fr_media_info.keys()))
|
41
|
+
media_info.update(fr_media_info)
|
42
|
+
logging.info("Media file info: %s", media_info)
|
43
|
+
media_info["thumb"] = video_info.generate_thumbnail()
|
44
|
+
except Exception as e:
|
45
|
+
logging.error("Failed to process media info!")
|
46
|
+
logging.exception(e)
|
47
|
+
|
48
|
+
return media_info
|
49
|
+
|
50
|
+
def do_work(self) -> None:
|
51
|
+
logging.info("download worker started")
|
52
|
+
while self.allow_loop.value == 1:
|
53
|
+
try:
|
54
|
+
job = None
|
55
|
+
try:
|
56
|
+
job = self.job_queue.get()
|
57
|
+
actor = None
|
58
|
+
try:
|
59
|
+
items = []
|
60
|
+
if "instagram.com/" in job.url:
|
61
|
+
if not job.in_process:
|
62
|
+
from warp_beacon.scrapler.instagram import InstagramScrapler
|
63
|
+
actor = InstagramScrapler()
|
64
|
+
while True:
|
65
|
+
try:
|
66
|
+
logging.info("Downloading URL '%s'", job.url)
|
67
|
+
items = actor.download(job.url)
|
68
|
+
break
|
69
|
+
except ConnectTimeout as e:
|
70
|
+
logging.error("ConnectTimeout download error!")
|
71
|
+
logging.exception(e)
|
72
|
+
time.sleep(2)
|
73
|
+
except MediaNotFound as e:
|
74
|
+
logging.warning("MediaNotFound occurred!")
|
75
|
+
logging.exception(e)
|
76
|
+
self.uploader.queue_task(job.to_upload_job(
|
77
|
+
job_failed=True,
|
78
|
+
job_failed_msg="Unable to access to media under this URL. Seems like the media is private.")
|
79
|
+
)
|
80
|
+
break
|
81
|
+
except (UnknownError, Exception) as e:
|
82
|
+
logging.warning("UnknownError occurred!")
|
83
|
+
logging.exception(e)
|
84
|
+
exception_msg = ""
|
85
|
+
if hasattr(e, "message"):
|
86
|
+
exception_msg = e.message
|
87
|
+
else:
|
88
|
+
exception_msg = str(e)
|
89
|
+
if "geoblock_required" in exception_msg:
|
90
|
+
self.uploader.queue_task(job.to_upload_job(
|
91
|
+
job_failed=True,
|
92
|
+
job_failed_msg="This content does not accessible for bot account. Seems like author blocked certain region.")
|
93
|
+
)
|
94
|
+
break
|
95
|
+
self.uploader.queue_task(job.to_upload_job(
|
96
|
+
job_failed=True,
|
97
|
+
job_failed_msg="WOW, unknown error occured! Please send service logs to developer via email: andrey@bagrintsev.me.")
|
98
|
+
)
|
99
|
+
break
|
100
|
+
|
101
|
+
if items:
|
102
|
+
for item in items:
|
103
|
+
media_info = {"filesize": 0}
|
104
|
+
if item["media_type"] == "video":
|
105
|
+
media_info = self.get_media_info(item["local_media_path"], item["media_info"])
|
106
|
+
elif item["media_type"] == "collection":
|
107
|
+
for v in item["items"]:
|
108
|
+
if v["media_type"] == "video":
|
109
|
+
col_media_info = self.get_media_info(v["local_media_path"], v["media_info"])
|
110
|
+
media_info["filesize"] += int(col_media_info.get("filesize", 0))
|
111
|
+
v["media_info"] = col_media_info
|
112
|
+
|
113
|
+
job_args = {"media_type": item["media_type"], "media_info": media_info}
|
114
|
+
if item["media_type"] == "collection":
|
115
|
+
job_args["media_collection"] = item["items"]
|
116
|
+
if item.get("save_items", None) is not None:
|
117
|
+
job_args["save_items"] = item.get("save_items", False)
|
118
|
+
else:
|
119
|
+
job_args["local_media_path"] = item["local_media_path"]
|
120
|
+
|
121
|
+
upload_job = job.to_upload_job(**job_args)
|
122
|
+
self.uploader.queue_task(upload_job)
|
123
|
+
else:
|
124
|
+
logging.info("Job already in work in parallel worker. Redirecting job to upload worker.")
|
125
|
+
self.uploader.queue_task(job.to_upload_job())
|
126
|
+
except HTTPError as e:
|
127
|
+
logging.error("HTTP error inside download worker!")
|
128
|
+
logging.exception(e)
|
129
|
+
except Exception as e:
|
130
|
+
logging.error("Error inside download worker!")
|
131
|
+
logging.exception(e)
|
132
|
+
self.notify_task_failed(job)
|
133
|
+
#self.queue_task(url=item["url"], message_id=item["message_id"], item_in_process=item["in_process"], uniq_id=item["uniq_id"])
|
134
|
+
except multiprocessing.Queue.empty:
|
135
|
+
pass
|
136
|
+
except Exception as e:
|
137
|
+
logging.error("Exception occurred inside worker!")
|
138
|
+
logging.exception(e)
|
139
|
+
|
140
|
+
def stop_all(self) -> None:
|
141
|
+
self.allow_loop.value = 0
|
142
|
+
for proc in self.workers:
|
143
|
+
if proc.is_alive():
|
144
|
+
logging.info("stopping process #%d", proc.pid)
|
145
|
+
proc.terminate()
|
146
|
+
proc.join()
|
147
|
+
logging.info("process #%d stopped", proc.pid)
|
148
|
+
self.workers.clear()
|
149
|
+
|
150
|
+
def queue_task(self, job: DownloadJob) -> str:
|
151
|
+
self.job_queue.put_nowait(job)
|
152
|
+
return str(job.job_id)
|
153
|
+
|
154
|
+
def notify_task_failed(self, job: DownloadJob) -> None:
|
155
|
+
self.uploader.queue_task(job.to_upload_job(job_failed=True))
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
class ScraplerAbstract(ABC):
|
4
|
+
def __init__(self) -> None:
|
5
|
+
pass
|
6
|
+
|
7
|
+
def __del__(self) -> None:
|
8
|
+
pass
|
9
|
+
|
10
|
+
@abstractmethod
|
11
|
+
def scrap(self, url: str) -> str:
|
12
|
+
raise NotImplementedError
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def download(self, url: str) -> bool:
|
16
|
+
raise NotImplementedError
|
@@ -0,0 +1,191 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import Path
|
3
|
+
import time
|
4
|
+
import json
|
5
|
+
from typing import Callable, Optional, Union
|
6
|
+
|
7
|
+
import requests
|
8
|
+
import urllib3
|
9
|
+
import logging
|
10
|
+
|
11
|
+
from instagrapi.mixins.story import Story
|
12
|
+
from instagrapi.types import Media
|
13
|
+
from instagrapi import Client
|
14
|
+
from instagrapi.exceptions import LoginRequired, PleaseWaitFewMinutes
|
15
|
+
|
16
|
+
from warp_beacon.scrapler.abstract import ScraplerAbstract
|
17
|
+
|
18
|
+
INST_SESSION_FILE = "/var/warp_beacon/inst_session.json"
|
19
|
+
|
20
|
+
class InstagramScrapler(ScraplerAbstract):
|
21
|
+
cl = None
|
22
|
+
|
23
|
+
def __init__(self) -> None:
|
24
|
+
super().__init__()
|
25
|
+
self.cl = Client()
|
26
|
+
|
27
|
+
def safe_write_session(self) -> None:
|
28
|
+
tmp_fname = "%s~" % INST_SESSION_FILE
|
29
|
+
with open(tmp_fname, 'w+') as f:
|
30
|
+
f.write(json.dumps(self.cl.get_settings()))
|
31
|
+
if os.path.isfile(INST_SESSION_FILE):
|
32
|
+
os.unlink(INST_SESSION_FILE)
|
33
|
+
os.rename(tmp_fname, INST_SESSION_FILE)
|
34
|
+
|
35
|
+
def load_session(self) -> None:
|
36
|
+
if os.path.exists(INST_SESSION_FILE):
|
37
|
+
self.cl.load_settings(INST_SESSION_FILE)
|
38
|
+
else:
|
39
|
+
self.login()
|
40
|
+
|
41
|
+
def login(self) -> None:
|
42
|
+
self.cl = Client()
|
43
|
+
username = os.environ.get("INSTAGRAM_LOGIN", default=None)
|
44
|
+
password = os.environ.get("INSTAGRAM_PASSWORD", default=None)
|
45
|
+
verification_code = os.environ.get("INSTAGRAM_VERIFICATION_CODE", default="")
|
46
|
+
if username is not None and password is not None:
|
47
|
+
self.cl.login(username=username, password=password, verification_code=verification_code)
|
48
|
+
self.safe_write_session()
|
49
|
+
|
50
|
+
def scrap(self, url: str) -> tuple[str]:
|
51
|
+
self.load_session()
|
52
|
+
def _scrap() -> tuple[str]:
|
53
|
+
if "stories" in url:
|
54
|
+
url_last_part = list(filter(None, url.split('/')))[-1]
|
55
|
+
if url_last_part.isnumeric():
|
56
|
+
return "story", self.scrap_story(url)
|
57
|
+
else:
|
58
|
+
return "stories", url_last_part
|
59
|
+
else:
|
60
|
+
return "media", self.scrap_media(url)
|
61
|
+
try:
|
62
|
+
return _scrap()
|
63
|
+
except LoginRequired as e:
|
64
|
+
logging.warning("Session error. Trying to relogin...")
|
65
|
+
logging.exception(e)
|
66
|
+
self.login()
|
67
|
+
return _scrap()
|
68
|
+
|
69
|
+
def scrap_stories(self, username: str) -> list[Story]:
|
70
|
+
user_info = self.cl.user_info_by_username(username)
|
71
|
+
logging.info("user_id is '%s'", user_info.pk)
|
72
|
+
return self.cl.user_stories(user_id=user_info.pk)
|
73
|
+
|
74
|
+
def scrap_story(self, url: str) -> str:
|
75
|
+
story_id = self.cl.story_pk_from_url(url)
|
76
|
+
logging.info("story_id is '%s'", story_id)
|
77
|
+
return story_id
|
78
|
+
|
79
|
+
def scrap_media(self, url: str) -> str:
|
80
|
+
media_id = self.cl.media_pk_from_url(url)
|
81
|
+
logging.info("media_id is '%s'", media_id)
|
82
|
+
return media_id
|
83
|
+
|
84
|
+
def __download_hndlr(self, func: Callable, *args: tuple[str], **kwargs: dict[str]) -> Union[Path, Media]:
|
85
|
+
ret_val = {}
|
86
|
+
max_retries = int(os.environ.get("IG_MAX_RETRIES", default=5))
|
87
|
+
retries = 0
|
88
|
+
while max_retries >= retries:
|
89
|
+
try:
|
90
|
+
ret_val = func(*args, **kwargs)
|
91
|
+
break
|
92
|
+
except (requests.exceptions.ConnectionError,
|
93
|
+
requests.exceptions.ReadTimeout,
|
94
|
+
urllib3.exceptions.ReadTimeoutError,
|
95
|
+
urllib3.exceptions.ConnectionError) as e:
|
96
|
+
logging.warning("Instagram read timeout! Retrying in 2 seconds ...")
|
97
|
+
logging.info("Your `IG_MAX_RETRIES` values is %d", max_retries)
|
98
|
+
logging.exception(e)
|
99
|
+
if max_retries == retries:
|
100
|
+
raise e
|
101
|
+
retries += 1
|
102
|
+
time.sleep(2)
|
103
|
+
|
104
|
+
return ret_val
|
105
|
+
|
106
|
+
|
107
|
+
def download_video(self, url: str, media_info: dict) -> dict:
|
108
|
+
path = self.__download_hndlr(self.cl.video_download_by_url, url, folder='/tmp')
|
109
|
+
return {"local_media_path": str(path), "media_type": "video", "media_info": {"duration": media_info.video_duration}}
|
110
|
+
|
111
|
+
def download_photo(self, url: str) -> dict:
|
112
|
+
path = self.__download_hndlr(self.cl.photo_download_by_url, url, folder='/tmp')
|
113
|
+
return {"local_media_path": str(path), "media_type": "image"}
|
114
|
+
|
115
|
+
def download_story(self, story_info: Story) -> dict:
|
116
|
+
path, media_type, media_info = "", "", {}
|
117
|
+
logging.info("Story id is '%s'", story_info.id)
|
118
|
+
effective_story_id = story_info.id
|
119
|
+
if '_' in effective_story_id:
|
120
|
+
st_parts = effective_story_id.split('_')
|
121
|
+
if len(st_parts) > 1:
|
122
|
+
effective_story_id = st_parts[0]
|
123
|
+
logging.info("Effective story id is '%s'", effective_story_id)
|
124
|
+
effective_url = "https://www.instagram.com/stories/%s/%s/" % (story_info.user.username, effective_story_id)
|
125
|
+
if story_info.media_type == 1: # photo
|
126
|
+
path = self.__download_hndlr(self.cl.story_download_by_url, url=story_info.thumbnail_url, folder='/tmp')
|
127
|
+
media_type = "image"
|
128
|
+
elif story_info.media_type == 2: # video
|
129
|
+
path = self.__download_hndlr(self.cl.story_download_by_url, url=story_info.video_url, folder='/tmp')
|
130
|
+
media_type = "video"
|
131
|
+
media_info["duration"] = story_info.video_duration
|
132
|
+
|
133
|
+
return {"local_media_path": str(path), "media_type": media_type, "media_info": media_info, "effective_url": effective_url}
|
134
|
+
|
135
|
+
def download_stories(self, stories: list[Story]) -> dict:
|
136
|
+
res = []
|
137
|
+
for story in stories:
|
138
|
+
res.append(self.download_story(story_info=story))
|
139
|
+
|
140
|
+
return {"media_type": "collection", "save_items": True, "items": res}
|
141
|
+
|
142
|
+
def download_album(self, media_info: dict) -> dict:
|
143
|
+
res = []
|
144
|
+
for i in media_info.resources:
|
145
|
+
_media_info = self.cl.media_info(i.pk)
|
146
|
+
if i.media_type == 1: # photo
|
147
|
+
res.append(self.download_photo(url=_media_info.thumbnail_url))
|
148
|
+
elif i.media_type == 2: # video
|
149
|
+
res.append(self.download_video(url=_media_info.video_url, media_info=_media_info))
|
150
|
+
|
151
|
+
return {"media_type": "collection", "items": res}
|
152
|
+
|
153
|
+
def download(self, url: str) -> Optional[list[dict]]:
|
154
|
+
res = []
|
155
|
+
while True:
|
156
|
+
try:
|
157
|
+
scrap_type, media_id = self.scrap(url)
|
158
|
+
if scrap_type == "media":
|
159
|
+
media_info = self.__download_hndlr(self.cl.media_info, media_id)
|
160
|
+
logging.info("media_type is '%d', product_type is '%s'", media_info.media_type, media_info.product_type)
|
161
|
+
if media_info.media_type == 2 and media_info.product_type == "clips": # Reels
|
162
|
+
res.append(self.download_video(url=media_info.video_url, media_info=media_info))
|
163
|
+
elif media_info.media_type == 1: # Photo
|
164
|
+
res.append(self.download_photo(url=media_info.thumbnail_url))
|
165
|
+
elif media_info.media_type == 8: # Album
|
166
|
+
res.append(self.download_album(media_info=media_info))
|
167
|
+
elif scrap_type == "story":
|
168
|
+
story_info = self.cl.story_info(media_id)
|
169
|
+
logging.info("media_type for story is '%d'", story_info.media_type)
|
170
|
+
res.append(self.download_story(story_info=story_info))
|
171
|
+
elif scrap_type == "stories":
|
172
|
+
logging.info("Stories download mode")
|
173
|
+
res.append(self.download_stories(self.scrap_stories(media_id)))
|
174
|
+
break
|
175
|
+
except PleaseWaitFewMinutes as e:
|
176
|
+
logging.warning("Please wait a few minutes error. Trying to relogin ...")
|
177
|
+
logging.exception(e)
|
178
|
+
wait_timeout = int(os.environ.get("IG_WAIT_TIMEOUT", default=5))
|
179
|
+
logging.info("Waiting %d seconds according configuration option `IG_WAIT_TIMEOUT`", wait_timeout)
|
180
|
+
if res:
|
181
|
+
for i in res:
|
182
|
+
if i["media_type"] == "collection":
|
183
|
+
for j in i["items"]:
|
184
|
+
if os.path.exists(j["local_media_path"]):
|
185
|
+
os.unlink(j["local_media_path"])
|
186
|
+
else:
|
187
|
+
if os.path.exists(i["local_media_path"]):
|
188
|
+
os.unlink(i["local_media_path"])
|
189
|
+
os.unlink(INST_SESSION_FILE)
|
190
|
+
time.sleep(wait_timeout)
|
191
|
+
return res
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import os
|
2
|
+
#from typing import Optional
|
3
|
+
import logging
|
4
|
+
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
|
7
|
+
from pymongo import MongoClient
|
8
|
+
|
9
|
+
VIDEO_STORAGE_DIR = os.environ.get("VIDEO_STORAGE_DIR", default="/var/warp_beacon/videos")
|
10
|
+
|
11
|
+
class Storage(object):
|
12
|
+
client = None
|
13
|
+
db = None
|
14
|
+
def __init__(self) -> None:
|
15
|
+
if not os.path.isdir(VIDEO_STORAGE_DIR):
|
16
|
+
os.mkdir(VIDEO_STORAGE_DIR)
|
17
|
+
|
18
|
+
self.client = MongoClient(
|
19
|
+
host=os.environ.get("MONGODB_HOST", default='127.0.0.1'),
|
20
|
+
port=int(os.environ.get("MONGODB_PORT", default=27017)),
|
21
|
+
username=os.environ.get("MONGODB_USER", default='root'),
|
22
|
+
password=os.environ.get("MONGODB_PASSWORD", default="changeme"))
|
23
|
+
self.db = self.client.media.media
|
24
|
+
|
25
|
+
def __del__(self) -> None:
|
26
|
+
if self.client:
|
27
|
+
self.client.close()
|
28
|
+
|
29
|
+
@staticmethod
|
30
|
+
def compute_uniq(url: str) -> str:
|
31
|
+
path = urlparse(url).path.strip('/')
|
32
|
+
return path
|
33
|
+
|
34
|
+
def db_find(self, uniq_id: str) -> list[dict]:
|
35
|
+
document = None
|
36
|
+
ret = []
|
37
|
+
try:
|
38
|
+
logging.debug("uniq_id to search is '%s'", uniq_id)
|
39
|
+
cursor = self.db.find({"uniq_id": uniq_id})
|
40
|
+
for document in cursor:
|
41
|
+
ret.append({"uniq_id": document["uniq_id"], "tg_file_id": document["tg_file_id"], "media_type": document["media_type"]})
|
42
|
+
except Exception as e:
|
43
|
+
logging.error("Error occurred while trying to read from the database!")
|
44
|
+
logging.exception(e)
|
45
|
+
return ret
|
46
|
+
|
47
|
+
def db_lookup(self, url: str) -> dict:
|
48
|
+
uniq_id = self.compute_uniq(url)
|
49
|
+
doc = self.db_find(uniq_id)
|
50
|
+
return doc
|
51
|
+
|
52
|
+
def db_lookup_id(self, uniq_id: str) -> list[dict]:
|
53
|
+
return self.db_find(uniq_id)
|
54
|
+
|
55
|
+
def add_media(self, tg_file_ids: list[str], media_url: str, media_type: str, origin: str) -> list[int]:
|
56
|
+
uniq_id = self.compute_uniq(media_url)
|
57
|
+
media_ids = []
|
58
|
+
for tg_file_id in tg_file_ids:
|
59
|
+
if self.db_lookup_id(uniq_id):
|
60
|
+
logging.info("Detected existing uniq_id, skipping storage write operation")
|
61
|
+
continue
|
62
|
+
media_ids += str(self.db.insert_one({"uniq_id": uniq_id, "media_type": media_type, "tg_file_id": tg_file_id, "origin": origin}).inserted_id)
|
63
|
+
|
64
|
+
return media_ids
|
65
|
+
|
66
|
+
def get_random(self) -> dict:
|
67
|
+
ret = {}
|
68
|
+
try:
|
69
|
+
cursor = self.db.aggregate([
|
70
|
+
{ "$match": { "tg_file_id": { "$exists": True } } },
|
71
|
+
{ "$sample": { "size": 1 } }
|
72
|
+
])
|
73
|
+
tmp = list(cursor)
|
74
|
+
if tmp:
|
75
|
+
ret = tmp.pop()
|
76
|
+
except Exception as e:
|
77
|
+
logging.error("Error occurred while trying to read from the database!")
|
78
|
+
logging.exception(e)
|
79
|
+
return ret
|
80
|
+
|
81
|
+
|
82
|
+
|