cobweb-launcher 0.1.24__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

@@ -0,0 +1,199 @@
1
+ import time
2
+
3
+ import setting
4
+ import inspect
5
+ import threading
6
+ import importlib
7
+
8
+ from cobweb.base import Seed, Queue
9
+ from cobweb.utils.tools import dynamic_load_class
10
+
11
+
12
+ class Launcher(threading.Thread):
13
+
14
+ SEEDS = []
15
+
16
+ __DOING__ = {}
17
+
18
+ __CUSTOM_FUNC__ = {
19
+ "download": None,
20
+ "download_midware": None,
21
+ "parse": None,
22
+ }
23
+
24
+ __LAUNCHER_QUEUE__ = {
25
+ "new": Queue(),
26
+ "todo": Queue(),
27
+ "done": Queue(),
28
+ }
29
+
30
+ __LAUNCHER_FUNC__ = [
31
+ "_reset",
32
+ "_scheduler",
33
+ "_insert",
34
+ "_refresh",
35
+ "_delete",
36
+ ]
37
+
38
+ def __init__(self, task, project, custom_setting=None):
39
+ super().__init__()
40
+ self.task = task
41
+ self.project = project
42
+
43
+ self._stop = threading.Event() # 结束事件
44
+ self._pause = threading.Event() # 暂停事件
45
+
46
+ if custom_setting:
47
+ setting_ = dict()
48
+ if isinstance(custom_setting, dict):
49
+ setting_ = custom_setting
50
+ else:
51
+ if isinstance(custom_setting, str):
52
+ custom_setting = importlib.import_module(custom_setting)
53
+ if not inspect.ismodule(custom_setting):
54
+ raise Exception
55
+ for k, v in custom_setting.__dict__.items():
56
+ if not k.startswith("__") and not inspect.ismodule(v):
57
+ setting_[k] = v
58
+ for k, v in setting_.items():
59
+ setattr(setting, k, v)
60
+
61
+ self._Crawler = dynamic_load_class(setting.CRAWLER)
62
+ self._Pipeline = dynamic_load_class(setting.PIPELINE)
63
+
64
+ self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
65
+ self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
66
+ self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
67
+ self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
68
+ self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
69
+ self._seed_reset_seconds = setting.SEED_RESET_SECONDS
70
+
71
+ self._todo_queue_size = setting.TODO_QUEUE_SIZE
72
+ self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
73
+ self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
74
+ self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
75
+
76
+ self._done_model = setting.DONE_MODEL
77
+
78
+ self._upload_queue = Queue()
79
+
80
+ @property
81
+ def start_seeds(self):
82
+ return [Seed(seed) for seed in self.SEEDS]
83
+
84
+ @property
85
+ def request(self):
86
+ """
87
+ 自定义request函数
88
+ use case:
89
+ from cobweb.base import Request, BaseItem
90
+ @launcher.request
91
+ def request(seed: Seed) -> Union[Request, BaseItem]:
92
+ ...
93
+ return Request(seed.url, seed)
94
+ """
95
+ def decorator(func):
96
+ self.__CUSTOM_FUNC__["request"] = func
97
+ return decorator
98
+
99
+ @property
100
+ def download(self):
101
+ """
102
+ 自定义download函数
103
+ use case:
104
+ from cobweb.base import Request, Response, Seed, BaseItem
105
+ @launcher.download
106
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
107
+ ...
108
+ yield Response(item.seed, response)
109
+ """
110
+ def decorator(func):
111
+ self.__CUSTOM_FUNC__["download"] = func
112
+ return decorator
113
+
114
+ @property
115
+ def parse(self):
116
+ """
117
+ 自定义parse函数, xxxItem为自定义的存储数据类型
118
+ use case:
119
+ from cobweb.base import Request, Response
120
+ @launcher.download
121
+ def download(item: Response) -> BaseItem:
122
+ ...
123
+ yield xxxItem(seed, **kwargs)
124
+ """
125
+ def decorator(func):
126
+ self.__CUSTOM_FUNC__["parse"] = func
127
+ return decorator
128
+
129
+ def _remove_doing_seeds(self, seeds):
130
+ for seed in seeds:
131
+ self.__DOING__.pop(seed, None)
132
+
133
+ def _execute_heartbeat(self):
134
+ pass
135
+
136
+ def _reset(self):
137
+ pass
138
+
139
+ def _scheduler(self):
140
+ pass
141
+
142
+ def _insert(self):
143
+ pass
144
+
145
+ def _refresh(self):
146
+ pass
147
+
148
+ def _delete(self):
149
+ pass
150
+
151
+ def _execute(self):
152
+ for func_name in self.__LAUNCHER_FUNC__:
153
+ threading.Thread(name=func_name, target=getattr(self, func_name)).start()
154
+ time.sleep(2)
155
+
156
+ def _polling(self):
157
+
158
+ check_emtpy_times = 0
159
+
160
+ while not self._stop.is_set():
161
+
162
+ queue_not_empty_count = 0
163
+
164
+ for q in self.__LAUNCHER_QUEUE__.values():
165
+ if q.length != 0:
166
+ queue_not_empty_count += 1
167
+
168
+ if self._pause.is_set() and queue_not_empty_count != 0:
169
+ self._pause.clear()
170
+ self._execute()
171
+
172
+ elif queue_not_empty_count == 0:
173
+ check_emtpy_times += 1
174
+ else:
175
+ check_emtpy_times = 0
176
+
177
+ if check_emtpy_times > 2:
178
+ check_emtpy_times = 0
179
+ self.__DOING__ = {}
180
+ self._pause.set()
181
+
182
+ def run(self):
183
+ threading.Thread(target=self._execute_heartbeat).start()
184
+
185
+ self._Crawler(
186
+ upload_queue=self._upload_queue,
187
+ custom_func=self.__CUSTOM_FUNC__,
188
+ launcher_queue=self.__LAUNCHER_QUEUE__,
189
+ ).start()
190
+
191
+ self._Pipeline(
192
+ upload_queue=self._upload_queue,
193
+ done_queue=self.__LAUNCHER_QUEUE__["done"],
194
+ upload_queue_size=self._upload_queue_max_size,
195
+ upload_wait_seconds=self._upload_queue_wait_seconds
196
+ ).start()
197
+
198
+ self._execute()
199
+ self._polling()
@@ -0,0 +1,174 @@
1
+ import time
2
+ import threading
3
+
4
+ from cobweb.db import RedisDB
5
+ from cobweb.base import Seed, logger
6
+ from cobweb.launchers import Launcher
7
+ from cobweb.constant import DealModel, LogTemplate
8
+
9
+
10
+ class LauncherPro(Launcher):
11
+
12
+ def __init__(self, task, project, custom_setting=None):
13
+ super().__init__(task, project, custom_setting)
14
+ self._todo = "{%s:%s}:todo" % (project, task)
15
+ self._done = "{%s:%s}:done" % (project, task)
16
+ self._fail = "{%s:%s}:fail" % (project, task)
17
+ self._heartbeat = "heartbeat:%s_%s" % (project, task)
18
+ self._reset_lock = "lock:reset:%s_%s" % (project, task)
19
+ self._heartbeat_lock = "lock:heartbeat:%s_%s" % (project, task)
20
+ self._db = RedisDB()
21
+
22
+ self._heartbeat_start_event = threading.Event()
23
+ self._redis_queue_empty_event = threading.Event()
24
+
25
+ @property
26
+ def heartbeat(self):
27
+ return self._db.exists(self._heartbeat)
28
+
29
+ def _execute_heartbeat(self):
30
+ while not self._stop.is_set():
31
+ if self._heartbeat_start_event.is_set():
32
+ if self._db.lock(self._heartbeat_lock, t=1):
33
+ self._db.setex(self._heartbeat, 3)
34
+ time.sleep(1)
35
+ time.sleep(0.5)
36
+
37
+ def _reset(self):
38
+ """
39
+ 检查过期种子,重新添加到redis缓存中
40
+ """
41
+ first = True
42
+ while not self._pause.is_set():
43
+ reset_wait_seconds = 15
44
+ if self._db.lock(self._reset_lock, t=120):
45
+ if not self.heartbeat:
46
+ self._heartbeat_start_event.set()
47
+
48
+ _min = -int(time.time()) + self._seed_reset_seconds \
49
+ if self.heartbeat or not first else "-inf"
50
+
51
+ self._db.members(
52
+ self._todo, 0,
53
+ _min=_min, _max="(0"
54
+ )
55
+ self._db.delete(self._reset_lock)
56
+ reset_wait_seconds = 60
57
+
58
+ time.sleep(reset_wait_seconds)
59
+ first = False
60
+
61
+ def _scheduler(self):
62
+ """
63
+ 调度任务,获取redis队列种子,同时添加到doing字典中
64
+ """
65
+ if self.start_seeds:
66
+ self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
67
+ while not self._pause.is_set():
68
+ if not self._db.zcount(self._todo, 0, "(1000"):
69
+ time.sleep(self._scheduler_wait_seconds)
70
+ continue
71
+ if self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
72
+ time.sleep(self._todo_queue_full_wait_seconds)
73
+ continue
74
+ members = self._db.members(
75
+ self._todo, int(time.time()),
76
+ count=self._todo_queue_size,
77
+ _min=0, _max="(1000"
78
+ )
79
+ for member, priority in members:
80
+ seed = Seed(member, priority=priority)
81
+ self.__LAUNCHER_QUEUE__['todo'].push(seed)
82
+ self.__DOING__[seed.to_string] = seed.params.priority
83
+
84
+ def _insert(self):
85
+ """
86
+ 添加新种子到redis队列中
87
+ """
88
+ while not self._pause.is_set():
89
+ seeds = {}
90
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
91
+ for _ in range(self._new_queue_max_size):
92
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
93
+ if not seed:
94
+ break
95
+ seeds[seed.to_string] = seed.params.priority
96
+ if seeds:
97
+ self._db.zadd(self._todo, seeds, nx=True)
98
+ if status:
99
+ time.sleep(self._new_queue_wait_seconds)
100
+
101
+ def _refresh(self):
102
+ """
103
+ 刷新doing种子过期时间,防止reset重新消费
104
+ """
105
+ while not self._pause.is_set():
106
+ if self.__DOING__:
107
+ refresh_time = int(time.time())
108
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
109
+ self._db.zadd(self._todo, item=seeds, xx=True)
110
+ time.sleep(30)
111
+
112
+ def _delete(self):
113
+ """
114
+ 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
115
+ """
116
+ while not self._pause.is_set():
117
+ seeds, s_seeds, f_seeds = [], [], []
118
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
119
+
120
+ for _ in range(self._done_queue_max_size):
121
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
122
+ if not seed:
123
+ break
124
+ if seed.params.identifier == DealModel.fail:
125
+ f_seeds.append(seed.to_string)
126
+ elif self._done_model == 1:
127
+ s_seeds.append(seed.to_string)
128
+ else:
129
+ seeds.append(seed.to_string)
130
+ if seeds:
131
+ self._db.zrem(self._todo, *seeds)
132
+ if s_seeds:
133
+ self._db.done([self._todo, self._done], *s_seeds)
134
+ if f_seeds:
135
+ self._db.done([self._todo, self._fail], *f_seeds)
136
+
137
+ self._remove_doing_seeds(seeds)
138
+
139
+ if status:
140
+ time.sleep(self._done_queue_wait_seconds)
141
+
142
+ def _polling(self):
143
+ check_emtpy_times = 0
144
+ while not self._stop.is_set():
145
+ queue_not_empty_count = 0
146
+ pooling_wait_seconds = 30
147
+ if not self._db.zcard(self._todo):
148
+ for q in self.__LAUNCHER_QUEUE__.values():
149
+ if q.length != 0:
150
+ queue_not_empty_count += 1
151
+ if self._pause.is_set() and queue_not_empty_count != 0:
152
+ self._pause.clear()
153
+ self._execute()
154
+ elif queue_not_empty_count == 0:
155
+ pooling_wait_seconds = 3
156
+ check_emtpy_times += 1
157
+ else:
158
+ check_emtpy_times = 0
159
+ if check_emtpy_times > 2:
160
+ check_emtpy_times = 0
161
+ self.__DOING__ = {}
162
+ self._pause.set()
163
+ if not self._pause.is_set():
164
+ logger.info(LogTemplate.launcher_pro_polling.format(
165
+ task=self.task,
166
+ doing_len=len(self.__DOING__.keys()),
167
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
168
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
169
+ redis_seed_count=self._db.zcount(self._todo, "-inf", "+inf"),
170
+ redis_todo_len=self._db.zcount(self._todo, 0, "(1000"),
171
+ redis_doing_len=self._db.zcount(self._todo, "-inf", "(0"),
172
+ upload_len=self._upload_queue.length
173
+ ))
174
+ time.sleep(pooling_wait_seconds)
@@ -0,0 +1,2 @@
1
+ from .base_pipeline import Pipeline
2
+ from .loghub_pipeline import LoghubPipeline
@@ -0,0 +1,54 @@
1
+ import time
2
+ import threading
3
+
4
+ from abc import ABC, abstractmethod
5
+ from cobweb.base import BaseItem, Queue, logger
6
+
7
+
8
+ class Pipeline(threading.Thread, ABC):
9
+
10
+ def __init__(
11
+ self,
12
+ done_queue: Queue,
13
+ upload_queue: Queue,
14
+ upload_queue_size: int,
15
+ upload_wait_seconds: int
16
+ ):
17
+ super().__init__()
18
+ self.done_queue = done_queue
19
+ self.upload_queue = upload_queue
20
+ self.upload_queue_size = upload_queue_size
21
+ self.upload_wait_seconds = upload_wait_seconds
22
+
23
+ @abstractmethod
24
+ def build(self, item: BaseItem) -> dict:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def upload(self, table: str, data: list) -> bool:
29
+ pass
30
+
31
+ def run(self):
32
+ while True:
33
+ status = self.upload_queue.length < self.upload_queue_size
34
+ if status:
35
+ time.sleep(self.upload_wait_seconds)
36
+ data_info, seeds = {}, []
37
+ for _ in range(self.upload_queue_size):
38
+ item = self.upload_queue.pop()
39
+ if not item:
40
+ break
41
+ data = self.build(item)
42
+ seeds.append(item.seed)
43
+ data_info.setdefault(item.table, []).append(data)
44
+ for table, datas in data_info.items():
45
+ try:
46
+ self.upload(table, datas)
47
+ status = True
48
+ except Exception as e:
49
+ logger.info(e)
50
+ status = False
51
+ if status:
52
+ self.done_queue.push(seeds)
53
+
54
+
@@ -0,0 +1,34 @@
1
+ import json
2
+ import setting
3
+
4
+ from cobweb.base import BaseItem
5
+ from cobweb.pipelines import Pipeline
6
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
7
+
8
+
9
+ class LoghubPipeline(Pipeline):
10
+
11
+ def __init__(self, *args, **kwargs):
12
+ super().__init__(*args, **kwargs)
13
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+
15
+ def build(self, item: BaseItem):
16
+ log_item = LogItem()
17
+ temp = item.to_dict
18
+ for key, value in temp.items():
19
+ if not isinstance(value, str):
20
+ temp[key] = json.dumps(value, ensure_ascii=False)
21
+ contents = sorted(temp.items())
22
+ log_item.set_contents(contents)
23
+ return log_item
24
+
25
+ def upload(self, table, datas):
26
+ request = PutLogsRequest(
27
+ project=setting.LOGHUB_PROJECT,
28
+ logstore=table,
29
+ topic=setting.LOGHUB_TOPIC,
30
+ source=setting.LOGHUB_SOURCE,
31
+ logitems=datas,
32
+ compress=True
33
+ )
34
+ self.client.put_logs(request=request)
cobweb/setting.py CHANGED
@@ -1,13 +1,62 @@
1
1
  import os
2
2
 
3
+ # redis db config
4
+ REDIS_CONFIG = {
5
+ "host": os.getenv("REDIS_HOST"),
6
+ "password": os.getenv("REDIS_PASSWORD"),
7
+ "port": int(os.getenv("REDIS_PORT", 6379)),
8
+ "db": int(os.getenv("REDIS_DB", 0)),
9
+ }
3
10
 
4
- # model: 0, 1, 2
5
- MODEL = int(os.getenv("MODEL", "0"))
11
+ # loghub db config
12
+ LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
13
+ LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
14
+ LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
15
+ LOGHUB_CONFIG = {
16
+ "endpoint": os.getenv("LOGHUB_ENDPOINT"),
17
+ "accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
18
+ "accessKey": os.getenv("LOGHUB_SECRET_KEY")
19
+ }
6
20
 
7
- # 重制score值的等待时间, 默认10分钟
8
- RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
21
+ # oss util config
22
+ OSS_BUCKET = os.getenv("OSS_BUCKET")
23
+ OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
24
+ OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
25
+ OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
+ OSS_MIN_UPLOAD_SIZE = 1024 * 100
27
+ OSS_CHUNK_SIZE = 1024 ** 2
9
28
 
10
- # 默认设置检查spider queue队列锁的存活时间为30s
11
- CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
29
+ # 采集器选择
30
+ CRAWLER = "cobweb.crawlers.CrawlerAir"
12
31
 
32
+ # 数据上传链路
33
+ PIPELINE = "cobweb.pipelines.loghub_pipeline.LoghubPipeline"
13
34
 
35
+
36
+ # Launcher 等待时间
37
+ SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
38
+ TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
39
+ NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
40
+ DONE_QUEUE_WAIT_SECONDS = 15 # done队列等待时间
41
+ UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
42
+ SEED_RESET_SECONDS = 600 # 种子重制时间
43
+
44
+
45
+ # Launcher 队列长度
46
+ TODO_QUEUE_SIZE = 100 # todo队列长度
47
+ NEW_QUEUE_MAX_SIZE = 100 # new队列长度
48
+ DONE_QUEUE_MAX_SIZE = 100 # done队列长度
49
+ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
50
+
51
+ # DONE_MODEL IN (0, 1), 种子完成模式
52
+ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
53
+
54
+ # DOWNLOAD_MODEL IN (0, 1), 下载模式
55
+ DOWNLOAD_MODEL = 0 # 0: 通用下载;1:文件下载
56
+
57
+ # spider
58
+ SPIDER_THREAD_NUM = 10
59
+ SPIDER_MAX_RETRIES = 5
60
+
61
+ # 文件下载响应类型过滤
62
+ FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
@@ -0,0 +1,3 @@
1
+ from .oss import OssUtil
2
+ from .tools import *
3
+
cobweb/utils/oss.py ADDED
@@ -0,0 +1,87 @@
1
+ import setting
2
+
3
+ from requests import Response
4
+ from oss2 import Auth, Bucket, models
5
+ from cobweb.exceptions import oss_db_exception
6
+ from base.decorators import decorator_oss_db
7
+
8
+
9
+ class OssUtil:
10
+
11
+ def __init__(
12
+ self,
13
+ bucket=None,
14
+ endpoint=None,
15
+ access_key=None,
16
+ secret_key=None,
17
+ chunk_size=None,
18
+ min_upload_size=None,
19
+ ):
20
+ self.bucket = bucket or setting.OSS_BUCKET
21
+ self.endpoint = endpoint or setting.OSS_ENDPOINT
22
+ self.chunk_size = int(chunk_size or setting.OSS_CHUNK_SIZE)
23
+ self.min_upload_size = int(min_upload_size or setting.OSS_MIN_UPLOAD_SIZE)
24
+ self._auth = Auth(
25
+ access_key_id=access_key or setting.OSS_ACCESS_KEY,
26
+ access_key_secret=secret_key or setting.OSS_SECRET_KEY
27
+ )
28
+ self._client = Bucket(
29
+ auth=self._auth,
30
+ endpoint=self.endpoint,
31
+ bucket_name=self.bucket,
32
+ )
33
+
34
+ def exists(self, key: str) -> bool:
35
+ return self._client.object_exists(key)
36
+
37
+ def head(self, key: str) -> models.HeadObjectResult:
38
+ return self._client.head_object(key)
39
+
40
+ @decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
41
+ def init_part(self, key) -> models.InitMultipartUploadResult:
42
+ """初始化分片上传"""
43
+ return self._client.init_multipart_upload(key)
44
+
45
+ @decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
46
+ def put(self, key, data) -> models.PutObjectResult:
47
+ """文件上传"""
48
+ return self._client.put_object(key, data)
49
+
50
+ @decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
51
+ def put_part(self, key, upload_id, position, data) -> models.PutObjectResult:
52
+ """分片上传"""
53
+ return self._client.upload_part(key, upload_id, position, data)
54
+
55
+ @decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
56
+ def merge(self, key, upload_id, parts=None) -> models.PutObjectResult:
57
+ """合并分片"""
58
+ headers = None if parts else {"x-oss-complete-all": "yes"}
59
+ return self._client.complete_multipart_upload(key, upload_id, parts, headers=headers)
60
+
61
+ @decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
62
+ def append(self, key, position, data) -> models.AppendObjectResult:
63
+ """追加上传"""
64
+ return self._client.append_object(key, position, data)
65
+
66
+ def iter_data(self, data, chunk_size=None):
67
+ chunk_size = chunk_size or self.chunk_size
68
+ if isinstance(data, Response):
69
+ for part_data in data.iter_content(chunk_size):
70
+ yield part_data
71
+ if isinstance(data, bytes):
72
+ for i in range(0, len(data), chunk_size):
73
+ yield data[i:i + chunk_size]
74
+
75
+ def assemble(self, ready_data, data, chunk_size=None):
76
+ upload_data = b""
77
+ ready_data = ready_data + data
78
+ chunk_size = chunk_size or self.chunk_size
79
+ if len(ready_data) >= chunk_size:
80
+ upload_data = ready_data[:chunk_size]
81
+ ready_data = ready_data[chunk_size:]
82
+ return ready_data, upload_data
83
+
84
+ def content_length(self, key: str) -> int:
85
+ head = self.head(key)
86
+ return head.content_length
87
+
cobweb/utils/tools.py ADDED
@@ -0,0 +1,42 @@
1
+ import re
2
+ import hashlib
3
+ from typing import Union
4
+ from importlib import import_module
5
+
6
+
7
+ def md5(text: Union[str, bytes]) -> str:
8
+ if isinstance(text, str):
9
+ text = text.encode('utf-8')
10
+ return hashlib.md5(text).hexdigest()
11
+
12
+
13
+ def build_path(site, url, file_type):
14
+ return f"{site}/{md5(url)}.{file_type}"
15
+
16
+
17
+ def format_size(content_length: int) -> str:
18
+ units = ["KB", "MB", "GB", "TB"]
19
+ for i in range(4):
20
+ num = content_length / (1024 ** (i + 1))
21
+ if num < 1024:
22
+ return f"{round(num, 2)} {units[i]}"
23
+
24
+
25
+ def dynamic_load_class(model_info):
26
+ if isinstance(model_info, str):
27
+ if "import" in model_info:
28
+ model_path, class_name = re.search(
29
+ r"from (.*?) import (.*?)$", model_info
30
+ ).groups()
31
+ model = import_module(model_path)
32
+ class_object = getattr(model, class_name)
33
+ else:
34
+ model_path, class_name = model_info.rsplit(".", 1)
35
+ model = import_module(model_path)
36
+ class_object = getattr(model, class_name)
37
+ return class_object
38
+ raise TypeError()
39
+
40
+
41
+ def download_log_info(item:dict) -> str:
42
+ return "\n".join([" " * 12 + f"{k.ljust(14)}: {v}" for k, v in item.items()])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.24
3
+ Version: 1.0.0
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP