cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +209 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/utils/oss.py
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
from typing import List
|
2
|
+
from cobweb import setting
|
3
|
+
from requests import Response
|
4
|
+
from oss2 import Auth, Bucket, models, PartIterator
|
5
|
+
from cobweb.exceptions import oss_db_exception
|
6
|
+
from cobweb.base.decorators import decorator_oss_db
|
7
|
+
|
8
|
+
|
9
|
+
class OssUtil:
|
10
|
+
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
bucket=None,
|
14
|
+
endpoint=None,
|
15
|
+
access_key=None,
|
16
|
+
secret_key=None,
|
17
|
+
chunk_size=None,
|
18
|
+
min_upload_size=None,
|
19
|
+
**kwargs
|
20
|
+
):
|
21
|
+
self.bucket = bucket or setting.OSS_BUCKET
|
22
|
+
self.endpoint = endpoint or setting.OSS_ENDPOINT
|
23
|
+
self.chunk_size = int(chunk_size or setting.OSS_CHUNK_SIZE)
|
24
|
+
self.min_upload_size = int(min_upload_size or setting.OSS_MIN_UPLOAD_SIZE)
|
25
|
+
|
26
|
+
self._auth = Auth(
|
27
|
+
access_key_id=access_key or setting.OSS_ACCESS_KEY,
|
28
|
+
access_key_secret=secret_key or setting.OSS_SECRET_KEY
|
29
|
+
)
|
30
|
+
self._client = Bucket(
|
31
|
+
auth=self._auth,
|
32
|
+
endpoint=self.endpoint,
|
33
|
+
bucket_name=self.bucket,
|
34
|
+
**kwargs
|
35
|
+
)
|
36
|
+
|
37
|
+
def exists(self, key: str) -> bool:
|
38
|
+
return self._client.object_exists(key)
|
39
|
+
|
40
|
+
def head(self, key: str) -> models.HeadObjectResult:
|
41
|
+
return self._client.head_object(key)
|
42
|
+
|
43
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBInitPartError)
|
44
|
+
def init_part(self, key) -> models.InitMultipartUploadResult:
|
45
|
+
"""初始化分片上传"""
|
46
|
+
return self._client.init_multipart_upload(key)
|
47
|
+
|
48
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBPutObjError)
|
49
|
+
def put(self, key, data) -> models.PutObjectResult:
|
50
|
+
"""文件上传"""
|
51
|
+
return self._client.put_object(key, data)
|
52
|
+
|
53
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBPutPartError)
|
54
|
+
def put_part(self, key, upload_id, position, data) -> models.PutObjectResult:
|
55
|
+
"""分片上传"""
|
56
|
+
return self._client.upload_part(key, upload_id, position, data)
|
57
|
+
|
58
|
+
def list_part(self, key, upload_id): # -> List[models.ListPartsResult]:
|
59
|
+
"""获取分片列表"""
|
60
|
+
return [part_info for part_info in PartIterator(self._client, key, upload_id)]
|
61
|
+
|
62
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBMergeError)
|
63
|
+
def merge(self, key, upload_id, parts=None) -> models.PutObjectResult:
|
64
|
+
"""合并分片"""
|
65
|
+
headers = None if parts else {"x-oss-complete-all": "yes"}
|
66
|
+
return self._client.complete_multipart_upload(key, upload_id, parts, headers=headers)
|
67
|
+
|
68
|
+
@decorator_oss_db(exception=oss_db_exception.OssDBAppendObjError)
|
69
|
+
def append(self, key, position, data) -> models.AppendObjectResult:
|
70
|
+
"""追加上传"""
|
71
|
+
return self._client.append_object(key, position, data)
|
72
|
+
|
73
|
+
def iter_data(self, data, chunk_size=None):
|
74
|
+
chunk_size = chunk_size or self.chunk_size
|
75
|
+
if isinstance(data, Response):
|
76
|
+
for part_data in data.iter_content(chunk_size):
|
77
|
+
yield part_data
|
78
|
+
if isinstance(data, bytes):
|
79
|
+
for i in range(0, len(data), chunk_size):
|
80
|
+
yield data[i:i + chunk_size]
|
81
|
+
|
82
|
+
def assemble(self, ready_data, data, chunk_size=None):
|
83
|
+
upload_data = b""
|
84
|
+
ready_data = ready_data + data
|
85
|
+
chunk_size = chunk_size or self.chunk_size
|
86
|
+
if len(ready_data) >= chunk_size:
|
87
|
+
upload_data = ready_data[:chunk_size]
|
88
|
+
ready_data = ready_data[chunk_size:]
|
89
|
+
return ready_data, upload_data
|
90
|
+
|
91
|
+
def content_length(self, key: str) -> int:
|
92
|
+
head = self.head(key)
|
93
|
+
return head.content_length
|
94
|
+
|
cobweb/utils/tools.py
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
import re
|
2
|
+
import hashlib
|
3
|
+
from typing import Union
|
4
|
+
from importlib import import_module
|
5
|
+
|
6
|
+
|
7
|
+
def md5(text: Union[str, bytes]) -> str:
|
8
|
+
if isinstance(text, str):
|
9
|
+
text = text.encode('utf-8')
|
10
|
+
return hashlib.md5(text).hexdigest()
|
11
|
+
|
12
|
+
|
13
|
+
def build_path(site, url, file_type):
|
14
|
+
return f"{site}/{md5(url)}.{file_type}"
|
15
|
+
|
16
|
+
|
17
|
+
def format_size(content_length: int) -> str:
|
18
|
+
units = ["KB", "MB", "GB", "TB"]
|
19
|
+
for i in range(4):
|
20
|
+
num = content_length / (1024 ** (i + 1))
|
21
|
+
if num < 1024:
|
22
|
+
return f"{round(num, 2)} {units[i]}"
|
23
|
+
|
24
|
+
|
25
|
+
def dynamic_load_class(model_info):
|
26
|
+
if isinstance(model_info, str):
|
27
|
+
if "import" in model_info:
|
28
|
+
model_path, class_name = re.search(
|
29
|
+
r"from (.*?) import (.*?)$", model_info
|
30
|
+
).groups()
|
31
|
+
model = import_module(model_path)
|
32
|
+
class_object = getattr(model, class_name)
|
33
|
+
else:
|
34
|
+
model_path, class_name = model_info.rsplit(".", 1)
|
35
|
+
model = import_module(model_path)
|
36
|
+
class_object = getattr(model, class_name)
|
37
|
+
return class_object
|
38
|
+
raise TypeError()
|
39
|
+
|
40
|
+
|
41
|
+
# def download_log_info(item:dict) -> str:
|
42
|
+
# return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
|
@@ -0,0 +1,205 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: cobweb-launcher
|
3
|
+
Version: 1.2.41
|
4
|
+
Summary: spider_hole
|
5
|
+
Home-page: https://github.com/Juannie-PP/cobweb
|
6
|
+
Author: Juannie-PP
|
7
|
+
Author-email: 2604868278@qq.com
|
8
|
+
License: MIT
|
9
|
+
Keywords: cobweb-launcher, cobweb
|
10
|
+
Platform: UNKNOWN
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.7
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: requests (>=2.19.1)
|
16
|
+
Requires-Dist: oss2 (>=2.18.1)
|
17
|
+
Requires-Dist: redis (>=4.4.4)
|
18
|
+
Requires-Dist: aliyun-log-python-sdk
|
19
|
+
Requires-Dist: mmh3
|
20
|
+
|
21
|
+
# cobweb
|
22
|
+
cobweb是一个基于python的分布式爬虫调度框架,目前支持分布式爬虫,单机爬虫,支持自定义数据库,支持自定义数据存储,支持自定义数据处理等操作。
|
23
|
+
|
24
|
+
cobweb主要由3个模块和一个配置文件组成:Launcher启动器、Crawler采集器、Pipeline存储和setting配置文件。
|
25
|
+
1. Launcher启动器:用于启动爬虫任务,控制爬虫任务的执行流程,以及数据存储和数据处理。
|
26
|
+
框架提供两种启动器模式:LauncherAir、LauncherPro,分别对应单机爬虫模式和分布式调度模式。
|
27
|
+
2. Crawler采集器:用于控制采集流程、数据下载和数据处理。
|
28
|
+
框架提供了基础的采集器,用于控制采集流程、数据下载和数据处理,用户也可在创建任务时自定义请求、下载和解析方法,具体看使用方法介绍。
|
29
|
+
3. Pipeline存储:用于存储采集到的数据,支持自定义数据存储和数据处理。框架提供了Console和Loghub两种存储方式,用户也可继承Pipeline抽象类自定义存储方式。
|
30
|
+
4. setting配置文件:用于配置采集器、存储器、队列长度、采集线程数等参数,框架提供了默认配置,用户也可自定义配置。
|
31
|
+
## 安装
|
32
|
+
```
|
33
|
+
pip3 install --upgrade cobweb-launcher
|
34
|
+
```
|
35
|
+
## 使用方法介绍
|
36
|
+
### 1. 任务创建
|
37
|
+
- LauncherAir任务创建
|
38
|
+
```python
|
39
|
+
from cobweb import LauncherAir
|
40
|
+
|
41
|
+
# 创建启动器
|
42
|
+
app = LauncherAir(task="test", project="test")
|
43
|
+
|
44
|
+
# 设置采集种子
|
45
|
+
app.SEEDS = [{
|
46
|
+
"url": "https://www.baidu.com"
|
47
|
+
}]
|
48
|
+
...
|
49
|
+
# 启动任务
|
50
|
+
app.start()
|
51
|
+
```
|
52
|
+
- LauncherPro任务创建
|
53
|
+
LauncherPro依赖redis实现分布式调度,使用LauncherPro启动器需要完成环境变量的配置或自定义setting文件中的redis配置,如何配置查看`2. 自定义配置文件参数`
|
54
|
+
```python
|
55
|
+
from cobweb import LauncherPro
|
56
|
+
|
57
|
+
# 创建启动器
|
58
|
+
app = LauncherPro(
|
59
|
+
task="test",
|
60
|
+
project="test"
|
61
|
+
)
|
62
|
+
...
|
63
|
+
# 启动任务
|
64
|
+
app.start()
|
65
|
+
```
|
66
|
+
### 2. 自定义配置文件参数
|
67
|
+
- 通过自定义setting文件,配置文件导入字符串方式
|
68
|
+
> 默认配置文件:import cobweb.setting
|
69
|
+
> 不推荐!!!目前有bug,随缘使用...
|
70
|
+
例如:同级目录下自定义创建了setting.py文件。
|
71
|
+
```python
|
72
|
+
from cobweb import LauncherAir
|
73
|
+
|
74
|
+
app = LauncherAir(
|
75
|
+
task="test",
|
76
|
+
project="test",
|
77
|
+
setting="import setting"
|
78
|
+
)
|
79
|
+
|
80
|
+
...
|
81
|
+
|
82
|
+
app.start()
|
83
|
+
```
|
84
|
+
- 自定义修改setting中对象值
|
85
|
+
```python
|
86
|
+
from cobweb import LauncherPro
|
87
|
+
|
88
|
+
# 创建启动器
|
89
|
+
app = LauncherPro(
|
90
|
+
task="test",
|
91
|
+
project="test",
|
92
|
+
REDIS_CONFIG = {
|
93
|
+
"host": ...,
|
94
|
+
"password":...,
|
95
|
+
"port": ...,
|
96
|
+
"db": ...
|
97
|
+
}
|
98
|
+
)
|
99
|
+
...
|
100
|
+
# 启动任务
|
101
|
+
app.start()
|
102
|
+
```
|
103
|
+
### 3. 自定义请求
|
104
|
+
`@app.request`使用装饰器封装自定义请求方法,作用于发生请求前的操作,返回Request对象或继承于BaseItem对象,用于控制请求参数。
|
105
|
+
```python
|
106
|
+
from typing import Union
|
107
|
+
from cobweb import LauncherAir
|
108
|
+
from cobweb.base import Seed, Request, BaseItem
|
109
|
+
|
110
|
+
app = LauncherAir(
|
111
|
+
task="test",
|
112
|
+
project="test"
|
113
|
+
)
|
114
|
+
|
115
|
+
...
|
116
|
+
|
117
|
+
@app.request
|
118
|
+
def request(seed: Seed) -> Union[Request, BaseItem]:
|
119
|
+
# 可自定义headers,代理,构造请求参数等操作
|
120
|
+
proxies = {"http": ..., "https": ...}
|
121
|
+
yield Request(seed.url, seed, ..., proxies=proxies, timeout=15)
|
122
|
+
# yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
|
123
|
+
|
124
|
+
...
|
125
|
+
|
126
|
+
app.start()
|
127
|
+
```
|
128
|
+
> 默认请求方法
|
129
|
+
> def request(seed: Seed) -> Union[Request, BaseItem]:
|
130
|
+
> yield Request(seed.url, seed, timeout=5)
|
131
|
+
### 4. 自定义下载
|
132
|
+
`@app.download`使用装饰器封装自定义下载方法,作用于发生请求时的操作,返回Response对象或继承于BaseItem对象,用于控制请求参数。
|
133
|
+
```python
|
134
|
+
from typing import Union
|
135
|
+
from cobweb import LauncherAir
|
136
|
+
from cobweb.base import Request, Response, BaseItem
|
137
|
+
|
138
|
+
app = LauncherAir(
|
139
|
+
task="test",
|
140
|
+
project="test"
|
141
|
+
)
|
142
|
+
|
143
|
+
...
|
144
|
+
|
145
|
+
@app.download
|
146
|
+
def download(item: Request) -> Union[BaseItem, Response]:
|
147
|
+
...
|
148
|
+
response = ...
|
149
|
+
...
|
150
|
+
yield Response(item.seed, response, ...) # 返回Response对象,进行解析
|
151
|
+
# yield xxxItem(seed, ...) # 跳过请求和解析直接进入数据存储流程
|
152
|
+
|
153
|
+
...
|
154
|
+
|
155
|
+
app.start()
|
156
|
+
```
|
157
|
+
> 默认下载方法
|
158
|
+
> def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
159
|
+
> response = item.download()
|
160
|
+
> yield Response(item.seed, response, **item.to_dict)
|
161
|
+
### 5. 自定义解析
|
162
|
+
自定义解析需要由一个存储数据类和解析方法组成。存储数据类继承于BaseItem的对象,规定存储表名及字段,
|
163
|
+
解析方法返回继承于BaseItem的对象,yield返回进行控制数据存储流程。
|
164
|
+
```python
|
165
|
+
from typing import Union
|
166
|
+
from cobweb import LauncherAir
|
167
|
+
from cobweb.base import Seed, Response, BaseItem
|
168
|
+
|
169
|
+
class TestItem(BaseItem):
|
170
|
+
__TABLE__ = "test_data" # 表名
|
171
|
+
__FIELDS__ = "field1, field2, field3" # 字段名
|
172
|
+
|
173
|
+
app = LauncherAir(
|
174
|
+
task="test",
|
175
|
+
project="test"
|
176
|
+
)
|
177
|
+
|
178
|
+
...
|
179
|
+
|
180
|
+
@app.parse
|
181
|
+
def parse(item: Response) -> Union[Seed, BaseItem]:
|
182
|
+
...
|
183
|
+
yield TestItem(item.seed, field1=..., field2=..., field3=...)
|
184
|
+
# yield Seed(...) # 构造新种子推送至消费队列
|
185
|
+
|
186
|
+
...
|
187
|
+
|
188
|
+
app.start()
|
189
|
+
```
|
190
|
+
> 默认解析方法
|
191
|
+
> def parse(item: Request) -> Union[Seed, BaseItem]:
|
192
|
+
> upload_item = item.to_dict
|
193
|
+
> upload_item["text"] = item.response.text
|
194
|
+
> yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
|
195
|
+
## need deal
|
196
|
+
- 队列优化完善,使用queue的机制wait()同步各模块执行?
|
197
|
+
- 日志功能完善,单机模式调度和保存数据写入文件,结构化输出各任务日志
|
198
|
+
- 去重过滤(布隆过滤器等)
|
199
|
+
- 单机防丢失
|
200
|
+
- excel、mysql、redis数据完善
|
201
|
+
|
202
|
+
> 未更新流程图!!!
|
203
|
+
![img.png](https://image-luyuan.oss-cn-hangzhou.aliyuncs.com/image/D2388CDC-B9E5-4CE4-9F2C-7D173763B6A8.png)
|
204
|
+
|
205
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
cobweb/__init__.py,sha256=CBd2oByCfc5EmH2dCZYVHkxXYZG-oWrLyTtZU5sEoP0,96
|
2
|
+
cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
|
3
|
+
cobweb/setting.py,sha256=47HZsw40HLpsmOmvij1lyQALPQQCN_tWlKZ0wbn2MtM,2216
|
4
|
+
cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
|
5
|
+
cobweb/base/basic.py,sha256=Z56SSLB3I2IGHWCCcSy0Qbfzj8Qbg_po3gP32q1jh4k,7741
|
6
|
+
cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
|
7
|
+
cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
|
8
|
+
cobweb/base/dotting.py,sha256=lfFXXqnVP__hxlW3qH5Bnuq69KtnFaQLbcz1M8e2Ajg,1239
|
9
|
+
cobweb/base/item.py,sha256=hYheVTV2Bozp4iciJpE2ZwBIXkaqBg4QQkRccP8yoVk,1049
|
10
|
+
cobweb/base/log.py,sha256=L01hXdk3L2qEm9X1FOXQ9VmWIoHSELe0cyZvrdAN61A,2003
|
11
|
+
cobweb/base/request.py,sha256=tEkgMVUfdQI-kZuzWuiit9P_q4Q9-_RZh9aXXpc0314,2352
|
12
|
+
cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
13
|
+
cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
|
14
|
+
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
15
|
+
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
16
|
+
cobweb/crawlers/crawler.py,sha256=UojWdymPCwit0MOkqHsYRoe4hXyHdZhgh7-MBPfrhQo,8373
|
17
|
+
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
18
|
+
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
19
|
+
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
20
|
+
cobweb/db/redis_db.py,sha256=fumNZJiio-uQqRcSrymx8eJ1PqsdOwITe_Y-9JOXxrQ,4298
|
21
|
+
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
22
|
+
cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
23
|
+
cobweb/launchers/__init__.py,sha256=qMuVlQcjErVK67HyKFZEsXf_rfZD5ODjx1QucSCKMOM,114
|
24
|
+
cobweb/launchers/launcher.py,sha256=sPts-xlgxoeIfl1fn1XR2XVZxLzt7He9xrYDfTHRAGo,7029
|
25
|
+
cobweb/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
|
26
|
+
cobweb/launchers/launcher_api.py,sha256=YFqCTRvKn6icBLWTR1VxkU0WEIte2F7fv_LgPkifqdo,7885
|
27
|
+
cobweb/launchers/launcher_pro.py,sha256=B5FdxvuENRL3XrMl74ENdP1uNgnZOaYCUUfBfM0t3io,7842
|
28
|
+
cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
|
29
|
+
cobweb/pipelines/pipeline.py,sha256=4TJLX0sUHRxYndF5A4Vs5btUGI-wigkOcFvhTW1hLXI,2009
|
30
|
+
cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
|
31
|
+
cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
|
32
|
+
cobweb/schedulers/__init__.py,sha256=y7Lv_7b0zfTl0OhIONb_8u1K1C9gVlBA-xz_XG_kI9g,85
|
33
|
+
cobweb/schedulers/scheduler_api.py,sha256=pFEdS1H4zuzxwMhCV-G7CoLz-rEOPv4EVo3xZUXTyDo,2199
|
34
|
+
cobweb/schedulers/scheduler_redis.py,sha256=E5fjc3nNld8GbUhUGT7uY4smRejj2J2ZIzp2g6lhxFM,2205
|
35
|
+
cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
|
36
|
+
cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
37
|
+
cobweb/utils/dotting.py,sha256=PgsWdM-724Jy-MZWUsaygNWV-huqLMmdLgop7gaBxlo,872
|
38
|
+
cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
39
|
+
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
40
|
+
cobweb_launcher-1.2.41.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
41
|
+
cobweb_launcher-1.2.41.dist-info/METADATA,sha256=ZuTN2RXJGQB6qWfjgTtcvwoVrjxvS6-ho0z7V9BTR8A,6510
|
42
|
+
cobweb_launcher-1.2.41.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
43
|
+
cobweb_launcher-1.2.41.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
44
|
+
cobweb_launcher-1.2.41.dist-info/RECORD,,
|
cobweb/bbb.py
DELETED
@@ -1,191 +0,0 @@
|
|
1
|
-
# from typing import Iterable
|
2
|
-
import json
|
3
|
-
import time
|
4
|
-
import hashlib
|
5
|
-
from .log import log
|
6
|
-
from .utils import struct_queue_name
|
7
|
-
from collections import deque, namedtuple
|
8
|
-
|
9
|
-
|
10
|
-
class Queue:
|
11
|
-
|
12
|
-
def __init__(self):
|
13
|
-
self._queue = deque()
|
14
|
-
|
15
|
-
@property
|
16
|
-
def length(self) -> int:
|
17
|
-
return len(self._queue)
|
18
|
-
#
|
19
|
-
# @property
|
20
|
-
# def queue_names(self):
|
21
|
-
# return tuple(self.__dict__.keys())
|
22
|
-
#
|
23
|
-
# @property
|
24
|
-
# def used_memory(self):
|
25
|
-
# return asizeof.asizeof(self)
|
26
|
-
|
27
|
-
# def create_queue(self, queue_name: str):
|
28
|
-
# self.__setattr__(queue_name, deque())
|
29
|
-
|
30
|
-
# def push_seed(self, seed):
|
31
|
-
# self.push("_seed_queue", seed)
|
32
|
-
|
33
|
-
# def pop_seed(self):
|
34
|
-
# return self.pop("_seed_queue")
|
35
|
-
|
36
|
-
def push(self, data, left: bool = False, direct_insertion: bool = False):
|
37
|
-
try:
|
38
|
-
if not data:
|
39
|
-
return None
|
40
|
-
if direct_insertion or isinstance(data, Seed):
|
41
|
-
self._queue.appendleft(data) if left else self._queue.append(data)
|
42
|
-
elif any(isinstance(data, t) for t in (list, tuple)):
|
43
|
-
self._queue.extendleft(data) if left else self._queue.extend(data)
|
44
|
-
except AttributeError as e:
|
45
|
-
log.exception(e)
|
46
|
-
|
47
|
-
def pop(self, left: bool = True):
|
48
|
-
try:
|
49
|
-
return self._queue.popleft() if left else self._queue.pop()
|
50
|
-
except IndexError:
|
51
|
-
return None
|
52
|
-
except AttributeError as e:
|
53
|
-
log.exception(e)
|
54
|
-
return None
|
55
|
-
|
56
|
-
|
57
|
-
class Seed:
|
58
|
-
|
59
|
-
def __init__(
|
60
|
-
self,
|
61
|
-
seed_info=None,
|
62
|
-
priority=300,
|
63
|
-
version=0,
|
64
|
-
retry=0,
|
65
|
-
**kwargs
|
66
|
-
):
|
67
|
-
if seed_info:
|
68
|
-
if any(isinstance(seed_info, t) for t in (str, bytes)):
|
69
|
-
try:
|
70
|
-
item = json.loads(seed_info)
|
71
|
-
for k, v in item.items():
|
72
|
-
self.__setattr__(k, v)
|
73
|
-
except json.JSONDecodeError:
|
74
|
-
self.__setattr__("url", seed_info)
|
75
|
-
elif isinstance(seed_info, dict):
|
76
|
-
for k, v in seed_info.items():
|
77
|
-
self.__setattr__(k, v)
|
78
|
-
else:
|
79
|
-
raise TypeError(Exception(
|
80
|
-
f"seed type error, "
|
81
|
-
f"must be str or dict! "
|
82
|
-
f"seed_info: {seed_info}"
|
83
|
-
))
|
84
|
-
for k, v in kwargs.items():
|
85
|
-
self.__setattr__(k, v)
|
86
|
-
if not getattr(self, "_priority"):
|
87
|
-
self._priority = min(max(1, int(priority)), 999)
|
88
|
-
if not getattr(self, "_version"):
|
89
|
-
self._version = int(version) or int(time.time())
|
90
|
-
if not getattr(self, "_retry"):
|
91
|
-
self._retry = retry
|
92
|
-
if not getattr(self, "sid"):
|
93
|
-
self.init_id()
|
94
|
-
|
95
|
-
def init_id(self):
|
96
|
-
item_string = self.format_seed
|
97
|
-
seed_id = hashlib.md5(item_string.encode()).hexdigest()
|
98
|
-
self.__setattr__("sid", seed_id)
|
99
|
-
|
100
|
-
def __setitem__(self, key, value):
|
101
|
-
setattr(self, key, value)
|
102
|
-
|
103
|
-
def __getitem__(self, item):
|
104
|
-
return getattr(self, item)
|
105
|
-
|
106
|
-
def __getattr__(self, name):
|
107
|
-
return None
|
108
|
-
|
109
|
-
def __str__(self):
|
110
|
-
return json.dumps(self.__dict__, ensure_ascii=False)
|
111
|
-
|
112
|
-
def __repr__(self):
|
113
|
-
chars = [f"{k}={v}" for k, v in self.__dict__.items()]
|
114
|
-
return f'{self.__class__.__name__}({", ".join(chars)})'
|
115
|
-
|
116
|
-
@property
|
117
|
-
def dict_seed(self):
|
118
|
-
seed = self.__dict__.copy()
|
119
|
-
del seed["_priority"]
|
120
|
-
del seed["_version"]
|
121
|
-
del seed["_retry"]
|
122
|
-
return seed
|
123
|
-
|
124
|
-
@property
|
125
|
-
def format_seed(self):
|
126
|
-
return json.dumps(self.dict_seed, ensure_ascii=False)
|
127
|
-
|
128
|
-
|
129
|
-
class DBItem:
|
130
|
-
|
131
|
-
def __init__(self, **kwargs):
|
132
|
-
self.__setattr__("_index", 0, True)
|
133
|
-
for table in self.__class__.__table__:
|
134
|
-
if set(kwargs.keys()) == set(table._fields):
|
135
|
-
break
|
136
|
-
self._index += 1
|
137
|
-
|
138
|
-
if self._index > len(self.__class__.__table__):
|
139
|
-
raise Exception()
|
140
|
-
|
141
|
-
table = self.__class__.__table__[self._index]
|
142
|
-
self.__setattr__("struct_data", table(**kwargs), True)
|
143
|
-
self.__setattr__("db_name", self.__class__.__name__, True)
|
144
|
-
self.__setattr__("table_name", self.struct_data.__class__.__name__, True)
|
145
|
-
|
146
|
-
@classmethod
|
147
|
-
def init_item(cls, table_name, fields):
|
148
|
-
queue_name = struct_queue_name(cls.__name__, table_name)
|
149
|
-
if getattr(cls, queue_name, None) is None:
|
150
|
-
setattr(cls, queue_name, Queue())
|
151
|
-
|
152
|
-
if getattr(cls, "__table__", None) is None:
|
153
|
-
cls.__table__ = []
|
154
|
-
|
155
|
-
table = namedtuple(table_name, fields)
|
156
|
-
|
157
|
-
if table in getattr(cls, "__table__"):
|
158
|
-
raise Exception()
|
159
|
-
getattr(cls, "__table__").append(table)
|
160
|
-
|
161
|
-
def queue(self):
|
162
|
-
queue_name = struct_queue_name(self.db_name, self.table_name)
|
163
|
-
return getattr(self.__class__, queue_name)
|
164
|
-
|
165
|
-
def __setitem__(self, key, value):
|
166
|
-
self.__setattr__(key, value)
|
167
|
-
|
168
|
-
def __getitem__(self, item):
|
169
|
-
return self.struct_data[item]
|
170
|
-
|
171
|
-
def __getattr__(self, name):
|
172
|
-
return None
|
173
|
-
|
174
|
-
def __setattr__(self, key, value, init=None):
|
175
|
-
if init:
|
176
|
-
super().__setattr__(key, value)
|
177
|
-
elif not getattr(self, "struct_data"):
|
178
|
-
raise Exception(f"no struct_data")
|
179
|
-
else:
|
180
|
-
self.__setattr__(
|
181
|
-
"struct_data",
|
182
|
-
self.struct_data._replace(**{key: value}),
|
183
|
-
init=True
|
184
|
-
)
|
185
|
-
|
186
|
-
def __str__(self):
|
187
|
-
return json.dumps(self.struct_data._asdict(), ensure_ascii=False)
|
188
|
-
|
189
|
-
def __repr__(self):
|
190
|
-
return f'{self.__class__.__name__}:{self.struct_data}'
|
191
|
-
|