cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/__init__.py +2 -11
- cobweb/base/__init__.py +9 -0
- cobweb/base/basic.py +297 -0
- cobweb/base/common_queue.py +30 -0
- cobweb/base/decorators.py +40 -0
- cobweb/base/dotting.py +35 -0
- cobweb/base/item.py +46 -0
- cobweb/{log.py → base/log.py} +4 -6
- cobweb/base/request.py +82 -0
- cobweb/base/response.py +23 -0
- cobweb/base/seed.py +114 -0
- cobweb/constant.py +94 -0
- cobweb/crawlers/__init__.py +1 -0
- cobweb/crawlers/base_crawler.py +144 -0
- cobweb/crawlers/crawler.py +209 -0
- cobweb/crawlers/file_crawler.py +98 -0
- cobweb/db/__init__.py +2 -2
- cobweb/db/api_db.py +82 -0
- cobweb/db/redis_db.py +125 -218
- cobweb/exceptions/__init__.py +1 -0
- cobweb/exceptions/oss_db_exception.py +28 -0
- cobweb/launchers/__init__.py +3 -0
- cobweb/launchers/launcher.py +235 -0
- cobweb/launchers/launcher_air.py +88 -0
- cobweb/launchers/launcher_api.py +209 -0
- cobweb/launchers/launcher_pro.py +208 -0
- cobweb/pipelines/__init__.py +3 -0
- cobweb/pipelines/pipeline.py +69 -0
- cobweb/pipelines/pipeline_console.py +22 -0
- cobweb/pipelines/pipeline_loghub.py +34 -0
- cobweb/schedulers/__init__.py +3 -0
- cobweb/schedulers/scheduler_api.py +72 -0
- cobweb/schedulers/scheduler_redis.py +72 -0
- cobweb/setting.py +67 -6
- cobweb/utils/__init__.py +5 -0
- cobweb/utils/bloom.py +58 -0
- cobweb/utils/dotting.py +32 -0
- cobweb/utils/oss.py +94 -0
- cobweb/utils/tools.py +42 -0
- cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
- cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
- cobweb/bbb.py +0 -191
- cobweb/db/oss_db.py +0 -127
- cobweb/db/scheduler/__init__.py +0 -0
- cobweb/db/scheduler/default.py +0 -8
- cobweb/db/scheduler/textfile.py +0 -27
- cobweb/db/storer/__init__.py +0 -0
- cobweb/db/storer/console.py +0 -9
- cobweb/db/storer/loghub.py +0 -54
- cobweb/db/storer/redis.py +0 -15
- cobweb/db/storer/textfile.py +0 -15
- cobweb/decorators.py +0 -16
- cobweb/distributed/__init__.py +0 -0
- cobweb/distributed/launcher.py +0 -243
- cobweb/distributed/models.py +0 -143
- cobweb/interface.py +0 -34
- cobweb/single/__init__.py +0 -0
- cobweb/single/launcher.py +0 -231
- cobweb/single/models.py +0 -134
- cobweb/single/nest.py +0 -153
- cobweb/task.py +0 -50
- cobweb/utils.py +0 -90
- cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
- cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
- {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/db/oss_db.py
DELETED
@@ -1,127 +0,0 @@
|
|
1
|
-
import oss2
|
2
|
-
from typing import Union
|
3
|
-
from oss2.models import PartInfo
|
4
|
-
from requests import Response
|
5
|
-
from cobweb import log
|
6
|
-
|
7
|
-
|
8
|
-
class OssDB:
|
9
|
-
|
10
|
-
def __init__(
|
11
|
-
self,
|
12
|
-
bucket_name,
|
13
|
-
endpoint,
|
14
|
-
access_key,
|
15
|
-
secret_key,
|
16
|
-
chunk_size,
|
17
|
-
min_size
|
18
|
-
):
|
19
|
-
self.endpoint = endpoint
|
20
|
-
self.bucket_name = bucket_name
|
21
|
-
self.auth = oss2.Auth(
|
22
|
-
access_key_id=access_key,
|
23
|
-
access_key_secret=secret_key
|
24
|
-
)
|
25
|
-
self.bucket = oss2.Bucket(
|
26
|
-
auth=self.auth,
|
27
|
-
endpoint=endpoint,
|
28
|
-
bucket_name=bucket_name
|
29
|
-
)
|
30
|
-
self.chunk_size = chunk_size or 1024 ** 2
|
31
|
-
self.min_size = min_size or 1024
|
32
|
-
|
33
|
-
@staticmethod
|
34
|
-
def format_upload_len(length):
|
35
|
-
if not length:
|
36
|
-
raise ValueError("Length cannot be None or 0")
|
37
|
-
|
38
|
-
units = ["KB", "MB", "GB", "TB"]
|
39
|
-
for i in range(3):
|
40
|
-
num = length / (1024 ** (i + 1))
|
41
|
-
if num <= 1024:
|
42
|
-
return f"{round(num, 2)} {units[i]}"
|
43
|
-
|
44
|
-
def assemble(self, ready_data, part_data):
|
45
|
-
upload_data = None
|
46
|
-
ready_data = ready_data + part_data
|
47
|
-
if len(ready_data) >= self.chunk_size:
|
48
|
-
upload_data = ready_data[:self.chunk_size]
|
49
|
-
ready_data = ready_data[self.chunk_size:]
|
50
|
-
|
51
|
-
return ready_data, upload_data
|
52
|
-
|
53
|
-
def iter_data(self, data):
|
54
|
-
if isinstance(data, Response):
|
55
|
-
for part_data in data.iter_content(self.chunk_size):
|
56
|
-
yield part_data
|
57
|
-
if isinstance(data, bytes):
|
58
|
-
for i in range(0, len(data), self.chunk_size):
|
59
|
-
yield data[i:i + self.chunk_size]
|
60
|
-
|
61
|
-
def upload_split(
|
62
|
-
self, oss_path: str,
|
63
|
-
data: Union[bytes, Response],
|
64
|
-
timeout: int = 300,
|
65
|
-
):
|
66
|
-
parts = []
|
67
|
-
status = False
|
68
|
-
upload_id = None
|
69
|
-
ready_data = b""
|
70
|
-
upload_data_len = 0
|
71
|
-
headers = {"Expires": str(timeout * 1000)}
|
72
|
-
try:
|
73
|
-
upload_id = self.bucket.init_multipart_upload(oss_path).upload_id
|
74
|
-
for part_data in self.iter_data(data):
|
75
|
-
upload_data_len += len(part_data)
|
76
|
-
ready_data, upload_data = self.assemble(ready_data, part_data)
|
77
|
-
if upload_data:
|
78
|
-
part_index = len(parts) + 1
|
79
|
-
upload_info = self.bucket.upload_part(
|
80
|
-
oss_path, upload_id, part_index, upload_data
|
81
|
-
)
|
82
|
-
parts.append(PartInfo(part_index, upload_info.etag))
|
83
|
-
|
84
|
-
format_upload = self.format_upload_len(upload_data_len)
|
85
|
-
|
86
|
-
if parts and ready_data:
|
87
|
-
part_index = len(parts) + 1
|
88
|
-
upload_info = self.bucket.upload_part(
|
89
|
-
oss_path, upload_id, part_index, ready_data
|
90
|
-
)
|
91
|
-
parts.append(PartInfo(part_index, upload_info.etag))
|
92
|
-
self.bucket.complete_multipart_upload(
|
93
|
-
oss_path, upload_id, parts
|
94
|
-
)
|
95
|
-
log.info(
|
96
|
-
f"split upload, file path: {oss_path}"
|
97
|
-
f", file size: {format_upload}"
|
98
|
-
)
|
99
|
-
|
100
|
-
elif len(ready_data) > self.min_size:
|
101
|
-
self.bucket.put_object(oss_path, ready_data, headers)
|
102
|
-
log.info(
|
103
|
-
f"upload file, file path: {oss_path}"
|
104
|
-
f", file size: {format_upload}"
|
105
|
-
)
|
106
|
-
|
107
|
-
else:
|
108
|
-
log.info(
|
109
|
-
f"file size smaller than min size! "
|
110
|
-
f"file size: {format_upload}"
|
111
|
-
)
|
112
|
-
status = True
|
113
|
-
except ValueError as e:
|
114
|
-
log.exception(str(e))
|
115
|
-
except oss2.exceptions.RequestError as e:
|
116
|
-
self.bucket = oss2.Bucket(
|
117
|
-
auth=self.auth,
|
118
|
-
endpoint=self.endpoint,
|
119
|
-
bucket_name=self.bucket_name
|
120
|
-
)
|
121
|
-
log.exception("oss timeout! " + str(e))
|
122
|
-
except Exception as e:
|
123
|
-
self.bucket.abort_multipart_upload(oss_path, upload_id, headers)
|
124
|
-
log.exception("upload file exception: " + str(e))
|
125
|
-
|
126
|
-
return status
|
127
|
-
|
cobweb/db/scheduler/__init__.py
DELETED
File without changes
|
cobweb/db/scheduler/default.py
DELETED
cobweb/db/scheduler/textfile.py
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
from cobweb import log, Seed, SchedulerInterface
|
2
|
-
|
3
|
-
|
4
|
-
class Textfile(SchedulerInterface):
|
5
|
-
|
6
|
-
index = None
|
7
|
-
|
8
|
-
def schedule(self):
|
9
|
-
try:
|
10
|
-
seeds = []
|
11
|
-
with open(self.table, "r") as fp:
|
12
|
-
fp.seek(self.index or 0, 0)
|
13
|
-
for _ in range(self.length):
|
14
|
-
data = fp.readline().strip()
|
15
|
-
if not data:
|
16
|
-
log.info("scheduler end!")
|
17
|
-
self.stop = True
|
18
|
-
break
|
19
|
-
seeds.append(Seed(data))
|
20
|
-
self.index = fp.tell()
|
21
|
-
return seeds
|
22
|
-
except FileNotFoundError:
|
23
|
-
log.error("task table not found!")
|
24
|
-
return None
|
25
|
-
except TypeError:
|
26
|
-
log.error("task table type error!")
|
27
|
-
return None
|
cobweb/db/storer/__init__.py
DELETED
File without changes
|
cobweb/db/storer/console.py
DELETED
cobweb/db/storer/loghub.py
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
3
|
-
from cobweb import log, StorerInterface
|
4
|
-
|
5
|
-
|
6
|
-
class Loghub(StorerInterface):
|
7
|
-
|
8
|
-
def __init__(self, **kwargs):
|
9
|
-
super().__init__(**kwargs)
|
10
|
-
self.client = None
|
11
|
-
|
12
|
-
def init_loghub_clint(self):
|
13
|
-
try:
|
14
|
-
self.client = LogClient(
|
15
|
-
self.config['endpoint'],
|
16
|
-
self.config['access_key_id'],
|
17
|
-
self.config['access_key']
|
18
|
-
)
|
19
|
-
except Exception as e:
|
20
|
-
self.client = None
|
21
|
-
return False
|
22
|
-
|
23
|
-
def store(self, data_list):
|
24
|
-
try:
|
25
|
-
if not self.client:
|
26
|
-
self.init_loghub_clint()
|
27
|
-
|
28
|
-
log_items = list()
|
29
|
-
for item in data_list:
|
30
|
-
temp = item._asdict()
|
31
|
-
for key, value in temp.items():
|
32
|
-
if isinstance(value, str):
|
33
|
-
temp[key] = value
|
34
|
-
else:
|
35
|
-
temp[key] = json.dumps(value, ensure_ascii=False)
|
36
|
-
log_item = LogItem()
|
37
|
-
contents = sorted(temp.items()) # dict to tuple
|
38
|
-
log_item.set_contents(contents)
|
39
|
-
log_items.append(log_item)
|
40
|
-
request = PutLogsRequest(
|
41
|
-
project=self.config["project"],
|
42
|
-
logstore=self.table,
|
43
|
-
topic=self.config["topic"],
|
44
|
-
source=self.config.get("source"),
|
45
|
-
logitems=log_items,
|
46
|
-
compress=True
|
47
|
-
)
|
48
|
-
self.client.put_logs(request=request)
|
49
|
-
log.info(f"save data, data length: {len(data_list)}")
|
50
|
-
return True
|
51
|
-
except Exception as e:
|
52
|
-
log.exception(e)
|
53
|
-
return False
|
54
|
-
|
cobweb/db/storer/redis.py
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
from cobweb import log, StorerInterface
|
2
|
-
|
3
|
-
|
4
|
-
class Redis(StorerInterface):
|
5
|
-
|
6
|
-
def store(self, data_list):
|
7
|
-
try:
|
8
|
-
data_str = "\n".join(str(data) for data in data_list)
|
9
|
-
with open(self.table, "a") as fp:
|
10
|
-
fp.write(data_str)
|
11
|
-
log.info(f"save data, data length: {len(data_list)}")
|
12
|
-
return True
|
13
|
-
except Exception as e:
|
14
|
-
return False
|
15
|
-
|
cobweb/db/storer/textfile.py
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
from cobweb import log, StorerInterface
|
2
|
-
|
3
|
-
|
4
|
-
class Textfile(StorerInterface):
|
5
|
-
|
6
|
-
def store(self, data_list):
|
7
|
-
try:
|
8
|
-
data_str = "\n".join(str(data) for data in data_list)
|
9
|
-
with open(self.table, "a") as fp:
|
10
|
-
fp.write(data_str)
|
11
|
-
log.info(f"save data, data length: {len(data_list)}")
|
12
|
-
return True
|
13
|
-
except Exception as e:
|
14
|
-
return False
|
15
|
-
|
cobweb/decorators.py
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
from functools import wraps
|
2
|
-
from cobweb import log
|
3
|
-
|
4
|
-
|
5
|
-
def check_redis_status(func):
|
6
|
-
@wraps(func)
|
7
|
-
def wrapper(*args, **kwargs):
|
8
|
-
try:
|
9
|
-
result = func(*args, **kwargs)
|
10
|
-
except Exception as e:
|
11
|
-
log.exception(e)
|
12
|
-
result = False
|
13
|
-
return result
|
14
|
-
|
15
|
-
return wrapper
|
16
|
-
|
cobweb/distributed/__init__.py
DELETED
File without changes
|
cobweb/distributed/launcher.py
DELETED
@@ -1,243 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import threading
|
3
|
-
from threading import Thread
|
4
|
-
|
5
|
-
from .models import Scheduler, Spider, Storer
|
6
|
-
from cobweb import log, Queue, DBItem, RedisDB
|
7
|
-
from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
|
8
|
-
from cobweb.utils import (
|
9
|
-
struct_queue_name as sqn,
|
10
|
-
restore_table_name as rtn,
|
11
|
-
parse_import_model as pim,
|
12
|
-
)
|
13
|
-
|
14
|
-
|
15
|
-
def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
|
16
|
-
log.info("run check thread after 30 seconds...")
|
17
|
-
time.sleep(30)
|
18
|
-
spider_info = """
|
19
|
-
------------------- check: {0} ------------------
|
20
|
-
running_spider_thread_num: {1}
|
21
|
-
redis_ready_seed_length: {2}
|
22
|
-
redis_spider_seed_length: {3}
|
23
|
-
memory_seed_queue_length: {4}
|
24
|
-
storer_upload_queue_length_info:
|
25
|
-
{5}
|
26
|
-
----------------------- end -----------------------"""
|
27
|
-
while True:
|
28
|
-
status = "running"
|
29
|
-
running_spider_thread_num = spider.spider_in_progress.length
|
30
|
-
redis_ready_seed_length = ready_seed_length()
|
31
|
-
redis_spider_seed_length = spider_queue_length()
|
32
|
-
memory_seed_queue_length = scheduler.queue.length
|
33
|
-
storer_upload_queue_list = []
|
34
|
-
for storer in storer_list:
|
35
|
-
storer_upload_queue_list.append(
|
36
|
-
f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
|
37
|
-
)
|
38
|
-
if (
|
39
|
-
scheduler.stop and
|
40
|
-
# not redis_ready_seed_length and
|
41
|
-
not memory_seed_queue_length and
|
42
|
-
not running_spider_thread_num
|
43
|
-
):
|
44
|
-
if not MODEL:
|
45
|
-
log.info("spider is done?")
|
46
|
-
last.set()
|
47
|
-
time.sleep(3)
|
48
|
-
storer_queue_empty = True
|
49
|
-
storer_upload_queue_list = []
|
50
|
-
for storer in storer_list:
|
51
|
-
if storer.queue.length:
|
52
|
-
storer_queue_empty = False
|
53
|
-
storer_upload_queue_list.append(
|
54
|
-
f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
|
55
|
-
)
|
56
|
-
if (
|
57
|
-
storer_queue_empty and
|
58
|
-
not redis_ready_seed_length and
|
59
|
-
not redis_spider_seed_length
|
60
|
-
):
|
61
|
-
if MODEL:
|
62
|
-
log.info("waiting for push seeds...")
|
63
|
-
status = "waiting"
|
64
|
-
time.sleep(30)
|
65
|
-
else:
|
66
|
-
log.info("spider done!")
|
67
|
-
break
|
68
|
-
|
69
|
-
last.clear()
|
70
|
-
|
71
|
-
storer_upload_queue_length_info = "\n ".join(
|
72
|
-
storer_upload_queue_list) if storer_upload_queue_list else "None"
|
73
|
-
log.info(spider_info.format(
|
74
|
-
status,
|
75
|
-
running_spider_thread_num,
|
76
|
-
redis_ready_seed_length,
|
77
|
-
redis_spider_seed_length,
|
78
|
-
memory_seed_queue_length,
|
79
|
-
storer_upload_queue_length_info
|
80
|
-
))
|
81
|
-
|
82
|
-
time.sleep(3)
|
83
|
-
stop.set()
|
84
|
-
|
85
|
-
|
86
|
-
def launcher(task):
|
87
|
-
"""
|
88
|
-
任务启动装饰器
|
89
|
-
:param task: 任务配置信息
|
90
|
-
"""
|
91
|
-
def decorator(func):
|
92
|
-
"""
|
93
|
-
Item:
|
94
|
-
Textfile()
|
95
|
-
Loghub()
|
96
|
-
Console()
|
97
|
-
e.g.
|
98
|
-
task.fields = "a,b"
|
99
|
-
func(item, seed)
|
100
|
-
a = "a"
|
101
|
-
b = "b"
|
102
|
-
data = {"a": "a", "b": "b"}
|
103
|
-
yield item.Loghub(**data)
|
104
|
-
yield item.Loghub(a=a, b=b)
|
105
|
-
"""
|
106
|
-
storer_list = []
|
107
|
-
|
108
|
-
# 程序结束事件
|
109
|
-
last = threading.Event()
|
110
|
-
# 停止采集事件
|
111
|
-
stop = threading.Event()
|
112
|
-
|
113
|
-
# 初始化redis信息
|
114
|
-
redis_db = RedisDB(
|
115
|
-
task.project, task.task_name, task.redis_info,
|
116
|
-
model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
|
117
|
-
)
|
118
|
-
|
119
|
-
log.info("初始化cobweb!")
|
120
|
-
|
121
|
-
seed_queue = Queue()
|
122
|
-
|
123
|
-
if task.scheduler_info is None:
|
124
|
-
task.scheduler_info = dict()
|
125
|
-
|
126
|
-
# 调度器动态继承
|
127
|
-
sql = task.scheduler_info.get("sql")
|
128
|
-
table = task.scheduler_info.get("table")
|
129
|
-
size = task.scheduler_info.get("size")
|
130
|
-
scheduler_config = task.scheduler_info.get("config")
|
131
|
-
scheduler_db = task.scheduler_info.get("db", "default")
|
132
|
-
DB, class_name = pim(scheduler_db, "scheduler")
|
133
|
-
# SchedulerDB, table, sql, length, size, config = task.scheduler_info
|
134
|
-
SchedulerTmp = type(class_name, (Scheduler, DB), {})
|
135
|
-
|
136
|
-
# 初始化调度器
|
137
|
-
scheduler = SchedulerTmp(
|
138
|
-
table=table, sql=sql, size=size, queue=seed_queue,
|
139
|
-
length=task.scheduler_queue_length, config=scheduler_config
|
140
|
-
)
|
141
|
-
|
142
|
-
# 初始化采集器
|
143
|
-
spider = Spider(seed_queue, task.max_retries)
|
144
|
-
|
145
|
-
# 解析存储器信息
|
146
|
-
storer_info_list = task.storer_info or []
|
147
|
-
if not isinstance(storer_info_list, list):
|
148
|
-
storer_info_list = [storer_info_list]
|
149
|
-
|
150
|
-
# new item
|
151
|
-
item = type("Item", (object,), {"redis_client": redis_db.client})()
|
152
|
-
|
153
|
-
for storer_info in storer_info_list:
|
154
|
-
storer_db = storer_info["db"]
|
155
|
-
fields = storer_info["fields"]
|
156
|
-
storer_table = storer_info.get("table", "console")
|
157
|
-
storer_config = storer_info.get("config")
|
158
|
-
|
159
|
-
StorerDB, class_name = pim(storer_db, "storer")
|
160
|
-
StorerTmp = type(class_name, (Storer, StorerDB), {})
|
161
|
-
|
162
|
-
db_name = class_name.lower()
|
163
|
-
if not getattr(item, db_name, None):
|
164
|
-
instance = type(db_name, (DBItem,), {})
|
165
|
-
setattr(item, db_name, instance)
|
166
|
-
|
167
|
-
storer_item_instance = getattr(item, db_name)
|
168
|
-
storer_item_instance.init_item(storer_table, fields)
|
169
|
-
|
170
|
-
storer_queue = sqn(db_name, storer_table)
|
171
|
-
queue = getattr(storer_item_instance, storer_queue)
|
172
|
-
# 初始话存储器
|
173
|
-
table_name = rtn(table_name=storer_table)
|
174
|
-
storer = StorerTmp(
|
175
|
-
table=table_name, fields=fields,
|
176
|
-
length=task.storer_queue_length,
|
177
|
-
queue=queue, config=storer_config
|
178
|
-
)
|
179
|
-
storer_list.append(storer)
|
180
|
-
|
181
|
-
Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
|
182
|
-
Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
|
183
|
-
|
184
|
-
# 推送初始种子
|
185
|
-
# seeds = start_seeds(task.start_seed)
|
186
|
-
redis_db.add_seed(task.seeds)
|
187
|
-
# 启动调度器, 调度至redis队列
|
188
|
-
Thread(
|
189
|
-
# name="xxxx_schedule_seeds",
|
190
|
-
target=scheduler.schedule_seed,
|
191
|
-
args=(
|
192
|
-
redis_db.ready_seed_length,
|
193
|
-
redis_db.get_scheduler_lock,
|
194
|
-
redis_db.add_seed
|
195
|
-
)
|
196
|
-
).start()
|
197
|
-
|
198
|
-
# 启动调度器, 调度任务队列
|
199
|
-
Thread(
|
200
|
-
# name="xxxx_schedule_task",
|
201
|
-
target=scheduler.schedule_task,
|
202
|
-
args=(
|
203
|
-
stop, redis_db.get_seed,
|
204
|
-
redis_db.ready_seed_length
|
205
|
-
)
|
206
|
-
).start()
|
207
|
-
|
208
|
-
# 启动采集器
|
209
|
-
for index in range(task.spider_num):
|
210
|
-
Thread(
|
211
|
-
# name=f"xxxx_spider_task:{index}",
|
212
|
-
target=spider.spider_task,
|
213
|
-
args=(
|
214
|
-
stop, func, item,
|
215
|
-
redis_db.del_seed
|
216
|
-
)
|
217
|
-
).start()
|
218
|
-
|
219
|
-
# 启动存储器
|
220
|
-
for storer in storer_list:
|
221
|
-
Thread(
|
222
|
-
# name=f"xxxx_store_task:{storer.table}",
|
223
|
-
target=storer.store_task,
|
224
|
-
args=(
|
225
|
-
stop, last,
|
226
|
-
redis_db.reset_seed,
|
227
|
-
redis_db.set_storer
|
228
|
-
)
|
229
|
-
).start()
|
230
|
-
|
231
|
-
Thread(
|
232
|
-
# name="check_spider",
|
233
|
-
target=check,
|
234
|
-
args=(
|
235
|
-
stop, last, spider,
|
236
|
-
scheduler, storer_list,
|
237
|
-
redis_db.ready_seed_length,
|
238
|
-
redis_db.spider_queue_length,
|
239
|
-
)
|
240
|
-
).start()
|
241
|
-
|
242
|
-
return decorator
|
243
|
-
|
cobweb/distributed/models.py
DELETED
@@ -1,143 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
from hashlib import md5
|
3
|
-
from cobweb import log, Queue, Seed
|
4
|
-
from cobweb.utils import issubclass_cobweb_inf
|
5
|
-
|
6
|
-
# from pympler import asizeof
|
7
|
-
|
8
|
-
|
9
|
-
class Scheduler:
|
10
|
-
|
11
|
-
def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
|
12
|
-
|
13
|
-
inf_name = "SchedulerInterface"
|
14
|
-
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
15
|
-
raise Exception("not have schedule function!")
|
16
|
-
|
17
|
-
if self.__class__.__name__ == "Default":
|
18
|
-
self.stop = True
|
19
|
-
return None
|
20
|
-
|
21
|
-
while not self.stop:
|
22
|
-
length = ready_seed_length()
|
23
|
-
if length > self.size:
|
24
|
-
time.sleep(15)
|
25
|
-
|
26
|
-
elif get_scheduler_lock():
|
27
|
-
seeds = self.schedule()
|
28
|
-
add_seed(seeds)
|
29
|
-
|
30
|
-
log.info(f"close thread: schedule_seed")
|
31
|
-
|
32
|
-
def schedule_task(self, stop, get_seed, ready_seed_length):
|
33
|
-
time.sleep(3)
|
34
|
-
while not stop.is_set():
|
35
|
-
|
36
|
-
if not ready_seed_length():
|
37
|
-
time.sleep(5)
|
38
|
-
continue
|
39
|
-
|
40
|
-
if self.queue.length >= self.length:
|
41
|
-
time.sleep(3)
|
42
|
-
continue
|
43
|
-
|
44
|
-
seeds = get_seed(self.length)
|
45
|
-
self.queue.push(seeds)
|
46
|
-
log.info(f"close thread: schedule_task")
|
47
|
-
|
48
|
-
|
49
|
-
class Spider:
|
50
|
-
|
51
|
-
def __init__(self, queue, max_retries=5):
|
52
|
-
self.spider_in_progress = Queue()
|
53
|
-
self.max_retries = max_retries
|
54
|
-
self.queue = queue
|
55
|
-
|
56
|
-
def spider_task(self, stop, func, item, del_seed):
|
57
|
-
while not stop.is_set():
|
58
|
-
seed = self.queue.pop()
|
59
|
-
if not seed:
|
60
|
-
time.sleep(3)
|
61
|
-
continue
|
62
|
-
elif seed._retry >= self.max_retries:
|
63
|
-
del_seed(seed, spider_status=False)
|
64
|
-
continue
|
65
|
-
try:
|
66
|
-
self.spider_in_progress.push(1, direct_insertion=True)
|
67
|
-
# log.info("spider seed: " + str(seed))
|
68
|
-
ret_count = 0
|
69
|
-
status = None
|
70
|
-
store_queue = None
|
71
|
-
store_data = list()
|
72
|
-
for it in func(item, seed):
|
73
|
-
ret_count += 1
|
74
|
-
if getattr(it, "table_name", None):
|
75
|
-
if not store_queue:
|
76
|
-
store_queue = it.queue()
|
77
|
-
store_data.append(it.struct_data)
|
78
|
-
elif isinstance(it, Seed):
|
79
|
-
self.queue.push(it)
|
80
|
-
elif any(isinstance(it, t) for t in (list, tuple)):
|
81
|
-
self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
|
82
|
-
elif isinstance(it, bool):
|
83
|
-
status = it
|
84
|
-
|
85
|
-
if store_queue and store_data:
|
86
|
-
store_data.append(seed)
|
87
|
-
store_queue.push(store_data)
|
88
|
-
|
89
|
-
if status:
|
90
|
-
del_seed(seed, spider_status=True)
|
91
|
-
elif not ret_count or status is False:
|
92
|
-
seed._retry += 1
|
93
|
-
self.queue.push(seed)
|
94
|
-
|
95
|
-
except Exception as e:
|
96
|
-
seed._retry += 1
|
97
|
-
self.queue.push(seed)
|
98
|
-
log.info(f"{str(seed)} -> {str(e)}")
|
99
|
-
finally:
|
100
|
-
self.spider_in_progress.pop()
|
101
|
-
log.info(f"close thread: spider")
|
102
|
-
|
103
|
-
|
104
|
-
class Storer:
|
105
|
-
|
106
|
-
def store_task(self, stop, last, reset_seed, set_storer):
|
107
|
-
|
108
|
-
inf_name = "StorerInterface"
|
109
|
-
if not issubclass_cobweb_inf(self.__class__, inf_name):
|
110
|
-
return None
|
111
|
-
|
112
|
-
if not getattr(self, "store", None):
|
113
|
-
raise Exception("not have store function!")
|
114
|
-
|
115
|
-
storer_name = self.__class__.__name__ + self.table
|
116
|
-
store_key_id = md5(storer_name.encode()).hexdigest()
|
117
|
-
|
118
|
-
while not stop.is_set():
|
119
|
-
|
120
|
-
if last.is_set() or self.queue.length >= self.length:
|
121
|
-
seeds, data_list = [], []
|
122
|
-
|
123
|
-
while True:
|
124
|
-
data = self.queue.pop()
|
125
|
-
if not data:
|
126
|
-
break
|
127
|
-
if isinstance(data, Seed):
|
128
|
-
seeds.append(data)
|
129
|
-
if len(data_list) >= self.length:
|
130
|
-
break
|
131
|
-
continue
|
132
|
-
data_list.append(data)
|
133
|
-
|
134
|
-
if data_list:
|
135
|
-
if self.store(data_list):
|
136
|
-
set_storer(store_key_id, seeds)
|
137
|
-
else:
|
138
|
-
reset_seed(seeds)
|
139
|
-
continue
|
140
|
-
|
141
|
-
time.sleep(3)
|
142
|
-
|
143
|
-
log.info(f"close thread: {storer_name}")
|