cobweb-launcher 3.1.22__py3-none-any.whl → 3.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/base/__init__.py +2 -0
- cobweb/base/task_queue.py +179 -0
- cobweb/launchers/distributor.py +62 -53
- cobweb/launchers/launcher.py +17 -18
- cobweb/launchers/uploader.py +42 -28
- cobweb/schedulers/scheduler.py +4 -45
- cobweb/schedulers/scheduler_with_redis.py +70 -64
- {cobweb_launcher-3.1.22.dist-info → cobweb_launcher-3.1.23.dist-info}/METADATA +1 -1
- {cobweb_launcher-3.1.22.dist-info → cobweb_launcher-3.1.23.dist-info}/RECORD +12 -11
- {cobweb_launcher-3.1.22.dist-info → cobweb_launcher-3.1.23.dist-info}/LICENSE +0 -0
- {cobweb_launcher-3.1.22.dist-info → cobweb_launcher-3.1.23.dist-info}/WHEEL +0 -0
- {cobweb_launcher-3.1.22.dist-info → cobweb_launcher-3.1.23.dist-info}/top_level.txt +0 -0
cobweb/base/__init__.py
CHANGED
@@ -0,0 +1,179 @@
|
|
1
|
+
import time
|
2
|
+
import threading
|
3
|
+
from enum import Enum
|
4
|
+
from hashlib import md5
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Dict, Any, Optional, List
|
7
|
+
|
8
|
+
|
9
|
+
class Status(Enum):
|
10
|
+
PENDING = 0 # 待处理
|
11
|
+
PROCESSING = 1 # 处理中
|
12
|
+
FINISHED = 2 # 已完成
|
13
|
+
INSERT = 3 # 失败
|
14
|
+
UPLOAD = 4 # 过期
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class Task:
|
19
|
+
task_id: str # 种子唯一ID
|
20
|
+
data: Any # 种子内容
|
21
|
+
status: Status # 当前状态
|
22
|
+
priority: int # 优先级(数值越小越优先)
|
23
|
+
created_at: float # 创建时间戳
|
24
|
+
parent_id: Optional[str] = None # 父种子 ID
|
25
|
+
children_ids: List[str] = None # 子种子 ID 列表
|
26
|
+
ttl_seconds: Optional[int] = None # 可选 TTL 时间(秒)
|
27
|
+
|
28
|
+
def __post_init__(self):
|
29
|
+
if self.children_ids is None:
|
30
|
+
self.children_ids = []
|
31
|
+
|
32
|
+
|
33
|
+
class TaskQueue:
|
34
|
+
|
35
|
+
def __init__(self, cleanup_interval=60):
|
36
|
+
self._tasks: Dict[str, Task] = {}
|
37
|
+
self._lock = threading.Lock()
|
38
|
+
# self.cleanup_interval = cleanup_interval
|
39
|
+
# self._start_cleanup_task()
|
40
|
+
|
41
|
+
# def _start_cleanup_task(self):
|
42
|
+
# """启动后台线程清理过期种子"""
|
43
|
+
# def run():
|
44
|
+
# while True:
|
45
|
+
# time.sleep(self.cleanup_interval)
|
46
|
+
# self._cleanup_expired_seeds()
|
47
|
+
# threading.Thread(target=run, daemon=True).start()
|
48
|
+
|
49
|
+
def length(self) -> int:
|
50
|
+
with self._lock:
|
51
|
+
return len(self._tasks)
|
52
|
+
|
53
|
+
def status_length(self, status) -> int:
|
54
|
+
with self._lock:
|
55
|
+
return len([it for it in self._tasks.values() if it.status == status])
|
56
|
+
|
57
|
+
def get_task(self, task_id) -> Task:
|
58
|
+
with self._lock:
|
59
|
+
if task_id in self._tasks:
|
60
|
+
return self._tasks[task_id]
|
61
|
+
|
62
|
+
def get_task_by_status(self, status: list, limit: int = None) -> List[Task]:
|
63
|
+
with self._lock:
|
64
|
+
if not isinstance(status, list):
|
65
|
+
status = [status]
|
66
|
+
task_list = [it for it in self._tasks.values() if it.status in status]
|
67
|
+
task_list.sort(key=lambda x: (x.priority, x.created_at))
|
68
|
+
return task_list[:limit] if limit else task_list
|
69
|
+
|
70
|
+
def get_pending_task(self) -> Task:
|
71
|
+
with self._lock:
|
72
|
+
if items := [it for it in self._tasks.values() if it.status == Status.PENDING]:
|
73
|
+
items.sort(key=lambda x: (x.priority, x.created_at))
|
74
|
+
task_item = items[0]
|
75
|
+
task_item.status = Status.PROCESSING
|
76
|
+
self._tasks[task_item.task_id] = task_item
|
77
|
+
return task_item
|
78
|
+
|
79
|
+
def pop_task(self, status) -> Task:
|
80
|
+
with self._lock:
|
81
|
+
if items := [it for it in self._tasks.values() if it.status == status]:
|
82
|
+
items.sort(key=lambda x: (x.priority, x.created_at))
|
83
|
+
task_item = items[0]
|
84
|
+
|
85
|
+
to_remove = set()
|
86
|
+
queue = [task_item.task_id]
|
87
|
+
|
88
|
+
while queue:
|
89
|
+
current = queue.pop(0)
|
90
|
+
if current in self._tasks:
|
91
|
+
to_remove.add(current)
|
92
|
+
queue.extend(self._tasks[current].children_ids)
|
93
|
+
del self._tasks[current]
|
94
|
+
|
95
|
+
for tid in to_remove:
|
96
|
+
if task_item := self._tasks.get(tid):
|
97
|
+
if task_item.parent_id in self._tasks:
|
98
|
+
if tid in self._tasks[task_item.parent_id].children_ids:
|
99
|
+
self._tasks[task_item.parent_id].children_ids.remove(tid)
|
100
|
+
|
101
|
+
def add_task(
|
102
|
+
self,
|
103
|
+
task_id: str = None,
|
104
|
+
data: Any = None,
|
105
|
+
status=Status.PENDING,
|
106
|
+
priority: int = 500,
|
107
|
+
parent_id: Optional[str] = None,
|
108
|
+
ttl_seconds: Optional[int] = None
|
109
|
+
) -> bool:
|
110
|
+
"""添加新种子,可指定父种子"""
|
111
|
+
with self._lock:
|
112
|
+
if not task_id:
|
113
|
+
task_id = md5(str(time.time()).encode()).hexdigest()
|
114
|
+
|
115
|
+
if task_id in self._tasks:
|
116
|
+
return False # 防止重复添加
|
117
|
+
|
118
|
+
task_item = Task(
|
119
|
+
task_id=task_id,
|
120
|
+
data=data,
|
121
|
+
status=status,
|
122
|
+
priority=priority,
|
123
|
+
created_at=int(time.time()),
|
124
|
+
parent_id=parent_id,
|
125
|
+
ttl_seconds=ttl_seconds
|
126
|
+
)
|
127
|
+
self._tasks[task_id] = task_item
|
128
|
+
|
129
|
+
if parent_id and parent_id in self._tasks:
|
130
|
+
self._tasks[parent_id].children_ids.append(task_id)
|
131
|
+
|
132
|
+
return True
|
133
|
+
|
134
|
+
def update_task(self, task_id, status, data=None) -> Task:
|
135
|
+
with self._lock:
|
136
|
+
task_item = self._tasks[task_id]
|
137
|
+
task_item.status = status
|
138
|
+
if data:
|
139
|
+
task_item.data = data
|
140
|
+
|
141
|
+
for tid in task_item.children_ids:
|
142
|
+
if self._tasks[tid].status == Status.INSERT:
|
143
|
+
del self._tasks[tid]
|
144
|
+
|
145
|
+
task_item.children_ids = []
|
146
|
+
self._tasks[task_id] = task_item
|
147
|
+
|
148
|
+
return task_item
|
149
|
+
|
150
|
+
def remove(self, task_ids: list) -> bool:
|
151
|
+
with self._lock:
|
152
|
+
for task_id in task_ids:
|
153
|
+
if task_item := self._tasks.get(task_id):
|
154
|
+
|
155
|
+
if task_item.children_ids:
|
156
|
+
continue
|
157
|
+
|
158
|
+
if task_item.parent_id in self._tasks:
|
159
|
+
if task_id in self._tasks[task_item.parent_id].children_ids:
|
160
|
+
self._tasks[task_item.parent_id].children_ids.remove(task_id)
|
161
|
+
|
162
|
+
del self._tasks[task_id]
|
163
|
+
|
164
|
+
def count_children(self, task_id: str) -> int:
|
165
|
+
with self._lock:
|
166
|
+
if task_id in self._tasks:
|
167
|
+
return len(self._tasks[task_id].children_ids)
|
168
|
+
return 0
|
169
|
+
|
170
|
+
# def _cleanup_expired_seeds(self):
|
171
|
+
# now = time.time()
|
172
|
+
# expired_ids = []
|
173
|
+
# with self._lock:
|
174
|
+
# for seed_id, seed in self._seeds.items():
|
175
|
+
# if seed.ttl_seconds and now - seed.created_at > seed.ttl_seconds:
|
176
|
+
# expired_ids.append(seed_id)
|
177
|
+
# for seed_id in expired_ids:
|
178
|
+
# self._seeds[seed_id] = self._seeds[seed_id]._replace(status=SeedStatus.EXPIRED)
|
179
|
+
# print(f"清理了 {len(expired_ids)} 个过期种子")
|
cobweb/launchers/distributor.py
CHANGED
@@ -2,15 +2,15 @@ import time
|
|
2
2
|
import threading
|
3
3
|
import traceback
|
4
4
|
|
5
|
-
from typing import Callable, Type
|
6
5
|
from inspect import isgenerator
|
6
|
+
from typing import Callable, Type
|
7
7
|
from urllib.parse import urlparse
|
8
8
|
from requests import Response as Res
|
9
9
|
|
10
10
|
from cobweb.crawlers import Crawler
|
11
|
-
from cobweb.constant import DealModel, LogTemplate
|
12
11
|
from cobweb.utils import LoghubDot, check_pause
|
13
|
-
from cobweb.
|
12
|
+
from cobweb.constant import DealModel, LogTemplate
|
13
|
+
from cobweb.base import Seed, Status, TaskQueue, BaseItem, Request, Response, logger
|
14
14
|
|
15
15
|
|
16
16
|
class Distributor(threading.Thread):
|
@@ -19,10 +19,7 @@ class Distributor(threading.Thread):
|
|
19
19
|
self,
|
20
20
|
task: str,
|
21
21
|
project: str,
|
22
|
-
|
23
|
-
todo: Queue,
|
24
|
-
done: Queue,
|
25
|
-
upload: Queue,
|
22
|
+
task_queue: TaskQueue,
|
26
23
|
stop: threading.Event,
|
27
24
|
pause: threading.Event,
|
28
25
|
callback_register: Callable,
|
@@ -34,10 +31,8 @@ class Distributor(threading.Thread):
|
|
34
31
|
self.stop = stop
|
35
32
|
self.pause = pause
|
36
33
|
|
37
|
-
self.
|
38
|
-
|
39
|
-
self.done = done
|
40
|
-
self.upload = upload
|
34
|
+
self.task_queue = task_queue
|
35
|
+
|
41
36
|
self.callback_register = callback_register
|
42
37
|
self.Crawler = SpiderCrawler
|
43
38
|
|
@@ -46,69 +41,83 @@ class Distributor(threading.Thread):
|
|
46
41
|
self.thread_num = setting.SPIDER_THREAD_NUM
|
47
42
|
self.max_retries = setting.SPIDER_MAX_RETRIES
|
48
43
|
self.record_failed = setting.RECORD_FAILED_SPIDER
|
49
|
-
self.loghub_dot = LoghubDot(stop=stop) # todo
|
44
|
+
self.loghub_dot = LoghubDot(stop=stop) # todo
|
50
45
|
|
51
46
|
logger.debug(f"Distribute instance attrs: {self.__dict__}")
|
52
47
|
|
53
|
-
def distribute(self,
|
48
|
+
def distribute(self, task_id, item, _id: int):
|
54
49
|
if isinstance(item, Request):
|
55
|
-
seed.params.start_time = time.time()
|
56
|
-
self.process(
|
50
|
+
item.seed.params.start_time = time.time()
|
51
|
+
self.process(task_id=task_id, item=item, callback=self.Crawler.download, _id=1)
|
52
|
+
|
57
53
|
elif isinstance(item, Response):
|
58
54
|
if _id == 2:
|
59
55
|
raise TypeError("parse function can't yield a Response instance")
|
60
56
|
dot = isinstance(item.response, Res)
|
61
|
-
#
|
62
|
-
self.
|
63
|
-
|
57
|
+
self.spider_logging(item.seed, item, dot=dot) # todo: update
|
58
|
+
self.process(task_id=task_id, item=item, callback=self.Crawler.parse, _id=2)
|
59
|
+
|
64
60
|
elif isinstance(item, BaseItem):
|
65
|
-
self.
|
61
|
+
self.task_queue.add_task(
|
62
|
+
data=item,
|
63
|
+
status=Status.UPLOAD,
|
64
|
+
parent_id=task_id
|
65
|
+
)
|
66
|
+
|
66
67
|
elif isinstance(item, Seed):
|
67
|
-
self.
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
else:
|
77
|
-
if record_failed := self.record_failed:
|
78
|
-
try:
|
79
|
-
response = Response(seed, "failed", max_retries=True)
|
80
|
-
self.process(response, seed, self.Crawler.parse, _id=2)
|
81
|
-
except Exception as e:
|
82
|
-
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
83
|
-
logger.error(msg = msg)
|
84
|
-
record_failed = False
|
85
|
-
if not record_failed:
|
86
|
-
self.done.push(seed)
|
87
|
-
else:
|
68
|
+
self.task_queue.add_task(
|
69
|
+
task_id=item.sid,
|
70
|
+
data=item,
|
71
|
+
status=Status.INSERT,
|
72
|
+
priority=item.params.priority,
|
73
|
+
parent_id=task_id
|
74
|
+
)
|
75
|
+
|
76
|
+
elif isinstance(item, str) and item != DealModel.done:
|
88
77
|
raise TypeError("yield value type error!")
|
89
78
|
|
90
|
-
def process(self,
|
91
|
-
|
92
|
-
if not isgenerator(
|
79
|
+
def process(self, task_id, item, callback, _id: int):
|
80
|
+
iterators = callback(item)
|
81
|
+
if not isgenerator(iterators):
|
93
82
|
raise TypeError(f"{callback.__name__} function isn't a generator!")
|
94
|
-
for
|
95
|
-
self.distribute(
|
83
|
+
for it in iterators:
|
84
|
+
self.distribute(task_id=task_id, item=it, _id=_id)
|
96
85
|
|
97
86
|
@check_pause
|
98
87
|
def spider(self):
|
99
|
-
|
100
|
-
|
88
|
+
if task_item := self.task_queue.get_pending_task():
|
89
|
+
seed = task_item.data
|
90
|
+
status = Status.FINISHED
|
91
|
+
task_id = task_item.task_id
|
92
|
+
|
101
93
|
try:
|
102
|
-
self.process(
|
94
|
+
self.process(task_id=task_id, item=seed, callback=self.Crawler.request, _id=0)
|
95
|
+
|
103
96
|
except Exception as e:
|
104
|
-
|
97
|
+
|
98
|
+
seed.params.retry += 1
|
99
|
+
url, _status = seed.url, e.__class__.__name__
|
100
|
+
|
105
101
|
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
106
102
|
if getattr(e, "response", None) and isinstance(e.response, Res):
|
107
103
|
url = e.response.request.url
|
108
|
-
|
109
|
-
|
110
|
-
self.spider_logging(seed, None, error=True, url=url, status=
|
111
|
-
|
104
|
+
_status = e.response.status_code
|
105
|
+
|
106
|
+
self.spider_logging(seed, None, error=True, url=url, status=_status, msg=msg)
|
107
|
+
|
108
|
+
if seed.params.retry < self.max_retries:
|
109
|
+
status = Status.PENDING
|
110
|
+
|
111
|
+
elif self.record_failed:
|
112
|
+
try:
|
113
|
+
response = Response(seed, "failed", max_retries=True)
|
114
|
+
self.process(task_id=task_id, item=response, callback=self.Crawler.parse, _id=2)
|
115
|
+
except Exception as e:
|
116
|
+
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
117
|
+
logger.error(msg=msg)
|
118
|
+
|
119
|
+
finally:
|
120
|
+
self.task_queue.update_task(task_id, status=status, data=seed)
|
112
121
|
|
113
122
|
def spider_logging(
|
114
123
|
self, seed,
|
cobweb/launchers/launcher.py
CHANGED
@@ -5,20 +5,15 @@ import threading
|
|
5
5
|
import importlib
|
6
6
|
|
7
7
|
from cobweb import setting
|
8
|
-
from cobweb.
|
8
|
+
from cobweb.launchers.uploader import Uploader
|
9
9
|
from cobweb.utils.tools import dynamic_load_class
|
10
10
|
from cobweb.launchers.distributor import Distributor
|
11
|
-
from cobweb.
|
11
|
+
from cobweb.base import Seed, logger, TaskQueue, Status
|
12
12
|
from typing import Optional, Union, Dict, Any, Callable
|
13
13
|
|
14
14
|
|
15
15
|
class Launcher:
|
16
16
|
|
17
|
-
_NEW_QUEUE_ = Queue()
|
18
|
-
_TODO_QUEUE_ = Queue()
|
19
|
-
_DONE_QUEUE_ = Queue()
|
20
|
-
_UPLOAD_QUEUE_ = Queue()
|
21
|
-
|
22
17
|
__REGISTER_FUNC__: Dict[str, Callable] = {}
|
23
18
|
__WORKER_THREAD__: Dict[str, threading.Thread] = {}
|
24
19
|
|
@@ -33,6 +28,7 @@ class Launcher:
|
|
33
28
|
self._pause = threading.Event() # 暂停事件
|
34
29
|
|
35
30
|
_setting = self._load_custom_settings(custom_setting)
|
31
|
+
|
36
32
|
_setting.update(kwargs)
|
37
33
|
for key, value in _setting.items():
|
38
34
|
setattr(setting, key.upper(), value)
|
@@ -40,6 +36,8 @@ class Launcher:
|
|
40
36
|
self._done_model = setting.DONE_MODEL
|
41
37
|
self._task_model = setting.TASK_MODEL
|
42
38
|
|
39
|
+
self._task_queue = TaskQueue()
|
40
|
+
|
43
41
|
self.Scheduler = dynamic_load_class(setting.SCHEDULER)
|
44
42
|
self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
|
45
43
|
self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
|
@@ -108,7 +106,15 @@ class Launcher:
|
|
108
106
|
|
109
107
|
def start_seeds(self, seeds: list[Union[str, Dict]]) -> list[Seed]:
|
110
108
|
seed_list = [Seed(seed) for seed in seeds]
|
111
|
-
|
109
|
+
for seed in seed_list:
|
110
|
+
self._task_queue.add_task(
|
111
|
+
task_id=seed.sid,
|
112
|
+
data=seed,
|
113
|
+
status=Status.PENDING,
|
114
|
+
priority=seed.params.priority,
|
115
|
+
parent_id=None,
|
116
|
+
ttl_seconds=None
|
117
|
+
)
|
112
118
|
return seed_list
|
113
119
|
|
114
120
|
def _register(self, func: Callable, tag: str = "launcher"):
|
@@ -139,20 +145,14 @@ class Launcher:
|
|
139
145
|
project=self.project,
|
140
146
|
stop=self._stop,
|
141
147
|
pause=self._pause,
|
142
|
-
|
143
|
-
todo=self._TODO_QUEUE_,
|
144
|
-
done=self._DONE_QUEUE_,
|
145
|
-
upload=self._UPLOAD_QUEUE_,
|
148
|
+
task_queue=self._task_queue,
|
146
149
|
callback_register=self._register
|
147
150
|
).start()
|
148
151
|
|
149
152
|
Distributor(
|
150
153
|
task=self.task,
|
151
154
|
project=self.project,
|
152
|
-
|
153
|
-
todo=self._TODO_QUEUE_,
|
154
|
-
done=self._DONE_QUEUE_,
|
155
|
-
upload=self._UPLOAD_QUEUE_,
|
155
|
+
task_queue=self._task_queue,
|
156
156
|
callback_register=self._register,
|
157
157
|
stop=self._stop, pause=self._pause,
|
158
158
|
SpiderCrawler=self.SpiderCrawler
|
@@ -161,8 +161,7 @@ class Launcher:
|
|
161
161
|
Uploader(
|
162
162
|
task=self.task, project=self.project,
|
163
163
|
stop=self._stop, pause=self._pause,
|
164
|
-
|
165
|
-
upload=self._UPLOAD_QUEUE_,
|
164
|
+
task_queue=self._task_queue,
|
166
165
|
callback_register=self._register,
|
167
166
|
SpiderPipeline=self.SpiderPipeline
|
168
167
|
).start()
|
cobweb/launchers/uploader.py
CHANGED
@@ -2,7 +2,7 @@ import time
|
|
2
2
|
import threading
|
3
3
|
from typing import Callable, Type
|
4
4
|
from cobweb.pipelines import Pipeline
|
5
|
-
from cobweb.base import
|
5
|
+
from cobweb.base import TaskQueue, logger, Status
|
6
6
|
from cobweb.utils import check_pause
|
7
7
|
|
8
8
|
|
@@ -14,7 +14,7 @@ class Uploader(threading.Thread):
|
|
14
14
|
project: str,
|
15
15
|
stop: threading.Event,
|
16
16
|
pause: threading.Event,
|
17
|
-
|
17
|
+
task_queue: TaskQueue,
|
18
18
|
callback_register: Callable,
|
19
19
|
SpiderPipeline: Type[Pipeline]
|
20
20
|
):
|
@@ -25,8 +25,7 @@ class Uploader(threading.Thread):
|
|
25
25
|
self.stop = stop
|
26
26
|
self.pause = pause
|
27
27
|
|
28
|
-
self.
|
29
|
-
self.upload = upload
|
28
|
+
self.task_queue = task_queue
|
30
29
|
self.callback_register = callback_register
|
31
30
|
|
32
31
|
from cobweb import setting
|
@@ -40,30 +39,45 @@ class Uploader(threading.Thread):
|
|
40
39
|
|
41
40
|
@check_pause
|
42
41
|
def upload_data(self):
|
43
|
-
if
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
42
|
+
if task_list := self.task_queue.get_task_by_status(
|
43
|
+
status=Status.UPLOAD, limit=self.upload_size
|
44
|
+
):
|
45
|
+
try:
|
46
|
+
data_info, task_ids = dict(), set()
|
47
|
+
for task_item in task_list:
|
48
|
+
upload_data = self.pipeline.build(task_item.data)
|
49
|
+
data_info.setdefault(task_item.data.table, []).append(upload_data)
|
50
|
+
|
51
|
+
for table, datas in data_info.items():
|
52
|
+
try:
|
53
|
+
self.pipeline.upload(table, datas)
|
54
|
+
except Exception as e:
|
55
|
+
logger.info(e)
|
56
|
+
except Exception as e:
|
57
|
+
logger.info(e)
|
58
|
+
|
59
|
+
if self.task_queue.status_length(status=Status.UPLOAD) < self.upload_size:
|
60
|
+
time.sleep(self.wait_seconds)
|
61
|
+
|
62
|
+
# data_info, seeds = {}, []
|
63
|
+
# try:
|
64
|
+
# for _ in range(self.upload_size):
|
65
|
+
# item = self.upload.pop()
|
66
|
+
# if not item:
|
67
|
+
# break
|
68
|
+
# # seeds.append(item.seed)
|
69
|
+
# data = self.pipeline.build(item)
|
70
|
+
# data_info.setdefault(item.table, []).append(data)
|
71
|
+
# for table, datas in data_info.items():
|
72
|
+
# try:
|
73
|
+
# self.pipeline.upload(table, datas)
|
74
|
+
# # TODO: 上传打点
|
75
|
+
# except Exception as e:
|
76
|
+
# logger.info(e)
|
77
|
+
# except Exception as e:
|
78
|
+
# logger.info(e)
|
79
|
+
# if self.upload.length < self.upload_size:
|
80
|
+
# time.sleep(self.wait_seconds)
|
67
81
|
|
68
82
|
def run(self):
|
69
83
|
self.callback_register(self.upload_data, tag="Uploader")
|
cobweb/schedulers/scheduler.py
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
import threading
|
2
2
|
|
3
|
-
|
4
3
|
from typing import Callable
|
5
|
-
from cobweb.base import
|
4
|
+
from cobweb.base import TaskQueue
|
6
5
|
from abc import ABC, abstractmethod
|
7
6
|
|
8
7
|
|
9
8
|
class Scheduler(ABC, threading.Thread):
|
10
9
|
|
11
|
-
|
12
|
-
__LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
|
10
|
+
# __LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
|
13
11
|
|
14
12
|
def __init__(
|
15
13
|
self,
|
@@ -17,10 +15,7 @@ class Scheduler(ABC, threading.Thread):
|
|
17
15
|
project,
|
18
16
|
stop: threading.Event,
|
19
17
|
pause: threading.Event,
|
20
|
-
|
21
|
-
todo: Queue,
|
22
|
-
done: Queue,
|
23
|
-
upload: Queue,
|
18
|
+
task_queue: TaskQueue,
|
24
19
|
callback_register: Callable
|
25
20
|
):
|
26
21
|
super().__init__()
|
@@ -44,45 +39,9 @@ class Scheduler(ABC, threading.Thread):
|
|
44
39
|
self.stop = stop
|
45
40
|
self.pause = pause
|
46
41
|
|
47
|
-
self.
|
48
|
-
self.todo = todo
|
49
|
-
self.done = done
|
50
|
-
self.upload = upload
|
42
|
+
self.task_queue = task_queue
|
51
43
|
|
52
44
|
self.callback_register = callback_register
|
53
|
-
self.lock = threading.Lock()
|
54
|
-
|
55
|
-
def is_empty(self):
|
56
|
-
return all(queue.empty() for queue in (self.new, self.todo, self.done, self.upload))
|
57
|
-
|
58
|
-
def set_working_items(self, item_info: dict = None):
|
59
|
-
if not item_info:
|
60
|
-
return False
|
61
|
-
with self.lock:
|
62
|
-
self.__WORKING_ITEMS__.update(item_info)
|
63
|
-
|
64
|
-
def get_working_items(self) -> dict:
|
65
|
-
with self.lock:
|
66
|
-
return self.__WORKING_ITEMS__.copy()
|
67
|
-
|
68
|
-
def remove_working_items(self, items: list[str] = None) -> int:
|
69
|
-
if not items:
|
70
|
-
return 0
|
71
|
-
with self.lock:
|
72
|
-
deleted_count = 0
|
73
|
-
for item in items:
|
74
|
-
if item in self.__WORKING_ITEMS__:
|
75
|
-
del self.__WORKING_ITEMS__[item]
|
76
|
-
deleted_count += 1
|
77
|
-
return deleted_count
|
78
|
-
|
79
|
-
def get_working_items_count(self) -> int:
|
80
|
-
with self.lock:
|
81
|
-
return len(self.__WORKING_ITEMS__)
|
82
|
-
|
83
|
-
def clear_working_items(self):
|
84
|
-
with self.lock:
|
85
|
-
self.__WORKING_ITEMS__.clear()
|
86
45
|
|
87
46
|
@abstractmethod
|
88
47
|
def reset(self):
|
@@ -4,7 +4,7 @@ import threading
|
|
4
4
|
from typing import Callable
|
5
5
|
from cobweb.db import RedisDB, ApiDB
|
6
6
|
from cobweb.utils import check_pause
|
7
|
-
from cobweb.base import
|
7
|
+
from cobweb.base import Seed, logger, TaskQueue, Status
|
8
8
|
from cobweb.constant import LogTemplate
|
9
9
|
from .scheduler import Scheduler
|
10
10
|
use_api = bool(os.getenv("REDIS_API_HOST", 0))
|
@@ -18,13 +18,10 @@ class RedisScheduler(Scheduler):
|
|
18
18
|
project,
|
19
19
|
stop: threading.Event,
|
20
20
|
pause: threading.Event,
|
21
|
-
|
22
|
-
todo: Queue,
|
23
|
-
done: Queue,
|
24
|
-
upload: Queue,
|
21
|
+
task_queue: TaskQueue,
|
25
22
|
callback_register: Callable
|
26
23
|
):
|
27
|
-
super().__init__(task, project, stop, pause,
|
24
|
+
super().__init__(task, project, stop, pause, task_queue, callback_register)
|
28
25
|
self.todo_key = f"{{{project}:{task}}}:todo"
|
29
26
|
self.done_key = f"{{{project}:{task}}}:done"
|
30
27
|
self.fail_key = f"{{{project}:{task}}}:fail"
|
@@ -38,14 +35,13 @@ class RedisScheduler(Scheduler):
|
|
38
35
|
检查过期种子,重新添加到redis缓存中
|
39
36
|
"""
|
40
37
|
while not self.stop.is_set():
|
41
|
-
if self.db.lock(self.reset_lock_key, t=
|
38
|
+
if self.db.lock(self.reset_lock_key, t=360):
|
42
39
|
|
43
|
-
|
44
|
-
_min = -int(time.time()) + 3600
|
40
|
+
_min = -int(time.time()) + self.seed_reset_seconds
|
45
41
|
self.db.members(self.todo_key, 0, _min=_min, _max="(0")
|
46
42
|
self.db.delete(self.reset_lock_key)
|
47
43
|
|
48
|
-
time.sleep(
|
44
|
+
time.sleep(self.seed_reset_seconds)
|
49
45
|
|
50
46
|
@check_pause
|
51
47
|
def schedule(self):
|
@@ -56,42 +52,43 @@ class RedisScheduler(Scheduler):
|
|
56
52
|
time.sleep(self.scheduler_wait_seconds)
|
57
53
|
return
|
58
54
|
|
59
|
-
if self.
|
55
|
+
if self.task_queue.status_length(Status.PENDING) >= self.todo_queue_size:
|
60
56
|
time.sleep(self.todo_queue_full_wait_seconds)
|
61
57
|
return
|
62
58
|
|
63
|
-
members
|
59
|
+
if members := self.db.members(
|
64
60
|
self.todo_key, int(time.time()),
|
65
61
|
count=self.todo_queue_size,
|
66
62
|
_min=0, _max="(1000"
|
67
|
-
)
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
# self.set_working_items(item_info)
|
78
|
-
self.todo.push(seeds)
|
79
|
-
# time.sleep(1)
|
63
|
+
):
|
64
|
+
for member, priority in members:
|
65
|
+
seed = Seed(member, priority=int(priority % 1000))
|
66
|
+
self.task_queue.add_task(
|
67
|
+
task_id=seed.sid,
|
68
|
+
data=seed,
|
69
|
+
status=Status.PENDING,
|
70
|
+
priority=seed.params.priority
|
71
|
+
)
|
80
72
|
|
81
73
|
@check_pause
|
82
74
|
def insert(self):
|
83
75
|
"""
|
84
76
|
添加新种子到redis队列中
|
85
77
|
"""
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
78
|
+
if task_list := self.task_queue.get_task_by_status(
|
79
|
+
status=Status.INSERT, limit=self.new_queue_max_size
|
80
|
+
):
|
81
|
+
seed_info, task_ids = dict(), set()
|
90
82
|
|
91
|
-
|
92
|
-
|
83
|
+
for task_item in task_list:
|
84
|
+
seed = task_item.data
|
85
|
+
task_ids.add(task_item.task_id)
|
86
|
+
seed_info[seed.to_string] = seed.params.priority
|
93
87
|
|
94
|
-
|
88
|
+
self.db.zadd(self.todo_key, seed_info, nx=True)
|
89
|
+
self.task_queue.remove(task_ids)
|
90
|
+
|
91
|
+
if self.task_queue.status_length(status=Status.INSERT) < self.new_queue_max_size:
|
95
92
|
time.sleep(self.scheduler_wait_seconds)
|
96
93
|
|
97
94
|
@check_pause
|
@@ -99,27 +96,28 @@ class RedisScheduler(Scheduler):
|
|
99
96
|
"""
|
100
97
|
刷新doing种子过期时间,防止reset重新消费
|
101
98
|
"""
|
102
|
-
if
|
99
|
+
if task_list := self.task_queue.get_task_by_status(
|
100
|
+
status=[Status.PENDING, Status.PROCESSING, Status.FINISHED],
|
101
|
+
):
|
103
102
|
refresh_time = int(time.time())
|
104
|
-
seed_info = {
|
103
|
+
seed_info = {it.data.to_string: -refresh_time - it.data.params.priority / 1000 for it in task_list}
|
105
104
|
self.db.zadd(self.todo_key, seed_info, xx=True)
|
106
|
-
|
107
|
-
time.sleep(20)
|
105
|
+
time.sleep(self.seed_reset_seconds // 3)
|
108
106
|
|
109
107
|
@check_pause
|
110
108
|
def delete(self):
|
111
109
|
"""
|
112
110
|
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
113
111
|
"""
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
self.
|
121
|
-
|
122
|
-
if self.
|
112
|
+
if task_list := self.task_queue.get_task_by_status(
|
113
|
+
status=Status.FINISHED, limit=self.done_queue_max_size
|
114
|
+
):
|
115
|
+
zrem_items = [it.data.to_string for it in task_list]
|
116
|
+
remove_task_ids = [it.task_id for it in task_list]
|
117
|
+
self.db.zrem(self.todo_key, *zrem_items)
|
118
|
+
self.task_queue.remove(remove_task_ids)
|
119
|
+
|
120
|
+
if self.task_queue.status_length(status=Status.FINISHED) < self.done_queue_max_size:
|
123
121
|
time.sleep(self.done_queue_wait_seconds)
|
124
122
|
|
125
123
|
def run(self):
|
@@ -129,32 +127,41 @@ class RedisScheduler(Scheduler):
|
|
129
127
|
self.callback_register(func, tag="scheduler")
|
130
128
|
|
131
129
|
while not self.stop.is_set():
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
130
|
+
todo_len = self.task_queue.status_length(status=Status.PENDING)
|
131
|
+
doing_len = self.task_queue.status_length(status=Status.PROCESSING)
|
132
|
+
done_len = self.task_queue.status_length(status=Status.FINISHED)
|
133
|
+
upload_len = self.task_queue.status_length(status=Status.UPLOAD)
|
134
|
+
|
135
|
+
redis_doing_count = self.db.zcount(self.todo_key, "-inf", "(0")
|
136
|
+
redis_todo_len = self.db.zcount(self.todo_key, 0, "(1000")
|
137
|
+
redis_seed_count = self.db.zcard(self.todo_key)
|
136
138
|
|
137
139
|
if self.pause.is_set():
|
138
140
|
execute_time = int(time.time()) - start_time
|
139
141
|
if not self.task_model and execute_time > self.before_scheduler_wait_seconds:
|
140
142
|
logger.info("Done! ready to close thread...")
|
141
143
|
self.stop.set()
|
142
|
-
elif
|
144
|
+
elif redis_todo_len:
|
143
145
|
logger.info(
|
144
|
-
f"Recovery {self.task} task run!
|
146
|
+
f"Recovery {self.task} task run!"
|
147
|
+
f"Todo seeds count: {redis_todo_len}"
|
148
|
+
f", queue length: {redis_seed_count}"
|
149
|
+
)
|
145
150
|
self.pause.clear()
|
146
151
|
else:
|
147
152
|
logger.info("Pause! waiting for resume...")
|
148
153
|
|
149
|
-
elif self.
|
150
|
-
|
151
|
-
|
152
|
-
|
154
|
+
elif self.task_queue.length() == 0:
|
155
|
+
if redis_seed_count:
|
156
|
+
logger.info(
|
157
|
+
f"Todo seeds count: {redis_todo_len}"
|
158
|
+
f", queue length: {redis_seed_count}"
|
159
|
+
)
|
153
160
|
self.pause.clear()
|
154
161
|
else:
|
155
162
|
count = 0
|
156
163
|
for _ in range(3):
|
157
|
-
if not
|
164
|
+
if not redis_seed_count:
|
158
165
|
count += 1
|
159
166
|
time.sleep(5)
|
160
167
|
logger.info("Checking count...")
|
@@ -162,19 +169,18 @@ class RedisScheduler(Scheduler):
|
|
162
169
|
break
|
163
170
|
if count >= 3:
|
164
171
|
logger.info("Todo queue is empty! Pause set...")
|
165
|
-
self.clear_working_items()
|
166
172
|
self.pause.set()
|
167
173
|
|
168
174
|
else:
|
169
175
|
logger.info(LogTemplate.launcher_pro_polling.format(
|
170
176
|
task=self.task,
|
171
|
-
doing_len=
|
172
|
-
todo_len=
|
173
|
-
done_len=
|
174
|
-
redis_seed_count=
|
175
|
-
redis_todo_len=
|
176
|
-
redis_doing_len=
|
177
|
-
upload_len=
|
177
|
+
doing_len=doing_len,
|
178
|
+
todo_len=todo_len,
|
179
|
+
done_len=done_len,
|
180
|
+
redis_seed_count=redis_seed_count,
|
181
|
+
redis_todo_len=redis_todo_len,
|
182
|
+
redis_doing_len=redis_doing_count,
|
183
|
+
upload_len=upload_len,
|
178
184
|
))
|
179
185
|
|
180
186
|
time.sleep(30)
|
@@ -1,13 +1,14 @@
|
|
1
1
|
cobweb/__init__.py,sha256=YdBi3uytEFRXan155xU1kKMpiUKUupO2RGeJyXmH0zk,129
|
2
2
|
cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
|
3
3
|
cobweb/setting.py,sha256=rHPQfc4a1xMTbkt3_KXBfUomhYcbTXogsz7ew-QsqHw,1670
|
4
|
-
cobweb/base/__init__.py,sha256=
|
4
|
+
cobweb/base/__init__.py,sha256=NanSxJr0WsqjqCNOQAlxlkt-vQEsERHYBzacFC057oI,222
|
5
5
|
cobweb/base/common_queue.py,sha256=hYdaM70KrWjvACuLKaGhkI2VqFCnd87NVvWzmnfIg8Q,1423
|
6
6
|
cobweb/base/item.py,sha256=1bS4U_3vzI2jzSSeoEbLoLT_5CfgLPopWiEYtaahbvw,1674
|
7
7
|
cobweb/base/logger.py,sha256=Vsg1bD4LXW91VgY-ANsmaUu-mD88hU_WS83f7jX3qF8,2011
|
8
8
|
cobweb/base/request.py,sha256=MBYYjWpbRQRulPG0zPbK0DO3LKmScqQ4tBzFXekYkao,2652
|
9
9
|
cobweb/base/response.py,sha256=g8e5H0hEiRfqseh3nD7t6a1rhIJYRMV7nI47kqNOd-U,446
|
10
10
|
cobweb/base/seed.py,sha256=ddaWCq_KaWwpmPl1CToJlfCxEEnoJ16kjo6azJs9uls,5000
|
11
|
+
cobweb/base/task_queue.py,sha256=3ScPKnjlPEuuCzWyG9D2iHiND3L9lLM7fo1LNOkw8CY,6337
|
11
12
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
12
13
|
cobweb/crawlers/crawler.py,sha256=ZZVZJ17RWuvzUFGLjqdvyVZpmuq-ynslJwXQzdm_UdQ,709
|
13
14
|
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
@@ -16,24 +17,24 @@ cobweb/db/redis_db.py,sha256=X7dUpW50QcmRPjYlYg7b-fXF_fcjuRRk3DBx2ggetXk,7687
|
|
16
17
|
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
17
18
|
cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
18
19
|
cobweb/launchers/__init__.py,sha256=6_v2jd2sgj6YnOB1nPKiYBskuXVb5xpQnq2YaDGJgQ8,100
|
19
|
-
cobweb/launchers/distributor.py,sha256=
|
20
|
-
cobweb/launchers/launcher.py,sha256=
|
21
|
-
cobweb/launchers/uploader.py,sha256=
|
20
|
+
cobweb/launchers/distributor.py,sha256=I5QBs2hFiyGGkqLLkMw9uzf4_oRW2JvahNW9yc866cc,6748
|
21
|
+
cobweb/launchers/launcher.py,sha256=Shb6o6MAM38d32ybW2gY6qpGmhuiV7jo9TDh0f7rud8,5694
|
22
|
+
cobweb/launchers/uploader.py,sha256=dDBv6Vfy1ciaTAJA3TebJV-2oM3OMrqTfzpNX8VGv-0,2766
|
22
23
|
cobweb/pipelines/__init__.py,sha256=rtkaaCZ4u1XcxpkDLHztETQjEcLZ_6DXTHjdfcJlyxQ,97
|
23
24
|
cobweb/pipelines/pipeline.py,sha256=OgSEZ2DdqofpZcer1Wj1tuBqn8OHVjrYQ5poqt75czQ,357
|
24
25
|
cobweb/pipelines/pipeline_csv.py,sha256=TFqxqgVUqkBF6Jott4zd6fvCSxzG67lpafRQtXPw1eg,807
|
25
26
|
cobweb/pipelines/pipeline_loghub.py,sha256=zwIa_pcWBB2UNGd32Cu-i1jKGNruTbo2STdxl1WGwZ0,1829
|
26
27
|
cobweb/schedulers/__init__.py,sha256=LEya11fdAv0X28YzbQTeC1LQZ156Fj4cyEMGqQHUWW0,49
|
27
|
-
cobweb/schedulers/scheduler.py,sha256=
|
28
|
-
cobweb/schedulers/scheduler_with_redis.py,sha256=
|
28
|
+
cobweb/schedulers/scheduler.py,sha256=Of-BjbBh679R6glc12Kc8iugeERCSusP7jolpCc1UMI,1740
|
29
|
+
cobweb/schedulers/scheduler_with_redis.py,sha256=SUiEjYhzbbzc5kt_zpK8bXaEjIpwqC-JBk8ApHcVa18,7149
|
29
30
|
cobweb/utils/__init__.py,sha256=TRFJyyBjaQH_sejU6G_msOeHpjc3ZXU0dUOO5GQfknM,171
|
30
31
|
cobweb/utils/bloom.py,sha256=A8xqtHXp7jgRoBuUlpovmq8lhU5y7IEF0FOCjfQDb6s,1855
|
31
32
|
cobweb/utils/decorators.py,sha256=ZwVQlz-lYHgXgKf9KRCp15EWPzTDdhoikYUNUCIqNeM,1140
|
32
33
|
cobweb/utils/dotting.py,sha256=L-jGSApdnFIP4jUWH6p5qIme0aJ1vyDrxAx8wOJWvcs,1960
|
33
34
|
cobweb/utils/oss.py,sha256=wmToIIVNO8nCQVRmreVaZejk01aCWS35e1NV6cr0yGI,4192
|
34
35
|
cobweb/utils/tools.py,sha256=14TCedqt07m4z6bCnFAsITOFixeGr8V3aOKk--L7Cr0,879
|
35
|
-
cobweb_launcher-3.1.
|
36
|
-
cobweb_launcher-3.1.
|
37
|
-
cobweb_launcher-3.1.
|
38
|
-
cobweb_launcher-3.1.
|
39
|
-
cobweb_launcher-3.1.
|
36
|
+
cobweb_launcher-3.1.23.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
37
|
+
cobweb_launcher-3.1.23.dist-info/METADATA,sha256=QwwqDS7cSVmiivRXj_Kgu2BZW527APBQ-Qe6frnjIls,5998
|
38
|
+
cobweb_launcher-3.1.23.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
39
|
+
cobweb_launcher-3.1.23.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
40
|
+
cobweb_launcher-3.1.23.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|