cobweb-launcher 3.1.22__py3-none-any.whl → 3.1.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cobweb/base/__init__.py CHANGED
@@ -4,4 +4,6 @@ from .response import Response
4
4
  from .request import Request
5
5
  from .logger import logger
6
6
  from .seed import Seed
7
+ from .task_queue import TaskQueue, Status
8
+
7
9
 
@@ -0,0 +1,179 @@
1
+ import time
2
+ import threading
3
+ from enum import Enum
4
+ from hashlib import md5
5
+ from dataclasses import dataclass
6
+ from typing import Dict, Any, Optional, List
7
+
8
+
9
+ class Status(Enum):
10
+ PENDING = 0 # 待处理
11
+ PROCESSING = 1 # 处理中
12
+ FINISHED = 2 # 已完成
13
+ INSERT = 3 # 失败
14
+ UPLOAD = 4 # 过期
15
+
16
+
17
+ @dataclass
18
+ class Task:
19
+ task_id: str # 种子唯一ID
20
+ data: Any # 种子内容
21
+ status: Status # 当前状态
22
+ priority: int # 优先级(数值越小越优先)
23
+ created_at: float # 创建时间戳
24
+ parent_id: Optional[str] = None # 父种子 ID
25
+ children_ids: List[str] = None # 子种子 ID 列表
26
+ ttl_seconds: Optional[int] = None # 可选 TTL 时间(秒)
27
+
28
+ def __post_init__(self):
29
+ if self.children_ids is None:
30
+ self.children_ids = []
31
+
32
+
33
+ class TaskQueue:
34
+
35
+ def __init__(self, cleanup_interval=60):
36
+ self._tasks: Dict[str, Task] = {}
37
+ self._lock = threading.Lock()
38
+ # self.cleanup_interval = cleanup_interval
39
+ # self._start_cleanup_task()
40
+
41
+ # def _start_cleanup_task(self):
42
+ # """启动后台线程清理过期种子"""
43
+ # def run():
44
+ # while True:
45
+ # time.sleep(self.cleanup_interval)
46
+ # self._cleanup_expired_seeds()
47
+ # threading.Thread(target=run, daemon=True).start()
48
+
49
+ def length(self) -> int:
50
+ with self._lock:
51
+ return len(self._tasks)
52
+
53
+ def status_length(self, status) -> int:
54
+ with self._lock:
55
+ return len([it for it in self._tasks.values() if it.status == status])
56
+
57
+ def get_task(self, task_id) -> Task:
58
+ with self._lock:
59
+ if task_id in self._tasks:
60
+ return self._tasks[task_id]
61
+
62
+ def get_task_by_status(self, status: list, limit: int = None) -> List[Task]:
63
+ with self._lock:
64
+ if not isinstance(status, list):
65
+ status = [status]
66
+ task_list = [it for it in self._tasks.values() if it.status in status]
67
+ task_list.sort(key=lambda x: (x.priority, x.created_at))
68
+ return task_list[:limit] if limit else task_list
69
+
70
+ def get_pending_task(self) -> Task:
71
+ with self._lock:
72
+ if items := [it for it in self._tasks.values() if it.status == Status.PENDING]:
73
+ items.sort(key=lambda x: (x.priority, x.created_at))
74
+ task_item = items[0]
75
+ task_item.status = Status.PROCESSING
76
+ self._tasks[task_item.task_id] = task_item
77
+ return task_item
78
+
79
+ def pop_task(self, status) -> Task:
80
+ with self._lock:
81
+ if items := [it for it in self._tasks.values() if it.status == status]:
82
+ items.sort(key=lambda x: (x.priority, x.created_at))
83
+ task_item = items[0]
84
+
85
+ to_remove = set()
86
+ queue = [task_item.task_id]
87
+
88
+ while queue:
89
+ current = queue.pop(0)
90
+ if current in self._tasks:
91
+ to_remove.add(current)
92
+ queue.extend(self._tasks[current].children_ids)
93
+ del self._tasks[current]
94
+
95
+ for tid in to_remove:
96
+ if task_item := self._tasks.get(tid):
97
+ if task_item.parent_id in self._tasks:
98
+ if tid in self._tasks[task_item.parent_id].children_ids:
99
+ self._tasks[task_item.parent_id].children_ids.remove(tid)
100
+
101
+ def add_task(
102
+ self,
103
+ task_id: str = None,
104
+ data: Any = None,
105
+ status=Status.PENDING,
106
+ priority: int = 500,
107
+ parent_id: Optional[str] = None,
108
+ ttl_seconds: Optional[int] = None
109
+ ) -> bool:
110
+ """添加新种子,可指定父种子"""
111
+ with self._lock:
112
+ if not task_id:
113
+ task_id = md5(str(time.time()).encode()).hexdigest()
114
+
115
+ if task_id in self._tasks:
116
+ return False # 防止重复添加
117
+
118
+ task_item = Task(
119
+ task_id=task_id,
120
+ data=data,
121
+ status=status,
122
+ priority=priority,
123
+ created_at=int(time.time()),
124
+ parent_id=parent_id,
125
+ ttl_seconds=ttl_seconds
126
+ )
127
+ self._tasks[task_id] = task_item
128
+
129
+ if parent_id and parent_id in self._tasks:
130
+ self._tasks[parent_id].children_ids.append(task_id)
131
+
132
+ return True
133
+
134
+ def update_task(self, task_id, status, data=None) -> Task:
135
+ with self._lock:
136
+ task_item = self._tasks[task_id]
137
+ task_item.status = status
138
+ if data:
139
+ task_item.data = data
140
+
141
+ for tid in task_item.children_ids:
142
+ if self._tasks[tid].status == Status.INSERT:
143
+ del self._tasks[tid]
144
+
145
+ task_item.children_ids = []
146
+ self._tasks[task_id] = task_item
147
+
148
+ return task_item
149
+
150
+ def remove(self, task_ids: list) -> bool:
151
+ with self._lock:
152
+ for task_id in task_ids:
153
+ if task_item := self._tasks.get(task_id):
154
+
155
+ if task_item.children_ids:
156
+ continue
157
+
158
+ if task_item.parent_id in self._tasks:
159
+ if task_id in self._tasks[task_item.parent_id].children_ids:
160
+ self._tasks[task_item.parent_id].children_ids.remove(task_id)
161
+
162
+ del self._tasks[task_id]
163
+
164
+ def count_children(self, task_id: str) -> int:
165
+ with self._lock:
166
+ if task_id in self._tasks:
167
+ return len(self._tasks[task_id].children_ids)
168
+ return 0
169
+
170
+ # def _cleanup_expired_seeds(self):
171
+ # now = time.time()
172
+ # expired_ids = []
173
+ # with self._lock:
174
+ # for seed_id, seed in self._seeds.items():
175
+ # if seed.ttl_seconds and now - seed.created_at > seed.ttl_seconds:
176
+ # expired_ids.append(seed_id)
177
+ # for seed_id in expired_ids:
178
+ # self._seeds[seed_id] = self._seeds[seed_id]._replace(status=SeedStatus.EXPIRED)
179
+ # print(f"清理了 {len(expired_ids)} 个过期种子")
@@ -2,15 +2,15 @@ import time
2
2
  import threading
3
3
  import traceback
4
4
 
5
- from typing import Callable, Type
6
5
  from inspect import isgenerator
6
+ from typing import Callable, Type
7
7
  from urllib.parse import urlparse
8
8
  from requests import Response as Res
9
9
 
10
10
  from cobweb.crawlers import Crawler
11
- from cobweb.constant import DealModel, LogTemplate
12
11
  from cobweb.utils import LoghubDot, check_pause
13
- from cobweb.base import Seed, Queue, BaseItem, Request, Response, logger
12
+ from cobweb.constant import DealModel, LogTemplate
13
+ from cobweb.base import Seed, Status, TaskQueue, BaseItem, Request, Response, logger
14
14
 
15
15
 
16
16
  class Distributor(threading.Thread):
@@ -19,10 +19,7 @@ class Distributor(threading.Thread):
19
19
  self,
20
20
  task: str,
21
21
  project: str,
22
- new: Queue,
23
- todo: Queue,
24
- done: Queue,
25
- upload: Queue,
22
+ task_queue: TaskQueue,
26
23
  stop: threading.Event,
27
24
  pause: threading.Event,
28
25
  callback_register: Callable,
@@ -34,10 +31,8 @@ class Distributor(threading.Thread):
34
31
  self.stop = stop
35
32
  self.pause = pause
36
33
 
37
- self.new = new
38
- self.todo = todo
39
- self.done = done
40
- self.upload = upload
34
+ self.task_queue = task_queue
35
+
41
36
  self.callback_register = callback_register
42
37
  self.Crawler = SpiderCrawler
43
38
 
@@ -46,69 +41,83 @@ class Distributor(threading.Thread):
46
41
  self.thread_num = setting.SPIDER_THREAD_NUM
47
42
  self.max_retries = setting.SPIDER_MAX_RETRIES
48
43
  self.record_failed = setting.RECORD_FAILED_SPIDER
49
- self.loghub_dot = LoghubDot(stop=stop) # todo: 解偶
44
+ self.loghub_dot = LoghubDot(stop=stop) # todo
50
45
 
51
46
  logger.debug(f"Distribute instance attrs: {self.__dict__}")
52
47
 
53
- def distribute(self, item, seed, _id: int):
48
+ def distribute(self, task_id, item, _id: int):
54
49
  if isinstance(item, Request):
55
- seed.params.start_time = time.time()
56
- self.process(item=item, seed=seed, callback=self.Crawler.download, _id=1)
50
+ item.seed.params.start_time = time.time()
51
+ self.process(task_id=task_id, item=item, callback=self.Crawler.download, _id=1)
52
+
57
53
  elif isinstance(item, Response):
58
54
  if _id == 2:
59
55
  raise TypeError("parse function can't yield a Response instance")
60
56
  dot = isinstance(item.response, Res)
61
- # TODO: 请求成功打点
62
- self.spider_logging(seed, item, dot=dot)
63
- self.process(item=item, seed=seed, callback=self.Crawler.parse, _id=2)
57
+ self.spider_logging(item.seed, item, dot=dot) # todo: update
58
+ self.process(task_id=task_id, item=item, callback=self.Crawler.parse, _id=2)
59
+
64
60
  elif isinstance(item, BaseItem):
65
- self.upload.push(item)
61
+ self.task_queue.add_task(
62
+ data=item,
63
+ status=Status.UPLOAD,
64
+ parent_id=task_id
65
+ )
66
+
66
67
  elif isinstance(item, Seed):
67
- self.new.push((seed, item), direct_insertion=True)
68
- elif isinstance(item, str) and item == DealModel.poll:
69
- self.todo.push(seed)
70
- elif isinstance(item, str) and item == DealModel.done:
71
- self.done.push(seed)
72
- elif isinstance(item, str) and item == DealModel.fail:
73
- seed.params.retry += 1
74
- if seed.params.retry < self.max_retries:
75
- self.todo.push(seed)
76
- else:
77
- if record_failed := self.record_failed:
78
- try:
79
- response = Response(seed, "failed", max_retries=True)
80
- self.process(response, seed, self.Crawler.parse, _id=2)
81
- except Exception as e:
82
- msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
83
- logger.error(msg = msg)
84
- record_failed = False
85
- if not record_failed:
86
- self.done.push(seed)
87
- else:
68
+ self.task_queue.add_task(
69
+ task_id=item.sid,
70
+ data=item,
71
+ status=Status.INSERT,
72
+ priority=item.params.priority,
73
+ parent_id=task_id
74
+ )
75
+
76
+ elif isinstance(item, str) and item != DealModel.done:
88
77
  raise TypeError("yield value type error!")
89
78
 
90
- def process(self, item, seed, callback, _id: int):
91
- result_iterators = callback(item)
92
- if not isgenerator(result_iterators):
79
+ def process(self, task_id, item, callback, _id: int):
80
+ iterators = callback(item)
81
+ if not isgenerator(iterators):
93
82
  raise TypeError(f"{callback.__name__} function isn't a generator!")
94
- for result_item in result_iterators:
95
- self.distribute(result_item, seed, _id)
83
+ for it in iterators:
84
+ self.distribute(task_id=task_id, item=it, _id=_id)
96
85
 
97
86
  @check_pause
98
87
  def spider(self):
99
- # TODO: 限流措施
100
- if seed := self.todo.pop():
88
+ if task_item := self.task_queue.get_pending_task():
89
+ seed = task_item.data
90
+ status = Status.FINISHED
91
+ task_id = task_item.task_id
92
+
101
93
  try:
102
- self.process(item=seed, seed=seed, callback=self.Crawler.request, _id=0)
94
+ self.process(task_id=task_id, item=seed, callback=self.Crawler.request, _id=0)
95
+
103
96
  except Exception as e:
104
- url, status = seed.url, e.__class__.__name__
97
+
98
+ seed.params.retry += 1
99
+ url, _status = seed.url, e.__class__.__name__
100
+
105
101
  msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
106
102
  if getattr(e, "response", None) and isinstance(e.response, Res):
107
103
  url = e.response.request.url
108
- status = e.response.status_code
109
- # TODO:失败请求打点
110
- self.spider_logging(seed, None, error=True, url=url, status=status, msg=msg)
111
- self.distribute(DealModel.fail, seed, _id=-1)
104
+ _status = e.response.status_code
105
+
106
+ self.spider_logging(seed, None, error=True, url=url, status=_status, msg=msg)
107
+
108
+ if seed.params.retry < self.max_retries:
109
+ status = Status.PENDING
110
+
111
+ elif self.record_failed:
112
+ try:
113
+ response = Response(seed, "failed", max_retries=True)
114
+ self.process(task_id=task_id, item=response, callback=self.Crawler.parse, _id=2)
115
+ except Exception as e:
116
+ msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
117
+ logger.error(msg=msg)
118
+
119
+ finally:
120
+ self.task_queue.update_task(task_id, status=status, data=seed)
112
121
 
113
122
  def spider_logging(
114
123
  self, seed,
@@ -5,20 +5,15 @@ import threading
5
5
  import importlib
6
6
 
7
7
  from cobweb import setting
8
- from cobweb.base import Seed, Queue, logger
8
+ from cobweb.launchers.uploader import Uploader
9
9
  from cobweb.utils.tools import dynamic_load_class
10
10
  from cobweb.launchers.distributor import Distributor
11
- from cobweb.launchers.uploader import Uploader
11
+ from cobweb.base import Seed, logger, TaskQueue, Status
12
12
  from typing import Optional, Union, Dict, Any, Callable
13
13
 
14
14
 
15
15
  class Launcher:
16
16
 
17
- _NEW_QUEUE_ = Queue()
18
- _TODO_QUEUE_ = Queue()
19
- _DONE_QUEUE_ = Queue()
20
- _UPLOAD_QUEUE_ = Queue()
21
-
22
17
  __REGISTER_FUNC__: Dict[str, Callable] = {}
23
18
  __WORKER_THREAD__: Dict[str, threading.Thread] = {}
24
19
 
@@ -33,6 +28,7 @@ class Launcher:
33
28
  self._pause = threading.Event() # 暂停事件
34
29
 
35
30
  _setting = self._load_custom_settings(custom_setting)
31
+
36
32
  _setting.update(kwargs)
37
33
  for key, value in _setting.items():
38
34
  setattr(setting, key.upper(), value)
@@ -40,6 +36,8 @@ class Launcher:
40
36
  self._done_model = setting.DONE_MODEL
41
37
  self._task_model = setting.TASK_MODEL
42
38
 
39
+ self._task_queue = TaskQueue()
40
+
43
41
  self.Scheduler = dynamic_load_class(setting.SCHEDULER)
44
42
  self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
45
43
  self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
@@ -108,7 +106,15 @@ class Launcher:
108
106
 
109
107
  def start_seeds(self, seeds: list[Union[str, Dict]]) -> list[Seed]:
110
108
  seed_list = [Seed(seed) for seed in seeds]
111
- self._TODO_QUEUE_.push(seed_list)
109
+ for seed in seed_list:
110
+ self._task_queue.add_task(
111
+ task_id=seed.sid,
112
+ data=seed,
113
+ status=Status.PENDING,
114
+ priority=seed.params.priority,
115
+ parent_id=None,
116
+ ttl_seconds=None
117
+ )
112
118
  return seed_list
113
119
 
114
120
  def _register(self, func: Callable, tag: str = "launcher"):
@@ -139,20 +145,14 @@ class Launcher:
139
145
  project=self.project,
140
146
  stop=self._stop,
141
147
  pause=self._pause,
142
- new=self._NEW_QUEUE_,
143
- todo=self._TODO_QUEUE_,
144
- done=self._DONE_QUEUE_,
145
- upload=self._UPLOAD_QUEUE_,
148
+ task_queue=self._task_queue,
146
149
  callback_register=self._register
147
150
  ).start()
148
151
 
149
152
  Distributor(
150
153
  task=self.task,
151
154
  project=self.project,
152
- new=self._NEW_QUEUE_,
153
- todo=self._TODO_QUEUE_,
154
- done=self._DONE_QUEUE_,
155
- upload=self._UPLOAD_QUEUE_,
155
+ task_queue=self._task_queue,
156
156
  callback_register=self._register,
157
157
  stop=self._stop, pause=self._pause,
158
158
  SpiderCrawler=self.SpiderCrawler
@@ -161,8 +161,7 @@ class Launcher:
161
161
  Uploader(
162
162
  task=self.task, project=self.project,
163
163
  stop=self._stop, pause=self._pause,
164
- done=self._DONE_QUEUE_,
165
- upload=self._UPLOAD_QUEUE_,
164
+ task_queue=self._task_queue,
166
165
  callback_register=self._register,
167
166
  SpiderPipeline=self.SpiderPipeline
168
167
  ).start()
@@ -2,7 +2,7 @@ import time
2
2
  import threading
3
3
  from typing import Callable, Type
4
4
  from cobweb.pipelines import Pipeline
5
- from cobweb.base import Queue, logger
5
+ from cobweb.base import TaskQueue, logger, Status
6
6
  from cobweb.utils import check_pause
7
7
 
8
8
 
@@ -14,7 +14,7 @@ class Uploader(threading.Thread):
14
14
  project: str,
15
15
  stop: threading.Event,
16
16
  pause: threading.Event,
17
- upload: Queue, done: Queue,
17
+ task_queue: TaskQueue,
18
18
  callback_register: Callable,
19
19
  SpiderPipeline: Type[Pipeline]
20
20
  ):
@@ -25,8 +25,7 @@ class Uploader(threading.Thread):
25
25
  self.stop = stop
26
26
  self.pause = pause
27
27
 
28
- self.done = done
29
- self.upload = upload
28
+ self.task_queue = task_queue
30
29
  self.callback_register = callback_register
31
30
 
32
31
  from cobweb import setting
@@ -40,30 +39,45 @@ class Uploader(threading.Thread):
40
39
 
41
40
  @check_pause
42
41
  def upload_data(self):
43
- if not self.upload.length:
44
- time.sleep(self.wait_seconds)
45
- return
46
- if self.upload.length < self.upload_size:
47
- time.sleep(self.wait_seconds)
48
- data_info, seeds = {}, []
49
- try:
50
- for _ in range(self.upload_size):
51
- item = self.upload.pop()
52
- if not item:
53
- break
54
- seeds.append(item.seed)
55
- data = self.pipeline.build(item)
56
- data_info.setdefault(item.table, []).append(data)
57
- for table, datas in data_info.items():
58
- try:
59
- self.pipeline.upload(table, datas)
60
- # TODO: 上传打点
61
- except Exception as e:
62
- logger.info(e)
63
- except Exception as e:
64
- logger.info(e)
65
- if seeds:
66
- self.done.push(seeds)
42
+ if task_list := self.task_queue.get_task_by_status(
43
+ status=Status.UPLOAD, limit=self.upload_size
44
+ ):
45
+ try:
46
+ data_info, task_ids = dict(), set()
47
+ for task_item in task_list:
48
+ upload_data = self.pipeline.build(task_item.data)
49
+ data_info.setdefault(task_item.data.table, []).append(upload_data)
50
+
51
+ for table, datas in data_info.items():
52
+ try:
53
+ self.pipeline.upload(table, datas)
54
+ except Exception as e:
55
+ logger.info(e)
56
+ except Exception as e:
57
+ logger.info(e)
58
+
59
+ if self.task_queue.status_length(status=Status.UPLOAD) < self.upload_size:
60
+ time.sleep(self.wait_seconds)
61
+
62
+ # data_info, seeds = {}, []
63
+ # try:
64
+ # for _ in range(self.upload_size):
65
+ # item = self.upload.pop()
66
+ # if not item:
67
+ # break
68
+ # # seeds.append(item.seed)
69
+ # data = self.pipeline.build(item)
70
+ # data_info.setdefault(item.table, []).append(data)
71
+ # for table, datas in data_info.items():
72
+ # try:
73
+ # self.pipeline.upload(table, datas)
74
+ # # TODO: 上传打点
75
+ # except Exception as e:
76
+ # logger.info(e)
77
+ # except Exception as e:
78
+ # logger.info(e)
79
+ # if self.upload.length < self.upload_size:
80
+ # time.sleep(self.wait_seconds)
67
81
 
68
82
  def run(self):
69
83
  self.callback_register(self.upload_data, tag="Uploader")
@@ -1,15 +1,13 @@
1
1
  import threading
2
2
 
3
-
4
3
  from typing import Callable
5
- from cobweb.base import Queue
4
+ from cobweb.base import TaskQueue
6
5
  from abc import ABC, abstractmethod
7
6
 
8
7
 
9
8
  class Scheduler(ABC, threading.Thread):
10
9
 
11
- __WORKING_ITEMS__ = {}
12
- __LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
10
+ # __LAUNCHER_FUNC__ = ["_reset", "_scheduler", "_insert", "_refresh", "_delete"]
13
11
 
14
12
  def __init__(
15
13
  self,
@@ -17,10 +15,7 @@ class Scheduler(ABC, threading.Thread):
17
15
  project,
18
16
  stop: threading.Event,
19
17
  pause: threading.Event,
20
- new: Queue,
21
- todo: Queue,
22
- done: Queue,
23
- upload: Queue,
18
+ task_queue: TaskQueue,
24
19
  callback_register: Callable
25
20
  ):
26
21
  super().__init__()
@@ -44,45 +39,9 @@ class Scheduler(ABC, threading.Thread):
44
39
  self.stop = stop
45
40
  self.pause = pause
46
41
 
47
- self.new = new
48
- self.todo = todo
49
- self.done = done
50
- self.upload = upload
42
+ self.task_queue = task_queue
51
43
 
52
44
  self.callback_register = callback_register
53
- self.lock = threading.Lock()
54
-
55
- def is_empty(self):
56
- return all(queue.empty() for queue in (self.new, self.todo, self.done, self.upload))
57
-
58
- def set_working_items(self, item_info: dict = None):
59
- if not item_info:
60
- return False
61
- with self.lock:
62
- self.__WORKING_ITEMS__.update(item_info)
63
-
64
- def get_working_items(self) -> dict:
65
- with self.lock:
66
- return self.__WORKING_ITEMS__.copy()
67
-
68
- def remove_working_items(self, items: list[str] = None) -> int:
69
- if not items:
70
- return 0
71
- with self.lock:
72
- deleted_count = 0
73
- for item in items:
74
- if item in self.__WORKING_ITEMS__:
75
- del self.__WORKING_ITEMS__[item]
76
- deleted_count += 1
77
- return deleted_count
78
-
79
- def get_working_items_count(self) -> int:
80
- with self.lock:
81
- return len(self.__WORKING_ITEMS__)
82
-
83
- def clear_working_items(self):
84
- with self.lock:
85
- self.__WORKING_ITEMS__.clear()
86
45
 
87
46
  @abstractmethod
88
47
  def reset(self):
@@ -4,7 +4,7 @@ import threading
4
4
  from typing import Callable
5
5
  from cobweb.db import RedisDB, ApiDB
6
6
  from cobweb.utils import check_pause
7
- from cobweb.base import Queue, Seed, logger
7
+ from cobweb.base import Seed, logger, TaskQueue, Status
8
8
  from cobweb.constant import LogTemplate
9
9
  from .scheduler import Scheduler
10
10
  use_api = bool(os.getenv("REDIS_API_HOST", 0))
@@ -18,13 +18,10 @@ class RedisScheduler(Scheduler):
18
18
  project,
19
19
  stop: threading.Event,
20
20
  pause: threading.Event,
21
- new: Queue,
22
- todo: Queue,
23
- done: Queue,
24
- upload: Queue,
21
+ task_queue: TaskQueue,
25
22
  callback_register: Callable
26
23
  ):
27
- super().__init__(task, project, stop, pause, new, todo, done, upload, callback_register)
24
+ super().__init__(task, project, stop, pause, task_queue, callback_register)
28
25
  self.todo_key = f"{{{project}:{task}}}:todo"
29
26
  self.done_key = f"{{{project}:{task}}}:done"
30
27
  self.fail_key = f"{{{project}:{task}}}:fail"
@@ -38,14 +35,13 @@ class RedisScheduler(Scheduler):
38
35
  检查过期种子,重新添加到redis缓存中
39
36
  """
40
37
  while not self.stop.is_set():
41
- if self.db.lock(self.reset_lock_key, t=60):
38
+ if self.db.lock(self.reset_lock_key, t=360):
42
39
 
43
- # _min = -int(time.time()) + self.seed_reset_seconds
44
- _min = -int(time.time()) + 3600
40
+ _min = -int(time.time()) + self.seed_reset_seconds
45
41
  self.db.members(self.todo_key, 0, _min=_min, _max="(0")
46
42
  self.db.delete(self.reset_lock_key)
47
43
 
48
- time.sleep(60)
44
+ time.sleep(self.seed_reset_seconds)
49
45
 
50
46
  @check_pause
51
47
  def schedule(self):
@@ -56,42 +52,43 @@ class RedisScheduler(Scheduler):
56
52
  time.sleep(self.scheduler_wait_seconds)
57
53
  return
58
54
 
59
- if self.todo.length >= self.todo_queue_size:
55
+ if self.task_queue.status_length(Status.PENDING) >= self.todo_queue_size:
60
56
  time.sleep(self.todo_queue_full_wait_seconds)
61
57
  return
62
58
 
63
- members = self.db.members(
59
+ if members := self.db.members(
64
60
  self.todo_key, int(time.time()),
65
61
  count=self.todo_queue_size,
66
62
  _min=0, _max="(1000"
67
- )
68
-
69
- logger.debug(f"Retrieved {len(members)} seeds from Redis.")
70
-
71
- seeds, item_info = list(), dict()
72
- for member, priority in members:
73
- seed = Seed(member, priority=int(priority % 1000))
74
- item_info[seed.to_string] = seed.params.priority
75
- seeds.append(seed)
76
-
77
- # self.set_working_items(item_info)
78
- self.todo.push(seeds)
79
- # time.sleep(1)
63
+ ):
64
+ for member, priority in members:
65
+ seed = Seed(member, priority=int(priority % 1000))
66
+ self.task_queue.add_task(
67
+ task_id=seed.sid,
68
+ data=seed,
69
+ status=Status.PENDING,
70
+ priority=seed.params.priority
71
+ )
80
72
 
81
73
  @check_pause
82
74
  def insert(self):
83
75
  """
84
76
  添加新种子到redis队列中
85
77
  """
86
- seeds, delete_seeds = dict(), set()
87
- for seed, new_seed in self.new.iter_items(limit=self.new_queue_max_size):
88
- seeds[new_seed.to_string] = new_seed.params.priority
89
- delete_seeds.add(seed)
78
+ if task_list := self.task_queue.get_task_by_status(
79
+ status=Status.INSERT, limit=self.new_queue_max_size
80
+ ):
81
+ seed_info, task_ids = dict(), set()
90
82
 
91
- self.db.zadd(self.todo_key, seeds, nx=True)
92
- self.done.push(delete_seeds)
83
+ for task_item in task_list:
84
+ seed = task_item.data
85
+ task_ids.add(task_item.task_id)
86
+ seed_info[seed.to_string] = seed.params.priority
93
87
 
94
- if self.new.length < self.new_queue_max_size:
88
+ self.db.zadd(self.todo_key, seed_info, nx=True)
89
+ self.task_queue.remove(task_ids)
90
+
91
+ if self.task_queue.status_length(status=Status.INSERT) < self.new_queue_max_size:
95
92
  time.sleep(self.scheduler_wait_seconds)
96
93
 
97
94
  @check_pause
@@ -99,27 +96,28 @@ class RedisScheduler(Scheduler):
99
96
  """
100
97
  刷新doing种子过期时间,防止reset重新消费
101
98
  """
102
- if item_info := self.get_working_items():
99
+ if task_list := self.task_queue.get_task_by_status(
100
+ status=[Status.PENDING, Status.PROCESSING, Status.FINISHED],
101
+ ):
103
102
  refresh_time = int(time.time())
104
- seed_info = {k: -refresh_time - v / 1000 for k, v in item_info.items()}
103
+ seed_info = {it.data.to_string: -refresh_time - it.data.params.priority / 1000 for it in task_list}
105
104
  self.db.zadd(self.todo_key, seed_info, xx=True)
106
- # self.set_working_items(seed_info)
107
- time.sleep(20)
105
+ time.sleep(self.seed_reset_seconds // 3)
108
106
 
109
107
  @check_pause
110
108
  def delete(self):
111
109
  """
112
110
  删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
113
111
  """
114
- seeds = [seed for seed in self.done.iter_items(limit=self.done_queue_max_size)]
115
- items = [seed.to_string for seed in seeds]
116
-
117
- if self.remove_working_items(items):
118
- self.db.zrem(self.todo_key, *items)
119
- elif seeds:
120
- self.done.push(seeds)
121
-
122
- if self.done.length < self.done_queue_max_size:
112
+ if task_list := self.task_queue.get_task_by_status(
113
+ status=Status.FINISHED, limit=self.done_queue_max_size
114
+ ):
115
+ zrem_items = [it.data.to_string for it in task_list]
116
+ remove_task_ids = [it.task_id for it in task_list]
117
+ self.db.zrem(self.todo_key, *zrem_items)
118
+ self.task_queue.remove(remove_task_ids)
119
+
120
+ if self.task_queue.status_length(status=Status.FINISHED) < self.done_queue_max_size:
123
121
  time.sleep(self.done_queue_wait_seconds)
124
122
 
125
123
  def run(self):
@@ -129,32 +127,41 @@ class RedisScheduler(Scheduler):
129
127
  self.callback_register(func, tag="scheduler")
130
128
 
131
129
  while not self.stop.is_set():
132
- working_count = self.get_working_items_count()
133
- memory_count = self.db.zcount(self.todo_key, "-inf", "(0")
134
- todo_count = self.db.zcount(self.todo_key, 0, "(1000")
135
- all_count = self.db.zcard(self.todo_key)
130
+ todo_len = self.task_queue.status_length(status=Status.PENDING)
131
+ doing_len = self.task_queue.status_length(status=Status.PROCESSING)
132
+ done_len = self.task_queue.status_length(status=Status.FINISHED)
133
+ upload_len = self.task_queue.status_length(status=Status.UPLOAD)
134
+
135
+ redis_doing_count = self.db.zcount(self.todo_key, "-inf", "(0")
136
+ redis_todo_len = self.db.zcount(self.todo_key, 0, "(1000")
137
+ redis_seed_count = self.db.zcard(self.todo_key)
136
138
 
137
139
  if self.pause.is_set():
138
140
  execute_time = int(time.time()) - start_time
139
141
  if not self.task_model and execute_time > self.before_scheduler_wait_seconds:
140
142
  logger.info("Done! ready to close thread...")
141
143
  self.stop.set()
142
- elif todo_count:
144
+ elif redis_todo_len:
143
145
  logger.info(
144
- f"Recovery {self.task} task run!Todo seeds count: {todo_count}, queue length: {all_count}")
146
+ f"Recovery {self.task} task run!"
147
+ f"Todo seeds count: {redis_todo_len}"
148
+ f", queue length: {redis_seed_count}"
149
+ )
145
150
  self.pause.clear()
146
151
  else:
147
152
  logger.info("Pause! waiting for resume...")
148
153
 
149
- elif self.is_empty():
150
-
151
- if all_count:
152
- logger.info(f"Todo seeds count: {todo_count}, queue length: {all_count}")
154
+ elif self.task_queue.length() == 0:
155
+ if redis_seed_count:
156
+ logger.info(
157
+ f"Todo seeds count: {redis_todo_len}"
158
+ f", queue length: {redis_seed_count}"
159
+ )
153
160
  self.pause.clear()
154
161
  else:
155
162
  count = 0
156
163
  for _ in range(3):
157
- if not all_count:
164
+ if not redis_seed_count:
158
165
  count += 1
159
166
  time.sleep(5)
160
167
  logger.info("Checking count...")
@@ -162,19 +169,18 @@ class RedisScheduler(Scheduler):
162
169
  break
163
170
  if count >= 3:
164
171
  logger.info("Todo queue is empty! Pause set...")
165
- self.clear_working_items()
166
172
  self.pause.set()
167
173
 
168
174
  else:
169
175
  logger.info(LogTemplate.launcher_pro_polling.format(
170
176
  task=self.task,
171
- doing_len=working_count,
172
- todo_len=self.todo.length,
173
- done_len=self.done.length,
174
- redis_seed_count=all_count,
175
- redis_todo_len=todo_count,
176
- redis_doing_len=memory_count,
177
- upload_len=self.upload.length,
177
+ doing_len=doing_len,
178
+ todo_len=todo_len,
179
+ done_len=done_len,
180
+ redis_seed_count=redis_seed_count,
181
+ redis_todo_len=redis_todo_len,
182
+ redis_doing_len=redis_doing_count,
183
+ upload_len=upload_len,
178
184
  ))
179
185
 
180
186
  time.sleep(30)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 3.1.22
3
+ Version: 3.1.23
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,13 +1,14 @@
1
1
  cobweb/__init__.py,sha256=YdBi3uytEFRXan155xU1kKMpiUKUupO2RGeJyXmH0zk,129
2
2
  cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
3
3
  cobweb/setting.py,sha256=rHPQfc4a1xMTbkt3_KXBfUomhYcbTXogsz7ew-QsqHw,1670
4
- cobweb/base/__init__.py,sha256=c1qwQrpe5atW_OK_Qw9kaiZpVDey3t_nvRwlH7QRIqY,179
4
+ cobweb/base/__init__.py,sha256=NanSxJr0WsqjqCNOQAlxlkt-vQEsERHYBzacFC057oI,222
5
5
  cobweb/base/common_queue.py,sha256=hYdaM70KrWjvACuLKaGhkI2VqFCnd87NVvWzmnfIg8Q,1423
6
6
  cobweb/base/item.py,sha256=1bS4U_3vzI2jzSSeoEbLoLT_5CfgLPopWiEYtaahbvw,1674
7
7
  cobweb/base/logger.py,sha256=Vsg1bD4LXW91VgY-ANsmaUu-mD88hU_WS83f7jX3qF8,2011
8
8
  cobweb/base/request.py,sha256=MBYYjWpbRQRulPG0zPbK0DO3LKmScqQ4tBzFXekYkao,2652
9
9
  cobweb/base/response.py,sha256=g8e5H0hEiRfqseh3nD7t6a1rhIJYRMV7nI47kqNOd-U,446
10
10
  cobweb/base/seed.py,sha256=ddaWCq_KaWwpmPl1CToJlfCxEEnoJ16kjo6azJs9uls,5000
11
+ cobweb/base/task_queue.py,sha256=3ScPKnjlPEuuCzWyG9D2iHiND3L9lLM7fo1LNOkw8CY,6337
11
12
  cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
12
13
  cobweb/crawlers/crawler.py,sha256=ZZVZJ17RWuvzUFGLjqdvyVZpmuq-ynslJwXQzdm_UdQ,709
13
14
  cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
@@ -16,24 +17,24 @@ cobweb/db/redis_db.py,sha256=X7dUpW50QcmRPjYlYg7b-fXF_fcjuRRk3DBx2ggetXk,7687
16
17
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
17
18
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
18
19
  cobweb/launchers/__init__.py,sha256=6_v2jd2sgj6YnOB1nPKiYBskuXVb5xpQnq2YaDGJgQ8,100
19
- cobweb/launchers/distributor.py,sha256=ALvu7MVZLSQPmWJc_FR-UUIlTMv4PAu8q7tt-KzK1v8,6810
20
- cobweb/launchers/launcher.py,sha256=L75eYKemPVqT0cuwfBy_Vh0CObWilDpJ9ibD29g5L38,5742
21
- cobweb/launchers/uploader.py,sha256=5Hm1pmco8PsFrtBDRN9aw6IjAElfX2wdN1yaILtp03w,2059
20
+ cobweb/launchers/distributor.py,sha256=I5QBs2hFiyGGkqLLkMw9uzf4_oRW2JvahNW9yc866cc,6748
21
+ cobweb/launchers/launcher.py,sha256=Shb6o6MAM38d32ybW2gY6qpGmhuiV7jo9TDh0f7rud8,5694
22
+ cobweb/launchers/uploader.py,sha256=dDBv6Vfy1ciaTAJA3TebJV-2oM3OMrqTfzpNX8VGv-0,2766
22
23
  cobweb/pipelines/__init__.py,sha256=rtkaaCZ4u1XcxpkDLHztETQjEcLZ_6DXTHjdfcJlyxQ,97
23
24
  cobweb/pipelines/pipeline.py,sha256=OgSEZ2DdqofpZcer1Wj1tuBqn8OHVjrYQ5poqt75czQ,357
24
25
  cobweb/pipelines/pipeline_csv.py,sha256=TFqxqgVUqkBF6Jott4zd6fvCSxzG67lpafRQtXPw1eg,807
25
26
  cobweb/pipelines/pipeline_loghub.py,sha256=zwIa_pcWBB2UNGd32Cu-i1jKGNruTbo2STdxl1WGwZ0,1829
26
27
  cobweb/schedulers/__init__.py,sha256=LEya11fdAv0X28YzbQTeC1LQZ156Fj4cyEMGqQHUWW0,49
27
- cobweb/schedulers/scheduler.py,sha256=qb_u3tJELp4zRGOyT8OsNWndKDknhQslnKnjUlwEpiE,2943
28
- cobweb/schedulers/scheduler_with_redis.py,sha256=i7EfjXY4p5BJrZfCNxyUqBqIcyg1ZICjm7gbxrhHbN0,6448
28
+ cobweb/schedulers/scheduler.py,sha256=Of-BjbBh679R6glc12Kc8iugeERCSusP7jolpCc1UMI,1740
29
+ cobweb/schedulers/scheduler_with_redis.py,sha256=SUiEjYhzbbzc5kt_zpK8bXaEjIpwqC-JBk8ApHcVa18,7149
29
30
  cobweb/utils/__init__.py,sha256=TRFJyyBjaQH_sejU6G_msOeHpjc3ZXU0dUOO5GQfknM,171
30
31
  cobweb/utils/bloom.py,sha256=A8xqtHXp7jgRoBuUlpovmq8lhU5y7IEF0FOCjfQDb6s,1855
31
32
  cobweb/utils/decorators.py,sha256=ZwVQlz-lYHgXgKf9KRCp15EWPzTDdhoikYUNUCIqNeM,1140
32
33
  cobweb/utils/dotting.py,sha256=L-jGSApdnFIP4jUWH6p5qIme0aJ1vyDrxAx8wOJWvcs,1960
33
34
  cobweb/utils/oss.py,sha256=wmToIIVNO8nCQVRmreVaZejk01aCWS35e1NV6cr0yGI,4192
34
35
  cobweb/utils/tools.py,sha256=14TCedqt07m4z6bCnFAsITOFixeGr8V3aOKk--L7Cr0,879
35
- cobweb_launcher-3.1.22.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
36
- cobweb_launcher-3.1.22.dist-info/METADATA,sha256=L5ozmjklF83tH1LhHCgFp2U-rRBzEP_sj6x8WSZda5s,5998
37
- cobweb_launcher-3.1.22.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
38
- cobweb_launcher-3.1.22.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
39
- cobweb_launcher-3.1.22.dist-info/RECORD,,
36
+ cobweb_launcher-3.1.23.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
+ cobweb_launcher-3.1.23.dist-info/METADATA,sha256=QwwqDS7cSVmiivRXj_Kgu2BZW527APBQ-Qe6frnjIls,5998
38
+ cobweb_launcher-3.1.23.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
+ cobweb_launcher-3.1.23.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
+ cobweb_launcher-3.1.23.dist-info/RECORD,,