cobweb-launcher 1.0.5__py3-none-any.whl → 3.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +5 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +40 -14
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +744 -47
  7. cobweb/base/response.py +381 -13
  8. cobweb/base/seed.py +98 -50
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +39 -2
  12. cobweb/crawlers/__init__.py +1 -2
  13. cobweb/crawlers/crawler.py +27 -0
  14. cobweb/db/__init__.py +1 -0
  15. cobweb/db/api_db.py +83 -0
  16. cobweb/db/redis_db.py +118 -27
  17. cobweb/launchers/__init__.py +3 -1
  18. cobweb/launchers/distributor.py +141 -0
  19. cobweb/launchers/launcher.py +103 -130
  20. cobweb/launchers/uploader.py +68 -0
  21. cobweb/log_dots/__init__.py +2 -0
  22. cobweb/log_dots/dot.py +258 -0
  23. cobweb/log_dots/loghub_dot.py +53 -0
  24. cobweb/pipelines/__init__.py +3 -2
  25. cobweb/pipelines/pipeline.py +19 -0
  26. cobweb/pipelines/pipeline_csv.py +25 -0
  27. cobweb/pipelines/pipeline_loghub.py +54 -0
  28. cobweb/schedulers/__init__.py +1 -0
  29. cobweb/schedulers/scheduler.py +66 -0
  30. cobweb/schedulers/scheduler_with_redis.py +189 -0
  31. cobweb/setting.py +37 -38
  32. cobweb/utils/__init__.py +5 -2
  33. cobweb/utils/bloom.py +58 -0
  34. cobweb/{base → utils}/decorators.py +14 -12
  35. cobweb/utils/dotting.py +300 -0
  36. cobweb/utils/oss.py +113 -86
  37. cobweb/utils/tools.py +3 -15
  38. cobweb_launcher-3.2.18.dist-info/METADATA +193 -0
  39. cobweb_launcher-3.2.18.dist-info/RECORD +44 -0
  40. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/WHEEL +1 -1
  41. cobweb/crawlers/base_crawler.py +0 -121
  42. cobweb/crawlers/file_crawler.py +0 -181
  43. cobweb/launchers/launcher_pro.py +0 -174
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb_launcher-1.0.5.dist-info/METADATA +0 -48
  47. cobweb_launcher-1.0.5.dist-info/RECORD +0 -32
  48. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.0.5.dist-info → cobweb_launcher-3.2.18.dist-info}/top_level.txt +0 -0
cobweb/db/redis_db.py CHANGED
@@ -1,68 +1,161 @@
1
+ import os
2
+
3
+ import time
1
4
  import redis
2
- from cobweb import setting
5
+ from redis.exceptions import ConnectionError, TimeoutError
3
6
 
4
7
 
5
8
  class RedisDB:
6
-
7
- def __init__(self):
8
- pool = redis.ConnectionPool(**setting.REDIS_CONFIG)
9
- self._client = redis.Redis(connection_pool=pool)
9
+ def __init__(
10
+ self,
11
+ host=None,
12
+ password=None,
13
+ port=6379, db=0
14
+ ):
15
+ self.host = host or os.getenv("REDIS_HOST", "localhost")
16
+ self.password = password or os.getenv("REDIS_PASSWORD")
17
+ self.port = port or os.getenv("REDIS_PORT", 6379)
18
+ self.db = db or os.getenv("REDIS_DB", 0)
19
+
20
+ self.max_retries = 5
21
+ self.retry_delay = 5
22
+ self.client = None
23
+ self.connect()
24
+
25
+ def connect(self):
26
+ retries = 0
27
+ while retries < self.max_retries:
28
+ try:
29
+ self.client = redis.Redis(
30
+ host=self.host,
31
+ port=self.port,
32
+ password=self.password,
33
+ db=self.db,
34
+ socket_timeout=5,
35
+ socket_connect_timeout=5
36
+ )
37
+ self.client.ping()
38
+ return
39
+ except (ConnectionError, TimeoutError) as e:
40
+ retries += 1
41
+ if retries < self.max_retries:
42
+ time.sleep(self.retry_delay)
43
+ else:
44
+ raise Exception("达到最大重试次数,无法连接 Redis")
45
+
46
+ def is_connected(self):
47
+ try:
48
+ self.client.ping()
49
+ return True
50
+ except (ConnectionError, TimeoutError):
51
+ return False
52
+
53
+ def reconnect(self):
54
+ self.connect()
55
+
56
+ def execute_command(self, command, *args, **kwargs):
57
+ retries = 0
58
+ while retries < self.max_retries:
59
+ try:
60
+ if not self.is_connected():
61
+ self.reconnect()
62
+ return getattr(self.client, command)(*args, **kwargs)
63
+ except (ConnectionError, TimeoutError) as e:
64
+ retries += 1
65
+ if retries < self.max_retries:
66
+ time.sleep(self.retry_delay)
67
+ else:
68
+ raise Exception("达到最大重试次数,无法执行命令")
69
+
70
+ def get(self, name):
71
+ # with self.get_connection() as client:
72
+ # return client.get(name)
73
+ return self.execute_command("get", name)
74
+
75
+ def incrby(self, name, value):
76
+ # with self.get_connection() as client:
77
+ # client.incrby(name, value)
78
+ self.execute_command("incrby", name, value)
10
79
 
11
80
  def setnx(self, name, value=""):
12
- return self._client.setnx(name, value)
81
+ # with self.get_connection() as client:
82
+ # client.setnx(name, value)
83
+ self.execute_command("setnx", name, value)
13
84
 
14
85
  def setex(self, name, t, value=""):
15
- return self._client.setex(name, t, value)
86
+ # with self.get_connection() as client:
87
+ # client.setex(name, t, value)
88
+ self.execute_command("setex", name, t, value)
16
89
 
17
90
  def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
18
- return self._client.expire(name, t, nx, xx, gt, lt)
91
+ # with self.get_connection() as client:
92
+ # client.expire(name, t, nx, xx, gt, lt)
93
+ self.execute_command("expire", name, t, nx, xx, gt, lt)
19
94
 
20
95
  def ttl(self, name):
21
- return self._client.ttl(name)
96
+ # with self.get_connection() as client:
97
+ # return client.ttl(name)
98
+ return self.execute_command("ttl", name)
22
99
 
23
100
  def delete(self, name):
24
- return self._client.delete(name)
101
+ # with self.get_connection() as client:
102
+ # return client.delete(name)
103
+ return self.execute_command("delete", name)
25
104
 
26
105
  def exists(self, *name) -> bool:
27
- return self._client.exists(*name)
106
+ # with self.get_connection() as client:
107
+ # return client.exists(*name)
108
+ return self.execute_command("exists", *name)
28
109
 
29
110
  def sadd(self, name, value):
30
- return self._client.sadd(name, value)
111
+ # with self.get_connection() as client:
112
+ # return client.sadd(name, value)
113
+ return self.execute_command("sadd", name, value)
31
114
 
32
115
  def zcard(self, name) -> bool:
33
- return self._client.zcard(name)
116
+ # with self.get_connection() as client:
117
+ # return client.zcard(name)
118
+ return self.execute_command("zcard", name)
34
119
 
35
120
  def zadd(self, name, item: dict, **kwargs):
36
- return self._client.zadd(name, item, **kwargs)
121
+ # with self.get_connection() as client:
122
+ # return client.zadd(name, item, **kwargs)
123
+ if item:
124
+ return self.execute_command("zadd", name, item, **kwargs)
37
125
 
38
126
  def zrem(self, name, *value):
39
- return self._client.zrem(name, *value)
127
+ # with self.get_connection() as client:
128
+ # return client.zrem(name, *value)
129
+ return self.execute_command("zrem", name, *value)
40
130
 
41
131
  def zcount(self, name, _min, _max):
42
- return self._client.zcount(name, _min, _max)
132
+ # with self.get_connection() as client:
133
+ # return client.zcount(name, _min, _max)
134
+ return self.execute_command("zcount", name, _min, _max)
43
135
 
44
136
  # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
45
- # return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
137
+ # with self.get_connection() as client:
138
+ # return client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
46
139
 
47
140
  def lua(self, script: str, keys: list = None, args: list = None):
48
141
  keys = keys or []
49
142
  args = args or []
50
143
  keys_count = len(keys)
51
- return self._client.eval(script, keys_count, *keys, *args)
144
+ return self.execute_command("eval", script, keys_count, *keys, *args)
52
145
 
53
146
  def lua_sha(self, sha1: str, keys: list = None, args: list = None):
54
147
  keys = keys or []
55
148
  args = args or []
56
149
  keys_count = len(keys)
57
- return self._client.evalsha(sha1, keys_count, *keys, *args)
150
+ return self.execute_command("evalsha", sha1, keys_count, *keys, *args)
58
151
 
59
152
  def execute_lua(self, lua_script: str, keys: list, *args):
60
- execute = self._client.register_script(lua_script)
153
+ execute = self.execute_command("register_script", lua_script)
61
154
  return execute(keys=keys, args=args)
62
155
 
63
156
  def lock(self, key, t=15) -> bool:
64
157
  lua_script = """
65
- local status = redis.call('setnx', KEYS[1], ARGV[1])
158
+ local status = redis.call('setnx', KEYS[1], 1)
66
159
  if ( status == 1 ) then
67
160
  redis.call('expire', KEYS[1], ARGV[1])
68
161
  end
@@ -71,7 +164,7 @@ class RedisDB:
71
164
  status = self.execute_lua(lua_script, [key], t)
72
165
  return bool(status)
73
166
 
74
- def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
167
+ def members(self, key, score, start=0, count=1000, _min="-inf", _max="+inf") -> list:
75
168
  lua_script = """
76
169
  local min = ARGV[1]
77
170
  local max = ARGV[2]
@@ -85,7 +178,7 @@ class RedisDB:
85
178
  else
86
179
  members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
87
180
  end
88
-
181
+
89
182
  local result = {}
90
183
 
91
184
  for i = 1, #members, 2 do
@@ -97,7 +190,7 @@ class RedisDB:
97
190
  else
98
191
  originPriority = math.floor(members[i+1])
99
192
  end
100
-
193
+
101
194
  if ( score + 0 >= 1000 ) then
102
195
  priority = -score - originPriority / 1000
103
196
  elseif ( score + 0 == 0 ) then
@@ -116,7 +209,7 @@ class RedisDB:
116
209
  members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
117
210
  return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
118
211
 
119
- def done(self, keys: list, *args) -> list:
212
+ def done(self, keys: list, *args):
120
213
  lua_script = """
121
214
  for i, member in ipairs(ARGV) do
122
215
  redis.call("zrem", KEYS[1], member)
@@ -124,5 +217,3 @@ class RedisDB:
124
217
  end
125
218
  """
126
219
  self.execute_lua(lua_script, keys, *args)
127
-
128
-
@@ -1,2 +1,4 @@
1
1
  from .launcher import Launcher
2
- from .launcher_pro import LauncherPro
2
+ from .uploader import Uploader
3
+ from .distributor import Distributor
4
+
@@ -0,0 +1,141 @@
1
+ import time
2
+ import threading
3
+ import traceback
4
+
5
+ from inspect import isgenerator
6
+ from typing import Callable, Type
7
+ from requests import RequestException
8
+
9
+ from cobweb.crawlers import Crawler
10
+ from cobweb.utils import check_pause
11
+ from cobweb.log_dots import LoghubDot
12
+ from cobweb.constant import DealModel, LogTemplate
13
+ from cobweb.base import Seed, Status, TaskQueue, BaseItem, Request, Response, logger
14
+
15
+
16
+ class Distributor(threading.Thread):
17
+
18
+ def __init__(
19
+ self,
20
+ task: str,
21
+ project: str,
22
+ task_queue: TaskQueue,
23
+ stop: threading.Event,
24
+ pause: threading.Event,
25
+ callback_register: Callable,
26
+ SpiderCrawler: Type[Crawler]
27
+ ):
28
+ super().__init__()
29
+ self.task = task
30
+ self.project = project
31
+ self.pause = pause
32
+
33
+ self.task_queue = task_queue
34
+
35
+ self.callback_register = callback_register
36
+ self.Crawler = SpiderCrawler
37
+
38
+ from cobweb import setting
39
+ self.time_sleep = setting.SPIDER_TIME_SLEEP
40
+ self.thread_num = setting.SPIDER_THREAD_NUM
41
+ self.max_retries = setting.SPIDER_MAX_RETRIES
42
+ self.loghub_dot = LoghubDot(stop=stop, project=self.project, task=self.task)
43
+
44
+ logger.debug(f"Distribute instance attrs: {self.__dict__}")
45
+
46
+ def distribute(self, task_id, item, status: Status):
47
+ if isinstance(item, Request):
48
+ item.seed.params.request_time = time.time()
49
+ self.loghub_dot._build_request_log(item)
50
+ self.process(task_id=task_id, item=item, callback=self.Crawler.download, status=Status.PROCESSING)
51
+
52
+ elif isinstance(item, Response):
53
+ if status == Status.FINISHED:
54
+ raise TypeError("parse function can't yield a Response instance")
55
+ item.seed.params.download_time = time.time()
56
+ logger.debug(LogTemplate.download_info.format(
57
+ detail=LogTemplate.log_info(item.seed.to_dict),
58
+ retry=item.seed.params.retry,
59
+ priority=item.seed.params.priority,
60
+ seed_version=item.seed.params.seed_version,
61
+ identifier=item.seed.identifier or "",
62
+ status=item.response,
63
+ response=LogTemplate.log_info(item.to_dict)
64
+ ))
65
+ self.loghub_dot._build_download_log(item)
66
+ self.process(task_id=task_id, item=item, callback=self.Crawler.parse, status=Status.FINISHED)
67
+
68
+ elif isinstance(item, BaseItem):
69
+ item.seed.params.parse_time = time.time()
70
+ self.loghub_dot._build_parse_log(item)
71
+ self.task_queue.add_task(data=item, status=Status.UPLOAD, parent_id=task_id)
72
+
73
+ elif isinstance(item, Seed):
74
+ # todo: 新种子日志
75
+ item.seed.params.insert_time = time.time()
76
+ self.task_queue.add_task(
77
+ task_id=item.sid, data=item, status=Status.INSERT,
78
+ priority=item.params.priority, parent_id=task_id
79
+ )
80
+
81
+ elif isinstance(item, str) and item != DealModel.done:
82
+ raise TypeError("yield value type error!")
83
+
84
+ def process(self, task_id, item, callback, status: Status):
85
+ iterators = callback(item)
86
+ if not isgenerator(iterators):
87
+ raise TypeError(f"{callback.__name__} function isn't a generator!")
88
+ for it in iterators:
89
+ self.distribute(task_id=task_id, item=it, status=status)
90
+
91
+ @check_pause
92
+ def spider(self):
93
+ if task_item := self.task_queue.get_pending_task():
94
+ finsh_status = True
95
+ seed = task_item.data
96
+ status = Status.FINISHED
97
+ task_id = task_item.task_id
98
+ seed.params.start_time = time.time()
99
+
100
+ if seed.params.retry and isinstance(seed.params.retry, int):
101
+ time.sleep(self.time_sleep * seed.params.retry / 100)
102
+
103
+ try:
104
+ self.process(task_id=task_id, item=seed, callback=self.Crawler.request, status=Status.PENDING)
105
+ except Exception as e:
106
+ seed.params.retry += 1
107
+ seed.params.failed_time = time.time()
108
+ msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
109
+ if not seed.params.msg:
110
+ seed.params.traceback = [msg]
111
+ elif isinstance(seed.params.msg, list):
112
+ seed.params.traceback.append(msg)
113
+
114
+ if isinstance(e, RequestException):
115
+ self.loghub_dot._build_http_error_log(seed, e)
116
+ else:
117
+ self.loghub_dot._build_exception_log(seed, e)
118
+
119
+ if seed.params.retry < self.max_retries:
120
+ status = Status.PENDING
121
+ finsh_status = False
122
+
123
+ logger.info(LogTemplate.download_exception.format(
124
+ detail=LogTemplate.log_info(seed.to_dict),
125
+ retry=seed.params.retry,
126
+ priority=seed.params.priority,
127
+ seed_version=seed.params.seed_version,
128
+ identifier=seed.identifier or "",
129
+ exception=msg
130
+ ))
131
+
132
+ finally:
133
+ if finsh_status:
134
+ seed.params.finsh_time = time.time()
135
+ self.loghub_dot._build_finish_log(seed, status=bool(seed.params.retry < self.max_retries))
136
+ self.task_queue.update_task(task_id, status=status, data=seed)
137
+
138
+ def run(self):
139
+ self.callback_register(self.loghub_dot._build_run, tag="LoghubDot")
140
+ for _ in range(self.thread_num):
141
+ self.callback_register(self.spider, tag="Distributor")
@@ -1,84 +1,63 @@
1
1
  import time
2
+ import uuid
2
3
  import inspect
3
4
  import threading
4
5
  import importlib
5
6
 
6
7
  from cobweb import setting
7
- from cobweb.base import Seed, Queue
8
+ from cobweb.launchers.uploader import Uploader
8
9
  from cobweb.utils.tools import dynamic_load_class
10
+ from cobweb.launchers.distributor import Distributor
11
+ from cobweb.base import Seed, logger, TaskQueue, Status
12
+ from typing import Optional, Union, Dict, Any, Callable
9
13
 
10
14
 
11
- class Launcher(threading.Thread):
15
+ class Launcher:
12
16
 
13
- SEEDS = []
17
+ __REGISTER_FUNC__: Dict[str, Callable] = {}
18
+ __WORKER_THREAD__: Dict[str, threading.Thread] = {}
14
19
 
15
- __DOING__ = {}
16
-
17
- __CUSTOM_FUNC__ = {
18
- "download": None,
19
- "download_midware": None,
20
- "parse": None,
21
- }
22
-
23
- __LAUNCHER_QUEUE__ = {
24
- "new": Queue(),
25
- "todo": Queue(),
26
- "done": Queue(),
27
- }
28
-
29
- __LAUNCHER_FUNC__ = [
30
- "_reset",
31
- "_scheduler",
32
- "_insert",
33
- "_refresh",
34
- "_delete",
35
- ]
36
-
37
- def __init__(self, task, project, custom_setting=None):
20
+ def __init__(self, task, project, custom_setting=None, **kwargs):
38
21
  super().__init__()
22
+
39
23
  self.task = task
40
24
  self.project = project
41
25
 
26
+ self._app_time = int(time.time())
42
27
  self._stop = threading.Event() # 结束事件
43
28
  self._pause = threading.Event() # 暂停事件
44
29
 
45
- if custom_setting:
46
- setting_ = dict()
47
- if isinstance(custom_setting, dict):
48
- setting_ = custom_setting
49
- else:
50
- if isinstance(custom_setting, str):
51
- custom_setting = importlib.import_module(custom_setting)
52
- if not inspect.ismodule(custom_setting):
53
- raise Exception
54
- for k, v in custom_setting.__dict__.items():
55
- if not k.startswith("__") and not inspect.ismodule(v):
56
- setting_[k] = v
57
- for k, v in setting_.items():
58
- setattr(setting, k, v)
59
-
60
- self._Crawler = dynamic_load_class(setting.CRAWLER)
61
- self._Pipeline = dynamic_load_class(setting.PIPELINE)
62
-
63
- self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
64
- self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
65
- self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
66
- self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
67
- self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
68
- self._seed_reset_seconds = setting.SEED_RESET_SECONDS
69
-
70
- self._todo_queue_size = setting.TODO_QUEUE_SIZE
71
- self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
72
- self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
73
- self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
30
+ _setting = self._load_custom_settings(custom_setting)
31
+
32
+ _setting.update(kwargs)
33
+ for key, value in _setting.items():
34
+ setattr(setting, key.upper(), value)
74
35
 
75
36
  self._done_model = setting.DONE_MODEL
37
+ self._task_model = setting.TASK_MODEL
76
38
 
77
- self._upload_queue = Queue()
39
+ self._task_queue = TaskQueue()
78
40
 
79
- @property
80
- def start_seeds(self):
81
- return [Seed(seed) for seed in self.SEEDS]
41
+ self.Scheduler = dynamic_load_class(setting.SCHEDULER)
42
+ self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
43
+ self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
44
+
45
+ @staticmethod
46
+ def _load_custom_settings(custom_setting: Optional[Union[str, Dict]]) -> Dict[str, Any]:
47
+ _setting = {}
48
+ if custom_setting:
49
+ if isinstance(custom_setting, dict):
50
+ _setting = custom_setting
51
+ elif isinstance(custom_setting, str):
52
+ module = importlib.import_module(custom_setting)
53
+ _setting = {
54
+ k: v
55
+ for k, v in module.__dict__.items()
56
+ if not k.startswith("__") and not inspect.ismodule(v)
57
+ }
58
+ else:
59
+ raise ValueError("custom_setting must be a dictionary or a module path.")
60
+ return _setting
82
61
 
83
62
  @property
84
63
  def request(self):
@@ -89,10 +68,10 @@ class Launcher(threading.Thread):
89
68
  @launcher.request
90
69
  def request(seed: Seed) -> Union[Request, BaseItem]:
91
70
  ...
92
- return Request(seed.url, seed)
71
+ yield Request(seed.url, seed)
93
72
  """
94
73
  def decorator(func):
95
- self.__CUSTOM_FUNC__["request"] = func
74
+ self.SpiderCrawler.request = func
96
75
  return decorator
97
76
 
98
77
  @property
@@ -107,7 +86,7 @@ class Launcher(threading.Thread):
107
86
  yield Response(item.seed, response)
108
87
  """
109
88
  def decorator(func):
110
- self.__CUSTOM_FUNC__["download"] = func
89
+ self.SpiderCrawler.download = func
111
90
  return decorator
112
91
 
113
92
  @property
@@ -116,83 +95,77 @@ class Launcher(threading.Thread):
116
95
  自定义parse函数, xxxItem为自定义的存储数据类型
117
96
  use case:
118
97
  from cobweb.base import Request, Response
119
- @launcher.download
120
- def download(item: Response) -> BaseItem:
98
+ @launcher.parse
99
+ def parse(item: Response) -> BaseItem:
121
100
  ...
122
101
  yield xxxItem(seed, **kwargs)
123
102
  """
124
103
  def decorator(func):
125
- self.__CUSTOM_FUNC__["parse"] = func
104
+ self.SpiderCrawler.parse = func
126
105
  return decorator
127
106
 
128
- def _remove_doing_seeds(self, seeds):
129
- for seed in seeds:
130
- self.__DOING__.pop(seed, None)
131
-
132
- def _execute_heartbeat(self):
133
- pass
134
-
135
- def _reset(self):
136
- pass
137
-
138
- def _scheduler(self):
139
- pass
140
-
141
- def _insert(self):
142
- pass
143
-
144
- def _refresh(self):
145
- pass
146
-
147
- def _delete(self):
148
- pass
149
-
150
- def _execute(self):
151
- for func_name in self.__LAUNCHER_FUNC__:
152
- threading.Thread(name=func_name, target=getattr(self, func_name)).start()
153
- time.sleep(2)
154
-
155
- def _polling(self):
156
-
157
- check_emtpy_times = 0
158
-
107
+ def start_seeds(self, seeds: list[Union[str, Dict]]) -> list[Seed]:
108
+ seed_list = [Seed(seed) for seed in seeds]
109
+ for seed in seed_list:
110
+ self._task_queue.add_task(
111
+ task_id=seed.sid,
112
+ data=seed,
113
+ status=Status.PENDING,
114
+ priority=seed.params.priority,
115
+ parent_id=None,
116
+ ttl_seconds=None
117
+ )
118
+ return seed_list
119
+
120
+ def _register(self, func: Callable, tag: str = "launcher"):
121
+ name = f"{tag}:{func.__name__}_{uuid.uuid4()}"
122
+ self.__REGISTER_FUNC__[name] = func
123
+ if not self.__WORKER_THREAD__.get(name):
124
+ worker_thread = threading.Thread(name=name, target=func)
125
+ self.__WORKER_THREAD__[name] = worker_thread
126
+
127
+ def _monitor(self):
159
128
  while not self._stop.is_set():
129
+ if not self._pause.is_set():
130
+ for name, worker_thread in list(self.__WORKER_THREAD__.items()):
131
+ if not worker_thread.is_alive():
132
+ logger.debug(f"{name} thread is dead. Restarting...")
133
+ func = self.__REGISTER_FUNC__[name]
134
+ worker_thread = threading.Thread(name=name, target=func)
135
+ self.__WORKER_THREAD__[name] = worker_thread
136
+ worker_thread.start()
137
+ time.sleep(15)
138
+ logger.info("monitor thread close!")
139
+
140
+ def start(self):
141
+ self._pause.is_set()
142
+
143
+ self.Scheduler(
144
+ task=self.task,
145
+ project=self.project,
146
+ stop=self._stop,
147
+ pause=self._pause,
148
+ task_queue=self._task_queue,
149
+ callback_register=self._register
150
+ ).start()
160
151
 
161
- queue_not_empty_count = 0
162
-
163
- for q in self.__LAUNCHER_QUEUE__.values():
164
- if q.length != 0:
165
- queue_not_empty_count += 1
166
-
167
- if self._pause.is_set() and queue_not_empty_count != 0:
168
- self._pause.clear()
169
- self._execute()
170
-
171
- elif queue_not_empty_count == 0:
172
- check_emtpy_times += 1
173
- else:
174
- check_emtpy_times = 0
175
-
176
- if check_emtpy_times > 2:
177
- check_emtpy_times = 0
178
- self.__DOING__ = {}
179
- self._pause.set()
180
-
181
- def run(self):
182
- threading.Thread(target=self._execute_heartbeat).start()
183
-
184
- self._Crawler(
185
- upload_queue=self._upload_queue,
186
- custom_func=self.__CUSTOM_FUNC__,
187
- launcher_queue=self.__LAUNCHER_QUEUE__,
152
+ Distributor(
153
+ task=self.task,
154
+ project=self.project,
155
+ task_queue=self._task_queue,
156
+ callback_register=self._register,
157
+ stop=self._stop, pause=self._pause,
158
+ SpiderCrawler=self.SpiderCrawler
188
159
  ).start()
189
160
 
190
- self._Pipeline(
191
- upload_queue=self._upload_queue,
192
- done_queue=self.__LAUNCHER_QUEUE__["done"],
193
- upload_queue_size=self._upload_queue_max_size,
194
- upload_wait_seconds=self._upload_queue_wait_seconds
161
+ Uploader(
162
+ task=self.task, project=self.project,
163
+ stop=self._stop, pause=self._pause,
164
+ task_queue=self._task_queue,
165
+ callback_register=self._register,
166
+ SpiderPipeline=self.SpiderPipeline
195
167
  ).start()
196
168
 
197
- self._execute()
198
- self._polling()
169
+ self._monitor()
170
+ logger.info("task done!")
171
+