cobweb-launcher 1.2.25__py3-none-any.whl → 3.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +4 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +35 -16
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +741 -54
  7. cobweb/base/response.py +380 -13
  8. cobweb/base/seed.py +96 -48
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +10 -1
  12. cobweb/crawlers/crawler.py +12 -155
  13. cobweb/db/api_db.py +3 -2
  14. cobweb/db/redis_db.py +117 -28
  15. cobweb/launchers/__init__.py +4 -3
  16. cobweb/launchers/distributor.py +141 -0
  17. cobweb/launchers/launcher.py +95 -157
  18. cobweb/launchers/uploader.py +68 -0
  19. cobweb/log_dots/__init__.py +2 -0
  20. cobweb/log_dots/dot.py +258 -0
  21. cobweb/log_dots/loghub_dot.py +53 -0
  22. cobweb/pipelines/__init__.py +1 -1
  23. cobweb/pipelines/pipeline.py +5 -55
  24. cobweb/pipelines/pipeline_csv.py +25 -0
  25. cobweb/pipelines/pipeline_loghub.py +32 -12
  26. cobweb/schedulers/__init__.py +1 -0
  27. cobweb/schedulers/scheduler.py +66 -0
  28. cobweb/schedulers/scheduler_with_redis.py +189 -0
  29. cobweb/setting.py +27 -40
  30. cobweb/utils/__init__.py +5 -3
  31. cobweb/utils/bloom.py +58 -58
  32. cobweb/{base → utils}/decorators.py +14 -12
  33. cobweb/utils/dotting.py +300 -0
  34. cobweb/utils/oss.py +113 -94
  35. cobweb/utils/tools.py +3 -15
  36. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/METADATA +31 -43
  37. cobweb_launcher-3.2.20.dist-info/RECORD +44 -0
  38. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/WHEEL +1 -1
  39. cobweb/crawlers/base_crawler.py +0 -144
  40. cobweb/crawlers/file_crawler.py +0 -98
  41. cobweb/launchers/launcher_air.py +0 -88
  42. cobweb/launchers/launcher_api.py +0 -221
  43. cobweb/launchers/launcher_pro.py +0 -222
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb/pipelines/pipeline_console.py +0 -22
  47. cobweb_launcher-1.2.25.dist-info/RECORD +0 -40
  48. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/top_level.txt +0 -0
cobweb/db/redis_db.py CHANGED
@@ -1,64 +1,156 @@
1
+ import os
2
+
3
+ import time
1
4
  import redis
2
- from cobweb import setting
5
+ from redis.exceptions import ConnectionError, TimeoutError
3
6
 
4
7
 
5
8
  class RedisDB:
6
-
7
- def __init__(self, **kwargs):
8
- redis_config = kwargs or setting.REDIS_CONFIG
9
- pool = redis.ConnectionPool(**redis_config)
10
- self._client = redis.Redis(connection_pool=pool)
9
+ def __init__(
10
+ self,
11
+ host=None,
12
+ password=None,
13
+ port=6379, db=0
14
+ ):
15
+ self.host = host or os.getenv("REDIS_HOST", "localhost")
16
+ self.password = password or os.getenv("REDIS_PASSWORD")
17
+ self.port = port or os.getenv("REDIS_PORT", 6379)
18
+ self.db = db or os.getenv("REDIS_DB", 0)
19
+
20
+ self.max_retries = 5
21
+ self.retry_delay = 5
22
+ self.client = None
23
+ self.connect()
24
+
25
+ def connect(self):
26
+ retries = 0
27
+ while retries < self.max_retries:
28
+ try:
29
+ self.client = redis.Redis(
30
+ host=self.host,
31
+ port=self.port,
32
+ password=self.password,
33
+ db=self.db,
34
+ socket_timeout=5,
35
+ socket_connect_timeout=5
36
+ )
37
+ self.client.ping()
38
+ return
39
+ except (ConnectionError, TimeoutError) as e:
40
+ retries += 1
41
+ if retries < self.max_retries:
42
+ time.sleep(self.retry_delay)
43
+ else:
44
+ raise Exception("达到最大重试次数,无法连接 Redis")
45
+
46
+ def is_connected(self):
47
+ try:
48
+ self.client.ping()
49
+ return True
50
+ except (ConnectionError, TimeoutError):
51
+ return False
52
+
53
+ def reconnect(self):
54
+ self.connect()
55
+
56
+ def execute_command(self, command, *args, **kwargs):
57
+ retries = 0
58
+ while retries < self.max_retries:
59
+ try:
60
+ if not self.is_connected():
61
+ self.reconnect()
62
+ return getattr(self.client, command)(*args, **kwargs)
63
+ except (ConnectionError, TimeoutError) as e:
64
+ retries += 1
65
+ if retries < self.max_retries:
66
+ time.sleep(self.retry_delay)
67
+ else:
68
+ raise Exception("达到最大重试次数,无法执行命令")
69
+
70
+ def get(self, name):
71
+ # with self.get_connection() as client:
72
+ # return client.get(name)
73
+ return self.execute_command("get", name)
74
+
75
+ def incrby(self, name, value):
76
+ # with self.get_connection() as client:
77
+ # client.incrby(name, value)
78
+ self.execute_command("incrby", name, value)
11
79
 
12
80
  def setnx(self, name, value=""):
13
- return self._client.setnx(name, value)
81
+ # with self.get_connection() as client:
82
+ # client.setnx(name, value)
83
+ self.execute_command("setnx", name, value)
14
84
 
15
85
  def setex(self, name, t, value=""):
16
- return self._client.setex(name, t, value)
86
+ # with self.get_connection() as client:
87
+ # client.setex(name, t, value)
88
+ self.execute_command("setex", name, t, value)
17
89
 
18
90
  def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
19
- return self._client.expire(name, t, nx, xx, gt, lt)
91
+ # with self.get_connection() as client:
92
+ # client.expire(name, t, nx, xx, gt, lt)
93
+ self.execute_command("expire", name, t, nx, xx, gt, lt)
20
94
 
21
95
  def ttl(self, name):
22
- return self._client.ttl(name)
96
+ # with self.get_connection() as client:
97
+ # return client.ttl(name)
98
+ return self.execute_command("ttl", name)
23
99
 
24
100
  def delete(self, name):
25
- return self._client.delete(name)
101
+ # with self.get_connection() as client:
102
+ # return client.delete(name)
103
+ return self.execute_command("delete", name)
26
104
 
27
105
  def exists(self, *name) -> bool:
28
- return self._client.exists(*name)
106
+ # with self.get_connection() as client:
107
+ # return client.exists(*name)
108
+ return self.execute_command("exists", *name)
29
109
 
30
110
  def sadd(self, name, value):
31
- return self._client.sadd(name, value)
111
+ # with self.get_connection() as client:
112
+ # return client.sadd(name, value)
113
+ return self.execute_command("sadd", name, value)
32
114
 
33
115
  def zcard(self, name) -> bool:
34
- return self._client.zcard(name)
116
+ # with self.get_connection() as client:
117
+ # return client.zcard(name)
118
+ return self.execute_command("zcard", name)
35
119
 
36
120
  def zadd(self, name, item: dict, **kwargs):
37
- return self._client.zadd(name, item, **kwargs)
121
+ # with self.get_connection() as client:
122
+ # return client.zadd(name, item, **kwargs)
123
+ if item:
124
+ return self.execute_command("zadd", name, item, **kwargs)
38
125
 
39
126
  def zrem(self, name, *value):
40
- return self._client.zrem(name, *value)
127
+ # with self.get_connection() as client:
128
+ # return client.zrem(name, *value)
129
+ return self.execute_command("zrem", name, *value)
41
130
 
42
131
  def zcount(self, name, _min, _max):
43
- return self._client.zcount(name, _min, _max)
132
+ # with self.get_connection() as client:
133
+ # return client.zcount(name, _min, _max)
134
+ return self.execute_command("zcount", name, _min, _max)
44
135
 
45
136
  # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
46
- # return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
137
+ # with self.get_connection() as client:
138
+ # return client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
47
139
 
48
140
  def lua(self, script: str, keys: list = None, args: list = None):
49
141
  keys = keys or []
50
142
  args = args or []
51
143
  keys_count = len(keys)
52
- return self._client.eval(script, keys_count, *keys, *args)
144
+ return self.execute_command("eval", script, keys_count, *keys, *args)
53
145
 
54
146
  def lua_sha(self, sha1: str, keys: list = None, args: list = None):
55
147
  keys = keys or []
56
148
  args = args or []
57
149
  keys_count = len(keys)
58
- return self._client.evalsha(sha1, keys_count, *keys, *args)
150
+ return self.execute_command("evalsha", sha1, keys_count, *keys, *args)
59
151
 
60
152
  def execute_lua(self, lua_script: str, keys: list, *args):
61
- execute = self._client.register_script(lua_script)
153
+ execute = self.execute_command("register_script", lua_script)
62
154
  return execute(keys=keys, args=args)
63
155
 
64
156
  def lock(self, key, t=15) -> bool:
@@ -72,7 +164,7 @@ class RedisDB:
72
164
  status = self.execute_lua(lua_script, [key], t)
73
165
  return bool(status)
74
166
 
75
- def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
167
+ def members(self, key, score, start=0, count=1000, _min="-inf", _max="+inf") -> list:
76
168
  lua_script = """
77
169
  local min = ARGV[1]
78
170
  local max = ARGV[2]
@@ -86,7 +178,7 @@ class RedisDB:
86
178
  else
87
179
  members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
88
180
  end
89
-
181
+
90
182
  local result = {}
91
183
 
92
184
  for i = 1, #members, 2 do
@@ -98,7 +190,7 @@ class RedisDB:
98
190
  else
99
191
  originPriority = math.floor(members[i+1])
100
192
  end
101
-
193
+
102
194
  if ( score + 0 >= 1000 ) then
103
195
  priority = -score - originPriority / 1000
104
196
  elseif ( score + 0 == 0 ) then
@@ -117,7 +209,7 @@ class RedisDB:
117
209
  members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
118
210
  return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
119
211
 
120
- def done(self, keys: list, *args) -> list:
212
+ def done(self, keys: list, *args):
121
213
  lua_script = """
122
214
  for i, member in ipairs(ARGV) do
123
215
  redis.call("zrem", KEYS[1], member)
@@ -125,6 +217,3 @@ class RedisDB:
125
217
  end
126
218
  """
127
219
  self.execute_lua(lua_script, keys, *args)
128
-
129
-
130
-
@@ -1,3 +1,4 @@
1
- from .launcher_air import LauncherAir
2
- from .launcher_pro import LauncherPro
3
- from .launcher_api import LauncherApi
1
+ from .launcher import Launcher
2
+ from .uploader import Uploader
3
+ from .distributor import Distributor
4
+
@@ -0,0 +1,141 @@
1
+ import time
2
+ import threading
3
+ import traceback
4
+
5
+ from inspect import isgenerator
6
+ from typing import Callable, Type
7
+ from requests import RequestException
8
+
9
+ from cobweb.crawlers import Crawler
10
+ from cobweb.utils import check_pause
11
+ from cobweb.log_dots import LoghubDot
12
+ from cobweb.constant import DealModel, LogTemplate
13
+ from cobweb.base import Seed, Status, TaskQueue, BaseItem, Request, Response, logger
14
+
15
+
16
+ class Distributor(threading.Thread):
17
+
18
+ def __init__(
19
+ self,
20
+ task: str,
21
+ project: str,
22
+ task_queue: TaskQueue,
23
+ stop: threading.Event,
24
+ pause: threading.Event,
25
+ callback_register: Callable,
26
+ SpiderCrawler: Type[Crawler]
27
+ ):
28
+ super().__init__()
29
+ self.task = task
30
+ self.project = project
31
+ self.pause = pause
32
+
33
+ self.task_queue = task_queue
34
+
35
+ self.callback_register = callback_register
36
+ self.Crawler = SpiderCrawler
37
+
38
+ from cobweb import setting
39
+ self.time_sleep = setting.SPIDER_TIME_SLEEP
40
+ self.thread_num = setting.SPIDER_THREAD_NUM
41
+ self.max_retries = setting.SPIDER_MAX_RETRIES
42
+ self.loghub_dot = LoghubDot(stop=stop, project=self.project, task=self.task)
43
+
44
+ logger.debug(f"Distribute instance attrs: {self.__dict__}")
45
+
46
+ def distribute(self, task_id, item, status: Status):
47
+ if isinstance(item, Request):
48
+ item.seed.params.request_time = time.time()
49
+ self.loghub_dot._build_request_log(item)
50
+ self.process(task_id=task_id, item=item, callback=self.Crawler.download, status=Status.PROCESSING)
51
+
52
+ elif isinstance(item, Response):
53
+ if status == Status.FINISHED:
54
+ raise TypeError("parse function can't yield a Response instance")
55
+ item.seed.params.download_time = time.time()
56
+ logger.debug(LogTemplate.download_info.format(
57
+ detail=LogTemplate.log_info(item.seed.to_dict),
58
+ retry=item.seed.params.retry,
59
+ priority=item.seed.params.priority,
60
+ seed_version=item.seed.params.seed_version,
61
+ identifier=item.seed.identifier or "",
62
+ status=item.response,
63
+ response=LogTemplate.log_info(item.to_dict)
64
+ ))
65
+ self.loghub_dot._build_download_log(item)
66
+ self.process(task_id=task_id, item=item, callback=self.Crawler.parse, status=Status.FINISHED)
67
+
68
+ elif isinstance(item, BaseItem):
69
+ item.seed.params.parse_time = time.time()
70
+ self.loghub_dot._build_parse_log(item)
71
+ self.task_queue.add_task(data=item, status=Status.UPLOAD, parent_id=task_id)
72
+
73
+ elif isinstance(item, Seed):
74
+ # todo: 新种子日志
75
+ item.params.insert_time = time.time()
76
+ self.task_queue.add_task(
77
+ task_id=item.sid, data=item, status=Status.INSERT,
78
+ priority=item.params.priority, parent_id=task_id
79
+ )
80
+
81
+ elif isinstance(item, str) and item != DealModel.done:
82
+ raise TypeError("yield value type error!")
83
+
84
+ def process(self, task_id, item, callback, status: Status):
85
+ iterators = callback(item)
86
+ if not isgenerator(iterators):
87
+ raise TypeError(f"{callback.__name__} function isn't a generator!")
88
+ for it in iterators:
89
+ self.distribute(task_id=task_id, item=it, status=status)
90
+
91
+ @check_pause
92
+ def spider(self):
93
+ if task_item := self.task_queue.get_pending_task():
94
+ finsh_status = True
95
+ seed = task_item.data
96
+ status = Status.FINISHED
97
+ task_id = task_item.task_id
98
+ seed.params.start_time = time.time()
99
+
100
+ if seed.params.retry and isinstance(seed.params.retry, int):
101
+ time.sleep(self.time_sleep * seed.params.retry / 100)
102
+
103
+ try:
104
+ self.process(task_id=task_id, item=seed, callback=self.Crawler.request, status=Status.PENDING)
105
+ except Exception as e:
106
+ seed.params.retry += 1
107
+ seed.params.failed_time = time.time()
108
+ msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
109
+ if not seed.params.msg:
110
+ seed.params.traceback = [msg]
111
+ elif isinstance(seed.params.msg, list):
112
+ seed.params.traceback.append(msg)
113
+
114
+ if isinstance(e, RequestException):
115
+ self.loghub_dot._build_http_error_log(seed, e)
116
+ else:
117
+ self.loghub_dot._build_exception_log(seed, e)
118
+
119
+ if seed.params.retry < self.max_retries:
120
+ status = Status.PENDING
121
+ finsh_status = False
122
+
123
+ logger.info(LogTemplate.download_exception.format(
124
+ detail=LogTemplate.log_info(seed.to_dict),
125
+ retry=seed.params.retry,
126
+ priority=seed.params.priority,
127
+ seed_version=seed.params.seed_version,
128
+ identifier=seed.identifier or "",
129
+ exception=msg
130
+ ))
131
+
132
+ finally:
133
+ if finsh_status:
134
+ seed.params.finsh_time = time.time()
135
+ self.loghub_dot._build_finish_log(seed, status=bool(seed.params.retry < self.max_retries))
136
+ self.task_queue.update_task(task_id, status=status, data=seed)
137
+
138
+ def run(self):
139
+ self.callback_register(self.loghub_dot._build_run, tag="LoghubDot")
140
+ for _ in range(self.thread_num):
141
+ self.callback_register(self.spider, tag="Distributor")
@@ -1,57 +1,25 @@
1
1
  import time
2
+ import uuid
2
3
  import inspect
3
4
  import threading
4
5
  import importlib
5
- from functools import wraps
6
6
 
7
7
  from cobweb import setting
8
- from cobweb.base import Seed, Queue, logger
8
+ from cobweb.launchers.uploader import Uploader
9
9
  from cobweb.utils.tools import dynamic_load_class
10
+ from cobweb.launchers.distributor import Distributor
11
+ from cobweb.base import Seed, logger, TaskQueue, Status
12
+ from typing import Optional, Union, Dict, Any, Callable
10
13
 
11
14
 
12
- def check_pause(func):
13
- @wraps(func)
14
- def wrapper(self, *args, **kwargs):
15
- while not self._pause.is_set():
16
- try:
17
- func(self, *args, **kwargs)
18
- except Exception as e:
19
- logger.info(f"{func.__name__}: " + str(e))
20
- finally:
21
- time.sleep(0.1)
15
+ class Launcher:
22
16
 
23
- return wrapper
24
-
25
-
26
- class Launcher(threading.Thread):
27
-
28
- SEEDS = []
29
-
30
- __DOING__ = {}
31
-
32
- __CUSTOM_FUNC__ = {
33
- # "download": None,
34
- # "request": None,
35
- # "parse": None,
36
- }
37
-
38
- __LAUNCHER_QUEUE__ = {
39
- "new": Queue(),
40
- "todo": Queue(),
41
- "done": Queue(),
42
- "upload": Queue()
43
- }
44
-
45
- __LAUNCHER_FUNC__ = [
46
- "_reset",
47
- "_scheduler",
48
- "_insert",
49
- "_refresh",
50
- "_delete",
51
- ]
17
+ __REGISTER_FUNC__: Dict[str, Callable] = {}
18
+ __WORKER_THREAD__: Dict[str, threading.Thread] = {}
52
19
 
53
20
  def __init__(self, task, project, custom_setting=None, **kwargs):
54
21
  super().__init__()
22
+
55
23
  self.task = task
56
24
  self.project = project
57
25
 
@@ -59,51 +27,37 @@ class Launcher(threading.Thread):
59
27
  self._stop = threading.Event() # 结束事件
60
28
  self._pause = threading.Event() # 暂停事件
61
29
 
62
- _setting = dict()
30
+ _setting = self._load_custom_settings(custom_setting)
63
31
 
64
- if custom_setting:
65
- if isinstance(custom_setting, dict):
66
- _setting = custom_setting
67
- else:
68
- if isinstance(custom_setting, str):
69
- custom_setting = importlib.import_module(custom_setting)
70
- if not inspect.ismodule(custom_setting):
71
- raise Exception
72
- for k, v in custom_setting.__dict__.items():
73
- if not k.startswith("__") and not inspect.ismodule(v):
74
- _setting[k] = v
75
-
76
- _setting.update(**kwargs)
77
-
78
- for k, v in _setting.items():
79
- setattr(setting, k.upper(), v)
80
-
81
- self._Crawler = dynamic_load_class(setting.CRAWLER)
82
- self._Pipeline = dynamic_load_class(setting.PIPELINE)
83
-
84
- self._before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
85
- self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
86
- self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
87
- self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
88
- self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
89
- self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
90
- self._seed_reset_seconds = setting.SEED_RESET_SECONDS
91
-
92
- self._todo_queue_size = setting.TODO_QUEUE_SIZE
93
- self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
94
- self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
95
- self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
96
-
97
- self._spider_max_retries = setting.SPIDER_MAX_RETRIES
98
- self._spider_thread_num = setting.SPIDER_THREAD_NUM
99
- self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
100
- self._spider_max_count = setting.SPIDER_MAX_COUNT
101
- self._time_window = setting.TIME_WINDOW
32
+ _setting.update(kwargs)
33
+ for key, value in _setting.items():
34
+ setattr(setting, key.upper(), value)
102
35
 
103
36
  self._done_model = setting.DONE_MODEL
104
37
  self._task_model = setting.TASK_MODEL
105
38
 
106
- self._filter_field = setting.FILTER_FIELD
39
+ self._task_queue = TaskQueue()
40
+
41
+ self.Scheduler = dynamic_load_class(setting.SCHEDULER)
42
+ self.SpiderCrawler = dynamic_load_class(setting.CRAWLER)
43
+ self.SpiderPipeline = dynamic_load_class(setting.PIPELINE)
44
+
45
+ @staticmethod
46
+ def _load_custom_settings(custom_setting: Optional[Union[str, Dict]]) -> Dict[str, Any]:
47
+ _setting = {}
48
+ if custom_setting:
49
+ if isinstance(custom_setting, dict):
50
+ _setting = custom_setting
51
+ elif isinstance(custom_setting, str):
52
+ module = importlib.import_module(custom_setting)
53
+ _setting = {
54
+ k: v
55
+ for k, v in module.__dict__.items()
56
+ if not k.startswith("__") and not inspect.ismodule(v)
57
+ }
58
+ else:
59
+ raise ValueError("custom_setting must be a dictionary or a module path.")
60
+ return _setting
107
61
 
108
62
  @property
109
63
  def request(self):
@@ -117,7 +71,7 @@ class Launcher(threading.Thread):
117
71
  yield Request(seed.url, seed)
118
72
  """
119
73
  def decorator(func):
120
- self.__CUSTOM_FUNC__["request"] = func
74
+ self.SpiderCrawler.request = func
121
75
  return decorator
122
76
 
123
77
  @property
@@ -132,7 +86,7 @@ class Launcher(threading.Thread):
132
86
  yield Response(item.seed, response)
133
87
  """
134
88
  def decorator(func):
135
- self.__CUSTOM_FUNC__["download"] = func
89
+ self.SpiderCrawler.download = func
136
90
  return decorator
137
91
 
138
92
  @property
@@ -147,87 +101,71 @@ class Launcher(threading.Thread):
147
101
  yield xxxItem(seed, **kwargs)
148
102
  """
149
103
  def decorator(func):
150
- self.__CUSTOM_FUNC__["parse"] = func
104
+ self.SpiderCrawler.parse = func
151
105
  return decorator
152
106
 
153
- def start_seeds(self):
154
- seeds = [Seed(seed) for seed in self.SEEDS]
155
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
156
- return seeds
157
-
158
- def _remove_doing_seeds(self, seeds):
159
- for seed in seeds:
160
- self.__DOING__.pop(seed, None)
161
- # logger.info("remove %s seeds from __DOING__" % len(seeds))
162
-
163
- def _get_seed(self) -> Seed:
164
- return self.__LAUNCHER_QUEUE__["todo"].pop()
165
-
166
- def _set_seed(self, seed, **kwargs):
167
- self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
168
-
169
- def _upload_data(self, data, **kwargs):
170
- self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
171
-
172
- def _add_seed(self, seed, **kwargs):
173
- self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
174
-
175
- def _delete_seed(self, seed, **kwargs):
176
- self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
177
-
178
- def _execute(self):
179
- for func_name in self.__LAUNCHER_FUNC__:
180
- threading.Thread(name=func_name, target=getattr(self, func_name)).start()
181
- time.sleep(1)
182
-
183
- def run(self):
184
- threading.Thread(target=self._execute_heartbeat).start()
185
-
186
- self.start_seeds()
107
+ def start_seeds(self, seeds: list[Union[str, Dict]]) -> list[Seed]:
108
+ seed_list = [Seed(seed) for seed in seeds]
109
+ for seed in seed_list:
110
+ self._task_queue.add_task(
111
+ task_id=seed.sid,
112
+ data=seed,
113
+ status=Status.PENDING,
114
+ priority=seed.params.priority,
115
+ parent_id=None,
116
+ ttl_seconds=None
117
+ )
118
+ return seed_list
119
+
120
+ def _register(self, func: Callable, tag: str = "launcher"):
121
+ name = f"{tag}:{func.__name__}_{uuid.uuid4()}"
122
+ self.__REGISTER_FUNC__[name] = func
123
+ if not self.__WORKER_THREAD__.get(name):
124
+ worker_thread = threading.Thread(name=name, target=func)
125
+ self.__WORKER_THREAD__[name] = worker_thread
126
+
127
+ def _monitor(self):
128
+ while not self._stop.is_set():
129
+ if not self._pause.is_set():
130
+ for name, worker_thread in list(self.__WORKER_THREAD__.items()):
131
+ if not worker_thread.is_alive():
132
+ logger.debug(f"{name} thread is dead. Restarting...")
133
+ func = self.__REGISTER_FUNC__[name]
134
+ worker_thread = threading.Thread(name=name, target=func)
135
+ self.__WORKER_THREAD__[name] = worker_thread
136
+ worker_thread.start()
137
+ time.sleep(15)
138
+ logger.info("monitor thread close!")
139
+
140
+ def start(self):
141
+ self._pause.is_set()
142
+
143
+ self.Scheduler(
144
+ task=self.task,
145
+ project=self.project,
146
+ stop=self._stop,
147
+ pause=self._pause,
148
+ task_queue=self._task_queue,
149
+ callback_register=self._register
150
+ ).start()
187
151
 
188
- self._Crawler(
152
+ Distributor(
153
+ task=self.task,
154
+ project=self.project,
155
+ task_queue=self._task_queue,
156
+ callback_register=self._register,
189
157
  stop=self._stop, pause=self._pause,
190
- # launcher_queue=self.__LAUNCHER_QUEUE__,
191
- get_seed=self._get_seed,
192
- set_seed=self._set_seed,
193
- add_seed=self._add_seed,
194
- delete_seed=self._delete_seed,
195
- upload_data=self._upload_data,
196
- custom_func=self.__CUSTOM_FUNC__,
197
- thread_num = self._spider_thread_num,
198
- max_retries = self._spider_max_retries,
199
- time_sleep=self._spider_time_sleep
158
+ SpiderCrawler=self.SpiderCrawler
200
159
  ).start()
201
160
 
202
- self._Pipeline(
161
+ Uploader(
162
+ task=self.task, project=self.project,
203
163
  stop=self._stop, pause=self._pause,
204
- upload=self.__LAUNCHER_QUEUE__["upload"],
205
- done=self.__LAUNCHER_QUEUE__["done"],
206
- upload_size=self._upload_queue_max_size,
207
- wait_seconds=self._upload_queue_wait_seconds
164
+ task_queue=self._task_queue,
165
+ callback_register=self._register,
166
+ SpiderPipeline=self.SpiderPipeline
208
167
  ).start()
209
168
 
210
- self._execute()
211
- self._polling()
212
-
213
- def _execute_heartbeat(self):
214
- pass
215
-
216
- def _reset(self):
217
- pass
218
-
219
- def _scheduler(self):
220
- pass
221
-
222
- def _insert(self):
223
- pass
224
-
225
- def _refresh(self):
226
- pass
227
-
228
- def _delete(self):
229
- pass
230
-
231
- def _polling(self):
232
- pass
169
+ self._monitor()
170
+ logger.info("task done!")
233
171