cobweb-launcher 1.2.49__py3-none-any.whl → 1.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. cobweb/base/__init__.py +141 -4
  2. cobweb/base/basic.py +28 -82
  3. cobweb/base/common_queue.py +13 -0
  4. cobweb/base/dotting.py +1 -1
  5. cobweb/base/request.py +14 -2
  6. cobweb/base/seed.py +10 -6
  7. cobweb/constant.py +16 -0
  8. cobweb/crawlers/crawler.py +51 -181
  9. cobweb/db/redis_db.py +28 -0
  10. cobweb/launchers/__init__.py +2 -2
  11. cobweb/launchers/launcher.py +110 -141
  12. cobweb/launchers/launcher_api.py +66 -114
  13. cobweb/launchers/launcher_pro.py +76 -194
  14. cobweb/pipelines/base_pipeline.py +54 -0
  15. cobweb/pipelines/loghub_pipeline.py +34 -0
  16. cobweb/pipelines/pipeline.py +25 -49
  17. cobweb/schedulers/__init__.py +0 -2
  18. cobweb/schedulers/scheduler_redis.py +5 -8
  19. cobweb/setting.py +29 -6
  20. cobweb/utils/dotting.py +10 -42
  21. cobweb_/__init__.py +2 -0
  22. cobweb_/base/__init__.py +9 -0
  23. cobweb_/base/common_queue.py +30 -0
  24. cobweb_/base/decorators.py +40 -0
  25. cobweb_/base/item.py +46 -0
  26. cobweb_/base/log.py +94 -0
  27. cobweb_/base/request.py +82 -0
  28. cobweb_/base/response.py +23 -0
  29. cobweb_/base/seed.py +114 -0
  30. cobweb_/constant.py +94 -0
  31. cobweb_/crawlers/__init__.py +1 -0
  32. cobweb_/crawlers/crawler.py +184 -0
  33. cobweb_/db/__init__.py +2 -0
  34. cobweb_/db/api_db.py +82 -0
  35. cobweb_/db/redis_db.py +130 -0
  36. cobweb_/exceptions/__init__.py +1 -0
  37. cobweb_/exceptions/oss_db_exception.py +28 -0
  38. cobweb_/launchers/__init__.py +3 -0
  39. cobweb_/launchers/launcher.py +235 -0
  40. cobweb_/launchers/launcher_air.py +88 -0
  41. cobweb_/launchers/launcher_api.py +221 -0
  42. cobweb_/launchers/launcher_pro.py +222 -0
  43. cobweb_/pipelines/__init__.py +3 -0
  44. cobweb_/pipelines/pipeline.py +69 -0
  45. cobweb_/pipelines/pipeline_console.py +22 -0
  46. cobweb_/pipelines/pipeline_loghub.py +34 -0
  47. cobweb_/setting.py +74 -0
  48. cobweb_/utils/__init__.py +5 -0
  49. cobweb_/utils/bloom.py +58 -0
  50. cobweb_/utils/dotting.py +32 -0
  51. cobweb_/utils/oss.py +94 -0
  52. cobweb_/utils/tools.py +42 -0
  53. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/METADATA +1 -1
  54. cobweb_launcher-1.3.2.dist-info/RECORD +110 -0
  55. cobweb_launcher-1.3.2.dist-info/top_level.txt +2 -0
  56. cobweb_new/__init__.py +2 -0
  57. cobweb_new/base/__init__.py +72 -0
  58. cobweb_new/base/common_queue.py +53 -0
  59. cobweb_new/base/decorators.py +72 -0
  60. cobweb_new/base/item.py +46 -0
  61. cobweb_new/base/log.py +94 -0
  62. cobweb_new/base/request.py +82 -0
  63. cobweb_new/base/response.py +23 -0
  64. cobweb_new/base/seed.py +118 -0
  65. cobweb_new/constant.py +105 -0
  66. cobweb_new/crawlers/__init__.py +1 -0
  67. cobweb_new/crawlers/crawler-new.py +85 -0
  68. cobweb_new/crawlers/crawler.py +170 -0
  69. cobweb_new/db/__init__.py +2 -0
  70. cobweb_new/db/api_db.py +82 -0
  71. cobweb_new/db/redis_db.py +158 -0
  72. cobweb_new/exceptions/__init__.py +1 -0
  73. cobweb_new/exceptions/oss_db_exception.py +28 -0
  74. cobweb_new/launchers/__init__.py +3 -0
  75. cobweb_new/launchers/launcher.py +237 -0
  76. cobweb_new/launchers/launcher_air.py +88 -0
  77. cobweb_new/launchers/launcher_api.py +161 -0
  78. cobweb_new/launchers/launcher_pro.py +96 -0
  79. cobweb_new/launchers/tesss.py +47 -0
  80. cobweb_new/pipelines/__init__.py +3 -0
  81. cobweb_new/pipelines/pipeline.py +68 -0
  82. cobweb_new/pipelines/pipeline_console.py +22 -0
  83. cobweb_new/pipelines/pipeline_loghub.py +34 -0
  84. cobweb_new/setting.py +95 -0
  85. cobweb_new/utils/__init__.py +5 -0
  86. cobweb_new/utils/bloom.py +58 -0
  87. cobweb_new/utils/oss.py +94 -0
  88. cobweb_new/utils/tools.py +42 -0
  89. cobweb/schedulers/scheduler_api.py +0 -72
  90. cobweb_launcher-1.2.49.dist-info/RECORD +0 -44
  91. cobweb_launcher-1.2.49.dist-info/top_level.txt +0 -1
  92. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/LICENSE +0 -0
  93. {cobweb_launcher-1.2.49.dist-info → cobweb_launcher-1.3.2.dist-info}/WHEEL +0 -0
@@ -1,66 +1,35 @@
1
1
  import json
2
- import os
3
- import threading
4
2
  import time
5
- import traceback
6
- from inspect import isgenerator
3
+ import threading
7
4
  from typing import Union, Callable, Mapping
8
- from urllib.parse import urlparse
9
-
10
- from requests import Response as Res
11
5
 
12
- from cobweb.constant import DealModel, LogTemplate
6
+ import setting
13
7
  from cobweb.base import (
14
8
  Seed,
15
- BaseItem,
16
- Request,
17
- Response,
9
+ BaseItem,
10
+ Request,
11
+ Response,
18
12
  ConsoleItem,
13
+ Decorators,
14
+ TaskQueue,
19
15
  logger
20
16
  )
21
- from cobweb.utils import LoghubDot
22
- proxy_type = os.getenv("PROXY_TYPE", "")
17
+ from constant import DealModel
23
18
 
24
19
 
25
20
  class Crawler(threading.Thread):
26
21
 
27
- def __init__(
28
- self,
29
- task: str,
30
- project: str,
31
- stop: threading.Event,
32
- pause: threading.Event,
33
- # launcher_queue: Union[Mapping[str, Queue]],
34
- get_seed: Callable,
35
- set_seed: Callable,
36
- add_seed: Callable,
37
- delete_seed: Callable,
38
- upload_data: Callable,
39
- custom_func: Union[Mapping[str, Callable]],
40
- thread_num: int,
41
- max_retries: int,
42
- time_sleep: int,
43
- ):
22
+ def __init__(self, pause, custom_func: Union[Mapping[str, Callable]]):
44
23
  super().__init__()
45
- self.task = task
46
- self.project = project
47
- self._stop = stop
48
- self._pause = pause
49
- self._get_seed = get_seed
50
- self._set_seed = set_seed
51
- self._add_seed = add_seed
52
- self._delete_seed = delete_seed
53
- self._upload_data = upload_data
54
-
24
+ self.pause = pause
55
25
  for func_name, _callable in custom_func.items():
56
26
  if isinstance(_callable, Callable):
57
27
  self.__setattr__(func_name, _callable)
58
28
 
59
- self.thread_num = thread_num
60
- self.time_sleep = time_sleep
61
- self.max_retries = max_retries
62
-
63
- self.loghub_dot = LoghubDot()
29
+ self.spider_max_retries = setting.SPIDER_MAX_RETRIES
30
+ self.request_queue_size = setting.REQUEST_QUEUE_SIZE
31
+ self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
32
+ self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
64
33
 
65
34
  @staticmethod
66
35
  def request(seed: Seed) -> Union[Request, BaseItem]:
@@ -77,142 +46,43 @@ class Crawler(threading.Thread):
77
46
  upload_item["text"] = item.response.text
78
47
  yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
79
48
 
80
- # def get_seed(self) -> Seed:
81
- # return self._todo.pop()
82
-
83
- def distribute(self, item, seed):
84
- if isinstance(item, BaseItem):
85
- self._upload_data(item)
86
- elif isinstance(item, Seed):
87
- self._add_seed(item)
88
- elif isinstance(item, str) and item == DealModel.poll:
89
- self._set_seed(seed)
90
- elif isinstance(item, str) and item == DealModel.done:
91
- self._delete_seed(seed)
92
- elif isinstance(item, str) and item == DealModel.fail:
93
- seed.params.seed_status = DealModel.fail
94
- self._delete_seed(seed)
95
- else:
96
- raise TypeError("yield value type error!")
97
-
98
- def spider(self):
99
- while not self._stop.is_set():
100
-
101
- seed = self._get_seed()
102
-
103
- if not seed:
104
- time.sleep(1)
105
- continue
106
-
107
- elif seed.params.retry > self.max_retries:
49
+ # @decorators.add_thread()
50
+ @Decorators.pause
51
+ def build_request_item(self):
52
+ thread_sleep = 0.1
53
+ if TaskQueue.REQUEST.length >= self.request_queue_size:
54
+ thread_sleep = 5
55
+ elif seed_info := TaskQueue.TODO.pop():
56
+ member, priority = seed_info
57
+ seed = Seed(member, priority=priority)
58
+ if seed.params.retry > self.spider_max_retries:
108
59
  seed.params.seed_status = DealModel.fail
109
- self._delete_seed(seed)
110
- continue
111
-
112
- seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
113
-
114
- try:
115
- request_iterators = self.request(seed)
116
-
117
- if not isgenerator(request_iterators):
118
- raise TypeError("request function isn't a generator!")
119
-
120
- iterator_status = False
121
-
122
- for request_item in request_iterators:
123
-
124
- iterator_status = True
125
-
126
- if isinstance(request_item, Request):
127
- iterator_status = False
128
- start_time = time.time()
129
- download_iterators = self.download(request_item)
130
- if not isgenerator(download_iterators):
131
- raise TypeError("download function isn't a generator")
132
-
133
- for download_item in download_iterators:
134
- iterator_status = True
135
- if isinstance(download_item, Response):
136
- iterator_status = False
137
- logger.info(LogTemplate.download_info.format(
138
- detail=seed_detail_log_info,
139
- retry=seed.params.retry,
140
- priority=seed.params.priority,
141
- seed_version=seed.params.seed_version,
142
- identifier=seed.identifier or "",
143
- status=download_item.response,
144
- response=LogTemplate.log_info(download_item.to_dict)
145
- ))
146
- if isinstance(download_item.response, Res):
147
- end_time = time.time()
148
- self.loghub_dot.build(
149
- topic=urlparse(download_item.response.request.url).netloc,
150
- data_size=int(download_item.response.headers.get("content-length", 0)),
151
- cost_time=end_time - start_time, status = 200,
152
- url=download_item.response.url, proxy_type=proxy_type,
153
- project=self.project, task=self.task,
154
- )
155
- parse_iterators = self.parse(download_item)
156
- if not isgenerator(parse_iterators):
157
- raise TypeError("parse function isn't a generator")
158
- for parse_item in parse_iterators:
159
- iterator_status = True
160
- if isinstance(parse_item, Response):
161
- raise TypeError("upload_item can't be a Response instance")
162
- self.distribute(parse_item, seed)
163
- else:
164
- self.distribute(download_item, seed)
165
- else:
166
- self.distribute(request_item, seed)
167
-
168
- if not iterator_status:
169
- raise ValueError("request/download/parse function yield value error!")
170
- except Exception as e:
171
- exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
172
- url = seed.url
173
- status = e.__class__.__name__
174
- if getattr(e, "response", None) and isinstance(e.response, Res):
175
- url = e.response.request.url
176
- status = e.response.status_code
177
- self.loghub_dot.build(
178
- topic=urlparse(url).netloc,
179
- data_size=-1, cost_time=-1,
180
- status=status, url=url,
181
- proxy_type=proxy_type,
182
- project=self.project,
183
- task=self.task,
184
- msg=exception_msg,
185
- )
186
- logger.info(LogTemplate.download_exception.format(
187
- detail=seed_detail_log_info,
188
- retry=seed.params.retry,
189
- priority=seed.params.priority,
190
- seed_version=seed.params.seed_version,
191
- identifier=seed.identifier or "",
192
- exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
193
- ))
194
- seed.params.retry += 1
195
- self._set_seed(seed)
196
- # time.sleep(self.time_sleep * seed.params.retry)
197
- # except Exception as e:
198
- # logger.info(LogTemplate.download_exception.format(
199
- # detail=seed_detail_log_info,
200
- # retry=seed.params.retry,
201
- # priority=seed.params.priority,
202
- # seed_version=seed.params.seed_version,
203
- # identifier=seed.identifier or "",
204
- # exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
205
- # ))
206
- # seed.params.retry += 1
207
- # # self._todo.push(seed)
208
- # self._set_seed(seed)
209
- # # time.sleep(self.time_sleep * seed.params.retry)
210
- finally:
211
- time.sleep(0.1)
212
- logger.info("spider thread close")
60
+ else:
61
+ TaskQueue.process_task(seed, self.request)
62
+ TaskQueue.DELETE.push(member)
63
+ time.sleep(thread_sleep)
64
+
65
+ @Decorators.pause
66
+ def build_download_item(self):
67
+ thread_sleep = 0.1
68
+ if TaskQueue.RESPONSE.length >= self.download_queue_size:
69
+ logger.info(f"download queue is full, sleep {thread_sleep}s")
70
+ thread_sleep = 5
71
+ elif request_info := TaskQueue.DOWNLOAD.pop():
72
+ member, priority = request_info
73
+ request_setting = json.loads(member)
74
+ request_item = Request(seed=member, **request_setting)
75
+ TaskQueue.process_task(request_item, self.download)
76
+ time.sleep(thread_sleep)
77
+
78
+ @Decorators.pause
79
+ def build_parse_item(self):
80
+ thread_sleep = 0.1
81
+ if TaskQueue.UPLOAD.length >= self.upload_queue_size:
82
+ logger.info(f"upload queue is full, sleep {thread_sleep}s")
83
+ thread_sleep = 5
84
+ if response_item := TaskQueue.RESPONSE.pop():
85
+ TaskQueue.process_task(response_item, self.parse)
86
+ time.sleep(thread_sleep)
213
87
 
214
- def run(self):
215
- threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
216
- for index in range(self.thread_num):
217
- threading.Thread(name=f"spider_{index}", target=self.spider).start()
218
88
 
cobweb/db/redis_db.py CHANGED
@@ -27,6 +27,9 @@ class RedisDB:
27
27
  def exists(self, *name) -> bool:
28
28
  return self._client.exists(*name)
29
29
 
30
+ def incrby(self, name, value):
31
+ return self._client.incrby(name, value)
32
+
30
33
  def sadd(self, name, value):
31
34
  return self._client.sadd(name, value)
32
35
 
@@ -72,6 +75,31 @@ class RedisDB:
72
75
  status = self.execute_lua(lua_script, [key], t)
73
76
  return bool(status)
74
77
 
78
+ def auto_incr(self, name, t=15, limit=1000):
79
+ lua_script = """
80
+ local count = 0
81
+ local status = false
82
+ local limit = ARGV[2]
83
+ local expire = redis.call('ttl', KEYS[1])
84
+
85
+ if ( expire == -2 ) then
86
+ redis.call('setnx', KEYS[1], 1)
87
+ elseif ( expire == -1) then
88
+ redis.call('expire', KEYS[1], ARGV[1])
89
+ else
90
+ count = redis.call('get', KEYS[1])
91
+ end
92
+
93
+ if ( count + 0 < limit + 0 ) then
94
+ status = true
95
+ redis.call('incr', KEYS[1])
96
+ end
97
+
98
+ return status
99
+ """
100
+ status = self.execute_lua(lua_script, [name], t, limit)
101
+ return bool(status)
102
+
75
103
  def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
76
104
  lua_script = """
77
105
  local min = ARGV[1]
@@ -1,3 +1,3 @@
1
- from .launcher_air import LauncherAir
1
+ # from .launcher_air import LauncherAir
2
2
  from .launcher_pro import LauncherPro
3
- from .launcher_api import LauncherApi
3
+ # from .launcher_api import LauncherApi
@@ -2,63 +2,25 @@ import time
2
2
  import inspect
3
3
  import threading
4
4
  import importlib
5
- from functools import wraps
6
-
7
5
 
6
+ from cobweb.constant import LogTemplate
7
+ from cobweb.utils import dynamic_load_class
8
+ from cobweb.base import TaskQueue, Decorators, logger
8
9
  from cobweb import setting
9
- from cobweb.base import Seed, Queue, logger
10
- from cobweb.utils.tools import dynamic_load_class
11
-
12
-
13
- def check_pause(func):
14
- @wraps(func)
15
- def wrapper(self, *args, **kwargs):
16
- while not self._pause.is_set():
17
- try:
18
- func(self, *args, **kwargs)
19
- except Exception as e:
20
- logger.info(f"{func.__name__}: " + str(e))
21
- finally:
22
- time.sleep(0.1)
23
-
24
- return wrapper
25
10
 
26
11
 
27
12
  class Launcher(threading.Thread):
28
13
 
29
- SEEDS = []
30
-
31
- __DOING__ = {}
32
-
33
- __CUSTOM_FUNC__ = {
34
- # "download": None,
35
- # "request": None,
36
- # "parse": None,
37
- }
38
-
39
- __LAUNCHER_QUEUE__ = {
40
- "new": Queue(),
41
- "todo": Queue(),
42
- "done": Queue(),
43
- "upload": Queue()
44
- }
45
-
46
- __LAUNCHER_FUNC__ = [
47
- "_reset",
48
- "_scheduler",
49
- "_insert",
50
- "_refresh",
51
- "_delete",
52
- ]
14
+ __CUSTOM_FUNC__ = {}
53
15
 
54
16
  def __init__(self, task, project, custom_setting=None, **kwargs):
55
17
  super().__init__()
56
18
  self.task = task
57
19
  self.project = project
20
+ self.custom_func = dict()
21
+ self.app_time = int(time.time())
58
22
 
59
- self._app_time = int(time.time())
60
- self._stop = threading.Event() # 结束事件
61
- self._pause = threading.Event() # 暂停事件
23
+ self.check_emtpy_times = 0
62
24
 
63
25
  _setting = dict()
64
26
 
@@ -79,32 +41,39 @@ class Launcher(threading.Thread):
79
41
  for k, v in _setting.items():
80
42
  setattr(setting, k.upper(), v)
81
43
 
82
- self._Crawler = dynamic_load_class(setting.CRAWLER)
83
- self._Pipeline = dynamic_load_class(setting.PIPELINE)
44
+ self.before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
45
+
46
+ self.scheduling_wait_time = setting.SCHEDULING_WAIT_TIME
47
+ self.inserting_wait_time = setting.INSERTING_WAIT_TIME
48
+ self.removing_wait_time = setting.REMOVING_WAIT_TIME
49
+ self.seed_reset_seconds = setting.SEED_RESET_SECONDS
50
+
51
+ self.scheduling_size = setting.SCHEDULING_SIZE
52
+ self.inserting_size = setting.INSERTING_SIZE
53
+ self.removing_size = setting.REMOVING_SIZE
84
54
 
85
- self._before_scheduler_wait_seconds = setting.BEFORE_SCHEDULER_WAIT_SECONDS
86
- self._scheduler_wait_seconds = setting.SCHEDULER_WAIT_SECONDS
87
- self._todo_queue_full_wait_seconds = setting.TODO_QUEUE_FULL_WAIT_SECONDS
88
- self._new_queue_wait_seconds = setting.NEW_QUEUE_WAIT_SECONDS
89
- self._done_queue_wait_seconds = setting.DONE_QUEUE_WAIT_SECONDS
90
- self._upload_queue_wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
91
- self._seed_reset_seconds = setting.SEED_RESET_SECONDS
55
+ self.todo_queue_size = setting.TODO_QUEUE_SIZE
56
+ self.seed_queue_size = setting.SEED_QUEUE_SIZE
57
+ self.request_queue_size = setting.REQUEST_QUEUE_SIZE
58
+ self.download_queue_size = setting.DOWNLOAD_QUEUE_SIZE
59
+ self.response_queue_size = setting.RESPONSE_QUEUE_SIZE
60
+ self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
61
+ self.delete_queue_size = setting.DELETE_QUEUE_SIZE
62
+ self.done_queue_size = setting.DONE_QUEUE_SIZE
92
63
 
93
- self._todo_queue_size = setting.TODO_QUEUE_SIZE
94
- self._new_queue_max_size = setting.NEW_QUEUE_MAX_SIZE
95
- self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
96
- self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
64
+ self.spider_thread_num = setting.SPIDER_THREAD_NUM
97
65
 
98
- self._spider_max_retries = setting.SPIDER_MAX_RETRIES
99
- self._spider_thread_num = setting.SPIDER_THREAD_NUM
100
- self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
101
- self._spider_max_count = setting.SPIDER_MAX_COUNT
102
- self._time_window = setting.TIME_WINDOW
66
+ self.task_model = setting.TASK_MODEL
103
67
 
104
- self._done_model = setting.DONE_MODEL
105
- self._task_model = setting.TASK_MODEL
68
+ self.stop = threading.Event() # 结束事件
69
+ self.pause = threading.Event() # 暂停事件
106
70
 
107
- self._filter_field = setting.FILTER_FIELD
71
+ self.crawler_path = setting.CRAWLER
72
+ self.pipeline_path = setting.PIPELINE
73
+
74
+ self._threads = []
75
+
76
+ self._task_info = dict(todo={}, download={})
108
77
 
109
78
  @property
110
79
  def request(self):
@@ -118,7 +87,7 @@ class Launcher(threading.Thread):
118
87
  yield Request(seed.url, seed)
119
88
  """
120
89
  def decorator(func):
121
- self.__CUSTOM_FUNC__["request"] = func
90
+ self.custom_func['request'] = func
122
91
  return decorator
123
92
 
124
93
  @property
@@ -133,7 +102,7 @@ class Launcher(threading.Thread):
133
102
  yield Response(item.seed, response)
134
103
  """
135
104
  def decorator(func):
136
- self.__CUSTOM_FUNC__["download"] = func
105
+ self.custom_func['download'] = func
137
106
  return decorator
138
107
 
139
108
  @property
@@ -148,88 +117,88 @@ class Launcher(threading.Thread):
148
117
  yield xxxItem(seed, **kwargs)
149
118
  """
150
119
  def decorator(func):
151
- self.__CUSTOM_FUNC__["parse"] = func
120
+ self.custom_func['parse'] = func
152
121
  return decorator
153
122
 
154
- def start_seeds(self):
155
- seeds = [Seed(seed) for seed in self.SEEDS]
156
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
157
- return seeds
123
+ def remove_working_items(self, key, items):
124
+ for item in items:
125
+ self._task_info[key].pop(item, None)
158
126
 
159
- def _remove_doing_seeds(self, seeds):
160
- for seed in seeds:
161
- self.__DOING__.pop(seed, None)
162
- # logger.info("remove %s seeds from __DOING__" % len(seeds))
127
+ def add_working_item(self, key, member, priority):
128
+ self._task_info[key][member] = priority
163
129
 
164
- def _get_seed(self) -> Seed:
165
- return self.__LAUNCHER_QUEUE__["todo"].pop()
130
+ def check_alive(self):
131
+ while not self.stop.is_set():
132
+ if not self.pause.is_set():
133
+ for thread in self._threads:
134
+ if not thread.is_alive():
135
+ thread.start()
136
+ time.sleep(1)
166
137
 
167
- def _set_seed(self, seed, **kwargs):
168
- self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
138
+ def _add_thread(self, func, num=1, obj=None, name=None, args=()):
139
+ obj = obj or self
140
+ name = obj.__class__.__name__ + name or func.__name__
141
+ for i in range(num):
142
+ func_name = name + "_" + str(i) if num > 1 else name
143
+ self._threads.append(threading.Thread(name=func_name, target=func, args=(obj,) + args))
144
+
145
+ @Decorators.stop
146
+ def _polling(self):
147
+ if TaskQueue.is_empty():
148
+ if self.pause.is_set():
149
+ run_time = int(time.time()) - self.app_time
150
+ if not self.task_model and run_time > self.before_scheduler_wait_seconds:
151
+ logger.info("Done! ready to close thread...")
152
+ self.stop.set()
153
+ else:
154
+ logger.info("pause! waiting for resume...")
155
+ elif self.check_emtpy_times > 2:
156
+ logger.info("pause! waiting for resume...")
157
+ self.doing_seeds = {}
158
+ self._task_info['todo'] = {}
159
+ self._task_info['download'] = {}
160
+ self.pause.set()
161
+ else:
162
+ logger.info(
163
+ "check whether the task is complete, "
164
+ f"reset times {3 - self.check_emtpy_times}"
165
+ )
166
+ self.check_emtpy_times += 1
167
+ elif TaskQueue.TODO.length:
168
+ logger.info(f"Recovery {self.task} task run!")
169
+ self.check_emtpy_times = 0
170
+ self.pause.clear()
171
+ else:
172
+ logger.info(LogTemplate.launcher_polling.format(
173
+ task=self.task,
174
+ memory_todo_count=len(self._task_info["todo"]),
175
+ memory_download_count=len(self._task_info["download"]),
176
+ todo_queue_len=TaskQueue.TODO.length,
177
+ delete_queue_len=TaskQueue.DELETE.length,
178
+ request_queue_len=TaskQueue.REQUEST.length,
179
+ response_queue_len=TaskQueue.RESPONSE.length,
180
+ done_queue_len=TaskQueue.DONE.length,
181
+ upload_queue_len=TaskQueue.UPLOAD.length,
182
+ ))
183
+ time.sleep(10)
169
184
 
170
- def _upload_data(self, data, **kwargs):
171
- self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
185
+ def run(self):
186
+ Crawler = dynamic_load_class(self.crawler_path)
187
+ Pipeline = dynamic_load_class(self.pipeline_path)
172
188
 
173
- def _add_seed(self, seed, **kwargs):
174
- self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
189
+ crawler = Crawler(pause=self.pause, custom_func=self.custom_func)
190
+ pipeline = Pipeline(pause=self.pause)
175
191
 
176
- def _delete_seed(self, seed, **kwargs):
177
- self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
192
+ self._add_thread(obj=crawler, func=crawler.build_request_item)
193
+ self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
194
+ self._add_thread(obj=crawler, func=crawler.build_parse_item)
195
+ self._add_thread(obj=pipeline, func=pipeline.run)
178
196
 
179
- def _execute(self):
180
- for func_name in self.__LAUNCHER_FUNC__:
181
- threading.Thread(name=func_name, target=getattr(self, func_name)).start()
182
- time.sleep(1)
197
+ self._add_thread(func=self._polling)
183
198
 
184
- def run(self):
185
- threading.Thread(target=self._execute_heartbeat).start()
186
-
187
- self.start_seeds()
188
-
189
- self._Crawler(
190
- task=self.task, project=self.project,
191
- stop=self._stop, pause=self._pause,
192
- # launcher_queue=self.__LAUNCHER_QUEUE__,
193
- get_seed=self._get_seed,
194
- set_seed=self._set_seed,
195
- add_seed=self._add_seed,
196
- delete_seed=self._delete_seed,
197
- upload_data=self._upload_data,
198
- custom_func=self.__CUSTOM_FUNC__,
199
- thread_num = self._spider_thread_num,
200
- max_retries = self._spider_max_retries,
201
- time_sleep=self._spider_time_sleep
202
- ).start()
203
-
204
- self._Pipeline(
205
- stop=self._stop, pause=self._pause,
206
- upload=self.__LAUNCHER_QUEUE__["upload"],
207
- done=self.__LAUNCHER_QUEUE__["done"],
208
- upload_size=self._upload_queue_max_size,
209
- wait_seconds=self._upload_queue_wait_seconds
210
- ).start()
211
-
212
- self._execute()
213
- self._polling()
214
-
215
- def _execute_heartbeat(self):
216
- pass
217
-
218
- def _reset(self):
219
- pass
220
-
221
- def _scheduler(self):
222
- pass
223
-
224
- def _insert(self):
225
- pass
226
-
227
- def _refresh(self):
228
- pass
229
-
230
- def _delete(self):
231
- pass
199
+ self._init_schedule_thread()
200
+ self.check_alive()
232
201
 
233
- def _polling(self):
234
- pass
202
+ def _init_schedule_thread(self):
203
+ ...
235
204