cobweb-launcher 1.2.6__py3-none-any.whl → 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

@@ -2,12 +2,25 @@ import time
2
2
  import inspect
3
3
  import threading
4
4
  import importlib
5
+ from functools import wraps
5
6
 
6
7
  from cobweb import setting
7
- from cobweb.base import Seed, Queue
8
+ from cobweb.base import Seed, Queue, logger
8
9
  from cobweb.utils.tools import dynamic_load_class
9
10
 
10
11
 
12
+ def check_pause(func):
13
+ @wraps(func)
14
+ def wrapper(self, *args, **kwargs):
15
+ while not self._pause.is_set():
16
+ try:
17
+ func(*args, **kwargs)
18
+ except Exception as e:
19
+ logger.info(f"{func.__name__}: " + str(e))
20
+
21
+ return wrapper
22
+
23
+
11
24
  class Launcher(threading.Thread):
12
25
 
13
26
  SEEDS = []
@@ -85,11 +98,6 @@ class Launcher(threading.Thread):
85
98
  self._done_model = setting.DONE_MODEL
86
99
  self._task_model = setting.TASK_MODEL
87
100
 
88
-
89
- @property
90
- def start_seeds(self):
91
- return [Seed(seed) for seed in self.SEEDS]
92
-
93
101
  @property
94
102
  def request(self):
95
103
  """
@@ -135,9 +143,15 @@ class Launcher(threading.Thread):
135
143
  self.__CUSTOM_FUNC__["parse"] = func
136
144
  return decorator
137
145
 
146
+ def start_seeds(self):
147
+ seeds = [Seed(seed) for seed in self.SEEDS]
148
+ self.__LAUNCHER_QUEUE__['todo'].push(seeds)
149
+ return seeds
150
+
138
151
  def _remove_doing_seeds(self, seeds):
139
152
  for seed in seeds:
140
153
  self.__DOING__.pop(seed, None)
154
+ logger.info("remove %s seeds from __DOING__" % len(seeds))
141
155
 
142
156
  def _execute(self):
143
157
  for func_name in self.__LAUNCHER_FUNC__:
@@ -147,6 +161,8 @@ class Launcher(threading.Thread):
147
161
  def run(self):
148
162
  threading.Thread(target=self._execute_heartbeat).start()
149
163
 
164
+ self.start_seeds()
165
+
150
166
  self._Crawler(
151
167
  stop=self._stop, pause=self._pause,
152
168
  launcher_queue=self.__LAUNCHER_QUEUE__,
@@ -1,46 +1,46 @@
1
1
  import time
2
2
 
3
- from cobweb.constant import LogTemplate
4
3
  from cobweb.base import logger
5
- from .launcher import Launcher
4
+ from cobweb.constant import LogTemplate
5
+ from .launcher import Launcher, check_pause
6
6
 
7
7
 
8
8
  class LauncherAir(Launcher):
9
9
 
10
- def _scheduler(self):
11
- if self.start_seeds:
12
- self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
10
+ # def _scheduler(self):
11
+ # if self.start_seeds:
12
+ # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
13
 
14
+ @check_pause
14
15
  def _insert(self):
15
- while not self._pause.is_set():
16
- seeds = {}
17
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
- for _ in range(self._new_queue_max_size):
19
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
- if not seed:
21
- break
22
- seeds[seed.to_string] = seed.params.priority
23
- if seeds:
24
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
- if status:
26
- time.sleep(self._new_queue_wait_seconds)
27
-
16
+ seeds = {}
17
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
+ for _ in range(self._new_queue_max_size):
19
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
+ if not seed:
21
+ break
22
+ seeds[seed.to_string] = seed.params.priority
23
+ if seeds:
24
+ self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
+ if status:
26
+ time.sleep(self._new_queue_wait_seconds)
27
+
28
+ @check_pause
28
29
  def _delete(self):
29
- while not self._pause.is_set():
30
- seeds = []
31
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
30
+ seeds = []
31
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
32
 
33
- for _ in range(self._done_queue_max_size):
34
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
- if not seed:
36
- break
37
- seeds.append(seed.to_string)
33
+ for _ in range(self._done_queue_max_size):
34
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
+ if not seed:
36
+ break
37
+ seeds.append(seed.to_string)
38
38
 
39
- if seeds:
40
- self._remove_doing_seeds(seeds)
39
+ if seeds:
40
+ self._remove_doing_seeds(seeds)
41
41
 
42
- if status:
43
- time.sleep(self._done_queue_wait_seconds)
42
+ if status:
43
+ time.sleep(self._done_queue_wait_seconds)
44
44
 
45
45
  def _polling(self):
46
46
 
@@ -4,7 +4,7 @@ import threading
4
4
  from cobweb.db import RedisDB
5
5
  from cobweb.base import Seed, logger
6
6
  from cobweb.constant import DealModel, LogTemplate
7
- from .launcher import Launcher
7
+ from .launcher import Launcher, check_pause
8
8
 
9
9
 
10
10
  class LauncherPro(Launcher):
@@ -33,44 +33,41 @@ class LauncherPro(Launcher):
33
33
  else:
34
34
  self._db._client.incrby(key, count)
35
35
 
36
+ @check_pause
36
37
  def _execute_heartbeat(self):
37
- while not self._stop.is_set():
38
- if self._heartbeat_start_event.is_set():
39
- self._db.setex(self._heartbeat_key, 5)
40
- time.sleep(3)
38
+ if self._heartbeat_start_event.is_set():
39
+ self._db.setex(self._heartbeat_key, 5)
40
+ time.sleep(3)
41
41
 
42
+ @check_pause
42
43
  def _reset(self):
43
44
  """
44
45
  检查过期种子,重新添加到redis缓存中
45
46
  """
46
- while not self._pause.is_set():
47
- reset_wait_seconds = 30
48
- if self._db.lock(self._reset_lock_key, t=120):
47
+ reset_wait_seconds = 30
48
+ if self._db.lock(self._reset_lock_key, t=120):
49
49
 
50
- _min = -int(time.time()) + self._seed_reset_seconds \
51
- if self.heartbeat else "-inf"
50
+ _min = -int(time.time()) + self._seed_reset_seconds \
51
+ if self.heartbeat else "-inf"
52
52
 
53
- self._db.members(self._todo_key, 0, _min=_min, _max="(0")
54
- self._db.delete(self._reset_lock_key)
53
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
54
+ self._db.delete(self._reset_lock_key)
55
55
 
56
- if not self.heartbeat:
57
- self._heartbeat_start_event.set()
56
+ if not self.heartbeat:
57
+ self._heartbeat_start_event.set()
58
58
 
59
- time.sleep(reset_wait_seconds)
59
+ time.sleep(reset_wait_seconds)
60
60
 
61
+ @check_pause
61
62
  def _scheduler(self):
62
63
  """
63
64
  调度任务,获取redis队列种子,同时添加到doing字典中
64
65
  """
65
- if self.start_seeds:
66
- self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
67
- while not self._pause.is_set():
68
- if not self._db.zcount(self._todo_key, 0, "(1000"):
69
- time.sleep(self._scheduler_wait_seconds)
70
- continue
71
- if self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
72
- time.sleep(self._todo_queue_full_wait_seconds)
73
- continue
66
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
67
+ time.sleep(self._scheduler_wait_seconds)
68
+ elif self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
69
+ time.sleep(self._todo_queue_full_wait_seconds)
70
+ else:
74
71
  members = self._db.members(
75
72
  self._todo_key, int(time.time()),
76
73
  count=self._todo_queue_size,
@@ -81,67 +78,66 @@ class LauncherPro(Launcher):
81
78
  self.__LAUNCHER_QUEUE__['todo'].push(seed)
82
79
  self.__DOING__[seed.to_string] = seed.params.priority
83
80
 
81
+ @check_pause
84
82
  def _insert(self):
85
83
  """
86
84
  添加新种子到redis队列中
87
85
  """
88
- while not self._pause.is_set():
89
- seeds = {}
90
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
91
- for _ in range(self._new_queue_max_size):
92
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
93
- if not seed:
94
- break
86
+ seeds = {}
87
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
88
+ for _ in range(self._new_queue_max_size):
89
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
90
+ if seed:
95
91
  seeds[seed.to_string] = seed.params.priority
96
- if seeds:
97
- self._db.zadd(self._todo_key, seeds, nx=True)
98
- if status:
99
- time.sleep(self._new_queue_wait_seconds)
92
+ if seeds:
93
+ self._db.zadd(self._todo_key, seeds, nx=True)
94
+ if status:
95
+ time.sleep(self._new_queue_wait_seconds)
100
96
 
97
+ @check_pause
101
98
  def _refresh(self):
102
99
  """
103
100
  刷新doing种子过期时间,防止reset重新消费
104
101
  """
105
- while not self._pause.is_set():
106
- if self.__DOING__:
107
- refresh_time = int(time.time())
108
- seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
109
- self._db.zadd(self._todo_key, item=seeds, xx=True)
110
- time.sleep(15)
102
+ if self.__DOING__:
103
+ refresh_time = int(time.time())
104
+ seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
105
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
106
+ time.sleep(15)
111
107
 
108
+ @check_pause
112
109
  def _delete(self):
113
110
  """
114
111
  删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
115
112
  """
116
- while not self._pause.is_set():
117
- seeds, s_seeds, f_seeds = [], [], []
118
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
119
-
120
- for _ in range(self._done_queue_max_size):
121
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
122
- if not seed:
123
- break
124
- if seed.params.seed_status == DealModel.fail:
125
- f_seeds.append(seed.to_string)
126
- elif self._done_model == 1:
127
- s_seeds.append(seed.to_string)
128
- else:
129
- seeds.append(seed.to_string)
130
- if seeds:
131
- count = self._db.zrem(self._todo_key, *seeds)
132
- self.statistics(self._statistics_done_key, count)
133
- self._remove_doing_seeds(seeds)
134
- if s_seeds:
135
- count = self._db.done([self._todo_key, self._done_key], *s_seeds)
136
- self.statistics(self._statistics_done_key, count)
137
- self._remove_doing_seeds(s_seeds)
138
- if f_seeds:
139
- count = self._db.done([self._todo_key, self._fail_key], *f_seeds)
140
- self.statistics(self._statistics_fail_key, count)
141
- self._remove_doing_seeds(f_seeds)
142
-
143
- if status:
144
- time.sleep(self._done_queue_wait_seconds)
113
+ seeds, s_seeds, f_seeds = [], [], []
114
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
115
+
116
+ for _ in range(self._done_queue_max_size):
117
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
118
+ if not seed:
119
+ break
120
+ if seed.params.seed_status == DealModel.fail:
121
+ f_seeds.append(seed.to_string)
122
+ elif self._done_model == 1:
123
+ s_seeds.append(seed.to_string)
124
+ else:
125
+ seeds.append(seed.to_string)
126
+ if seeds:
127
+ count = self._db.zrem(self._todo_key, *seeds)
128
+ self.statistics(self._statistics_done_key, count)
129
+ self._remove_doing_seeds(seeds)
130
+ if s_seeds:
131
+ count = self._db.done([self._todo_key, self._done_key], *s_seeds)
132
+ self.statistics(self._statistics_done_key, count)
133
+ self._remove_doing_seeds(s_seeds)
134
+ if f_seeds:
135
+ count = self._db.done([self._todo_key, self._fail_key], *f_seeds)
136
+ self.statistics(self._statistics_fail_key, count)
137
+ self._remove_doing_seeds(f_seeds)
138
+
139
+ if status:
140
+ time.sleep(self._done_queue_wait_seconds)
145
141
 
146
142
  def _polling(self):
147
143
  wait_scheduler_execute = True
cobweb/setting.py CHANGED
@@ -26,9 +26,6 @@ OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
26
26
  OSS_CHUNK_SIZE = 10 * 1024 ** 2
27
27
  OSS_MIN_UPLOAD_SIZE = 1024
28
28
 
29
- # message
30
- MESSAGE = ""
31
-
32
29
 
33
30
  # 采集器选择
34
31
  CRAWLER = "cobweb.crawlers.Crawler"
@@ -43,7 +40,7 @@ BEFORE_SCHEDULER_WAIT_SECONDS = 60 # 调度前等待时间,只作用于单次
43
40
  SCHEDULER_WAIT_SECONDS = 15 # 调度等待时间
44
41
  TODO_QUEUE_FULL_WAIT_SECONDS = 5 # todo队列已满时等待时间
45
42
  NEW_QUEUE_WAIT_SECONDS = 30 # new队列等待时间
46
- DONE_QUEUE_WAIT_SECONDS = 15 # done队列等待时间
43
+ DONE_QUEUE_WAIT_SECONDS = 5 # done队列等待时间
47
44
  UPLOAD_QUEUE_WAIT_SECONDS = 15 # upload队列等待时间
48
45
  SEED_RESET_SECONDS = 30 # 种子重制时间
49
46
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.6
3
+ Version: 1.2.8
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,6 +1,6 @@
1
1
  cobweb/__init__.py,sha256=uMHyf4Fekbyw2xBCbkA8R0LwCpBJf5p_7pWbh60ZWYk,83
2
2
  cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
3
- cobweb/setting.py,sha256=bKM2J-yjGHeQn1Yf-yZ3cay_FTCPaokgOYSC6GPWwM4,2015
3
+ cobweb/setting.py,sha256=Wev0clo4ZETI5cRvBnzTWnJWo0Nowv_uvNCqlzYPSiE,1990
4
4
  cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
5
5
  cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
6
6
  cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
@@ -18,9 +18,9 @@ cobweb/db/redis_db.py,sha256=NNI2QkRV1hEZI-z-COEncXt88z3pZN6wusKlcQzc8V4,4304
18
18
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
19
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
20
  cobweb/launchers/__init__.py,sha256=af0Y6wrGX8SQZ7w7XL2sOtREjCT3dwad-uCc3nIontY,76
21
- cobweb/launchers/launcher.py,sha256=0YhJtmN048MgTZ7SAUWvaKJEPXrNx0p4dkoTopsyjzc,5466
22
- cobweb/launchers/launcher_air.py,sha256=zHVEJqQCxYU1WDnqQzzEHbEXasR1GmKevujQkCfFt5o,2947
23
- cobweb/launchers/launcher_pro.py,sha256=qCmobbi1jhI09cVRjaYGxyTHwBSvARiHxF1p37b5iaE,8107
21
+ cobweb/launchers/launcher.py,sha256=H0MEL8mTcNgNbuZBKlHH0GlQ3IIFvbJkHaG42mZ2vW8,5947
22
+ cobweb/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
23
+ cobweb/launchers/launcher_pro.py,sha256=IpckN0Xl5NAPVjuq-pWbCx0RerDmYG9gZ182EDURixo,7599
24
24
  cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
25
25
  cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
26
26
  cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
@@ -30,8 +30,8 @@ cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXt
30
30
  cobweb/utils/__init__.py,sha256=JTE4sBfHnKHhD6w9Auk0MIT7O9BMOamCeryhlHNx3Zg,47
31
31
  cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
32
32
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
33
- cobweb_launcher-1.2.6.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
- cobweb_launcher-1.2.6.dist-info/METADATA,sha256=q_ihYp0LYgZRMdvzJ2gmTYG0HgeoOAnRjOEj5FAUp34,6489
35
- cobweb_launcher-1.2.6.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
36
- cobweb_launcher-1.2.6.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
- cobweb_launcher-1.2.6.dist-info/RECORD,,
33
+ cobweb_launcher-1.2.8.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
+ cobweb_launcher-1.2.8.dist-info/METADATA,sha256=VCstetX4haZKrB1Y5KWd9D9WjrmxnZORL7ZRX3WiADA,6489
35
+ cobweb_launcher-1.2.8.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
36
+ cobweb_launcher-1.2.8.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
+ cobweb_launcher-1.2.8.dist-info/RECORD,,