cobweb-launcher 1.3.11__py3-none-any.whl → 1.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cobweb/base/__init__.py CHANGED
@@ -100,14 +100,15 @@ class Decorators:
100
100
  def pause(func):
101
101
  @wraps(func)
102
102
  def wrapper(self, *args, **kwargs):
103
- while not self.pause.is_set():
104
- try:
105
- func(self)
106
- except Exception as e:
107
- logger.info(f"{func.__name__}: " + str(e))
108
- finally:
109
- time.sleep(0.1)
110
- logger.info(f"{func.__name__}: close!")
103
+ while not self.stop.is_set():
104
+ while not self.pause.is_set():
105
+ try:
106
+ func(self)
107
+ except Exception as e:
108
+ logger.info(f"{func.__name__}: " + str(e))
109
+ finally:
110
+ time.sleep(0.1)
111
+ # logger.info(f"{func.__name__}: close!")
111
112
 
112
113
  return wrapper
113
114
 
cobweb/base/basic.py CHANGED
@@ -142,7 +142,7 @@ class Request:
142
142
 
143
143
  def __init__(
144
144
  self,
145
- url,
145
+ # url,
146
146
  seed,
147
147
  random_ua=True,
148
148
  check_status_code=True,
@@ -152,7 +152,7 @@ class Request:
152
152
  status=None,
153
153
  **kwargs
154
154
  ):
155
- self.url = url
155
+ # self.url = url
156
156
  self.check_status_code = check_status_code
157
157
  self.request_setting = {}
158
158
 
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import time
3
4
  import threading
4
5
  from typing import Union, Callable, Mapping
@@ -12,15 +13,15 @@ from cobweb.base import (
12
13
  ConsoleItem,
13
14
  Decorators,
14
15
  TaskQueue,
15
- logger
16
16
  )
17
17
  from cobweb.constant import DealModel
18
18
 
19
19
 
20
20
  class Crawler(threading.Thread):
21
21
 
22
- def __init__(self, pause, custom_func: Union[Mapping[str, Callable]]):
22
+ def __init__(self, stop, pause, custom_func: Union[Mapping[str, Callable]]):
23
23
  super().__init__()
24
+ self.stop = stop
24
25
  self.pause = pause
25
26
  for func_name, _callable in custom_func.items():
26
27
  if isinstance(_callable, Callable):
@@ -52,14 +53,20 @@ class Crawler(threading.Thread):
52
53
  thread_sleep = 0.1
53
54
  if TaskQueue.REQUEST.length >= self.request_queue_size:
54
55
  thread_sleep = 5
55
- elif seed_info := TaskQueue.TODO.pop():
56
- member, priority = seed_info
57
- seed = Seed(member, priority=priority)
56
+ elif seed := TaskQueue.TODO.pop():
57
+ # member, priority = seed_info
58
+ # seed = Seed(member, priority=priority)
58
59
  if seed.params.retry > self.spider_max_retries:
59
- seed.params.seed_status = DealModel.fail
60
+ TaskQueue.DOT.build(
61
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
62
+ process_task_type=seed.__class__.__name__,
63
+ seed_status=DealModel.fail,
64
+ retries=seed.params.retry,
65
+ **seed.to_dict
66
+ )
60
67
  else:
61
68
  TaskQueue.process_task(seed, self.request)
62
- TaskQueue.DELETE.push(member)
69
+ TaskQueue.DELETE.push(seed.seed)
63
70
  time.sleep(thread_sleep)
64
71
 
65
72
  @Decorators.pause
@@ -67,22 +74,37 @@ class Crawler(threading.Thread):
67
74
  thread_sleep = 0.1
68
75
  if TaskQueue.RESPONSE.length >= self.download_queue_size:
69
76
  thread_sleep = 5
70
- # logger.info(f"download queue is full, sleep {thread_sleep}s")
71
- elif request_info := TaskQueue.DOWNLOAD.pop():
72
- member, priority = request_info
73
- request_setting = json.loads(member)
74
- request_item = Request(seed=member, **request_setting)
75
- TaskQueue.process_task(request_item, self.download)
77
+ elif request_item := TaskQueue.DOWNLOAD.pop():
78
+ if request_item.params.retry > self.spider_max_retries:
79
+ TaskQueue.DOT.build(
80
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
81
+ process_task_type=request_item.__class__.__name__,
82
+ retries=request_item.params.retry,
83
+ seed_status=DealModel.fail,
84
+ **request_item.to_dict
85
+ )
86
+ TaskQueue.DONE.push(request_item.seed)
87
+ else:
88
+ TaskQueue.process_task(request_item, self.download)
76
89
  time.sleep(thread_sleep)
77
90
 
78
91
  @Decorators.pause
79
92
  def build_parse_item(self):
80
93
  thread_sleep = 0.1
81
94
  if TaskQueue.UPLOAD.length >= self.upload_queue_size:
82
- # logger.info(f"upload queue is full, sleep {thread_sleep}s")
83
95
  thread_sleep = 5
84
- if response_item := TaskQueue.RESPONSE.pop():
85
- TaskQueue.process_task(response_item, self.parse)
96
+ elif response_item := TaskQueue.RESPONSE.pop():
97
+ if response_item.params.retry > self.spider_max_retries:
98
+ TaskQueue.DOT.build(
99
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
100
+ process_task_type=response_item.__class__.__name__,
101
+ seed_status=DealModel.fail,
102
+ retries=response_item.params.retry,
103
+ **response_item.to_dict
104
+ )
105
+ TaskQueue.DONE.push(response_item.seed)
106
+ else:
107
+ TaskQueue.process_task(response_item, self.parse)
86
108
  time.sleep(thread_sleep)
87
109
 
88
110
 
@@ -60,6 +60,7 @@ class Launcher(threading.Thread):
60
60
  self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
61
61
  self.delete_queue_size = setting.DELETE_QUEUE_SIZE
62
62
  self.done_queue_size = setting.DONE_QUEUE_SIZE
63
+ self.spider_max_retries = setting.SPIDER_MAX_RETRIES
63
64
 
64
65
  self.spider_thread_num = setting.SPIDER_THREAD_NUM
65
66
 
@@ -71,7 +72,7 @@ class Launcher(threading.Thread):
71
72
  self.crawler_path = setting.CRAWLER
72
73
  self.pipeline_path = setting.PIPELINE
73
74
 
74
- self._threads = []
75
+ self._thread_info = {}
75
76
 
76
77
  self._task_info = dict(todo={}, download={})
77
78
 
@@ -130,9 +131,12 @@ class Launcher(threading.Thread):
130
131
  def check_alive(self):
131
132
  while not self.stop.is_set():
132
133
  if not self.pause.is_set():
133
- for thread in self._threads:
134
- if not thread.is_alive():
135
- thread.start()
134
+ for name, thread_info in self._thread_info.items():
135
+ instance = thread_info['instance']
136
+ if not instance.is_alive():
137
+ instance = threading.Thread(name=name, target=thread_info['func'], args=())
138
+ self._thread_info[name] = dict(instance=instance, func=thread_info['func'])
139
+ instance.start()
136
140
  time.sleep(1)
137
141
 
138
142
  def _add_thread(self, func, num=1, obj=None, name=None, args=()):
@@ -140,7 +144,9 @@ class Launcher(threading.Thread):
140
144
  name = obj.__class__.__name__ + ":" + (name or func.__name__)
141
145
  for i in range(num):
142
146
  func_name = name + "_" + str(i) if num > 1 else name
143
- self._threads.append(threading.Thread(name=func_name, target=func, args=()))
147
+ instance = threading.Thread(name=func_name, target=func, args=())
148
+ self._thread_info[func_name] = dict(instance=instance, func=func)
149
+ instance.start()
144
150
 
145
151
  @Decorators.stop
146
152
  def _polling(self):
@@ -187,8 +193,8 @@ class Launcher(threading.Thread):
187
193
  Crawler = dynamic_load_class(self.crawler_path)
188
194
  Pipeline = dynamic_load_class(self.pipeline_path)
189
195
 
190
- crawler = Crawler(pause=self.pause, custom_func=self.custom_func)
191
- pipeline = Pipeline(pause=self.pause)
196
+ crawler = Crawler(stop=self.stop, pause=self.pause, custom_func=self.custom_func)
197
+ pipeline = Pipeline(stop=self.stop, pause=self.pause)
192
198
 
193
199
  self._add_thread(obj=crawler, func=crawler.build_request_item)
194
200
  self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
@@ -1,6 +1,6 @@
1
1
  import time
2
2
 
3
- from cobweb.base import TaskQueue, Decorators
3
+ from cobweb.base import TaskQueue, Decorators, Seed, Request
4
4
  from cobweb.schedulers import ApiScheduler
5
5
  from .launcher import Launcher
6
6
 
@@ -16,13 +16,13 @@ class LauncherApi(Launcher):
16
16
  @Decorators.stop
17
17
  def _schedule(self):
18
18
  thread_sleep = self.scheduling_wait_time
19
- for q, key, size, item_info in [
20
- (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
- (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
19
+ for q, key, size, item_info, Cls in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
22
22
  ]:
23
23
  if q.length < size:
24
24
  for member, priority in self._scheduler.schedule(key, self.scheduling_size):
25
- q.push((member, priority), direct_insertion=True)
25
+ q.push(Cls(member, priority=priority))
26
26
  self.add_working_item(key.split(":")[-1], member, priority)
27
27
  thread_sleep = 0.1
28
28
  time.sleep(thread_sleep)
@@ -1,6 +1,6 @@
1
1
  import time
2
2
 
3
- from cobweb.base import TaskQueue, Decorators
3
+ from cobweb.base import TaskQueue, Decorators, Seed, Request
4
4
  from cobweb.schedulers import RedisScheduler
5
5
  from .launcher import Launcher
6
6
 
@@ -16,13 +16,13 @@ class LauncherPro(Launcher):
16
16
  @Decorators.stop
17
17
  def _schedule(self):
18
18
  thread_sleep = self.scheduling_wait_time
19
- for q, key, size, item_info in [
20
- (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
- (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
19
+ for q, key, size, item_info, Cls in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
22
22
  ]:
23
23
  if q.length < size:
24
24
  for member, priority in self._scheduler.schedule(key, self.scheduling_size):
25
- q.push((member, priority), direct_insertion=True)
25
+ q.push(Cls(member, priority=priority))
26
26
  self.add_working_item(key.split(":")[-1], member, priority)
27
27
  thread_sleep = 0.1
28
28
  time.sleep(thread_sleep)
@@ -10,9 +10,11 @@ class Pipeline(ABC):
10
10
 
11
11
  def __init__(
12
12
  self,
13
+ stop: threading.Event,
13
14
  pause: threading.Event,
14
15
  ):
15
16
  super().__init__()
17
+ self.stop = stop
16
18
  self.pause = pause
17
19
  self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
18
20
  self.upload_wait_time = setting.UPLOAD_WAIT_TIME
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.11
3
+ Version: 1.3.13
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,8 +1,8 @@
1
1
  cobweb/__init__.py,sha256=oaEfsGUuGP0s39UbFRwrnsjMUeuB6QvQIAwStKFyUTk,83
2
2
  cobweb/constant.py,sha256=eofONAntk9O6S-cb4KbYGYHL_u7nBlOqqFOw_HzJHAU,3588
3
3
  cobweb/setting.py,sha256=pY6LKsgWI3164GiGA1z_y26LVf5-3mpiEgmm86mKRdY,3135
4
- cobweb/base/__init__.py,sha256=T8raRjxGJW27SdRa04KcqvCVlUvRjv1jMLB4wNP_7xc,5169
5
- cobweb/base/basic.py,sha256=s5G4LBZiLUfoymV-gLSIqeH-OJ7q7-L35sBa6xEH3EI,7666
4
+ cobweb/base/__init__.py,sha256=CgNg7BK8uPICSWrLI9Bi6vNQaquBNY2H31TrDo9-fTI,5245
5
+ cobweb/base/basic.py,sha256=eOSHnZT2xR-sOND8J4M3iCJJJUV51QiFi8Yn8JxV7s4,7670
6
6
  cobweb/base/common_queue.py,sha256=Gor7sR3h1hlZWaI0XcNAbf0S15Ftjr3DFRWNTGL13uU,1137
7
7
  cobweb/base/dotting.py,sha256=lfFXXqnVP__hxlW3qH5Bnuq69KtnFaQLbcz1M8e2Ajg,1239
8
8
  cobweb/base/item.py,sha256=hYheVTV2Bozp4iciJpE2ZwBIXkaqBg4QQkRccP8yoVk,1049
@@ -11,19 +11,19 @@ cobweb/base/request.py,sha256=acGm3OzxsPed5VUTk7D9eeHZPMh7KUNQRUv44G5znZg,2659
11
11
  cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
12
12
  cobweb/base/seed.py,sha256=PN5J4gKPEXylwyQeSGOBfauxHktxFr7RJe8nVX1hBw4,2987
13
13
  cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
14
- cobweb/crawlers/crawler.py,sha256=ZQ6yVA1EaQRdKJEY3DNqShzp9HPMwlSXapnsRW9E5Wc,2987
14
+ cobweb/crawlers/crawler.py,sha256=hmnvQWKC9EdVrn3bGcV4VlwX8KDpVFylCPfuoV307tc,3960
15
15
  cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
16
16
  cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
17
17
  cobweb/db/redis_db.py,sha256=FvMzckJtmhwKhZqKoS23iXmJti5P2dnMVD5rJ__5LUw,5139
18
18
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
19
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
20
  cobweb/launchers/__init__.py,sha256=m_XNG2bWuMbirPt3d0_s-Ezl1xycfUxeqZnwq_kkfuo,116
21
- cobweb/launchers/launcher.py,sha256=O7HjZ2lewsZ5IIiZ2vhCAqGidJ1JKMxS5fs4KlslBkA,7332
21
+ cobweb/launchers/launcher.py,sha256=KB7aL38T3uMh1s78HyHLcS0DS8ovx7VuW6JHn5ooec8,7807
22
22
  cobweb/launchers/launcher_air.py,sha256=yPr395HVIIHAq6lqRcYJu7c0KkfO9V8O-2sn0hC96p0,2990
23
- cobweb/launchers/launcher_api.py,sha256=TfLrLXazFWsOJLI7caMGfZozCttL1WTwTo3uUpN_FV0,3370
24
- cobweb/launchers/launcher_pro.py,sha256=2H-TcvQx-ga78GLNTa-GXMLYAj9nEeCJSWf8xl-1ISQ,3374
23
+ cobweb/launchers/launcher_api.py,sha256=vpwFxB1azgBk1bS7VhX3jOprQS8fl6Iu_5M-Y3QT67A,3394
24
+ cobweb/launchers/launcher_pro.py,sha256=Kl64CQLcUwW9FeaKFg0GIK51fLZaWkoQlQ346zEvJEE,3398
25
25
  cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
26
- cobweb/pipelines/pipeline.py,sha256=Pycm22bHId9a3gdP81D5y7SsuMndYooTb5n4zQxP7dM,1321
26
+ cobweb/pipelines/pipeline.py,sha256=FHY7ZHNZgx-AdbHt1MKHm-w0aigxvyXFV10T0NMdrOE,1381
27
27
  cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
28
28
  cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
29
29
  cobweb/schedulers/__init__.py,sha256=y7Lv_7b0zfTl0OhIONb_8u1K1C9gVlBA-xz_XG_kI9g,85
@@ -33,8 +33,8 @@ cobweb/utils/__init__.py,sha256=YvD4mIDBd9jmGA6WJBcwkgDU2jRFNBCEbarZCSUBAHE,114
33
33
  cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
34
34
  cobweb/utils/oss.py,sha256=6Qlhdde7CcwD69bBe2rGWHY3-aptG9NXB_DZLhjgDRQ,3553
35
35
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
36
- cobweb_launcher-1.3.11.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
- cobweb_launcher-1.3.11.dist-info/METADATA,sha256=5GKTN8jXVIcQY-kAcbc2-o8NPPImj_9KFrwaBXKhXMY,6510
38
- cobweb_launcher-1.3.11.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
- cobweb_launcher-1.3.11.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
- cobweb_launcher-1.3.11.dist-info/RECORD,,
36
+ cobweb_launcher-1.3.13.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
+ cobweb_launcher-1.3.13.dist-info/METADATA,sha256=_2gVLBf6ZRO9NA1zwD4kLFHKkbLFo8uEP7EYL7WdjaQ,6510
38
+ cobweb_launcher-1.3.13.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
+ cobweb_launcher-1.3.13.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
+ cobweb_launcher-1.3.13.dist-info/RECORD,,