cobweb-launcher 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cobweb/base/__init__.py CHANGED
@@ -59,8 +59,9 @@ class TaskQueue:
59
59
  else:
60
60
  raise TypeError(f"{crawler_func.__name__} function return type isn't supported")
61
61
  TaskQueue.DOT.build(
62
- topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}:{tk.__class__.__name__}",
62
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
63
63
  cost_time=round(time.time() - start_time, 2),
64
+ process_task_type=tk.__class__.__name__,
64
65
  **tk.to_dict
65
66
  )
66
67
  except Exception as e:
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import time
3
4
  import threading
4
5
  from typing import Union, Callable, Mapping
@@ -52,14 +53,21 @@ class Crawler(threading.Thread):
52
53
  thread_sleep = 0.1
53
54
  if TaskQueue.REQUEST.length >= self.request_queue_size:
54
55
  thread_sleep = 5
55
- elif seed_info := TaskQueue.TODO.pop():
56
- member, priority = seed_info
57
- seed = Seed(member, priority=priority)
56
+ elif seed := TaskQueue.TODO.pop():
57
+ # member, priority = seed_info
58
+ # seed = Seed(member, priority=priority)
58
59
  if seed.params.retry > self.spider_max_retries:
59
- seed.params.seed_status = DealModel.fail
60
+ # seed.params.seed_status = DealModel.fail
61
+ TaskQueue.DOT.build(
62
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
63
+ # cost_time=round(time.time() - start_time, 2),
64
+ process_task_type=seed.__class__.__name__,
65
+ seed_status=DealModel.fail,
66
+ **seed.to_dict
67
+ )
60
68
  else:
61
69
  TaskQueue.process_task(seed, self.request)
62
- TaskQueue.DELETE.push(member)
70
+ TaskQueue.DELETE.push(seed.seed)
63
71
  time.sleep(thread_sleep)
64
72
 
65
73
  @Decorators.pause
@@ -68,21 +76,40 @@ class Crawler(threading.Thread):
68
76
  if TaskQueue.RESPONSE.length >= self.download_queue_size:
69
77
  thread_sleep = 5
70
78
  # logger.info(f"download queue is full, sleep {thread_sleep}s")
71
- elif request_info := TaskQueue.DOWNLOAD.pop():
72
- member, priority = request_info
73
- request_setting = json.loads(member)
74
- request_item = Request(seed=member, **request_setting)
75
- TaskQueue.process_task(request_item, self.download)
79
+ elif request_item := TaskQueue.DOWNLOAD.pop():
80
+ # member, priority = request_info
81
+ #
82
+ # request_setting = json.loads(member)
83
+ # request_item = Request(seed=member, **request_setting)
84
+ if request_item.params.retry > self.spider_max_retries:
85
+ TaskQueue.DOT.build(
86
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
87
+ # cost_time=round(time.time() - start_time, 2),
88
+ process_task_type=request_item.__class__.__name__,
89
+ seed_status=DealModel.fail,
90
+ **request_item.to_dict
91
+ )
92
+ TaskQueue.DONE.push(request_item.seed)
93
+ else:
94
+ TaskQueue.process_task(request_item, self.download)
76
95
  time.sleep(thread_sleep)
77
96
 
78
97
  @Decorators.pause
79
98
  def build_parse_item(self):
80
99
  thread_sleep = 0.1
81
100
  if TaskQueue.UPLOAD.length >= self.upload_queue_size:
82
- # logger.info(f"upload queue is full, sleep {thread_sleep}s")
83
101
  thread_sleep = 5
84
- if response_item := TaskQueue.RESPONSE.pop():
85
- TaskQueue.process_task(response_item, self.parse)
102
+ elif response_item := TaskQueue.RESPONSE.pop():
103
+ if response_item.params.retry > self.spider_max_retries:
104
+ TaskQueue.DOT.build(
105
+ topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
106
+ process_task_type=response_item.__class__.__name__,
107
+ seed_status=DealModel.fail,
108
+ **response_item.to_dict
109
+ )
110
+ TaskQueue.DONE.push(response_item.seed)
111
+ else:
112
+ TaskQueue.process_task(response_item, self.parse)
86
113
  time.sleep(thread_sleep)
87
114
 
88
115
 
@@ -60,6 +60,7 @@ class Launcher(threading.Thread):
60
60
  self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
61
61
  self.delete_queue_size = setting.DELETE_QUEUE_SIZE
62
62
  self.done_queue_size = setting.DONE_QUEUE_SIZE
63
+ self.spider_max_retries = setting.SPIDER_MAX_RETRIES
63
64
 
64
65
  self.spider_thread_num = setting.SPIDER_THREAD_NUM
65
66
 
@@ -1,6 +1,6 @@
1
1
  import time
2
2
 
3
- from cobweb.base import TaskQueue, Decorators
3
+ from cobweb.base import TaskQueue, Decorators, Seed, Request
4
4
  from cobweb.schedulers import ApiScheduler
5
5
  from .launcher import Launcher
6
6
 
@@ -16,13 +16,13 @@ class LauncherApi(Launcher):
16
16
  @Decorators.stop
17
17
  def _schedule(self):
18
18
  thread_sleep = self.scheduling_wait_time
19
- for q, key, size, item_info in [
20
- (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
- (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
19
+ for q, key, size, item_info, Cls in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
22
22
  ]:
23
23
  if q.length < size:
24
24
  for member, priority in self._scheduler.schedule(key, self.scheduling_size):
25
- q.push((member, priority), direct_insertion=True)
25
+ q.push(Cls(member, priority=priority))
26
26
  self.add_working_item(key.split(":")[-1], member, priority)
27
27
  thread_sleep = 0.1
28
28
  time.sleep(thread_sleep)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.10
3
+ Version: 1.3.12
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,7 +1,7 @@
1
1
  cobweb/__init__.py,sha256=oaEfsGUuGP0s39UbFRwrnsjMUeuB6QvQIAwStKFyUTk,83
2
2
  cobweb/constant.py,sha256=eofONAntk9O6S-cb4KbYGYHL_u7nBlOqqFOw_HzJHAU,3588
3
3
  cobweb/setting.py,sha256=pY6LKsgWI3164GiGA1z_y26LVf5-3mpiEgmm86mKRdY,3135
4
- cobweb/base/__init__.py,sha256=HWcjTGKYXCnbmeb70QMqyLwUGxPuxx9IxUJG6PxKpF8,5132
4
+ cobweb/base/__init__.py,sha256=T8raRjxGJW27SdRa04KcqvCVlUvRjv1jMLB4wNP_7xc,5169
5
5
  cobweb/base/basic.py,sha256=s5G4LBZiLUfoymV-gLSIqeH-OJ7q7-L35sBa6xEH3EI,7666
6
6
  cobweb/base/common_queue.py,sha256=Gor7sR3h1hlZWaI0XcNAbf0S15Ftjr3DFRWNTGL13uU,1137
7
7
  cobweb/base/dotting.py,sha256=lfFXXqnVP__hxlW3qH5Bnuq69KtnFaQLbcz1M8e2Ajg,1239
@@ -11,16 +11,16 @@ cobweb/base/request.py,sha256=acGm3OzxsPed5VUTk7D9eeHZPMh7KUNQRUv44G5znZg,2659
11
11
  cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
12
12
  cobweb/base/seed.py,sha256=PN5J4gKPEXylwyQeSGOBfauxHktxFr7RJe8nVX1hBw4,2987
13
13
  cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
14
- cobweb/crawlers/crawler.py,sha256=ZQ6yVA1EaQRdKJEY3DNqShzp9HPMwlSXapnsRW9E5Wc,2987
14
+ cobweb/crawlers/crawler.py,sha256=G81gh_Rm0kypjnMKq0S5lrV39zu0zijtinJrQhwdWgI,4233
15
15
  cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
16
16
  cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
17
17
  cobweb/db/redis_db.py,sha256=FvMzckJtmhwKhZqKoS23iXmJti5P2dnMVD5rJ__5LUw,5139
18
18
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
19
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
20
  cobweb/launchers/__init__.py,sha256=m_XNG2bWuMbirPt3d0_s-Ezl1xycfUxeqZnwq_kkfuo,116
21
- cobweb/launchers/launcher.py,sha256=O7HjZ2lewsZ5IIiZ2vhCAqGidJ1JKMxS5fs4KlslBkA,7332
21
+ cobweb/launchers/launcher.py,sha256=dBHbm8Hj87CMCx5F9ZVo1oDPeW3L89d5Lu3PP9utYho,7393
22
22
  cobweb/launchers/launcher_air.py,sha256=yPr395HVIIHAq6lqRcYJu7c0KkfO9V8O-2sn0hC96p0,2990
23
- cobweb/launchers/launcher_api.py,sha256=TfLrLXazFWsOJLI7caMGfZozCttL1WTwTo3uUpN_FV0,3370
23
+ cobweb/launchers/launcher_api.py,sha256=vpwFxB1azgBk1bS7VhX3jOprQS8fl6Iu_5M-Y3QT67A,3394
24
24
  cobweb/launchers/launcher_pro.py,sha256=2H-TcvQx-ga78GLNTa-GXMLYAj9nEeCJSWf8xl-1ISQ,3374
25
25
  cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
26
26
  cobweb/pipelines/pipeline.py,sha256=Pycm22bHId9a3gdP81D5y7SsuMndYooTb5n4zQxP7dM,1321
@@ -33,8 +33,8 @@ cobweb/utils/__init__.py,sha256=YvD4mIDBd9jmGA6WJBcwkgDU2jRFNBCEbarZCSUBAHE,114
33
33
  cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
34
34
  cobweb/utils/oss.py,sha256=6Qlhdde7CcwD69bBe2rGWHY3-aptG9NXB_DZLhjgDRQ,3553
35
35
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
36
- cobweb_launcher-1.3.10.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
- cobweb_launcher-1.3.10.dist-info/METADATA,sha256=_OKrcjWsAAL5b8PmzTHSTMo0xj3gpxv0aHFXSbVdAgc,6510
38
- cobweb_launcher-1.3.10.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
- cobweb_launcher-1.3.10.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
- cobweb_launcher-1.3.10.dist-info/RECORD,,
36
+ cobweb_launcher-1.3.12.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
37
+ cobweb_launcher-1.3.12.dist-info/METADATA,sha256=NJlheF7Uwt4i6XKbgI6v-mSuyIk5PNSmU-H7fy3VCC8,6510
38
+ cobweb_launcher-1.3.12.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
39
+ cobweb_launcher-1.3.12.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
40
+ cobweb_launcher-1.3.12.dist-info/RECORD,,