cobweb-launcher 1.3.11__py3-none-any.whl → 1.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/crawlers/crawler.py +40 -13
- cobweb/launchers/launcher.py +1 -0
- cobweb/launchers/launcher_api.py +5 -5
- {cobweb_launcher-1.3.11.dist-info → cobweb_launcher-1.3.12.dist-info}/METADATA +1 -1
- {cobweb_launcher-1.3.11.dist-info → cobweb_launcher-1.3.12.dist-info}/RECORD +8 -8
- {cobweb_launcher-1.3.11.dist-info → cobweb_launcher-1.3.12.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.3.11.dist-info → cobweb_launcher-1.3.12.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.3.11.dist-info → cobweb_launcher-1.3.12.dist-info}/top_level.txt +0 -0
cobweb/crawlers/crawler.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
import time
|
3
4
|
import threading
|
4
5
|
from typing import Union, Callable, Mapping
|
@@ -52,14 +53,21 @@ class Crawler(threading.Thread):
|
|
52
53
|
thread_sleep = 0.1
|
53
54
|
if TaskQueue.REQUEST.length >= self.request_queue_size:
|
54
55
|
thread_sleep = 5
|
55
|
-
elif
|
56
|
-
member, priority = seed_info
|
57
|
-
seed = Seed(member, priority=priority)
|
56
|
+
elif seed := TaskQueue.TODO.pop():
|
57
|
+
# member, priority = seed_info
|
58
|
+
# seed = Seed(member, priority=priority)
|
58
59
|
if seed.params.retry > self.spider_max_retries:
|
59
|
-
seed.params.seed_status = DealModel.fail
|
60
|
+
# seed.params.seed_status = DealModel.fail
|
61
|
+
TaskQueue.DOT.build(
|
62
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
63
|
+
# cost_time=round(time.time() - start_time, 2),
|
64
|
+
process_task_type=seed.__class__.__name__,
|
65
|
+
seed_status=DealModel.fail,
|
66
|
+
**seed.to_dict
|
67
|
+
)
|
60
68
|
else:
|
61
69
|
TaskQueue.process_task(seed, self.request)
|
62
|
-
TaskQueue.DELETE.push(
|
70
|
+
TaskQueue.DELETE.push(seed.seed)
|
63
71
|
time.sleep(thread_sleep)
|
64
72
|
|
65
73
|
@Decorators.pause
|
@@ -68,21 +76,40 @@ class Crawler(threading.Thread):
|
|
68
76
|
if TaskQueue.RESPONSE.length >= self.download_queue_size:
|
69
77
|
thread_sleep = 5
|
70
78
|
# logger.info(f"download queue is full, sleep {thread_sleep}s")
|
71
|
-
elif
|
72
|
-
member, priority = request_info
|
73
|
-
|
74
|
-
|
75
|
-
|
79
|
+
elif request_item := TaskQueue.DOWNLOAD.pop():
|
80
|
+
# member, priority = request_info
|
81
|
+
#
|
82
|
+
# request_setting = json.loads(member)
|
83
|
+
# request_item = Request(seed=member, **request_setting)
|
84
|
+
if request_item.params.retry > self.spider_max_retries:
|
85
|
+
TaskQueue.DOT.build(
|
86
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
87
|
+
# cost_time=round(time.time() - start_time, 2),
|
88
|
+
process_task_type=request_item.__class__.__name__,
|
89
|
+
seed_status=DealModel.fail,
|
90
|
+
**request_item.to_dict
|
91
|
+
)
|
92
|
+
TaskQueue.DONE.push(request_item.seed)
|
93
|
+
else:
|
94
|
+
TaskQueue.process_task(request_item, self.download)
|
76
95
|
time.sleep(thread_sleep)
|
77
96
|
|
78
97
|
@Decorators.pause
|
79
98
|
def build_parse_item(self):
|
80
99
|
thread_sleep = 0.1
|
81
100
|
if TaskQueue.UPLOAD.length >= self.upload_queue_size:
|
82
|
-
# logger.info(f"upload queue is full, sleep {thread_sleep}s")
|
83
101
|
thread_sleep = 5
|
84
|
-
|
85
|
-
|
102
|
+
elif response_item := TaskQueue.RESPONSE.pop():
|
103
|
+
if response_item.params.retry > self.spider_max_retries:
|
104
|
+
TaskQueue.DOT.build(
|
105
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
106
|
+
process_task_type=response_item.__class__.__name__,
|
107
|
+
seed_status=DealModel.fail,
|
108
|
+
**response_item.to_dict
|
109
|
+
)
|
110
|
+
TaskQueue.DONE.push(response_item.seed)
|
111
|
+
else:
|
112
|
+
TaskQueue.process_task(response_item, self.parse)
|
86
113
|
time.sleep(thread_sleep)
|
87
114
|
|
88
115
|
|
cobweb/launchers/launcher.py
CHANGED
@@ -60,6 +60,7 @@ class Launcher(threading.Thread):
|
|
60
60
|
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
61
61
|
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
62
62
|
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
63
|
+
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
63
64
|
|
64
65
|
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
65
66
|
|
cobweb/launchers/launcher_api.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import time
|
2
2
|
|
3
|
-
from cobweb.base import TaskQueue, Decorators
|
3
|
+
from cobweb.base import TaskQueue, Decorators, Seed, Request
|
4
4
|
from cobweb.schedulers import ApiScheduler
|
5
5
|
from .launcher import Launcher
|
6
6
|
|
@@ -16,13 +16,13 @@ class LauncherApi(Launcher):
|
|
16
16
|
@Decorators.stop
|
17
17
|
def _schedule(self):
|
18
18
|
thread_sleep = self.scheduling_wait_time
|
19
|
-
for q, key, size, item_info in [
|
20
|
-
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
-
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
19
|
+
for q, key, size, item_info, Cls in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
|
22
22
|
]:
|
23
23
|
if q.length < size:
|
24
24
|
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
25
|
-
q.push((member, priority
|
25
|
+
q.push(Cls(member, priority=priority))
|
26
26
|
self.add_working_item(key.split(":")[-1], member, priority)
|
27
27
|
thread_sleep = 0.1
|
28
28
|
time.sleep(thread_sleep)
|
@@ -11,16 +11,16 @@ cobweb/base/request.py,sha256=acGm3OzxsPed5VUTk7D9eeHZPMh7KUNQRUv44G5znZg,2659
|
|
11
11
|
cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
12
12
|
cobweb/base/seed.py,sha256=PN5J4gKPEXylwyQeSGOBfauxHktxFr7RJe8nVX1hBw4,2987
|
13
13
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
14
|
-
cobweb/crawlers/crawler.py,sha256=
|
14
|
+
cobweb/crawlers/crawler.py,sha256=G81gh_Rm0kypjnMKq0S5lrV39zu0zijtinJrQhwdWgI,4233
|
15
15
|
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
16
16
|
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
17
17
|
cobweb/db/redis_db.py,sha256=FvMzckJtmhwKhZqKoS23iXmJti5P2dnMVD5rJ__5LUw,5139
|
18
18
|
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
19
19
|
cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
20
20
|
cobweb/launchers/__init__.py,sha256=m_XNG2bWuMbirPt3d0_s-Ezl1xycfUxeqZnwq_kkfuo,116
|
21
|
-
cobweb/launchers/launcher.py,sha256=
|
21
|
+
cobweb/launchers/launcher.py,sha256=dBHbm8Hj87CMCx5F9ZVo1oDPeW3L89d5Lu3PP9utYho,7393
|
22
22
|
cobweb/launchers/launcher_air.py,sha256=yPr395HVIIHAq6lqRcYJu7c0KkfO9V8O-2sn0hC96p0,2990
|
23
|
-
cobweb/launchers/launcher_api.py,sha256=
|
23
|
+
cobweb/launchers/launcher_api.py,sha256=vpwFxB1azgBk1bS7VhX3jOprQS8fl6Iu_5M-Y3QT67A,3394
|
24
24
|
cobweb/launchers/launcher_pro.py,sha256=2H-TcvQx-ga78GLNTa-GXMLYAj9nEeCJSWf8xl-1ISQ,3374
|
25
25
|
cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
|
26
26
|
cobweb/pipelines/pipeline.py,sha256=Pycm22bHId9a3gdP81D5y7SsuMndYooTb5n4zQxP7dM,1321
|
@@ -33,8 +33,8 @@ cobweb/utils/__init__.py,sha256=YvD4mIDBd9jmGA6WJBcwkgDU2jRFNBCEbarZCSUBAHE,114
|
|
33
33
|
cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
34
34
|
cobweb/utils/oss.py,sha256=6Qlhdde7CcwD69bBe2rGWHY3-aptG9NXB_DZLhjgDRQ,3553
|
35
35
|
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
36
|
-
cobweb_launcher-1.3.
|
37
|
-
cobweb_launcher-1.3.
|
38
|
-
cobweb_launcher-1.3.
|
39
|
-
cobweb_launcher-1.3.
|
40
|
-
cobweb_launcher-1.3.
|
36
|
+
cobweb_launcher-1.3.12.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
37
|
+
cobweb_launcher-1.3.12.dist-info/METADATA,sha256=NJlheF7Uwt4i6XKbgI6v-mSuyIk5PNSmU-H7fy3VCC8,6510
|
38
|
+
cobweb_launcher-1.3.12.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
39
|
+
cobweb_launcher-1.3.12.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
40
|
+
cobweb_launcher-1.3.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|