cobweb-launcher 1.3.10__tar.gz → 1.3.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-1.3.10/cobweb_launcher.egg-info → cobweb-launcher-1.3.12}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/__init__.py +2 -1
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/crawlers/crawler.py +40 -13
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/launchers/launcher.py +1 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/launchers/launcher_api.py +5 -5
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/setup.py +1 -1
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/LICENSE +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/README.md +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/basic.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/dotting.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/launchers/launcher_pro.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/pipelines/pipeline.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/schedulers/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/schedulers/scheduler_api.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/schedulers/scheduler_redis.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/setting.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/setup.cfg +0 -0
- {cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/test/test.py +0 -0
@@ -59,8 +59,9 @@ class TaskQueue:
|
|
59
59
|
else:
|
60
60
|
raise TypeError(f"{crawler_func.__name__} function return type isn't supported")
|
61
61
|
TaskQueue.DOT.build(
|
62
|
-
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}
|
62
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
63
63
|
cost_time=round(time.time() - start_time, 2),
|
64
|
+
process_task_type=tk.__class__.__name__,
|
64
65
|
**tk.to_dict
|
65
66
|
)
|
66
67
|
except Exception as e:
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
import time
|
3
4
|
import threading
|
4
5
|
from typing import Union, Callable, Mapping
|
@@ -52,14 +53,21 @@ class Crawler(threading.Thread):
|
|
52
53
|
thread_sleep = 0.1
|
53
54
|
if TaskQueue.REQUEST.length >= self.request_queue_size:
|
54
55
|
thread_sleep = 5
|
55
|
-
elif
|
56
|
-
member, priority = seed_info
|
57
|
-
seed = Seed(member, priority=priority)
|
56
|
+
elif seed := TaskQueue.TODO.pop():
|
57
|
+
# member, priority = seed_info
|
58
|
+
# seed = Seed(member, priority=priority)
|
58
59
|
if seed.params.retry > self.spider_max_retries:
|
59
|
-
seed.params.seed_status = DealModel.fail
|
60
|
+
# seed.params.seed_status = DealModel.fail
|
61
|
+
TaskQueue.DOT.build(
|
62
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
63
|
+
# cost_time=round(time.time() - start_time, 2),
|
64
|
+
process_task_type=seed.__class__.__name__,
|
65
|
+
seed_status=DealModel.fail,
|
66
|
+
**seed.to_dict
|
67
|
+
)
|
60
68
|
else:
|
61
69
|
TaskQueue.process_task(seed, self.request)
|
62
|
-
TaskQueue.DELETE.push(
|
70
|
+
TaskQueue.DELETE.push(seed.seed)
|
63
71
|
time.sleep(thread_sleep)
|
64
72
|
|
65
73
|
@Decorators.pause
|
@@ -68,21 +76,40 @@ class Crawler(threading.Thread):
|
|
68
76
|
if TaskQueue.RESPONSE.length >= self.download_queue_size:
|
69
77
|
thread_sleep = 5
|
70
78
|
# logger.info(f"download queue is full, sleep {thread_sleep}s")
|
71
|
-
elif
|
72
|
-
member, priority = request_info
|
73
|
-
|
74
|
-
|
75
|
-
|
79
|
+
elif request_item := TaskQueue.DOWNLOAD.pop():
|
80
|
+
# member, priority = request_info
|
81
|
+
#
|
82
|
+
# request_setting = json.loads(member)
|
83
|
+
# request_item = Request(seed=member, **request_setting)
|
84
|
+
if request_item.params.retry > self.spider_max_retries:
|
85
|
+
TaskQueue.DOT.build(
|
86
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
87
|
+
# cost_time=round(time.time() - start_time, 2),
|
88
|
+
process_task_type=request_item.__class__.__name__,
|
89
|
+
seed_status=DealModel.fail,
|
90
|
+
**request_item.to_dict
|
91
|
+
)
|
92
|
+
TaskQueue.DONE.push(request_item.seed)
|
93
|
+
else:
|
94
|
+
TaskQueue.process_task(request_item, self.download)
|
76
95
|
time.sleep(thread_sleep)
|
77
96
|
|
78
97
|
@Decorators.pause
|
79
98
|
def build_parse_item(self):
|
80
99
|
thread_sleep = 0.1
|
81
100
|
if TaskQueue.UPLOAD.length >= self.upload_queue_size:
|
82
|
-
# logger.info(f"upload queue is full, sleep {thread_sleep}s")
|
83
101
|
thread_sleep = 5
|
84
|
-
|
85
|
-
|
102
|
+
elif response_item := TaskQueue.RESPONSE.pop():
|
103
|
+
if response_item.params.retry > self.spider_max_retries:
|
104
|
+
TaskQueue.DOT.build(
|
105
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
106
|
+
process_task_type=response_item.__class__.__name__,
|
107
|
+
seed_status=DealModel.fail,
|
108
|
+
**response_item.to_dict
|
109
|
+
)
|
110
|
+
TaskQueue.DONE.push(response_item.seed)
|
111
|
+
else:
|
112
|
+
TaskQueue.process_task(response_item, self.parse)
|
86
113
|
time.sleep(thread_sleep)
|
87
114
|
|
88
115
|
|
@@ -60,6 +60,7 @@ class Launcher(threading.Thread):
|
|
60
60
|
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
61
61
|
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
62
62
|
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
63
|
+
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
63
64
|
|
64
65
|
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
65
66
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import time
|
2
2
|
|
3
|
-
from cobweb.base import TaskQueue, Decorators
|
3
|
+
from cobweb.base import TaskQueue, Decorators, Seed, Request
|
4
4
|
from cobweb.schedulers import ApiScheduler
|
5
5
|
from .launcher import Launcher
|
6
6
|
|
@@ -16,13 +16,13 @@ class LauncherApi(Launcher):
|
|
16
16
|
@Decorators.stop
|
17
17
|
def _schedule(self):
|
18
18
|
thread_sleep = self.scheduling_wait_time
|
19
|
-
for q, key, size, item_info in [
|
20
|
-
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
-
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
19
|
+
for q, key, size, item_info, Cls in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
|
22
22
|
]:
|
23
23
|
if q.length < size:
|
24
24
|
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
25
|
-
q.push((member, priority
|
25
|
+
q.push(Cls(member, priority=priority))
|
26
26
|
self.add_working_item(key.split(":")[-1], member, priority)
|
27
27
|
thread_sleep = 0.1
|
28
28
|
time.sleep(thread_sleep)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-1.3.10 → cobweb-launcher-1.3.12}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|