cobweb-launcher 1.3.12__tar.gz → 1.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-1.3.12/cobweb_launcher.egg-info → cobweb-launcher-1.3.14}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/__init__.py +9 -8
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/basic.py +2 -2
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/crawlers/crawler.py +6 -11
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/launchers/launcher.py +12 -7
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/launchers/launcher_pro.py +5 -5
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/pipelines/pipeline.py +2 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/setup.py +1 -1
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/LICENSE +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/README.md +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/dotting.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/launchers/launcher_api.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/schedulers/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/schedulers/scheduler_api.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/schedulers/scheduler_redis.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/setting.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/setup.cfg +0 -0
- {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/test/test.py +0 -0
@@ -100,14 +100,15 @@ class Decorators:
|
|
100
100
|
def pause(func):
|
101
101
|
@wraps(func)
|
102
102
|
def wrapper(self, *args, **kwargs):
|
103
|
-
while not self.
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
103
|
+
while not self.stop.is_set():
|
104
|
+
while not self.pause.is_set():
|
105
|
+
try:
|
106
|
+
func(self)
|
107
|
+
except Exception as e:
|
108
|
+
logger.info(f"{func.__name__}: " + str(e))
|
109
|
+
finally:
|
110
|
+
time.sleep(0.1)
|
111
|
+
# logger.info(f"{func.__name__}: close!")
|
111
112
|
|
112
113
|
return wrapper
|
113
114
|
|
@@ -142,7 +142,7 @@ class Request:
|
|
142
142
|
|
143
143
|
def __init__(
|
144
144
|
self,
|
145
|
-
url,
|
145
|
+
# url,
|
146
146
|
seed,
|
147
147
|
random_ua=True,
|
148
148
|
check_status_code=True,
|
@@ -152,7 +152,7 @@ class Request:
|
|
152
152
|
status=None,
|
153
153
|
**kwargs
|
154
154
|
):
|
155
|
-
self.url = url
|
155
|
+
# self.url = url
|
156
156
|
self.check_status_code = check_status_code
|
157
157
|
self.request_setting = {}
|
158
158
|
|
@@ -13,15 +13,15 @@ from cobweb.base import (
|
|
13
13
|
ConsoleItem,
|
14
14
|
Decorators,
|
15
15
|
TaskQueue,
|
16
|
-
logger
|
17
16
|
)
|
18
17
|
from cobweb.constant import DealModel
|
19
18
|
|
20
19
|
|
21
20
|
class Crawler(threading.Thread):
|
22
21
|
|
23
|
-
def __init__(self, pause, custom_func: Union[Mapping[str, Callable]]):
|
22
|
+
def __init__(self, stop, pause, custom_func: Union[Mapping[str, Callable]]):
|
24
23
|
super().__init__()
|
24
|
+
self.stop = stop
|
25
25
|
self.pause = pause
|
26
26
|
for func_name, _callable in custom_func.items():
|
27
27
|
if isinstance(_callable, Callable):
|
@@ -34,7 +34,7 @@ class Crawler(threading.Thread):
|
|
34
34
|
|
35
35
|
@staticmethod
|
36
36
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
37
|
-
yield Request(seed
|
37
|
+
yield Request(seed, timeout=5)
|
38
38
|
|
39
39
|
@staticmethod
|
40
40
|
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
@@ -57,12 +57,11 @@ class Crawler(threading.Thread):
|
|
57
57
|
# member, priority = seed_info
|
58
58
|
# seed = Seed(member, priority=priority)
|
59
59
|
if seed.params.retry > self.spider_max_retries:
|
60
|
-
# seed.params.seed_status = DealModel.fail
|
61
60
|
TaskQueue.DOT.build(
|
62
61
|
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
63
|
-
# cost_time=round(time.time() - start_time, 2),
|
64
62
|
process_task_type=seed.__class__.__name__,
|
65
63
|
seed_status=DealModel.fail,
|
64
|
+
retries=seed.params.retry,
|
66
65
|
**seed.to_dict
|
67
66
|
)
|
68
67
|
else:
|
@@ -75,17 +74,12 @@ class Crawler(threading.Thread):
|
|
75
74
|
thread_sleep = 0.1
|
76
75
|
if TaskQueue.RESPONSE.length >= self.download_queue_size:
|
77
76
|
thread_sleep = 5
|
78
|
-
# logger.info(f"download queue is full, sleep {thread_sleep}s")
|
79
77
|
elif request_item := TaskQueue.DOWNLOAD.pop():
|
80
|
-
# member, priority = request_info
|
81
|
-
#
|
82
|
-
# request_setting = json.loads(member)
|
83
|
-
# request_item = Request(seed=member, **request_setting)
|
84
78
|
if request_item.params.retry > self.spider_max_retries:
|
85
79
|
TaskQueue.DOT.build(
|
86
80
|
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
87
|
-
# cost_time=round(time.time() - start_time, 2),
|
88
81
|
process_task_type=request_item.__class__.__name__,
|
82
|
+
retries=request_item.params.retry,
|
89
83
|
seed_status=DealModel.fail,
|
90
84
|
**request_item.to_dict
|
91
85
|
)
|
@@ -105,6 +99,7 @@ class Crawler(threading.Thread):
|
|
105
99
|
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
106
100
|
process_task_type=response_item.__class__.__name__,
|
107
101
|
seed_status=DealModel.fail,
|
102
|
+
retries=response_item.params.retry,
|
108
103
|
**response_item.to_dict
|
109
104
|
)
|
110
105
|
TaskQueue.DONE.push(response_item.seed)
|
@@ -72,7 +72,7 @@ class Launcher(threading.Thread):
|
|
72
72
|
self.crawler_path = setting.CRAWLER
|
73
73
|
self.pipeline_path = setting.PIPELINE
|
74
74
|
|
75
|
-
self.
|
75
|
+
self._thread_info = {}
|
76
76
|
|
77
77
|
self._task_info = dict(todo={}, download={})
|
78
78
|
|
@@ -131,9 +131,12 @@ class Launcher(threading.Thread):
|
|
131
131
|
def check_alive(self):
|
132
132
|
while not self.stop.is_set():
|
133
133
|
if not self.pause.is_set():
|
134
|
-
for
|
135
|
-
|
136
|
-
|
134
|
+
for name, thread_info in self._thread_info.items():
|
135
|
+
instance = thread_info['instance']
|
136
|
+
if not instance.is_alive():
|
137
|
+
instance = threading.Thread(name=name, target=thread_info['func'], args=())
|
138
|
+
self._thread_info[name] = dict(instance=instance, func=thread_info['func'])
|
139
|
+
instance.start()
|
137
140
|
time.sleep(1)
|
138
141
|
|
139
142
|
def _add_thread(self, func, num=1, obj=None, name=None, args=()):
|
@@ -141,7 +144,9 @@ class Launcher(threading.Thread):
|
|
141
144
|
name = obj.__class__.__name__ + ":" + (name or func.__name__)
|
142
145
|
for i in range(num):
|
143
146
|
func_name = name + "_" + str(i) if num > 1 else name
|
144
|
-
|
147
|
+
instance = threading.Thread(name=func_name, target=func, args=())
|
148
|
+
self._thread_info[func_name] = dict(instance=instance, func=func)
|
149
|
+
instance.start()
|
145
150
|
|
146
151
|
@Decorators.stop
|
147
152
|
def _polling(self):
|
@@ -188,8 +193,8 @@ class Launcher(threading.Thread):
|
|
188
193
|
Crawler = dynamic_load_class(self.crawler_path)
|
189
194
|
Pipeline = dynamic_load_class(self.pipeline_path)
|
190
195
|
|
191
|
-
crawler = Crawler(pause=self.pause, custom_func=self.custom_func)
|
192
|
-
pipeline = Pipeline(pause=self.pause)
|
196
|
+
crawler = Crawler(stop=self.stop, pause=self.pause, custom_func=self.custom_func)
|
197
|
+
pipeline = Pipeline(stop=self.stop, pause=self.pause)
|
193
198
|
|
194
199
|
self._add_thread(obj=crawler, func=crawler.build_request_item)
|
195
200
|
self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import time
|
2
2
|
|
3
|
-
from cobweb.base import TaskQueue, Decorators
|
3
|
+
from cobweb.base import TaskQueue, Decorators, Seed, Request
|
4
4
|
from cobweb.schedulers import RedisScheduler
|
5
5
|
from .launcher import Launcher
|
6
6
|
|
@@ -16,13 +16,13 @@ class LauncherPro(Launcher):
|
|
16
16
|
@Decorators.stop
|
17
17
|
def _schedule(self):
|
18
18
|
thread_sleep = self.scheduling_wait_time
|
19
|
-
for q, key, size, item_info in [
|
20
|
-
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
-
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
19
|
+
for q, key, size, item_info, Cls in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
|
22
22
|
]:
|
23
23
|
if q.length < size:
|
24
24
|
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
25
|
-
q.push((member, priority
|
25
|
+
q.push(Cls(member, priority=priority))
|
26
26
|
self.add_working_item(key.split(":")[-1], member, priority)
|
27
27
|
thread_sleep = 0.1
|
28
28
|
time.sleep(thread_sleep)
|
@@ -10,9 +10,11 @@ class Pipeline(ABC):
|
|
10
10
|
|
11
11
|
def __init__(
|
12
12
|
self,
|
13
|
+
stop: threading.Event,
|
13
14
|
pause: threading.Event,
|
14
15
|
):
|
15
16
|
super().__init__()
|
17
|
+
self.stop = stop
|
16
18
|
self.pause = pause
|
17
19
|
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
18
20
|
self.upload_wait_time = setting.UPLOAD_WAIT_TIME
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-1.3.12 → cobweb-launcher-1.3.14}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|