cobweb-launcher 1.3.11__tar.gz → 1.3.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-1.3.11/cobweb_launcher.egg-info → cobweb-launcher-1.3.13}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/__init__.py +9 -8
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/basic.py +2 -2
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/crawlers/crawler.py +38 -16
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher.py +13 -7
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher_api.py +5 -5
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher_pro.py +5 -5
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/pipelines/pipeline.py +2 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/setup.py +1 -1
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/LICENSE +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/README.md +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/dotting.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/schedulers/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/schedulers/scheduler_api.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/schedulers/scheduler_redis.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/setting.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/setup.cfg +0 -0
- {cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/test/test.py +0 -0
@@ -100,14 +100,15 @@ class Decorators:
|
|
100
100
|
def pause(func):
|
101
101
|
@wraps(func)
|
102
102
|
def wrapper(self, *args, **kwargs):
|
103
|
-
while not self.
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
103
|
+
while not self.stop.is_set():
|
104
|
+
while not self.pause.is_set():
|
105
|
+
try:
|
106
|
+
func(self)
|
107
|
+
except Exception as e:
|
108
|
+
logger.info(f"{func.__name__}: " + str(e))
|
109
|
+
finally:
|
110
|
+
time.sleep(0.1)
|
111
|
+
# logger.info(f"{func.__name__}: close!")
|
111
112
|
|
112
113
|
return wrapper
|
113
114
|
|
@@ -142,7 +142,7 @@ class Request:
|
|
142
142
|
|
143
143
|
def __init__(
|
144
144
|
self,
|
145
|
-
url,
|
145
|
+
# url,
|
146
146
|
seed,
|
147
147
|
random_ua=True,
|
148
148
|
check_status_code=True,
|
@@ -152,7 +152,7 @@ class Request:
|
|
152
152
|
status=None,
|
153
153
|
**kwargs
|
154
154
|
):
|
155
|
-
self.url = url
|
155
|
+
# self.url = url
|
156
156
|
self.check_status_code = check_status_code
|
157
157
|
self.request_setting = {}
|
158
158
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
import time
|
3
4
|
import threading
|
4
5
|
from typing import Union, Callable, Mapping
|
@@ -12,15 +13,15 @@ from cobweb.base import (
|
|
12
13
|
ConsoleItem,
|
13
14
|
Decorators,
|
14
15
|
TaskQueue,
|
15
|
-
logger
|
16
16
|
)
|
17
17
|
from cobweb.constant import DealModel
|
18
18
|
|
19
19
|
|
20
20
|
class Crawler(threading.Thread):
|
21
21
|
|
22
|
-
def __init__(self, pause, custom_func: Union[Mapping[str, Callable]]):
|
22
|
+
def __init__(self, stop, pause, custom_func: Union[Mapping[str, Callable]]):
|
23
23
|
super().__init__()
|
24
|
+
self.stop = stop
|
24
25
|
self.pause = pause
|
25
26
|
for func_name, _callable in custom_func.items():
|
26
27
|
if isinstance(_callable, Callable):
|
@@ -52,14 +53,20 @@ class Crawler(threading.Thread):
|
|
52
53
|
thread_sleep = 0.1
|
53
54
|
if TaskQueue.REQUEST.length >= self.request_queue_size:
|
54
55
|
thread_sleep = 5
|
55
|
-
elif
|
56
|
-
member, priority = seed_info
|
57
|
-
seed = Seed(member, priority=priority)
|
56
|
+
elif seed := TaskQueue.TODO.pop():
|
57
|
+
# member, priority = seed_info
|
58
|
+
# seed = Seed(member, priority=priority)
|
58
59
|
if seed.params.retry > self.spider_max_retries:
|
59
|
-
|
60
|
+
TaskQueue.DOT.build(
|
61
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
62
|
+
process_task_type=seed.__class__.__name__,
|
63
|
+
seed_status=DealModel.fail,
|
64
|
+
retries=seed.params.retry,
|
65
|
+
**seed.to_dict
|
66
|
+
)
|
60
67
|
else:
|
61
68
|
TaskQueue.process_task(seed, self.request)
|
62
|
-
TaskQueue.DELETE.push(
|
69
|
+
TaskQueue.DELETE.push(seed.seed)
|
63
70
|
time.sleep(thread_sleep)
|
64
71
|
|
65
72
|
@Decorators.pause
|
@@ -67,22 +74,37 @@ class Crawler(threading.Thread):
|
|
67
74
|
thread_sleep = 0.1
|
68
75
|
if TaskQueue.RESPONSE.length >= self.download_queue_size:
|
69
76
|
thread_sleep = 5
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
77
|
+
elif request_item := TaskQueue.DOWNLOAD.pop():
|
78
|
+
if request_item.params.retry > self.spider_max_retries:
|
79
|
+
TaskQueue.DOT.build(
|
80
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
81
|
+
process_task_type=request_item.__class__.__name__,
|
82
|
+
retries=request_item.params.retry,
|
83
|
+
seed_status=DealModel.fail,
|
84
|
+
**request_item.to_dict
|
85
|
+
)
|
86
|
+
TaskQueue.DONE.push(request_item.seed)
|
87
|
+
else:
|
88
|
+
TaskQueue.process_task(request_item, self.download)
|
76
89
|
time.sleep(thread_sleep)
|
77
90
|
|
78
91
|
@Decorators.pause
|
79
92
|
def build_parse_item(self):
|
80
93
|
thread_sleep = 0.1
|
81
94
|
if TaskQueue.UPLOAD.length >= self.upload_queue_size:
|
82
|
-
# logger.info(f"upload queue is full, sleep {thread_sleep}s")
|
83
95
|
thread_sleep = 5
|
84
|
-
|
85
|
-
|
96
|
+
elif response_item := TaskQueue.RESPONSE.pop():
|
97
|
+
if response_item.params.retry > self.spider_max_retries:
|
98
|
+
TaskQueue.DOT.build(
|
99
|
+
topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
|
100
|
+
process_task_type=response_item.__class__.__name__,
|
101
|
+
seed_status=DealModel.fail,
|
102
|
+
retries=response_item.params.retry,
|
103
|
+
**response_item.to_dict
|
104
|
+
)
|
105
|
+
TaskQueue.DONE.push(response_item.seed)
|
106
|
+
else:
|
107
|
+
TaskQueue.process_task(response_item, self.parse)
|
86
108
|
time.sleep(thread_sleep)
|
87
109
|
|
88
110
|
|
@@ -60,6 +60,7 @@ class Launcher(threading.Thread):
|
|
60
60
|
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
61
61
|
self.delete_queue_size = setting.DELETE_QUEUE_SIZE
|
62
62
|
self.done_queue_size = setting.DONE_QUEUE_SIZE
|
63
|
+
self.spider_max_retries = setting.SPIDER_MAX_RETRIES
|
63
64
|
|
64
65
|
self.spider_thread_num = setting.SPIDER_THREAD_NUM
|
65
66
|
|
@@ -71,7 +72,7 @@ class Launcher(threading.Thread):
|
|
71
72
|
self.crawler_path = setting.CRAWLER
|
72
73
|
self.pipeline_path = setting.PIPELINE
|
73
74
|
|
74
|
-
self.
|
75
|
+
self._thread_info = {}
|
75
76
|
|
76
77
|
self._task_info = dict(todo={}, download={})
|
77
78
|
|
@@ -130,9 +131,12 @@ class Launcher(threading.Thread):
|
|
130
131
|
def check_alive(self):
|
131
132
|
while not self.stop.is_set():
|
132
133
|
if not self.pause.is_set():
|
133
|
-
for
|
134
|
-
|
135
|
-
|
134
|
+
for name, thread_info in self._thread_info.items():
|
135
|
+
instance = thread_info['instance']
|
136
|
+
if not instance.is_alive():
|
137
|
+
instance = threading.Thread(name=name, target=thread_info['func'], args=())
|
138
|
+
self._thread_info[name] = dict(instance=instance, func=thread_info['func'])
|
139
|
+
instance.start()
|
136
140
|
time.sleep(1)
|
137
141
|
|
138
142
|
def _add_thread(self, func, num=1, obj=None, name=None, args=()):
|
@@ -140,7 +144,9 @@ class Launcher(threading.Thread):
|
|
140
144
|
name = obj.__class__.__name__ + ":" + (name or func.__name__)
|
141
145
|
for i in range(num):
|
142
146
|
func_name = name + "_" + str(i) if num > 1 else name
|
143
|
-
|
147
|
+
instance = threading.Thread(name=func_name, target=func, args=())
|
148
|
+
self._thread_info[func_name] = dict(instance=instance, func=func)
|
149
|
+
instance.start()
|
144
150
|
|
145
151
|
@Decorators.stop
|
146
152
|
def _polling(self):
|
@@ -187,8 +193,8 @@ class Launcher(threading.Thread):
|
|
187
193
|
Crawler = dynamic_load_class(self.crawler_path)
|
188
194
|
Pipeline = dynamic_load_class(self.pipeline_path)
|
189
195
|
|
190
|
-
crawler = Crawler(pause=self.pause, custom_func=self.custom_func)
|
191
|
-
pipeline = Pipeline(pause=self.pause)
|
196
|
+
crawler = Crawler(stop=self.stop, pause=self.pause, custom_func=self.custom_func)
|
197
|
+
pipeline = Pipeline(stop=self.stop, pause=self.pause)
|
192
198
|
|
193
199
|
self._add_thread(obj=crawler, func=crawler.build_request_item)
|
194
200
|
self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import time
|
2
2
|
|
3
|
-
from cobweb.base import TaskQueue, Decorators
|
3
|
+
from cobweb.base import TaskQueue, Decorators, Seed, Request
|
4
4
|
from cobweb.schedulers import ApiScheduler
|
5
5
|
from .launcher import Launcher
|
6
6
|
|
@@ -16,13 +16,13 @@ class LauncherApi(Launcher):
|
|
16
16
|
@Decorators.stop
|
17
17
|
def _schedule(self):
|
18
18
|
thread_sleep = self.scheduling_wait_time
|
19
|
-
for q, key, size, item_info in [
|
20
|
-
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
-
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
19
|
+
for q, key, size, item_info, Cls in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
|
22
22
|
]:
|
23
23
|
if q.length < size:
|
24
24
|
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
25
|
-
q.push((member, priority
|
25
|
+
q.push(Cls(member, priority=priority))
|
26
26
|
self.add_working_item(key.split(":")[-1], member, priority)
|
27
27
|
thread_sleep = 0.1
|
28
28
|
time.sleep(thread_sleep)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import time
|
2
2
|
|
3
|
-
from cobweb.base import TaskQueue, Decorators
|
3
|
+
from cobweb.base import TaskQueue, Decorators, Seed, Request
|
4
4
|
from cobweb.schedulers import RedisScheduler
|
5
5
|
from .launcher import Launcher
|
6
6
|
|
@@ -16,13 +16,13 @@ class LauncherPro(Launcher):
|
|
16
16
|
@Decorators.stop
|
17
17
|
def _schedule(self):
|
18
18
|
thread_sleep = self.scheduling_wait_time
|
19
|
-
for q, key, size, item_info in [
|
20
|
-
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
|
21
|
-
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
|
19
|
+
for q, key, size, item_info, Cls in [
|
20
|
+
(TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
|
21
|
+
(TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
|
22
22
|
]:
|
23
23
|
if q.length < size:
|
24
24
|
for member, priority in self._scheduler.schedule(key, self.scheduling_size):
|
25
|
-
q.push((member, priority
|
25
|
+
q.push(Cls(member, priority=priority))
|
26
26
|
self.add_working_item(key.split(":")[-1], member, priority)
|
27
27
|
thread_sleep = 0.1
|
28
28
|
time.sleep(thread_sleep)
|
@@ -10,9 +10,11 @@ class Pipeline(ABC):
|
|
10
10
|
|
11
11
|
def __init__(
|
12
12
|
self,
|
13
|
+
stop: threading.Event,
|
13
14
|
pause: threading.Event,
|
14
15
|
):
|
15
16
|
super().__init__()
|
17
|
+
self.stop = stop
|
16
18
|
self.pause = pause
|
17
19
|
self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
|
18
20
|
self.upload_wait_time = setting.UPLOAD_WAIT_TIME
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-1.3.11 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|