cobweb-launcher 1.2.11__tar.gz → 1.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/crawlers/crawler.py +24 -14
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/db/redis_db.py +1 -1
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher.py +30 -3
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher_pro.py +43 -20
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/setting.py +7 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/utils/__init__.py +1 -0
- cobweb-launcher-1.2.14/cobweb/utils/bloom.py +76 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/PKG-INFO +1 -1
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/SOURCES.txt +3 -1
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/requires.txt +1 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/setup.py +2 -2
- cobweb-launcher-1.2.14/test/test.py +38 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/LICENSE +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/README.md +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/setup.cfg +0 -0
|
@@ -23,25 +23,33 @@ class Crawler(threading.Thread):
|
|
|
23
23
|
self,
|
|
24
24
|
stop: threading.Event,
|
|
25
25
|
pause: threading.Event,
|
|
26
|
-
launcher_queue: Union[Mapping[str, Queue]],
|
|
26
|
+
# launcher_queue: Union[Mapping[str, Queue]],
|
|
27
|
+
get_seed: Callable,
|
|
28
|
+
set_seed: Callable,
|
|
29
|
+
add_seed: Callable,
|
|
30
|
+
delete_seed: Callable,
|
|
31
|
+
upload_data: Callable,
|
|
27
32
|
custom_func: Union[Mapping[str, Callable]],
|
|
28
33
|
thread_num: int,
|
|
29
|
-
max_retries: int
|
|
34
|
+
max_retries: int,
|
|
35
|
+
time_sleep: int,
|
|
30
36
|
):
|
|
31
37
|
super().__init__()
|
|
32
38
|
|
|
33
39
|
self._stop = stop
|
|
34
40
|
self._pause = pause
|
|
35
|
-
self.
|
|
36
|
-
self.
|
|
37
|
-
self.
|
|
38
|
-
self.
|
|
41
|
+
self._get_seed = get_seed
|
|
42
|
+
self._set_seed = set_seed
|
|
43
|
+
self._add_seed = add_seed
|
|
44
|
+
self._delete_seed = delete_seed
|
|
45
|
+
self._upload_data = upload_data
|
|
39
46
|
|
|
40
47
|
for func_name, _callable in custom_func.items():
|
|
41
48
|
if isinstance(_callable, Callable):
|
|
42
49
|
self.__setattr__(func_name, _callable)
|
|
43
50
|
|
|
44
51
|
self.thread_num = thread_num
|
|
52
|
+
self.time_sleep = time_sleep
|
|
45
53
|
self.max_retries = max_retries
|
|
46
54
|
|
|
47
55
|
@staticmethod
|
|
@@ -64,23 +72,23 @@ class Crawler(threading.Thread):
|
|
|
64
72
|
|
|
65
73
|
def distribute(self, item, seed):
|
|
66
74
|
if isinstance(item, BaseItem):
|
|
67
|
-
self.
|
|
75
|
+
self._upload_data(item)
|
|
68
76
|
elif isinstance(item, Seed):
|
|
69
|
-
self.
|
|
77
|
+
self._add_seed(item)
|
|
70
78
|
elif isinstance(item, str) and item == DealModel.poll:
|
|
71
|
-
self.
|
|
79
|
+
self._set_seed(seed)
|
|
72
80
|
elif isinstance(item, str) and item == DealModel.done:
|
|
73
|
-
self.
|
|
81
|
+
self._delete_seed(seed)
|
|
74
82
|
elif isinstance(item, str) and item == DealModel.fail:
|
|
75
83
|
seed.params.seed_status = DealModel.fail
|
|
76
|
-
self.
|
|
84
|
+
self._delete_seed(seed)
|
|
77
85
|
else:
|
|
78
86
|
raise TypeError("yield value type error!")
|
|
79
87
|
|
|
80
88
|
def spider(self):
|
|
81
89
|
while not self._stop.is_set():
|
|
82
90
|
|
|
83
|
-
seed = self.
|
|
91
|
+
seed = self._get_seed()
|
|
84
92
|
|
|
85
93
|
if not seed:
|
|
86
94
|
time.sleep(1)
|
|
@@ -88,7 +96,7 @@ class Crawler(threading.Thread):
|
|
|
88
96
|
|
|
89
97
|
elif seed.params.retry > self.max_retries:
|
|
90
98
|
seed.params.seed_status = DealModel.fail
|
|
91
|
-
self.
|
|
99
|
+
self._delete_seed(seed)
|
|
92
100
|
continue
|
|
93
101
|
|
|
94
102
|
seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
|
|
@@ -149,7 +157,9 @@ class Crawler(threading.Thread):
|
|
|
149
157
|
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
150
158
|
))
|
|
151
159
|
seed.params.retry += 1
|
|
152
|
-
self._todo.push(seed)
|
|
160
|
+
# self._todo.push(seed)
|
|
161
|
+
self._set_seed(seed)
|
|
162
|
+
time.sleep(self.time_sleep * seed.params.retry)
|
|
153
163
|
finally:
|
|
154
164
|
time.sleep(0.1)
|
|
155
165
|
logger.info("spider thread close")
|
|
@@ -17,6 +17,8 @@ def check_pause(func):
|
|
|
17
17
|
func(self, *args, **kwargs)
|
|
18
18
|
except Exception as e:
|
|
19
19
|
logger.info(f"{func.__name__}: " + str(e))
|
|
20
|
+
finally:
|
|
21
|
+
time.sleep(0.1)
|
|
20
22
|
|
|
21
23
|
return wrapper
|
|
22
24
|
|
|
@@ -94,10 +96,14 @@ class Launcher(threading.Thread):
|
|
|
94
96
|
|
|
95
97
|
self._spider_max_retries = setting.SPIDER_MAX_RETRIES
|
|
96
98
|
self._spider_thread_num = setting.SPIDER_THREAD_NUM
|
|
99
|
+
self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
|
100
|
+
self._spider_max_speed = setting.SPIDER_MAX_SPEED
|
|
97
101
|
|
|
98
102
|
self._done_model = setting.DONE_MODEL
|
|
99
103
|
self._task_model = setting.TASK_MODEL
|
|
100
104
|
|
|
105
|
+
self._filter_field = setting.FILTER_FIELD
|
|
106
|
+
|
|
101
107
|
@property
|
|
102
108
|
def request(self):
|
|
103
109
|
"""
|
|
@@ -151,7 +157,22 @@ class Launcher(threading.Thread):
|
|
|
151
157
|
def _remove_doing_seeds(self, seeds):
|
|
152
158
|
for seed in seeds:
|
|
153
159
|
self.__DOING__.pop(seed, None)
|
|
154
|
-
logger.info("remove %s seeds from __DOING__" % len(seeds))
|
|
160
|
+
# logger.info("remove %s seeds from __DOING__" % len(seeds))
|
|
161
|
+
|
|
162
|
+
def _get_seed(self) -> Seed:
|
|
163
|
+
return self.__LAUNCHER_QUEUE__["todo"].pop()
|
|
164
|
+
|
|
165
|
+
def _set_seed(self, seed, **kwargs):
|
|
166
|
+
self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
|
|
167
|
+
|
|
168
|
+
def _upload_data(self, data, **kwargs):
|
|
169
|
+
self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
|
|
170
|
+
|
|
171
|
+
def _add_seed(self, seed, **kwargs):
|
|
172
|
+
self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
|
|
173
|
+
|
|
174
|
+
def _delete_seed(self, seed, **kwargs):
|
|
175
|
+
self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
|
|
155
176
|
|
|
156
177
|
def _execute(self):
|
|
157
178
|
for func_name in self.__LAUNCHER_FUNC__:
|
|
@@ -165,10 +186,16 @@ class Launcher(threading.Thread):
|
|
|
165
186
|
|
|
166
187
|
self._Crawler(
|
|
167
188
|
stop=self._stop, pause=self._pause,
|
|
168
|
-
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
189
|
+
# launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
190
|
+
get_seed=self._get_seed,
|
|
191
|
+
set_seed=self._set_seed,
|
|
192
|
+
add_seed=self._add_seed,
|
|
193
|
+
delete_seed=self._delete_seed,
|
|
194
|
+
upload_data=self._upload_data,
|
|
169
195
|
custom_func=self.__CUSTOM_FUNC__,
|
|
170
196
|
thread_num = self._spider_thread_num,
|
|
171
|
-
max_retries = self._spider_max_retries
|
|
197
|
+
max_retries = self._spider_max_retries,
|
|
198
|
+
time_sleep=self._spider_time_sleep
|
|
172
199
|
).start()
|
|
173
200
|
|
|
174
201
|
self._Pipeline(
|
|
@@ -3,6 +3,7 @@ import threading
|
|
|
3
3
|
|
|
4
4
|
from cobweb.db import RedisDB
|
|
5
5
|
from cobweb.base import Seed, logger
|
|
6
|
+
from cobweb.utils import BloomFilter
|
|
6
7
|
from cobweb.constant import DealModel, LogTemplate
|
|
7
8
|
from .launcher import Launcher, check_pause
|
|
8
9
|
|
|
@@ -15,11 +16,19 @@ class LauncherPro(Launcher):
|
|
|
15
16
|
self._done_key = "{%s:%s}:done" % (project, task)
|
|
16
17
|
self._fail_key = "{%s:%s}:fail" % (project, task)
|
|
17
18
|
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
|
18
|
-
|
|
19
|
+
|
|
19
20
|
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
|
20
21
|
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
|
22
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
|
23
|
+
|
|
24
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
|
25
|
+
|
|
26
|
+
self._bf_key = "bloom_%s_%s" % (project, task)
|
|
27
|
+
|
|
21
28
|
self._db = RedisDB()
|
|
22
29
|
|
|
30
|
+
self._bf = BloomFilter(self._bf_key)
|
|
31
|
+
|
|
23
32
|
self._heartbeat_start_event = threading.Event()
|
|
24
33
|
self._redis_queue_empty_event = threading.Event()
|
|
25
34
|
|
|
@@ -33,6 +42,21 @@ class LauncherPro(Launcher):
|
|
|
33
42
|
else:
|
|
34
43
|
self._db._client.incrby(key, count)
|
|
35
44
|
|
|
45
|
+
def _get_seed(self) -> Seed:
|
|
46
|
+
spider_speed = self._db._client.get(self._speed_control_key)
|
|
47
|
+
if int(spider_speed or 0) > self._spider_max_speed:
|
|
48
|
+
expire_time = self._db.ttl(self._speed_control_key)
|
|
49
|
+
if expire_time == -1:
|
|
50
|
+
self._db.delete(self._speed_control_key)
|
|
51
|
+
else:
|
|
52
|
+
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
|
53
|
+
time.sleep(expire_time / 2)
|
|
54
|
+
return None
|
|
55
|
+
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
|
56
|
+
if seed and not self._db.lock(self._speed_control_key, t=60):
|
|
57
|
+
self._db._client.incrby(self._speed_control_key, 1)
|
|
58
|
+
return seed
|
|
59
|
+
|
|
36
60
|
@check_pause
|
|
37
61
|
def _execute_heartbeat(self):
|
|
38
62
|
if self._heartbeat_start_event.is_set():
|
|
@@ -110,7 +134,7 @@ class LauncherPro(Launcher):
|
|
|
110
134
|
"""
|
|
111
135
|
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
|
112
136
|
"""
|
|
113
|
-
|
|
137
|
+
seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
|
114
138
|
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
115
139
|
|
|
116
140
|
for _ in range(self._done_queue_max_size):
|
|
@@ -118,26 +142,25 @@ class LauncherPro(Launcher):
|
|
|
118
142
|
if not seed:
|
|
119
143
|
break
|
|
120
144
|
if seed.params.seed_status == DealModel.fail:
|
|
121
|
-
|
|
145
|
+
seed_info["failed"].append(seed.to_string)
|
|
122
146
|
elif self._done_model == 1:
|
|
123
|
-
|
|
147
|
+
seed_info["succeed"].append(seed.to_string)
|
|
124
148
|
else:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
self.
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
self._remove_doing_seeds(f_seeds)
|
|
149
|
+
seed_info["common"].append(seed.to_string)
|
|
150
|
+
seed_info['count'] += 1
|
|
151
|
+
|
|
152
|
+
if seed_info["count"]:
|
|
153
|
+
|
|
154
|
+
succeed_count = self._db.zrem(self._todo_key, *seed_info["common"])
|
|
155
|
+
succeed_count += self._db.done([self._todo_key, self._done_key], *seed_info["succeed"])
|
|
156
|
+
failed_count = self._db.done([self._todo_key, self._fail_key], *seed_info["failed"])
|
|
157
|
+
|
|
158
|
+
if failed_count:
|
|
159
|
+
self.statistics(self._statistics_fail_key, failed_count)
|
|
160
|
+
if succeed_count:
|
|
161
|
+
self.statistics(self._statistics_done_key, succeed_count)
|
|
162
|
+
|
|
163
|
+
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
|
141
164
|
|
|
142
165
|
if status:
|
|
143
166
|
time.sleep(self._done_queue_wait_seconds)
|
|
@@ -57,9 +57,16 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
|
|
|
57
57
|
# spider
|
|
58
58
|
SPIDER_THREAD_NUM = 10
|
|
59
59
|
SPIDER_MAX_RETRIES = 5
|
|
60
|
+
SPIDER_TIME_SLEEP = 10
|
|
61
|
+
SPIDER_MAX_SPEED = 1000 # 一分钟最大采集数
|
|
60
62
|
|
|
61
63
|
# 任务模式
|
|
62
64
|
TASK_MODEL = 0 # 0:单次,1:常驻
|
|
63
65
|
|
|
66
|
+
|
|
67
|
+
# bloom过滤器
|
|
68
|
+
CAPACITY = 100000000
|
|
69
|
+
ERROR_RATE = 0.001
|
|
70
|
+
FILTER_FIELD = "url"
|
|
64
71
|
# 文件下载响应类型过滤
|
|
65
72
|
# FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import mmh3
|
|
5
|
+
import redis
|
|
6
|
+
from cobweb import setting
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BloomFilter:
|
|
10
|
+
|
|
11
|
+
def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
12
|
+
redis_config = redis_config or setting.REDIS_CONFIG
|
|
13
|
+
capacity = capacity or setting.CAPACITY
|
|
14
|
+
error_rate = error_rate or setting.ERROR_RATE
|
|
15
|
+
redis_config['db'] = 3
|
|
16
|
+
|
|
17
|
+
self.key = key
|
|
18
|
+
|
|
19
|
+
pool = redis.ConnectionPool(**redis_config)
|
|
20
|
+
self._client = redis.Redis(connection_pool=pool)
|
|
21
|
+
self.bit_size = self.get_bit_size(capacity, error_rate)
|
|
22
|
+
self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
|
23
|
+
self._init_bloom_key()
|
|
24
|
+
|
|
25
|
+
def add(self, value):
|
|
26
|
+
for seed in range(self.hash_count):
|
|
27
|
+
result = mmh3.hash(value, seed) % self.bit_size
|
|
28
|
+
self._client.setbit(self.key, result, 1)
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
def exists(self, value):
|
|
32
|
+
if not self._client.exists(self.key):
|
|
33
|
+
return False
|
|
34
|
+
for seed in range(self.hash_count):
|
|
35
|
+
result = mmh3.hash(value, seed) % self.bit_size
|
|
36
|
+
if not self._client.getbit(self.key, result):
|
|
37
|
+
return False
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
def _init_bloom_key(self):
|
|
41
|
+
lua_script = """
|
|
42
|
+
redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
|
43
|
+
redis.call("EXPIRE", KEYS[1], 604800)
|
|
44
|
+
"""
|
|
45
|
+
if self._client.exists(self.key):
|
|
46
|
+
return True
|
|
47
|
+
execute = self._client.register_script(lua_script)
|
|
48
|
+
execute(keys=[self.key], args=[self.bit_size-1, 1])
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def get_bit_size(cls, n, p):
|
|
52
|
+
return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_hash_count(cls, m, n):
|
|
56
|
+
return int((m / n) * math.log(2))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
testBLF = BloomFilter("test", {
|
|
61
|
+
"host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
|
|
62
|
+
"password": "SpiderLinux666",
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
print("start")
|
|
66
|
+
start_time = time.time()
|
|
67
|
+
testBLF.add("test")
|
|
68
|
+
add_time = time.time()
|
|
69
|
+
print("add time::: ")
|
|
70
|
+
print(add_time - start_time)
|
|
71
|
+
print("get::: ")
|
|
72
|
+
print(testBLF.exists("test"))
|
|
73
|
+
exist_time = time.time()
|
|
74
|
+
print("get time::: ")
|
|
75
|
+
print(exist_time - add_time)
|
|
76
|
+
|
|
@@ -27,10 +27,12 @@ cobweb/pipelines/pipeline.py
|
|
|
27
27
|
cobweb/pipelines/pipeline_console.py
|
|
28
28
|
cobweb/pipelines/pipeline_loghub.py
|
|
29
29
|
cobweb/utils/__init__.py
|
|
30
|
+
cobweb/utils/bloom.py
|
|
30
31
|
cobweb/utils/oss.py
|
|
31
32
|
cobweb/utils/tools.py
|
|
32
33
|
cobweb_launcher.egg-info/PKG-INFO
|
|
33
34
|
cobweb_launcher.egg-info/SOURCES.txt
|
|
34
35
|
cobweb_launcher.egg-info/dependency_links.txt
|
|
35
36
|
cobweb_launcher.egg-info/requires.txt
|
|
36
|
-
cobweb_launcher.egg-info/top_level.txt
|
|
37
|
+
cobweb_launcher.egg-info/top_level.txt
|
|
38
|
+
test/test.py
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="cobweb-launcher",
|
|
8
|
-
version="1.2.
|
|
8
|
+
version="1.2.14",
|
|
9
9
|
packages=find_packages(),
|
|
10
10
|
url="https://github.com/Juannie-PP/cobweb",
|
|
11
11
|
license="MIT",
|
|
@@ -14,7 +14,7 @@ setup(
|
|
|
14
14
|
description="spider_hole",
|
|
15
15
|
long_description=long_description,
|
|
16
16
|
long_description_content_type="text/markdown",
|
|
17
|
-
install_requires=["requests>=2.19.1", "oss2>=2.18.1", "redis>=4.4.4", "aliyun-log-python-sdk"],
|
|
17
|
+
install_requires=["requests>=2.19.1", "oss2>=2.18.1", "redis>=4.4.4", "aliyun-log-python-sdk", "mmh3"],
|
|
18
18
|
classifiers=[
|
|
19
19
|
"Programming Language :: Python :: 3",
|
|
20
20
|
],
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
os.environ["SPIDER_NUM"] = "1"
|
|
5
|
+
os.environ["REDIS_HOST"] = "r-j6cc5zw8m3pqom4chmpd.redis.rds.aliyuncs.com"
|
|
6
|
+
os.environ["REDIS_PASSWORD"] = "SpiderLinux666"
|
|
7
|
+
os.environ["REDIS_PORT"] = "6379"
|
|
8
|
+
os.environ["REDIS_DB"] = "0"
|
|
9
|
+
os.environ["SCHEDULER_QUEUE_LENGTH"] = "100"
|
|
10
|
+
os.environ["STORER_QUEUE_LENGTH"] = "10"
|
|
11
|
+
os.environ["LOGSTORE"] = "download_meta"
|
|
12
|
+
os.environ["LOGHUB_ACCESS_KEY"] = "LTAI5tH2FLYsBkxcbiLdoYZT"
|
|
13
|
+
os.environ["LOGHUB_ACCESS_SECRET"] = "4oD1NVvYfaWiqxkmGx6c2xtpPWEq17"
|
|
14
|
+
os.environ["BUCKET"] = "databee-video"
|
|
15
|
+
os.environ["ENDPOINT"] = "http://oss-cn-hangzhou.aliyuncs.com"
|
|
16
|
+
os.environ["OSS_ACCESS_KEY"] = "LTAI5tH2FLYsBkxcbiLdoYZT"
|
|
17
|
+
os.environ["OSS_ACCESS_SECRET"] = "4oD1NVvYfaWiqxkmGx6c2xtpPWEq17"
|
|
18
|
+
os.environ["CHUNK_SIZE"] = "1048576"
|
|
19
|
+
os.environ["MIN_SIZE"] = "1024"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
from cobweb import LauncherPro
|
|
23
|
+
|
|
24
|
+
app = LauncherPro("test", "test", SPIDER_THREAD_NUM=1)
|
|
25
|
+
|
|
26
|
+
app.SEEDS = [
|
|
27
|
+
{"url": "https://www.baidu.com"},
|
|
28
|
+
{"url": "https://www.google.com"},
|
|
29
|
+
{"url": "https://www.bing.com"},
|
|
30
|
+
{"url": "https://www.baidu.com?v=1"},
|
|
31
|
+
{"url": "https://www.baidu.com?v=2"},
|
|
32
|
+
{"url": "https://www.baidu.com?v=3"},
|
|
33
|
+
{"url": "https://www.baidu.com?v=4"},
|
|
34
|
+
{"url": "https://www.baidu.com?v=5"}
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
if __name__ == '__main__':
|
|
38
|
+
app.start()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|