cobweb-launcher 1.2.10__py3-none-any.whl → 1.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- cobweb/crawlers/crawler.py +4 -1
- cobweb/launchers/launcher.py +6 -2
- cobweb/launchers/launcher_pro.py +10 -3
- cobweb/pipelines/pipeline.py +2 -1
- cobweb/setting.py +6 -0
- cobweb/utils/__init__.py +1 -0
- cobweb/utils/bloom.py +98 -0
- {cobweb_launcher-1.2.10.dist-info → cobweb_launcher-1.2.13.dist-info}/METADATA +2 -1
- {cobweb_launcher-1.2.10.dist-info → cobweb_launcher-1.2.13.dist-info}/RECORD +12 -11
- {cobweb_launcher-1.2.10.dist-info → cobweb_launcher-1.2.13.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.10.dist-info → cobweb_launcher-1.2.13.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.2.10.dist-info → cobweb_launcher-1.2.13.dist-info}/top_level.txt +0 -0
cobweb/crawlers/crawler.py
CHANGED
|
@@ -26,7 +26,8 @@ class Crawler(threading.Thread):
|
|
|
26
26
|
launcher_queue: Union[Mapping[str, Queue]],
|
|
27
27
|
custom_func: Union[Mapping[str, Callable]],
|
|
28
28
|
thread_num: int,
|
|
29
|
-
max_retries: int
|
|
29
|
+
max_retries: int,
|
|
30
|
+
time_sleep: int
|
|
30
31
|
):
|
|
31
32
|
super().__init__()
|
|
32
33
|
|
|
@@ -43,6 +44,7 @@ class Crawler(threading.Thread):
|
|
|
43
44
|
|
|
44
45
|
self.thread_num = thread_num
|
|
45
46
|
self.max_retries = max_retries
|
|
47
|
+
self.time_sleep = time_sleep
|
|
46
48
|
|
|
47
49
|
@staticmethod
|
|
48
50
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
|
@@ -150,6 +152,7 @@ class Crawler(threading.Thread):
|
|
|
150
152
|
))
|
|
151
153
|
seed.params.retry += 1
|
|
152
154
|
self._todo.push(seed)
|
|
155
|
+
time.sleep(self.time_sleep * seed.params.retry)
|
|
153
156
|
finally:
|
|
154
157
|
time.sleep(0.1)
|
|
155
158
|
logger.info("spider thread close")
|
cobweb/launchers/launcher.py
CHANGED
|
@@ -94,10 +94,13 @@ class Launcher(threading.Thread):
|
|
|
94
94
|
|
|
95
95
|
self._spider_max_retries = setting.SPIDER_MAX_RETRIES
|
|
96
96
|
self._spider_thread_num = setting.SPIDER_THREAD_NUM
|
|
97
|
+
self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
|
97
98
|
|
|
98
99
|
self._done_model = setting.DONE_MODEL
|
|
99
100
|
self._task_model = setting.TASK_MODEL
|
|
100
101
|
|
|
102
|
+
self._filter_field = setting.FILTER_FIELD
|
|
103
|
+
|
|
101
104
|
@property
|
|
102
105
|
def request(self):
|
|
103
106
|
"""
|
|
@@ -151,7 +154,7 @@ class Launcher(threading.Thread):
|
|
|
151
154
|
def _remove_doing_seeds(self, seeds):
|
|
152
155
|
for seed in seeds:
|
|
153
156
|
self.__DOING__.pop(seed, None)
|
|
154
|
-
logger.info("remove %s seeds from __DOING__" % len(seeds))
|
|
157
|
+
# logger.info("remove %s seeds from __DOING__" % len(seeds))
|
|
155
158
|
|
|
156
159
|
def _execute(self):
|
|
157
160
|
for func_name in self.__LAUNCHER_FUNC__:
|
|
@@ -168,7 +171,8 @@ class Launcher(threading.Thread):
|
|
|
168
171
|
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
169
172
|
custom_func=self.__CUSTOM_FUNC__,
|
|
170
173
|
thread_num = self._spider_thread_num,
|
|
171
|
-
max_retries = self._spider_max_retries
|
|
174
|
+
max_retries = self._spider_max_retries,
|
|
175
|
+
time_sleep=self._spider_time_sleep
|
|
172
176
|
).start()
|
|
173
177
|
|
|
174
178
|
self._Pipeline(
|
cobweb/launchers/launcher_pro.py
CHANGED
|
@@ -3,6 +3,7 @@ import threading
|
|
|
3
3
|
|
|
4
4
|
from cobweb.db import RedisDB
|
|
5
5
|
from cobweb.base import Seed, logger
|
|
6
|
+
from cobweb.utils import BloomFilter
|
|
6
7
|
from cobweb.constant import DealModel, LogTemplate
|
|
7
8
|
from .launcher import Launcher, check_pause
|
|
8
9
|
|
|
@@ -18,8 +19,11 @@ class LauncherPro(Launcher):
|
|
|
18
19
|
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
|
19
20
|
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
|
20
21
|
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
|
22
|
+
self._bf_key = "bloom_%s_%s" % (project, task)
|
|
21
23
|
self._db = RedisDB()
|
|
22
24
|
|
|
25
|
+
self._bf = BloomFilter(self._bf_key)
|
|
26
|
+
|
|
23
27
|
self._heartbeat_start_event = threading.Event()
|
|
24
28
|
self._redis_queue_empty_event = threading.Event()
|
|
25
29
|
|
|
@@ -125,15 +129,18 @@ class LauncherPro(Launcher):
|
|
|
125
129
|
seeds.append(seed.to_string)
|
|
126
130
|
if seeds:
|
|
127
131
|
count = self._db.zrem(self._todo_key, *seeds)
|
|
128
|
-
|
|
132
|
+
if count:
|
|
133
|
+
self.statistics(self._statistics_done_key, count)
|
|
129
134
|
self._remove_doing_seeds(seeds)
|
|
130
135
|
if s_seeds:
|
|
131
136
|
count = self._db.done([self._todo_key, self._done_key], *s_seeds)
|
|
132
|
-
|
|
137
|
+
if count:
|
|
138
|
+
self.statistics(self._statistics_done_key, count)
|
|
133
139
|
self._remove_doing_seeds(s_seeds)
|
|
134
140
|
if f_seeds:
|
|
135
141
|
count = self._db.done([self._todo_key, self._fail_key], *f_seeds)
|
|
136
|
-
|
|
142
|
+
if count:
|
|
143
|
+
self.statistics(self._statistics_fail_key, count)
|
|
137
144
|
self._remove_doing_seeds(f_seeds)
|
|
138
145
|
|
|
139
146
|
if status:
|
cobweb/pipelines/pipeline.py
CHANGED
cobweb/setting.py
CHANGED
|
@@ -57,9 +57,15 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
|
|
|
57
57
|
# spider
|
|
58
58
|
SPIDER_THREAD_NUM = 10
|
|
59
59
|
SPIDER_MAX_RETRIES = 5
|
|
60
|
+
SPIDER_TIME_SLEEP = 10
|
|
60
61
|
|
|
61
62
|
# 任务模式
|
|
62
63
|
TASK_MODEL = 0 # 0:单次,1:常驻
|
|
63
64
|
|
|
65
|
+
|
|
66
|
+
# bloom过滤器
|
|
67
|
+
CAPACITY = 100000000
|
|
68
|
+
ERROR_RATE = 0.001
|
|
69
|
+
FILTER_FIELD = "url"
|
|
64
70
|
# 文件下载响应类型过滤
|
|
65
71
|
# FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
|
cobweb/utils/__init__.py
CHANGED
cobweb/utils/bloom.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from redis import Redis
|
|
5
|
+
from cobweb import setting
|
|
6
|
+
|
|
7
|
+
# class BloomFilter:
|
|
8
|
+
#
|
|
9
|
+
# def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
10
|
+
# redis_config = redis_config or setting.REDIS_CONFIG
|
|
11
|
+
# capacity = capacity or setting.CAPACITY
|
|
12
|
+
# error_rate = error_rate or setting.ERROR_RATE
|
|
13
|
+
# redis_config['db'] = 3
|
|
14
|
+
#
|
|
15
|
+
# self.key = key
|
|
16
|
+
#
|
|
17
|
+
# pool = redis.ConnectionPool(**redis_config)
|
|
18
|
+
# self.bit_size = self.get_bit_size(capacity, error_rate)
|
|
19
|
+
# self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
|
20
|
+
# self._init_bloom_key()
|
|
21
|
+
#
|
|
22
|
+
# def add(self, value):
|
|
23
|
+
# for seed in range(self.hash_count):
|
|
24
|
+
# result = mmh3.hash(value, seed) % self.bit_size
|
|
25
|
+
# self._client.setbit(self.key, result, 1)
|
|
26
|
+
# return True
|
|
27
|
+
#
|
|
28
|
+
# def exists(self, value):
|
|
29
|
+
# if not self._client.exists(self.key):
|
|
30
|
+
# return False
|
|
31
|
+
# for seed in range(self.hash_count):
|
|
32
|
+
# result = mmh3.hash(value, seed) % self.bit_size
|
|
33
|
+
# if not self._client.getbit(self.key, result):
|
|
34
|
+
# return False
|
|
35
|
+
# return True
|
|
36
|
+
#
|
|
37
|
+
# def _init_bloom_key(self):
|
|
38
|
+
# lua_script = """
|
|
39
|
+
# redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
|
40
|
+
# redis.call("EXPIRE", KEYS[1], 604800)
|
|
41
|
+
# """
|
|
42
|
+
# if self._client.exists(self.key):
|
|
43
|
+
# return True
|
|
44
|
+
# execute = self._client.register_script(lua_script)
|
|
45
|
+
# execute(keys=[self.key], args=[self.bit_size-1, 1])
|
|
46
|
+
#
|
|
47
|
+
# @classmethod
|
|
48
|
+
# def get_bit_size(cls, n, p):
|
|
49
|
+
# return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
|
50
|
+
#
|
|
51
|
+
# @classmethod
|
|
52
|
+
# def get_hash_count(cls, m, n):
|
|
53
|
+
# return int((m / n) * math.log(2))
|
|
54
|
+
|
|
55
|
+
class BloomFilter:
|
|
56
|
+
|
|
57
|
+
def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
58
|
+
redis_config = redis_config or setting.REDIS_CONFIG
|
|
59
|
+
capacity = capacity or setting.CAPACITY
|
|
60
|
+
error_rate = error_rate or setting.ERROR_RATE
|
|
61
|
+
redis_config['db'] = 3
|
|
62
|
+
|
|
63
|
+
self.key = key
|
|
64
|
+
|
|
65
|
+
self._client = Redis(**redis_config).bf()
|
|
66
|
+
self._client.create(key=self.key, capacity=capacity, errorRate=error_rate)
|
|
67
|
+
|
|
68
|
+
def add(self, value):
|
|
69
|
+
return self._client.add(self.key, value)
|
|
70
|
+
|
|
71
|
+
def madd(self, items: list):
|
|
72
|
+
return self._client.madd(self.key, *items)
|
|
73
|
+
|
|
74
|
+
def exists(self, value):
|
|
75
|
+
return self._client.exists(self.key, value)
|
|
76
|
+
|
|
77
|
+
def mexists(self, items: list):
|
|
78
|
+
return self._client.mexists(self.key, *items)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == '__main__':
|
|
82
|
+
testBLF = BloomFilter("test", {
|
|
83
|
+
"host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
|
|
84
|
+
"password": "SpiderLinux666",
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
print("start")
|
|
88
|
+
start_time = time.time()
|
|
89
|
+
testBLF.add("test")
|
|
90
|
+
add_time = time.time()
|
|
91
|
+
print("add time::: ")
|
|
92
|
+
print(add_time - start_time)
|
|
93
|
+
print("get::: ")
|
|
94
|
+
print(testBLF.exists("test"))
|
|
95
|
+
exist_time = time.time()
|
|
96
|
+
print("get time::: ")
|
|
97
|
+
print(exist_time - add_time)
|
|
98
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cobweb-launcher
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.13
|
|
4
4
|
Summary: spider_hole
|
|
5
5
|
Home-page: https://github.com/Juannie-PP/cobweb
|
|
6
6
|
Author: Juannie-PP
|
|
@@ -16,6 +16,7 @@ Requires-Dist: requests (>=2.19.1)
|
|
|
16
16
|
Requires-Dist: oss2 (>=2.18.1)
|
|
17
17
|
Requires-Dist: redis (>=4.4.4)
|
|
18
18
|
Requires-Dist: aliyun-log-python-sdk
|
|
19
|
+
Requires-Dist: mmh3
|
|
19
20
|
|
|
20
21
|
# cobweb
|
|
21
22
|
cobweb是一个基于python的分布式爬虫调度框架,目前支持分布式爬虫,单机爬虫,支持自定义数据库,支持自定义数据存储,支持自定义数据处理等操作。
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
cobweb/__init__.py,sha256=uMHyf4Fekbyw2xBCbkA8R0LwCpBJf5p_7pWbh60ZWYk,83
|
|
2
2
|
cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
|
|
3
|
-
cobweb/setting.py,sha256=
|
|
3
|
+
cobweb/setting.py,sha256=S33EYK5hrrXn8GFkOmwt9Qn7OdNv8E4h8a0ZyeNydCM,2092
|
|
4
4
|
cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
|
|
5
5
|
cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
|
|
6
6
|
cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
|
|
@@ -11,27 +11,28 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
|
11
11
|
cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
|
|
12
12
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
|
13
13
|
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
|
14
|
-
cobweb/crawlers/crawler.py,sha256=
|
|
14
|
+
cobweb/crawlers/crawler.py,sha256=5mex-ENuzZSME0EIdwS9fnWkAX6LQuEyoDNcFm0emqs,6132
|
|
15
15
|
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
|
16
16
|
cobweb/db/__init__.py,sha256=ut0iEyBLjcJL06WNG_5_d4hO5PJWvDrKWMkDOdmgh2M,30
|
|
17
17
|
cobweb/db/redis_db.py,sha256=NNI2QkRV1hEZI-z-COEncXt88z3pZN6wusKlcQzc8V4,4304
|
|
18
18
|
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
|
19
19
|
cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
|
20
20
|
cobweb/launchers/__init__.py,sha256=af0Y6wrGX8SQZ7w7XL2sOtREjCT3dwad-uCc3nIontY,76
|
|
21
|
-
cobweb/launchers/launcher.py,sha256=
|
|
21
|
+
cobweb/launchers/launcher.py,sha256=sH9bj5TIPX5RjpNFvHmqTtlhyBxQqbTDhuvYSd2LspI,6114
|
|
22
22
|
cobweb/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
|
|
23
|
-
cobweb/launchers/launcher_pro.py,sha256=
|
|
23
|
+
cobweb/launchers/launcher_pro.py,sha256=4hi8xviIJr8HqTERShfdZjp767KXBeIyqyHaW9rYOhE,7815
|
|
24
24
|
cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
|
|
25
25
|
cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
|
|
26
26
|
cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
|
|
27
|
-
cobweb/pipelines/pipeline.py,sha256=
|
|
27
|
+
cobweb/pipelines/pipeline.py,sha256=4TJLX0sUHRxYndF5A4Vs5btUGI-wigkOcFvhTW1hLXI,2009
|
|
28
28
|
cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
|
|
29
29
|
cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
|
|
30
|
-
cobweb/utils/__init__.py,sha256=
|
|
30
|
+
cobweb/utils/__init__.py,sha256=vBtZTy3EfRE0MmH43URhmr7nw6_oOWTEbGOM9xR_9o8,78
|
|
31
|
+
cobweb/utils/bloom.py,sha256=G0PlaMVTz6KCmhcNToi28bAHj1YJjDVqwJAQ_DUBWGk,3030
|
|
31
32
|
cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
|
32
33
|
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
|
33
|
-
cobweb_launcher-1.2.
|
|
34
|
-
cobweb_launcher-1.2.
|
|
35
|
-
cobweb_launcher-1.2.
|
|
36
|
-
cobweb_launcher-1.2.
|
|
37
|
-
cobweb_launcher-1.2.
|
|
34
|
+
cobweb_launcher-1.2.13.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
|
35
|
+
cobweb_launcher-1.2.13.dist-info/METADATA,sha256=-HRZ-uipPBajhDyuDF2TTtQQAectd1gsjk3KwvWtHpo,6510
|
|
36
|
+
cobweb_launcher-1.2.13.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
|
37
|
+
cobweb_launcher-1.2.13.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
|
38
|
+
cobweb_launcher-1.2.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|