cobweb-launcher 1.2.13__tar.gz → 1.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/crawlers/crawler.py +22 -15
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/db/redis_db.py +1 -1
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher.py +24 -1
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher_pro.py +39 -20
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/setting.py +1 -0
- cobweb-launcher-1.2.14/cobweb/utils/bloom.py +76 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/PKG-INFO +1 -1
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/SOURCES.txt +2 -1
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/setup.py +1 -1
- cobweb-launcher-1.2.14/test/test.py +38 -0
- cobweb-launcher-1.2.13/cobweb/utils/bloom.py +0 -98
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/LICENSE +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/README.md +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/setup.cfg +0 -0
|
@@ -23,28 +23,34 @@ class Crawler(threading.Thread):
|
|
|
23
23
|
self,
|
|
24
24
|
stop: threading.Event,
|
|
25
25
|
pause: threading.Event,
|
|
26
|
-
launcher_queue: Union[Mapping[str, Queue]],
|
|
26
|
+
# launcher_queue: Union[Mapping[str, Queue]],
|
|
27
|
+
get_seed: Callable,
|
|
28
|
+
set_seed: Callable,
|
|
29
|
+
add_seed: Callable,
|
|
30
|
+
delete_seed: Callable,
|
|
31
|
+
upload_data: Callable,
|
|
27
32
|
custom_func: Union[Mapping[str, Callable]],
|
|
28
33
|
thread_num: int,
|
|
29
34
|
max_retries: int,
|
|
30
|
-
time_sleep: int
|
|
35
|
+
time_sleep: int,
|
|
31
36
|
):
|
|
32
37
|
super().__init__()
|
|
33
38
|
|
|
34
39
|
self._stop = stop
|
|
35
40
|
self._pause = pause
|
|
36
|
-
self.
|
|
37
|
-
self.
|
|
38
|
-
self.
|
|
39
|
-
self.
|
|
41
|
+
self._get_seed = get_seed
|
|
42
|
+
self._set_seed = set_seed
|
|
43
|
+
self._add_seed = add_seed
|
|
44
|
+
self._delete_seed = delete_seed
|
|
45
|
+
self._upload_data = upload_data
|
|
40
46
|
|
|
41
47
|
for func_name, _callable in custom_func.items():
|
|
42
48
|
if isinstance(_callable, Callable):
|
|
43
49
|
self.__setattr__(func_name, _callable)
|
|
44
50
|
|
|
45
51
|
self.thread_num = thread_num
|
|
46
|
-
self.max_retries = max_retries
|
|
47
52
|
self.time_sleep = time_sleep
|
|
53
|
+
self.max_retries = max_retries
|
|
48
54
|
|
|
49
55
|
@staticmethod
|
|
50
56
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
|
@@ -66,23 +72,23 @@ class Crawler(threading.Thread):
|
|
|
66
72
|
|
|
67
73
|
def distribute(self, item, seed):
|
|
68
74
|
if isinstance(item, BaseItem):
|
|
69
|
-
self.
|
|
75
|
+
self._upload_data(item)
|
|
70
76
|
elif isinstance(item, Seed):
|
|
71
|
-
self.
|
|
77
|
+
self._add_seed(item)
|
|
72
78
|
elif isinstance(item, str) and item == DealModel.poll:
|
|
73
|
-
self.
|
|
79
|
+
self._set_seed(seed)
|
|
74
80
|
elif isinstance(item, str) and item == DealModel.done:
|
|
75
|
-
self.
|
|
81
|
+
self._delete_seed(seed)
|
|
76
82
|
elif isinstance(item, str) and item == DealModel.fail:
|
|
77
83
|
seed.params.seed_status = DealModel.fail
|
|
78
|
-
self.
|
|
84
|
+
self._delete_seed(seed)
|
|
79
85
|
else:
|
|
80
86
|
raise TypeError("yield value type error!")
|
|
81
87
|
|
|
82
88
|
def spider(self):
|
|
83
89
|
while not self._stop.is_set():
|
|
84
90
|
|
|
85
|
-
seed = self.
|
|
91
|
+
seed = self._get_seed()
|
|
86
92
|
|
|
87
93
|
if not seed:
|
|
88
94
|
time.sleep(1)
|
|
@@ -90,7 +96,7 @@ class Crawler(threading.Thread):
|
|
|
90
96
|
|
|
91
97
|
elif seed.params.retry > self.max_retries:
|
|
92
98
|
seed.params.seed_status = DealModel.fail
|
|
93
|
-
self.
|
|
99
|
+
self._delete_seed(seed)
|
|
94
100
|
continue
|
|
95
101
|
|
|
96
102
|
seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
|
|
@@ -151,7 +157,8 @@ class Crawler(threading.Thread):
|
|
|
151
157
|
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
152
158
|
))
|
|
153
159
|
seed.params.retry += 1
|
|
154
|
-
self._todo.push(seed)
|
|
160
|
+
# self._todo.push(seed)
|
|
161
|
+
self._set_seed(seed)
|
|
155
162
|
time.sleep(self.time_sleep * seed.params.retry)
|
|
156
163
|
finally:
|
|
157
164
|
time.sleep(0.1)
|
|
@@ -17,6 +17,8 @@ def check_pause(func):
|
|
|
17
17
|
func(self, *args, **kwargs)
|
|
18
18
|
except Exception as e:
|
|
19
19
|
logger.info(f"{func.__name__}: " + str(e))
|
|
20
|
+
finally:
|
|
21
|
+
time.sleep(0.1)
|
|
20
22
|
|
|
21
23
|
return wrapper
|
|
22
24
|
|
|
@@ -95,6 +97,7 @@ class Launcher(threading.Thread):
|
|
|
95
97
|
self._spider_max_retries = setting.SPIDER_MAX_RETRIES
|
|
96
98
|
self._spider_thread_num = setting.SPIDER_THREAD_NUM
|
|
97
99
|
self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
|
|
100
|
+
self._spider_max_speed = setting.SPIDER_MAX_SPEED
|
|
98
101
|
|
|
99
102
|
self._done_model = setting.DONE_MODEL
|
|
100
103
|
self._task_model = setting.TASK_MODEL
|
|
@@ -156,6 +159,21 @@ class Launcher(threading.Thread):
|
|
|
156
159
|
self.__DOING__.pop(seed, None)
|
|
157
160
|
# logger.info("remove %s seeds from __DOING__" % len(seeds))
|
|
158
161
|
|
|
162
|
+
def _get_seed(self) -> Seed:
|
|
163
|
+
return self.__LAUNCHER_QUEUE__["todo"].pop()
|
|
164
|
+
|
|
165
|
+
def _set_seed(self, seed, **kwargs):
|
|
166
|
+
self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
|
|
167
|
+
|
|
168
|
+
def _upload_data(self, data, **kwargs):
|
|
169
|
+
self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
|
|
170
|
+
|
|
171
|
+
def _add_seed(self, seed, **kwargs):
|
|
172
|
+
self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
|
|
173
|
+
|
|
174
|
+
def _delete_seed(self, seed, **kwargs):
|
|
175
|
+
self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
|
|
176
|
+
|
|
159
177
|
def _execute(self):
|
|
160
178
|
for func_name in self.__LAUNCHER_FUNC__:
|
|
161
179
|
threading.Thread(name=func_name, target=getattr(self, func_name)).start()
|
|
@@ -168,7 +186,12 @@ class Launcher(threading.Thread):
|
|
|
168
186
|
|
|
169
187
|
self._Crawler(
|
|
170
188
|
stop=self._stop, pause=self._pause,
|
|
171
|
-
launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
189
|
+
# launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
190
|
+
get_seed=self._get_seed,
|
|
191
|
+
set_seed=self._set_seed,
|
|
192
|
+
add_seed=self._add_seed,
|
|
193
|
+
delete_seed=self._delete_seed,
|
|
194
|
+
upload_data=self._upload_data,
|
|
172
195
|
custom_func=self.__CUSTOM_FUNC__,
|
|
173
196
|
thread_num = self._spider_thread_num,
|
|
174
197
|
max_retries = self._spider_max_retries,
|
|
@@ -16,10 +16,15 @@ class LauncherPro(Launcher):
|
|
|
16
16
|
self._done_key = "{%s:%s}:done" % (project, task)
|
|
17
17
|
self._fail_key = "{%s:%s}:fail" % (project, task)
|
|
18
18
|
self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
|
|
21
21
|
self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
|
|
22
|
+
self._speed_control_key = "speed_control:%s_%s" % (project, task)
|
|
23
|
+
|
|
24
|
+
self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
|
25
|
+
|
|
22
26
|
self._bf_key = "bloom_%s_%s" % (project, task)
|
|
27
|
+
|
|
23
28
|
self._db = RedisDB()
|
|
24
29
|
|
|
25
30
|
self._bf = BloomFilter(self._bf_key)
|
|
@@ -37,6 +42,21 @@ class LauncherPro(Launcher):
|
|
|
37
42
|
else:
|
|
38
43
|
self._db._client.incrby(key, count)
|
|
39
44
|
|
|
45
|
+
def _get_seed(self) -> Seed:
|
|
46
|
+
spider_speed = self._db._client.get(self._speed_control_key)
|
|
47
|
+
if int(spider_speed or 0) > self._spider_max_speed:
|
|
48
|
+
expire_time = self._db.ttl(self._speed_control_key)
|
|
49
|
+
if expire_time == -1:
|
|
50
|
+
self._db.delete(self._speed_control_key)
|
|
51
|
+
else:
|
|
52
|
+
logger.info(f"Too fast! Please wait {expire_time} seconds...")
|
|
53
|
+
time.sleep(expire_time / 2)
|
|
54
|
+
return None
|
|
55
|
+
seed = self.__LAUNCHER_QUEUE__["todo"].pop()
|
|
56
|
+
if seed and not self._db.lock(self._speed_control_key, t=60):
|
|
57
|
+
self._db._client.incrby(self._speed_control_key, 1)
|
|
58
|
+
return seed
|
|
59
|
+
|
|
40
60
|
@check_pause
|
|
41
61
|
def _execute_heartbeat(self):
|
|
42
62
|
if self._heartbeat_start_event.is_set():
|
|
@@ -114,7 +134,7 @@ class LauncherPro(Launcher):
|
|
|
114
134
|
"""
|
|
115
135
|
删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
|
|
116
136
|
"""
|
|
117
|
-
|
|
137
|
+
seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
|
|
118
138
|
status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
|
|
119
139
|
|
|
120
140
|
for _ in range(self._done_queue_max_size):
|
|
@@ -122,26 +142,25 @@ class LauncherPro(Launcher):
|
|
|
122
142
|
if not seed:
|
|
123
143
|
break
|
|
124
144
|
if seed.params.seed_status == DealModel.fail:
|
|
125
|
-
|
|
145
|
+
seed_info["failed"].append(seed.to_string)
|
|
126
146
|
elif self._done_model == 1:
|
|
127
|
-
|
|
147
|
+
seed_info["succeed"].append(seed.to_string)
|
|
128
148
|
else:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
self.
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
self._remove_doing_seeds(f_seeds)
|
|
149
|
+
seed_info["common"].append(seed.to_string)
|
|
150
|
+
seed_info['count'] += 1
|
|
151
|
+
|
|
152
|
+
if seed_info["count"]:
|
|
153
|
+
|
|
154
|
+
succeed_count = self._db.zrem(self._todo_key, *seed_info["common"])
|
|
155
|
+
succeed_count += self._db.done([self._todo_key, self._done_key], *seed_info["succeed"])
|
|
156
|
+
failed_count = self._db.done([self._todo_key, self._fail_key], *seed_info["failed"])
|
|
157
|
+
|
|
158
|
+
if failed_count:
|
|
159
|
+
self.statistics(self._statistics_fail_key, failed_count)
|
|
160
|
+
if succeed_count:
|
|
161
|
+
self.statistics(self._statistics_done_key, succeed_count)
|
|
162
|
+
|
|
163
|
+
self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
|
|
145
164
|
|
|
146
165
|
if status:
|
|
147
166
|
time.sleep(self._done_queue_wait_seconds)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import mmh3
|
|
5
|
+
import redis
|
|
6
|
+
from cobweb import setting
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BloomFilter:
|
|
10
|
+
|
|
11
|
+
def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
12
|
+
redis_config = redis_config or setting.REDIS_CONFIG
|
|
13
|
+
capacity = capacity or setting.CAPACITY
|
|
14
|
+
error_rate = error_rate or setting.ERROR_RATE
|
|
15
|
+
redis_config['db'] = 3
|
|
16
|
+
|
|
17
|
+
self.key = key
|
|
18
|
+
|
|
19
|
+
pool = redis.ConnectionPool(**redis_config)
|
|
20
|
+
self._client = redis.Redis(connection_pool=pool)
|
|
21
|
+
self.bit_size = self.get_bit_size(capacity, error_rate)
|
|
22
|
+
self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
|
23
|
+
self._init_bloom_key()
|
|
24
|
+
|
|
25
|
+
def add(self, value):
|
|
26
|
+
for seed in range(self.hash_count):
|
|
27
|
+
result = mmh3.hash(value, seed) % self.bit_size
|
|
28
|
+
self._client.setbit(self.key, result, 1)
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
def exists(self, value):
|
|
32
|
+
if not self._client.exists(self.key):
|
|
33
|
+
return False
|
|
34
|
+
for seed in range(self.hash_count):
|
|
35
|
+
result = mmh3.hash(value, seed) % self.bit_size
|
|
36
|
+
if not self._client.getbit(self.key, result):
|
|
37
|
+
return False
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
def _init_bloom_key(self):
|
|
41
|
+
lua_script = """
|
|
42
|
+
redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
|
43
|
+
redis.call("EXPIRE", KEYS[1], 604800)
|
|
44
|
+
"""
|
|
45
|
+
if self._client.exists(self.key):
|
|
46
|
+
return True
|
|
47
|
+
execute = self._client.register_script(lua_script)
|
|
48
|
+
execute(keys=[self.key], args=[self.bit_size-1, 1])
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def get_bit_size(cls, n, p):
|
|
52
|
+
return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_hash_count(cls, m, n):
|
|
56
|
+
return int((m / n) * math.log(2))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
testBLF = BloomFilter("test", {
|
|
61
|
+
"host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
|
|
62
|
+
"password": "SpiderLinux666",
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
print("start")
|
|
66
|
+
start_time = time.time()
|
|
67
|
+
testBLF.add("test")
|
|
68
|
+
add_time = time.time()
|
|
69
|
+
print("add time::: ")
|
|
70
|
+
print(add_time - start_time)
|
|
71
|
+
print("get::: ")
|
|
72
|
+
print(testBLF.exists("test"))
|
|
73
|
+
exist_time = time.time()
|
|
74
|
+
print("get time::: ")
|
|
75
|
+
print(exist_time - add_time)
|
|
76
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
os.environ["SPIDER_NUM"] = "1"
|
|
5
|
+
os.environ["REDIS_HOST"] = "r-j6cc5zw8m3pqom4chmpd.redis.rds.aliyuncs.com"
|
|
6
|
+
os.environ["REDIS_PASSWORD"] = "SpiderLinux666"
|
|
7
|
+
os.environ["REDIS_PORT"] = "6379"
|
|
8
|
+
os.environ["REDIS_DB"] = "0"
|
|
9
|
+
os.environ["SCHEDULER_QUEUE_LENGTH"] = "100"
|
|
10
|
+
os.environ["STORER_QUEUE_LENGTH"] = "10"
|
|
11
|
+
os.environ["LOGSTORE"] = "download_meta"
|
|
12
|
+
os.environ["LOGHUB_ACCESS_KEY"] = "LTAI5tH2FLYsBkxcbiLdoYZT"
|
|
13
|
+
os.environ["LOGHUB_ACCESS_SECRET"] = "4oD1NVvYfaWiqxkmGx6c2xtpPWEq17"
|
|
14
|
+
os.environ["BUCKET"] = "databee-video"
|
|
15
|
+
os.environ["ENDPOINT"] = "http://oss-cn-hangzhou.aliyuncs.com"
|
|
16
|
+
os.environ["OSS_ACCESS_KEY"] = "LTAI5tH2FLYsBkxcbiLdoYZT"
|
|
17
|
+
os.environ["OSS_ACCESS_SECRET"] = "4oD1NVvYfaWiqxkmGx6c2xtpPWEq17"
|
|
18
|
+
os.environ["CHUNK_SIZE"] = "1048576"
|
|
19
|
+
os.environ["MIN_SIZE"] = "1024"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
from cobweb import LauncherPro
|
|
23
|
+
|
|
24
|
+
app = LauncherPro("test", "test", SPIDER_THREAD_NUM=1)
|
|
25
|
+
|
|
26
|
+
app.SEEDS = [
|
|
27
|
+
{"url": "https://www.baidu.com"},
|
|
28
|
+
{"url": "https://www.google.com"},
|
|
29
|
+
{"url": "https://www.bing.com"},
|
|
30
|
+
{"url": "https://www.baidu.com?v=1"},
|
|
31
|
+
{"url": "https://www.baidu.com?v=2"},
|
|
32
|
+
{"url": "https://www.baidu.com?v=3"},
|
|
33
|
+
{"url": "https://www.baidu.com?v=4"},
|
|
34
|
+
{"url": "https://www.baidu.com?v=5"}
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
if __name__ == '__main__':
|
|
38
|
+
app.start()
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
from redis import Redis
|
|
5
|
-
from cobweb import setting
|
|
6
|
-
|
|
7
|
-
# class BloomFilter:
|
|
8
|
-
#
|
|
9
|
-
# def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
10
|
-
# redis_config = redis_config or setting.REDIS_CONFIG
|
|
11
|
-
# capacity = capacity or setting.CAPACITY
|
|
12
|
-
# error_rate = error_rate or setting.ERROR_RATE
|
|
13
|
-
# redis_config['db'] = 3
|
|
14
|
-
#
|
|
15
|
-
# self.key = key
|
|
16
|
-
#
|
|
17
|
-
# pool = redis.ConnectionPool(**redis_config)
|
|
18
|
-
# self.bit_size = self.get_bit_size(capacity, error_rate)
|
|
19
|
-
# self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
|
20
|
-
# self._init_bloom_key()
|
|
21
|
-
#
|
|
22
|
-
# def add(self, value):
|
|
23
|
-
# for seed in range(self.hash_count):
|
|
24
|
-
# result = mmh3.hash(value, seed) % self.bit_size
|
|
25
|
-
# self._client.setbit(self.key, result, 1)
|
|
26
|
-
# return True
|
|
27
|
-
#
|
|
28
|
-
# def exists(self, value):
|
|
29
|
-
# if not self._client.exists(self.key):
|
|
30
|
-
# return False
|
|
31
|
-
# for seed in range(self.hash_count):
|
|
32
|
-
# result = mmh3.hash(value, seed) % self.bit_size
|
|
33
|
-
# if not self._client.getbit(self.key, result):
|
|
34
|
-
# return False
|
|
35
|
-
# return True
|
|
36
|
-
#
|
|
37
|
-
# def _init_bloom_key(self):
|
|
38
|
-
# lua_script = """
|
|
39
|
-
# redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
|
40
|
-
# redis.call("EXPIRE", KEYS[1], 604800)
|
|
41
|
-
# """
|
|
42
|
-
# if self._client.exists(self.key):
|
|
43
|
-
# return True
|
|
44
|
-
# execute = self._client.register_script(lua_script)
|
|
45
|
-
# execute(keys=[self.key], args=[self.bit_size-1, 1])
|
|
46
|
-
#
|
|
47
|
-
# @classmethod
|
|
48
|
-
# def get_bit_size(cls, n, p):
|
|
49
|
-
# return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
|
50
|
-
#
|
|
51
|
-
# @classmethod
|
|
52
|
-
# def get_hash_count(cls, m, n):
|
|
53
|
-
# return int((m / n) * math.log(2))
|
|
54
|
-
|
|
55
|
-
class BloomFilter:
|
|
56
|
-
|
|
57
|
-
def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
|
58
|
-
redis_config = redis_config or setting.REDIS_CONFIG
|
|
59
|
-
capacity = capacity or setting.CAPACITY
|
|
60
|
-
error_rate = error_rate or setting.ERROR_RATE
|
|
61
|
-
redis_config['db'] = 3
|
|
62
|
-
|
|
63
|
-
self.key = key
|
|
64
|
-
|
|
65
|
-
self._client = Redis(**redis_config).bf()
|
|
66
|
-
self._client.create(key=self.key, capacity=capacity, errorRate=error_rate)
|
|
67
|
-
|
|
68
|
-
def add(self, value):
|
|
69
|
-
return self._client.add(self.key, value)
|
|
70
|
-
|
|
71
|
-
def madd(self, items: list):
|
|
72
|
-
return self._client.madd(self.key, *items)
|
|
73
|
-
|
|
74
|
-
def exists(self, value):
|
|
75
|
-
return self._client.exists(self.key, value)
|
|
76
|
-
|
|
77
|
-
def mexists(self, items: list):
|
|
78
|
-
return self._client.mexists(self.key, *items)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if __name__ == '__main__':
|
|
82
|
-
testBLF = BloomFilter("test", {
|
|
83
|
-
"host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
|
|
84
|
-
"password": "SpiderLinux666",
|
|
85
|
-
})
|
|
86
|
-
|
|
87
|
-
print("start")
|
|
88
|
-
start_time = time.time()
|
|
89
|
-
testBLF.add("test")
|
|
90
|
-
add_time = time.time()
|
|
91
|
-
print("add time::: ")
|
|
92
|
-
print(add_time - start_time)
|
|
93
|
-
print("get::: ")
|
|
94
|
-
print(testBLF.exists("test"))
|
|
95
|
-
exist_time = time.time()
|
|
96
|
-
print("get time::: ")
|
|
97
|
-
print(exist_time - add_time)
|
|
98
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cobweb-launcher-1.2.13 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|