cobweb-launcher 3.1.11__tar.gz → 3.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/PKG-INFO +1 -1
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/base/__init__.py +2 -4
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/base/item.py +16 -2
- cobweb-launcher-3.1.11/cobweb/base/log.py → cobweb-launcher-3.1.12/cobweb/base/logger.py +3 -3
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/base/request.py +1 -1
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/base/response.py +1 -0
- cobweb-launcher-3.1.12/cobweb/crawlers/crawler.py +27 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/db/redis_db.py +16 -13
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/launchers/distributor.py +15 -10
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/launchers/launcher.py +16 -16
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/launchers/uploader.py +15 -7
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/pipelines/__init__.py +1 -1
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/pipelines/pipeline.py +4 -0
- cobweb-launcher-3.1.12/cobweb/pipelines/pipeline_csv.py +25 -0
- cobweb-launcher-3.1.12/cobweb/pipelines/pipeline_loghub.py +54 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/schedulers/scheduler.py +4 -3
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/schedulers/scheduler_with_redis.py +13 -17
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/setting.py +20 -41
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/utils/__init__.py +2 -2
- cobweb-launcher-3.1.12/cobweb/utils/bloom.py +58 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/utils/dotting.py +6 -4
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/utils/tools.py +3 -15
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb_launcher.egg-info/PKG-INFO +1 -1
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb_launcher.egg-info/SOURCES.txt +2 -2
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb_launcher.egg-info/requires.txt +0 -2
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/setup.py +2 -2
- cobweb-launcher-3.1.11/cobweb/crawlers/crawler.py +0 -28
- cobweb-launcher-3.1.11/cobweb/pipelines/pipeline_console.py +0 -22
- cobweb-launcher-3.1.11/cobweb/pipelines/pipeline_loghub.py +0 -34
- cobweb-launcher-3.1.11/cobweb/utils/bloom.py +0 -58
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/LICENSE +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/README.md +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/__init__.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/constant.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/schedulers/__init__.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/utils/decorators.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/setup.cfg +0 -0
@@ -1,9 +1,7 @@
|
|
1
|
+
from .item import BaseItem, CSVItem
|
1
2
|
from .common_queue import Queue
|
2
3
|
from .response import Response
|
3
4
|
from .request import Request
|
4
|
-
from .
|
5
|
+
from .logger import logger
|
5
6
|
from .seed import Seed
|
6
7
|
|
7
|
-
from .log import logger
|
8
|
-
# from .decorators import decorator_oss_db
|
9
|
-
|
@@ -12,6 +12,15 @@ class Item(type):
|
|
12
12
|
new_class_instance.Data = namedtuple(table, fields)
|
13
13
|
return new_class_instance
|
14
14
|
|
15
|
+
def __getattr__(self, name):
|
16
|
+
return None
|
17
|
+
|
18
|
+
def __setitem__(self, key, value):
|
19
|
+
setattr(self, key, value)
|
20
|
+
|
21
|
+
def __getitem__(self, item):
|
22
|
+
return getattr(self, item)
|
23
|
+
|
15
24
|
|
16
25
|
class BaseItem(metaclass=Item):
|
17
26
|
|
@@ -38,9 +47,14 @@ class BaseItem(metaclass=Item):
|
|
38
47
|
def table(self):
|
39
48
|
return self.Data.__name__
|
40
49
|
|
50
|
+
@property
|
51
|
+
def fields(self):
|
52
|
+
return self.__FIELDS__
|
53
|
+
|
54
|
+
|
41
55
|
|
42
|
-
class
|
56
|
+
class CSVItem(BaseItem):
|
43
57
|
|
44
|
-
__TABLE__ = "
|
58
|
+
__TABLE__ = "cobweb"
|
45
59
|
__FIELDS__ = "data"
|
46
60
|
|
@@ -51,8 +51,8 @@ class ColorCodes:
|
|
51
51
|
HIDDEN = "\033[8m"
|
52
52
|
|
53
53
|
|
54
|
-
class
|
55
|
-
logging.getLogger('oss2.api').setLevel(logging.WARNING)
|
54
|
+
class Logger:
|
55
|
+
# logging.getLogger('oss2.api').setLevel(logging.WARNING)
|
56
56
|
logging.basicConfig(
|
57
57
|
level=logging.INFO,
|
58
58
|
format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
|
@@ -88,7 +88,7 @@ class Log:
|
|
88
88
|
return self.__class__.log.critical
|
89
89
|
|
90
90
|
|
91
|
-
logger =
|
91
|
+
logger = Logger()
|
92
92
|
|
93
93
|
|
94
94
|
|
@@ -58,7 +58,7 @@ class Request:
|
|
58
58
|
f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
|
59
59
|
return user_agent
|
60
60
|
|
61
|
-
def _build_header(self)
|
61
|
+
def _build_header(self):
|
62
62
|
if not self.request_setting.get("headers"):
|
63
63
|
self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
|
64
64
|
elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
|
@@ -0,0 +1,27 @@
|
|
1
|
+
from typing import Any, Generator
|
2
|
+
from cobweb.base import (
|
3
|
+
Seed,
|
4
|
+
BaseItem,
|
5
|
+
Request,
|
6
|
+
Response,
|
7
|
+
CSVItem,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
class Crawler:
|
12
|
+
|
13
|
+
@staticmethod
|
14
|
+
def request(seed: Seed) -> Generator[Request, Response, None]:
|
15
|
+
yield Request(seed.url, seed, timeout=5)
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def download(item: Request) -> Generator[Response, Any, None]:
|
19
|
+
response = item.download()
|
20
|
+
yield Response(item.seed, response, **item.to_dict)
|
21
|
+
|
22
|
+
@staticmethod
|
23
|
+
def parse(item: Response) -> Generator[BaseItem, Any, None]:
|
24
|
+
upload_item = item.to_dict
|
25
|
+
upload_item["content"] = getattr(item.response, "text", item.response)
|
26
|
+
yield CSVItem(item.seed, data=upload_item)
|
27
|
+
|
@@ -1,16 +1,21 @@
|
|
1
|
-
import
|
1
|
+
import os
|
2
|
+
|
2
3
|
import time
|
3
|
-
|
4
|
+
import redis
|
4
5
|
from redis.exceptions import ConnectionError, TimeoutError
|
5
6
|
|
6
7
|
|
7
8
|
class RedisDB:
|
8
|
-
def __init__(
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
host=None,
|
12
|
+
password=None,
|
13
|
+
port=6379, db=0
|
14
|
+
):
|
15
|
+
self.host = host or os.getenv("REDIS_HOST", "localhost")
|
16
|
+
self.password = password or os.getenv("REDIS_PASSWORD")
|
17
|
+
self.port = port or os.getenv("REDIS_PORT", 6379)
|
18
|
+
self.db = db or os.getenv("REDIS_DB", 0)
|
14
19
|
|
15
20
|
self.max_retries = 5
|
16
21
|
self.retry_delay = 5
|
@@ -18,7 +23,6 @@ class RedisDB:
|
|
18
23
|
self.connect()
|
19
24
|
|
20
25
|
def connect(self):
|
21
|
-
"""尝试连接 Redis"""
|
22
26
|
retries = 0
|
23
27
|
while retries < self.max_retries:
|
24
28
|
try:
|
@@ -27,10 +31,9 @@ class RedisDB:
|
|
27
31
|
port=self.port,
|
28
32
|
password=self.password,
|
29
33
|
db=self.db,
|
30
|
-
socket_timeout=5,
|
31
|
-
socket_connect_timeout=5
|
34
|
+
socket_timeout=5,
|
35
|
+
socket_connect_timeout=5
|
32
36
|
)
|
33
|
-
# 测试连接是否成功
|
34
37
|
self.client.ping()
|
35
38
|
return
|
36
39
|
except (ConnectionError, TimeoutError) as e:
|
@@ -205,7 +208,7 @@ class RedisDB:
|
|
205
208
|
members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
|
206
209
|
return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
|
207
210
|
|
208
|
-
def done(self, keys: list, *args)
|
211
|
+
def done(self, keys: list, *args):
|
209
212
|
lua_script = """
|
210
213
|
for i, member in ipairs(ARGV) do
|
211
214
|
redis.call("zrem", KEYS[1], member)
|
@@ -2,12 +2,12 @@ import time
|
|
2
2
|
import threading
|
3
3
|
import traceback
|
4
4
|
|
5
|
-
from typing import Callable
|
5
|
+
from typing import Callable, Type
|
6
6
|
from inspect import isgenerator
|
7
7
|
from urllib.parse import urlparse
|
8
8
|
from requests import Response as Res
|
9
9
|
|
10
|
-
from cobweb import
|
10
|
+
from cobweb.crawlers import Crawler
|
11
11
|
from cobweb.constant import DealModel, LogTemplate
|
12
12
|
from cobweb.utils import LoghubDot, check_pause
|
13
13
|
from cobweb.base import Seed, Queue, BaseItem, Request, Response, logger
|
@@ -23,10 +23,10 @@ class Distributor(threading.Thread):
|
|
23
23
|
todo: Queue,
|
24
24
|
done: Queue,
|
25
25
|
upload: Queue,
|
26
|
-
register: Callable,
|
27
26
|
stop: threading.Event,
|
28
27
|
pause: threading.Event,
|
29
|
-
|
28
|
+
callback_register: Callable,
|
29
|
+
SpiderCrawler: Type[Crawler]
|
30
30
|
):
|
31
31
|
super().__init__()
|
32
32
|
self.task = task
|
@@ -38,16 +38,16 @@ class Distributor(threading.Thread):
|
|
38
38
|
self.todo = todo
|
39
39
|
self.done = done
|
40
40
|
self.upload = upload
|
41
|
-
self.
|
41
|
+
self.callback_register = callback_register
|
42
|
+
self.Crawler = SpiderCrawler
|
42
43
|
|
44
|
+
from cobweb import setting
|
43
45
|
self.time_sleep = setting.SPIDER_TIME_SLEEP
|
44
46
|
self.thread_num = setting.SPIDER_THREAD_NUM
|
45
47
|
self.max_retries = setting.SPIDER_MAX_RETRIES
|
46
48
|
self.record_failed = setting.RECORD_FAILED_SPIDER
|
47
49
|
self.loghub_dot = LoghubDot(stop=stop) # todo: 解偶
|
48
50
|
|
49
|
-
self.Crawler = SpiderCrawler
|
50
|
-
|
51
51
|
logger.debug(f"Distribute instance attrs: {self.__dict__}")
|
52
52
|
|
53
53
|
def distribute(self, item, seed, _id: int):
|
@@ -58,6 +58,7 @@ class Distributor(threading.Thread):
|
|
58
58
|
if _id == 2:
|
59
59
|
raise TypeError("parse function can't yield a Response instance")
|
60
60
|
dot = isinstance(item.response, Res)
|
61
|
+
# TODO: 请求成功打点
|
61
62
|
self.spider_logging(seed, item, dot=dot)
|
62
63
|
self.process(item=item, seed=seed, callback=self.Crawler.parse, _id=2)
|
63
64
|
elif isinstance(item, BaseItem):
|
@@ -77,7 +78,9 @@ class Distributor(threading.Thread):
|
|
77
78
|
try:
|
78
79
|
response = Response(seed, "failed", max_retries=True)
|
79
80
|
self.process(response, seed, self.Crawler.parse, _id=2)
|
80
|
-
except:
|
81
|
+
except Exception as e:
|
82
|
+
msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
83
|
+
logger.error(msg = msg)
|
81
84
|
record_failed = False
|
82
85
|
if not record_failed:
|
83
86
|
self.done.push(seed)
|
@@ -93,6 +96,7 @@ class Distributor(threading.Thread):
|
|
93
96
|
|
94
97
|
@check_pause
|
95
98
|
def spider(self):
|
99
|
+
# TODO: 限流措施
|
96
100
|
if seed := self.todo.pop():
|
97
101
|
try:
|
98
102
|
self.process(item=seed, seed=seed, callback=self.Crawler.request, _id=0)
|
@@ -102,6 +106,7 @@ class Distributor(threading.Thread):
|
|
102
106
|
if getattr(e, "response", None) and isinstance(e.response, Res):
|
103
107
|
url = e.response.request.url
|
104
108
|
status = e.response.status_code
|
109
|
+
# TODO:失败请求打点
|
105
110
|
self.spider_logging(seed, None, error=True, url=url, status=status, msg=msg)
|
106
111
|
self.distribute(DealModel.fail, seed, _id=-1)
|
107
112
|
|
@@ -160,6 +165,6 @@ class Distributor(threading.Thread):
|
|
160
165
|
)
|
161
166
|
|
162
167
|
def run(self):
|
163
|
-
self.
|
168
|
+
self.callback_register(self.loghub_dot.build_run, tag="LoghubDot")
|
164
169
|
for _ in range(self.thread_num):
|
165
|
-
self.
|
170
|
+
self.callback_register(self.spider, tag="Distributor")
|
@@ -113,22 +113,20 @@ class Launcher:
|
|
113
113
|
if not self.__WORKER_THREAD__.get(name):
|
114
114
|
worker_thread = threading.Thread(name=name, target=func)
|
115
115
|
self.__WORKER_THREAD__[name] = worker_thread
|
116
|
-
worker_thread.start()
|
116
|
+
# worker_thread.start()
|
117
117
|
|
118
118
|
def _monitor(self):
|
119
119
|
while not self._stop.is_set():
|
120
|
-
if self._pause.is_set():
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
time.sleep(3)
|
131
|
-
logger.info("main thread close!")
|
120
|
+
if not self._pause.is_set():
|
121
|
+
for name, worker_thread in list(self.__WORKER_THREAD__.items()):
|
122
|
+
if not worker_thread.is_alive():
|
123
|
+
logger.debug(f"{name} thread is dead. Restarting...")
|
124
|
+
func = self.__REGISTER_FUNC__[name]
|
125
|
+
worker_thread = threading.Thread(name=name, target=func)
|
126
|
+
self.__WORKER_THREAD__[name] = worker_thread
|
127
|
+
worker_thread.start()
|
128
|
+
time.sleep(15)
|
129
|
+
logger.info("monitor thread close!")
|
132
130
|
|
133
131
|
def start(self):
|
134
132
|
self._pause.is_set()
|
@@ -142,7 +140,7 @@ class Launcher:
|
|
142
140
|
todo=self._TODO_QUEUE_,
|
143
141
|
done=self._DONE_QUEUE_,
|
144
142
|
upload=self._UPLOAD_QUEUE_,
|
145
|
-
|
143
|
+
callback_register=self._register
|
146
144
|
).start()
|
147
145
|
|
148
146
|
Distributor(
|
@@ -152,18 +150,20 @@ class Launcher:
|
|
152
150
|
todo=self._TODO_QUEUE_,
|
153
151
|
done=self._DONE_QUEUE_,
|
154
152
|
upload=self._UPLOAD_QUEUE_,
|
155
|
-
|
153
|
+
callback_register=self._register,
|
156
154
|
stop=self._stop, pause=self._pause,
|
157
155
|
SpiderCrawler=self.SpiderCrawler
|
158
156
|
).start()
|
159
157
|
|
160
158
|
Uploader(
|
159
|
+
task=self.task, project=self.project,
|
161
160
|
stop=self._stop, pause=self._pause,
|
162
161
|
done=self._DONE_QUEUE_,
|
163
162
|
upload=self._UPLOAD_QUEUE_,
|
164
|
-
|
163
|
+
callback_register=self._register,
|
165
164
|
SpiderPipeline=self.SpiderPipeline
|
166
165
|
).start()
|
167
166
|
|
168
167
|
self._monitor()
|
168
|
+
logger.info("task done!")
|
169
169
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import time
|
2
2
|
import threading
|
3
|
-
from typing import Callable
|
4
|
-
from cobweb import
|
3
|
+
from typing import Callable, Type
|
4
|
+
from cobweb.pipelines import Pipeline
|
5
5
|
from cobweb.base import Queue, logger
|
6
6
|
from cobweb.utils import check_pause
|
7
7
|
|
@@ -10,24 +10,31 @@ class Uploader(threading.Thread):
|
|
10
10
|
|
11
11
|
def __init__(
|
12
12
|
self,
|
13
|
+
task: str,
|
14
|
+
project: str,
|
13
15
|
stop: threading.Event,
|
14
16
|
pause: threading.Event,
|
15
17
|
upload: Queue, done: Queue,
|
16
|
-
|
17
|
-
SpiderPipeline
|
18
|
+
callback_register: Callable,
|
19
|
+
SpiderPipeline: Type[Pipeline]
|
18
20
|
):
|
19
21
|
super().__init__()
|
22
|
+
self.task = task
|
23
|
+
self.project = project
|
24
|
+
|
20
25
|
self.stop = stop
|
21
26
|
self.pause = pause
|
22
27
|
|
23
28
|
self.done = done
|
24
29
|
self.upload = upload
|
25
|
-
self.
|
30
|
+
self.callback_register = callback_register
|
31
|
+
|
32
|
+
from cobweb import setting
|
26
33
|
|
27
34
|
self.upload_size = setting.UPLOAD_QUEUE_MAX_SIZE
|
28
35
|
self.wait_seconds = setting.UPLOAD_QUEUE_WAIT_SECONDS
|
29
36
|
|
30
|
-
self.pipeline = SpiderPipeline()
|
37
|
+
self.pipeline = SpiderPipeline(task=self.task, project=self.project)
|
31
38
|
|
32
39
|
logger.debug(f"Uploader instance attrs: {self.__dict__}")
|
33
40
|
|
@@ -50,6 +57,7 @@ class Uploader(threading.Thread):
|
|
50
57
|
for table, datas in data_info.items():
|
51
58
|
try:
|
52
59
|
self.pipeline.upload(table, datas)
|
60
|
+
# TODO: 上传打点
|
53
61
|
except Exception as e:
|
54
62
|
logger.info(e)
|
55
63
|
except Exception as e:
|
@@ -58,6 +66,6 @@ class Uploader(threading.Thread):
|
|
58
66
|
self.done.push(seeds)
|
59
67
|
|
60
68
|
def run(self):
|
61
|
-
self.
|
69
|
+
self.callback_register(self.upload_data, tag="Uploader")
|
62
70
|
|
63
71
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import os
|
2
|
+
import csv
|
3
|
+
|
4
|
+
from cobweb.base import BaseItem
|
5
|
+
from cobweb.pipelines import Pipeline
|
6
|
+
|
7
|
+
|
8
|
+
class CSV(Pipeline):
|
9
|
+
|
10
|
+
def __init__(self, *args, **kwargs):
|
11
|
+
super(CSV, self).__init__(*args, **kwargs)
|
12
|
+
self.log_path = rf"{os.getcwd()}\{self.project}\{self.task}\%s.csv"
|
13
|
+
|
14
|
+
def build(self, item: BaseItem):
|
15
|
+
return item.to_dict
|
16
|
+
|
17
|
+
def upload(self, table, datas):
|
18
|
+
fieldnames = datas[0].keys()
|
19
|
+
file_path = self.log_path % table
|
20
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
21
|
+
with open(file_path, mode='a', encoding='utf-8', newline="") as file:
|
22
|
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
23
|
+
if file.tell() == 0: # 判断文件是否为空
|
24
|
+
writer.writeheader()
|
25
|
+
writer.writerows(datas)
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
|
4
|
+
from cobweb.base import BaseItem
|
5
|
+
from cobweb.pipelines import Pipeline
|
6
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
7
|
+
from collections import defaultdict
|
8
|
+
|
9
|
+
|
10
|
+
class Loghub(Pipeline):
|
11
|
+
|
12
|
+
def __init__(self, *args, **kwargs):
|
13
|
+
super().__init__(*args, **kwargs)
|
14
|
+
self.client = LogClient(
|
15
|
+
endpoint=os.getenv("LOGHUB_ENDPOINT"),
|
16
|
+
accessKeyId=os.getenv("LOGHUB_ACCESS_KEY"),
|
17
|
+
accessKey=os.getenv("LOGHUB_SECRET_KEY")
|
18
|
+
)
|
19
|
+
self.project = os.getenv("LOGHUB_PROJECT")
|
20
|
+
self.source = os.getenv("LOGHUB_SOURCE")
|
21
|
+
self.topic = os.getenv("LOGHUB_TOPIC")
|
22
|
+
|
23
|
+
def build(self, item: BaseItem):
|
24
|
+
log_item = LogItem()
|
25
|
+
temp = item.to_dict
|
26
|
+
for key, value in temp.items():
|
27
|
+
if not isinstance(value, str):
|
28
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
29
|
+
contents = sorted(temp.items())
|
30
|
+
log_item.set_contents(contents)
|
31
|
+
return (
|
32
|
+
log_item,
|
33
|
+
item.baseitem_topic or self.topic,
|
34
|
+
item.baseitem_source or self.source,
|
35
|
+
item.baseitem_project or self.project,
|
36
|
+
)
|
37
|
+
|
38
|
+
def upload(self, table, datas):
|
39
|
+
|
40
|
+
upload_items = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
41
|
+
|
42
|
+
for log_item, topic, source, project in datas:
|
43
|
+
upload_items[project][source][topic].append(log_item)
|
44
|
+
|
45
|
+
for request in [
|
46
|
+
PutLogsRequest(
|
47
|
+
logstore=table, project=project,
|
48
|
+
topic=topic, source=source,
|
49
|
+
logitems=log_items, compress=True
|
50
|
+
) for project, sources in upload_items.items()
|
51
|
+
for source, topics in sources.items()
|
52
|
+
for topic, log_items in topics.items()
|
53
|
+
]:
|
54
|
+
self.client.put_logs(request=request)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import threading
|
2
2
|
|
3
|
-
|
3
|
+
|
4
4
|
from typing import Callable
|
5
5
|
from cobweb.base import Queue
|
6
6
|
from abc import ABC, abstractmethod
|
@@ -20,11 +20,12 @@ class Scheduler(ABC, threading.Thread):
|
|
20
20
|
todo: Queue,
|
21
21
|
done: Queue,
|
22
22
|
upload: Queue,
|
23
|
-
|
23
|
+
callback_register: Callable
|
24
24
|
):
|
25
25
|
super().__init__()
|
26
26
|
self.task = task
|
27
27
|
self.project = project
|
28
|
+
from cobweb import setting
|
28
29
|
|
29
30
|
self.task_model = setting.TASK_MODEL
|
30
31
|
self.seed_reset_seconds = setting.SEED_RESET_SECONDS
|
@@ -47,7 +48,7 @@ class Scheduler(ABC, threading.Thread):
|
|
47
48
|
self.done = done
|
48
49
|
self.upload = upload
|
49
50
|
|
50
|
-
self.
|
51
|
+
self.callback_register = callback_register
|
51
52
|
|
52
53
|
self.working_seeds = dict()
|
53
54
|
|
@@ -22,16 +22,16 @@ class RedisScheduler(Scheduler):
|
|
22
22
|
todo: Queue,
|
23
23
|
done: Queue,
|
24
24
|
upload: Queue,
|
25
|
-
|
25
|
+
callback_register: Callable
|
26
26
|
):
|
27
|
-
super().__init__(task, project, stop, pause, new, todo, done, upload,
|
27
|
+
super().__init__(task, project, stop, pause, new, todo, done, upload, callback_register)
|
28
28
|
self.todo_key = "{%s:%s}:todo" % (project, task)
|
29
29
|
self.done_key = "{%s:%s}:done" % (project, task)
|
30
30
|
self.fail_key = "{%s:%s}:fail" % (project, task)
|
31
31
|
self.heartbeat_key = "heartbeat:%s_%s" % (project, task)
|
32
32
|
self.speed_control_key = "speed_control:%s_%s" % (project, task)
|
33
33
|
self.reset_lock_key = "lock:reset:%s_%s" % (project, task)
|
34
|
-
self.redis_queue_empty_event = threading.Event()
|
34
|
+
# self.redis_queue_empty_event = threading.Event()
|
35
35
|
self.db = ApiDB() if use_api else RedisDB()
|
36
36
|
|
37
37
|
@check_pause
|
@@ -126,11 +126,11 @@ class RedisScheduler(Scheduler):
|
|
126
126
|
def run(self):
|
127
127
|
start_time = int(time.time())
|
128
128
|
|
129
|
-
self.
|
130
|
-
self.
|
131
|
-
self.
|
132
|
-
self.
|
133
|
-
self.
|
129
|
+
self.callback_register(self.reset, tag="scheduler")
|
130
|
+
self.callback_register(self.insert, tag="scheduler")
|
131
|
+
self.callback_register(self.delete, tag="scheduler")
|
132
|
+
self.callback_register(self.refresh, tag="scheduler")
|
133
|
+
self.callback_register(self.schedule, tag="scheduler")
|
134
134
|
|
135
135
|
while not self.stop.is_set():
|
136
136
|
working_count = len(self.working_seeds.keys())
|
@@ -149,13 +149,12 @@ class RedisScheduler(Scheduler):
|
|
149
149
|
self.pause.clear()
|
150
150
|
else:
|
151
151
|
logger.info("pause! waiting for resume...")
|
152
|
+
elif all_count:
|
153
|
+
logger.info(f"todo seeds count: {todo_count}, queue length: {all_count}")
|
154
|
+
self.pause.clear()
|
152
155
|
else:
|
153
|
-
|
154
|
-
|
155
|
-
self.pause.clear()
|
156
|
-
else:
|
157
|
-
logger.info("Done! pause set...")
|
158
|
-
self.pause.set()
|
156
|
+
logger.info("TODO queue is empty! pause set...")
|
157
|
+
self.pause.set()
|
159
158
|
else:
|
160
159
|
if self.pause.is_set():
|
161
160
|
self.pause.clear()
|
@@ -171,6 +170,3 @@ class RedisScheduler(Scheduler):
|
|
171
170
|
))
|
172
171
|
|
173
172
|
time.sleep(30)
|
174
|
-
|
175
|
-
logger.info("Scheduler Done!")
|
176
|
-
|
@@ -1,37 +1,8 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
# redis db config
|
4
|
-
REDIS_CONFIG = {
|
5
|
-
"host": os.getenv("REDIS_HOST"),
|
6
|
-
"password": os.getenv("REDIS_PASSWORD"),
|
7
|
-
"port": int(os.getenv("REDIS_PORT", 6379)),
|
8
|
-
"db": int(os.getenv("REDIS_DB", 0)),
|
9
|
-
}
|
10
|
-
|
11
|
-
# loghub db config
|
12
|
-
LOGHUB_TOPIC = os.getenv("LOGHUB_TOPIC")
|
13
|
-
LOGHUB_SOURCE = os.getenv("LOGHUB_SOURCE")
|
14
|
-
LOGHUB_PROJECT = os.getenv("LOGHUB_PROJECT")
|
15
|
-
LOGHUB_CONFIG = {
|
16
|
-
"endpoint": os.getenv("LOGHUB_ENDPOINT"),
|
17
|
-
"accessKeyId": os.getenv("LOGHUB_ACCESS_KEY"),
|
18
|
-
"accessKey": os.getenv("LOGHUB_SECRET_KEY")
|
19
|
-
}
|
20
|
-
|
21
|
-
# # oss util config
|
22
|
-
OSS_BUCKET = os.getenv("OSS_BUCKET")
|
23
|
-
OSS_ENDPOINT = os.getenv("OSS_ENDPOINT")
|
24
|
-
OSS_ACCESS_KEY = os.getenv("OSS_ACCESS_KEY")
|
25
|
-
OSS_SECRET_KEY = os.getenv("OSS_SECRET_KEY")
|
26
|
-
OSS_CHUNK_SIZE = 10 * 1024 ** 2
|
27
|
-
OSS_MIN_UPLOAD_SIZE = 1024
|
28
|
-
|
29
|
-
|
30
1
|
# 采集器选择
|
31
2
|
CRAWLER = "cobweb.crawlers.Crawler"
|
32
3
|
|
33
4
|
# 数据管道
|
34
|
-
PIPELINE = "cobweb.pipelines.
|
5
|
+
PIPELINE = "cobweb.pipelines.CSV"
|
35
6
|
|
36
7
|
# 调度器
|
37
8
|
SCHEDULER = "cobweb.schedulers.RedisScheduler"
|
@@ -61,20 +32,28 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
|
|
61
32
|
SPIDER_THREAD_NUM = 10
|
62
33
|
SPIDER_MAX_RETRIES = 5
|
63
34
|
SPIDER_TIME_SLEEP = 10
|
64
|
-
RECORD_FAILED_SPIDER =
|
35
|
+
RECORD_FAILED_SPIDER = True
|
65
36
|
|
66
37
|
SPIDER_MAX_COUNT = 1000 # 在规定时间窗口内最大采集数
|
67
38
|
TIME_WINDOW = 60 # 频控固定时间窗口(秒)
|
68
39
|
|
69
|
-
#
|
70
|
-
TASK_MODEL = 0
|
40
|
+
# 任务模式, 0:单次,1:常驻
|
41
|
+
TASK_MODEL = 0
|
42
|
+
|
43
|
+
# 流控措施, 0:关闭,1:开启
|
44
|
+
SPEED_CONTROL = 1
|
45
|
+
|
71
46
|
|
72
|
-
#
|
73
|
-
|
47
|
+
# scheduler redis config
|
48
|
+
# os.getenv("REDIS_HOST", "127.0.0.1")
|
49
|
+
# os.getenv("REDIS_PASSWORD")
|
50
|
+
# os.getenv("REDIS_PORT", 6379)
|
51
|
+
# os.getenv("REDIS_DB", 0)
|
74
52
|
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
53
|
+
# loghub pipeline config
|
54
|
+
# os.getenv("LOGHUB_ENDPOINT"),
|
55
|
+
# os.getenv("LOGHUB_ACCESS_KEY"),
|
56
|
+
# os.getenv("LOGHUB_SECRET_KEY")
|
57
|
+
# os.getenv("LOGHUB_PROJECT")
|
58
|
+
# os.getenv("LOGHUB_SOURCE")
|
59
|
+
# os.getenv("LOGHUB_TOPIC")
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# import math
|
2
|
+
# import time
|
3
|
+
#
|
4
|
+
# import mmh3
|
5
|
+
# import redis
|
6
|
+
# from cobweb import setting
|
7
|
+
#
|
8
|
+
#
|
9
|
+
# class BloomFilter:
|
10
|
+
#
|
11
|
+
# def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
12
|
+
# redis_config = redis_config or setting.REDIS_CONFIG
|
13
|
+
# capacity = capacity or setting.CAPACITY
|
14
|
+
# error_rate = error_rate or setting.ERROR_RATE
|
15
|
+
# redis_config['db'] = 3
|
16
|
+
#
|
17
|
+
# self.key = key
|
18
|
+
#
|
19
|
+
# pool = redis.ConnectionPool(**redis_config)
|
20
|
+
# self._client = redis.Redis(connection_pool=pool)
|
21
|
+
# self.bit_size = self.get_bit_size(capacity, error_rate)
|
22
|
+
# self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
23
|
+
# self._init_bloom_key()
|
24
|
+
#
|
25
|
+
# def add(self, value):
|
26
|
+
# for seed in range(self.hash_count):
|
27
|
+
# result = mmh3.hash(value, seed) % self.bit_size
|
28
|
+
# self._client.setbit(self.key, result, 1)
|
29
|
+
# return True
|
30
|
+
#
|
31
|
+
# def exists(self, value):
|
32
|
+
# if not self._client.exists(self.key):
|
33
|
+
# return False
|
34
|
+
# for seed in range(self.hash_count):
|
35
|
+
# result = mmh3.hash(value, seed) % self.bit_size
|
36
|
+
# if not self._client.getbit(self.key, result):
|
37
|
+
# return False
|
38
|
+
# return True
|
39
|
+
#
|
40
|
+
# def _init_bloom_key(self):
|
41
|
+
# lua_script = """
|
42
|
+
# redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
43
|
+
# redis.call("EXPIRE", KEYS[1], 604800)
|
44
|
+
# """
|
45
|
+
# if self._client.exists(self.key):
|
46
|
+
# return True
|
47
|
+
# execute = self._client.register_script(lua_script)
|
48
|
+
# execute(keys=[self.key], args=[self.bit_size-1, 1])
|
49
|
+
#
|
50
|
+
# @classmethod
|
51
|
+
# def get_bit_size(cls, n, p):
|
52
|
+
# return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
53
|
+
#
|
54
|
+
# @classmethod
|
55
|
+
# def get_hash_count(cls, m, n):
|
56
|
+
# return int((m / n) * math.log(2))
|
57
|
+
#
|
58
|
+
#
|
@@ -1,21 +1,23 @@
|
|
1
|
+
import os
|
1
2
|
import json
|
2
3
|
import time
|
3
4
|
|
4
5
|
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
5
|
-
|
6
6
|
from cobweb.base import Queue, logger
|
7
|
-
from cobweb import setting
|
8
7
|
|
9
8
|
|
10
9
|
class LoghubDot:
|
11
10
|
|
12
11
|
def __init__(self, stop):
|
13
12
|
self.stop = stop
|
14
|
-
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
15
13
|
self.queue = Queue()
|
14
|
+
self.client = LogClient(
|
15
|
+
endpoint=os.getenv("LOGHUB_ENDPOINT"),
|
16
|
+
accessKeyId=os.getenv("LOGHUB_ACCESS_KEY"),
|
17
|
+
accessKey=os.getenv("LOGHUB_SECRET_KEY")
|
18
|
+
)
|
16
19
|
|
17
20
|
def build(self, topic, **kwargs):
|
18
|
-
|
19
21
|
temp = {}
|
20
22
|
log_item = LogItem()
|
21
23
|
for key, value in kwargs.items():
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import re
|
2
2
|
import hashlib
|
3
|
+
import inspect
|
3
4
|
from typing import Union
|
4
5
|
from importlib import import_module
|
5
6
|
|
@@ -10,18 +11,6 @@ def md5(text: Union[str, bytes]) -> str:
|
|
10
11
|
return hashlib.md5(text).hexdigest()
|
11
12
|
|
12
13
|
|
13
|
-
def build_path(site, url, file_type):
|
14
|
-
return f"{site}/{md5(url)}.{file_type}"
|
15
|
-
|
16
|
-
|
17
|
-
def format_size(content_length: int) -> str:
|
18
|
-
units = ["KB", "MB", "GB", "TB"]
|
19
|
-
for i in range(4):
|
20
|
-
num = content_length / (1024 ** (i + 1))
|
21
|
-
if num < 1024:
|
22
|
-
return f"{round(num, 2)} {units[i]}"
|
23
|
-
|
24
|
-
|
25
14
|
def dynamic_load_class(model_info):
|
26
15
|
if isinstance(model_info, str):
|
27
16
|
if "import" in model_info:
|
@@ -35,8 +24,7 @@ def dynamic_load_class(model_info):
|
|
35
24
|
model = import_module(model_path)
|
36
25
|
class_object = getattr(model, class_name)
|
37
26
|
return class_object
|
27
|
+
elif inspect.isclass(model_info):
|
28
|
+
return model_info
|
38
29
|
raise TypeError()
|
39
30
|
|
40
|
-
|
41
|
-
# def download_log_info(item:dict) -> str:
|
42
|
-
# return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
|
@@ -7,7 +7,7 @@ cobweb/setting.py
|
|
7
7
|
cobweb/base/__init__.py
|
8
8
|
cobweb/base/common_queue.py
|
9
9
|
cobweb/base/item.py
|
10
|
-
cobweb/base/
|
10
|
+
cobweb/base/logger.py
|
11
11
|
cobweb/base/request.py
|
12
12
|
cobweb/base/response.py
|
13
13
|
cobweb/base/seed.py
|
@@ -24,7 +24,7 @@ cobweb/launchers/launcher.py
|
|
24
24
|
cobweb/launchers/uploader.py
|
25
25
|
cobweb/pipelines/__init__.py
|
26
26
|
cobweb/pipelines/pipeline.py
|
27
|
-
cobweb/pipelines/
|
27
|
+
cobweb/pipelines/pipeline_csv.py
|
28
28
|
cobweb/pipelines/pipeline_loghub.py
|
29
29
|
cobweb/schedulers/__init__.py
|
30
30
|
cobweb/schedulers/scheduler.py
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
5
5
|
|
6
6
|
setup(
|
7
7
|
name="cobweb-launcher",
|
8
|
-
version="3.1.
|
8
|
+
version="3.1.12",
|
9
9
|
packages=find_packages(),
|
10
10
|
url="https://github.com/Juannie-PP/cobweb",
|
11
11
|
license="MIT",
|
@@ -14,7 +14,7 @@ setup(
|
|
14
14
|
description="spider_hole",
|
15
15
|
long_description=long_description,
|
16
16
|
long_description_content_type="text/markdown",
|
17
|
-
install_requires=["requests>=2.19.1", "
|
17
|
+
install_requires=["requests>=2.19.1", "redis>=4.4.4", "aliyun-log-python-sdk"],
|
18
18
|
classifiers=[
|
19
19
|
"Programming Language :: Python :: 3",
|
20
20
|
],
|
@@ -1,28 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
from typing import Union
|
3
|
-
from cobweb.base import (
|
4
|
-
Seed,
|
5
|
-
BaseItem,
|
6
|
-
Request,
|
7
|
-
Response,
|
8
|
-
ConsoleItem,
|
9
|
-
)
|
10
|
-
|
11
|
-
|
12
|
-
class Crawler:
|
13
|
-
|
14
|
-
@staticmethod
|
15
|
-
def request(seed: Seed) -> Union[Request, BaseItem]:
|
16
|
-
yield Request(seed.url, seed, timeout=5)
|
17
|
-
|
18
|
-
@staticmethod
|
19
|
-
def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
|
20
|
-
response = item.download()
|
21
|
-
yield Response(item.seed, response, **item.to_dict)
|
22
|
-
|
23
|
-
@staticmethod
|
24
|
-
def parse(item: Response) -> BaseItem:
|
25
|
-
upload_item = item.to_dict
|
26
|
-
upload_item["text"] = item.response.text
|
27
|
-
yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
|
28
|
-
|
@@ -1,22 +0,0 @@
|
|
1
|
-
from cobweb.base import ConsoleItem, logger
|
2
|
-
from cobweb.constant import LogTemplate
|
3
|
-
from cobweb.pipelines import Pipeline
|
4
|
-
|
5
|
-
|
6
|
-
class Console(Pipeline):
|
7
|
-
|
8
|
-
def build(self, item: ConsoleItem):
|
9
|
-
return {
|
10
|
-
"seed": item.seed.to_dict,
|
11
|
-
"data": item.to_dict
|
12
|
-
}
|
13
|
-
|
14
|
-
def upload(self, table, datas):
|
15
|
-
for data in datas:
|
16
|
-
parse_detail = LogTemplate.log_info(data["data"])
|
17
|
-
if len(parse_detail) > 500:
|
18
|
-
parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
|
19
|
-
logger.info(LogTemplate.console_item.format(
|
20
|
-
seed_detail=LogTemplate.log_info(data["seed"]),
|
21
|
-
parse_detail=parse_detail
|
22
|
-
))
|
@@ -1,34 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from cobweb import setting
|
4
|
-
from cobweb.base import BaseItem
|
5
|
-
from cobweb.pipelines import Pipeline
|
6
|
-
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
7
|
-
|
8
|
-
|
9
|
-
class Loghub(Pipeline):
|
10
|
-
|
11
|
-
def __init__(self, *args, **kwargs):
|
12
|
-
super().__init__(*args, **kwargs)
|
13
|
-
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
-
|
15
|
-
def build(self, item: BaseItem):
|
16
|
-
log_item = LogItem()
|
17
|
-
temp = item.to_dict
|
18
|
-
for key, value in temp.items():
|
19
|
-
if not isinstance(value, str):
|
20
|
-
temp[key] = json.dumps(value, ensure_ascii=False)
|
21
|
-
contents = sorted(temp.items())
|
22
|
-
log_item.set_contents(contents)
|
23
|
-
return log_item
|
24
|
-
|
25
|
-
def upload(self, table, datas):
|
26
|
-
request = PutLogsRequest(
|
27
|
-
project=setting.LOGHUB_PROJECT,
|
28
|
-
logstore=table,
|
29
|
-
topic=setting.LOGHUB_TOPIC,
|
30
|
-
source=setting.LOGHUB_SOURCE,
|
31
|
-
logitems=datas,
|
32
|
-
compress=True
|
33
|
-
)
|
34
|
-
self.client.put_logs(request=request)
|
@@ -1,58 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
import time
|
3
|
-
|
4
|
-
import mmh3
|
5
|
-
import redis
|
6
|
-
from cobweb import setting
|
7
|
-
|
8
|
-
|
9
|
-
class BloomFilter:
|
10
|
-
|
11
|
-
def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
|
12
|
-
redis_config = redis_config or setting.REDIS_CONFIG
|
13
|
-
capacity = capacity or setting.CAPACITY
|
14
|
-
error_rate = error_rate or setting.ERROR_RATE
|
15
|
-
redis_config['db'] = 3
|
16
|
-
|
17
|
-
self.key = key
|
18
|
-
|
19
|
-
pool = redis.ConnectionPool(**redis_config)
|
20
|
-
self._client = redis.Redis(connection_pool=pool)
|
21
|
-
self.bit_size = self.get_bit_size(capacity, error_rate)
|
22
|
-
self.hash_count = self.get_hash_count(self.bit_size, capacity)
|
23
|
-
self._init_bloom_key()
|
24
|
-
|
25
|
-
def add(self, value):
|
26
|
-
for seed in range(self.hash_count):
|
27
|
-
result = mmh3.hash(value, seed) % self.bit_size
|
28
|
-
self._client.setbit(self.key, result, 1)
|
29
|
-
return True
|
30
|
-
|
31
|
-
def exists(self, value):
|
32
|
-
if not self._client.exists(self.key):
|
33
|
-
return False
|
34
|
-
for seed in range(self.hash_count):
|
35
|
-
result = mmh3.hash(value, seed) % self.bit_size
|
36
|
-
if not self._client.getbit(self.key, result):
|
37
|
-
return False
|
38
|
-
return True
|
39
|
-
|
40
|
-
def _init_bloom_key(self):
|
41
|
-
lua_script = """
|
42
|
-
redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
|
43
|
-
redis.call("EXPIRE", KEYS[1], 604800)
|
44
|
-
"""
|
45
|
-
if self._client.exists(self.key):
|
46
|
-
return True
|
47
|
-
execute = self._client.register_script(lua_script)
|
48
|
-
execute(keys=[self.key], args=[self.bit_size-1, 1])
|
49
|
-
|
50
|
-
@classmethod
|
51
|
-
def get_bit_size(cls, n, p):
|
52
|
-
return int(-(n * math.log(p)) / (math.log(2) ** 2))
|
53
|
-
|
54
|
-
@classmethod
|
55
|
-
def get_hash_count(cls, m, n):
|
56
|
-
return int((m / n) * math.log(2))
|
57
|
-
|
58
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-3.1.11 → cobweb-launcher-3.1.12}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|