cobweb-launcher 1.2.11__py3-none-any.whl → 1.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

@@ -26,7 +26,8 @@ class Crawler(threading.Thread):
26
26
  launcher_queue: Union[Mapping[str, Queue]],
27
27
  custom_func: Union[Mapping[str, Callable]],
28
28
  thread_num: int,
29
- max_retries: int
29
+ max_retries: int,
30
+ time_sleep: int
30
31
  ):
31
32
  super().__init__()
32
33
 
@@ -43,6 +44,7 @@ class Crawler(threading.Thread):
43
44
 
44
45
  self.thread_num = thread_num
45
46
  self.max_retries = max_retries
47
+ self.time_sleep = time_sleep
46
48
 
47
49
  @staticmethod
48
50
  def request(seed: Seed) -> Union[Request, BaseItem]:
@@ -150,6 +152,7 @@ class Crawler(threading.Thread):
150
152
  ))
151
153
  seed.params.retry += 1
152
154
  self._todo.push(seed)
155
+ time.sleep(self.time_sleep * seed.params.retry)
153
156
  finally:
154
157
  time.sleep(0.1)
155
158
  logger.info("spider thread close")
@@ -94,10 +94,13 @@ class Launcher(threading.Thread):
94
94
 
95
95
  self._spider_max_retries = setting.SPIDER_MAX_RETRIES
96
96
  self._spider_thread_num = setting.SPIDER_THREAD_NUM
97
+ self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
97
98
 
98
99
  self._done_model = setting.DONE_MODEL
99
100
  self._task_model = setting.TASK_MODEL
100
101
 
102
+ self._filter_field = setting.FILTER_FIELD
103
+
101
104
  @property
102
105
  def request(self):
103
106
  """
@@ -151,7 +154,7 @@ class Launcher(threading.Thread):
151
154
  def _remove_doing_seeds(self, seeds):
152
155
  for seed in seeds:
153
156
  self.__DOING__.pop(seed, None)
154
- logger.info("remove %s seeds from __DOING__" % len(seeds))
157
+ # logger.info("remove %s seeds from __DOING__" % len(seeds))
155
158
 
156
159
  def _execute(self):
157
160
  for func_name in self.__LAUNCHER_FUNC__:
@@ -168,7 +171,8 @@ class Launcher(threading.Thread):
168
171
  launcher_queue=self.__LAUNCHER_QUEUE__,
169
172
  custom_func=self.__CUSTOM_FUNC__,
170
173
  thread_num = self._spider_thread_num,
171
- max_retries = self._spider_max_retries
174
+ max_retries = self._spider_max_retries,
175
+ time_sleep=self._spider_time_sleep
172
176
  ).start()
173
177
 
174
178
  self._Pipeline(
@@ -3,6 +3,7 @@ import threading
3
3
 
4
4
  from cobweb.db import RedisDB
5
5
  from cobweb.base import Seed, logger
6
+ from cobweb.utils import BloomFilter
6
7
  from cobweb.constant import DealModel, LogTemplate
7
8
  from .launcher import Launcher, check_pause
8
9
 
@@ -18,8 +19,11 @@ class LauncherPro(Launcher):
18
19
  self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
19
20
  self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
20
21
  self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
22
+ self._bf_key = "bloom_%s_%s" % (project, task)
21
23
  self._db = RedisDB()
22
24
 
25
+ self._bf = BloomFilter(self._bf_key)
26
+
23
27
  self._heartbeat_start_event = threading.Event()
24
28
  self._redis_queue_empty_event = threading.Event()
25
29
 
cobweb/setting.py CHANGED
@@ -57,9 +57,15 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
57
57
  # spider
58
58
  SPIDER_THREAD_NUM = 10
59
59
  SPIDER_MAX_RETRIES = 5
60
+ SPIDER_TIME_SLEEP = 10
60
61
 
61
62
  # 任务模式
62
63
  TASK_MODEL = 0 # 0:单次,1:常驻
63
64
 
65
+
66
+ # bloom过滤器
67
+ CAPACITY = 100000000
68
+ ERROR_RATE = 0.001
69
+ FILTER_FIELD = "url"
64
70
  # 文件下载响应类型过滤
65
71
  # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
cobweb/utils/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
1
  from .oss import OssUtil
2
2
  from .tools import *
3
+ from .bloom import BloomFilter
3
4
 
cobweb/utils/bloom.py ADDED
@@ -0,0 +1,98 @@
1
+ import time
2
+
3
+
4
+ from redis import Redis
5
+ from cobweb import setting
6
+
7
+ # class BloomFilter:
8
+ #
9
+ # def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
10
+ # redis_config = redis_config or setting.REDIS_CONFIG
11
+ # capacity = capacity or setting.CAPACITY
12
+ # error_rate = error_rate or setting.ERROR_RATE
13
+ # redis_config['db'] = 3
14
+ #
15
+ # self.key = key
16
+ #
17
+ # pool = redis.ConnectionPool(**redis_config)
18
+ # self.bit_size = self.get_bit_size(capacity, error_rate)
19
+ # self.hash_count = self.get_hash_count(self.bit_size, capacity)
20
+ # self._init_bloom_key()
21
+ #
22
+ # def add(self, value):
23
+ # for seed in range(self.hash_count):
24
+ # result = mmh3.hash(value, seed) % self.bit_size
25
+ # self._client.setbit(self.key, result, 1)
26
+ # return True
27
+ #
28
+ # def exists(self, value):
29
+ # if not self._client.exists(self.key):
30
+ # return False
31
+ # for seed in range(self.hash_count):
32
+ # result = mmh3.hash(value, seed) % self.bit_size
33
+ # if not self._client.getbit(self.key, result):
34
+ # return False
35
+ # return True
36
+ #
37
+ # def _init_bloom_key(self):
38
+ # lua_script = """
39
+ # redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
40
+ # redis.call("EXPIRE", KEYS[1], 604800)
41
+ # """
42
+ # if self._client.exists(self.key):
43
+ # return True
44
+ # execute = self._client.register_script(lua_script)
45
+ # execute(keys=[self.key], args=[self.bit_size-1, 1])
46
+ #
47
+ # @classmethod
48
+ # def get_bit_size(cls, n, p):
49
+ # return int(-(n * math.log(p)) / (math.log(2) ** 2))
50
+ #
51
+ # @classmethod
52
+ # def get_hash_count(cls, m, n):
53
+ # return int((m / n) * math.log(2))
54
+
55
+ class BloomFilter:
56
+
57
+ def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
58
+ redis_config = redis_config or setting.REDIS_CONFIG
59
+ capacity = capacity or setting.CAPACITY
60
+ error_rate = error_rate or setting.ERROR_RATE
61
+ redis_config['db'] = 3
62
+
63
+ self.key = key
64
+
65
+ self._client = Redis(**redis_config).bf()
66
+ self._client.create(key=self.key, capacity=capacity, errorRate=error_rate)
67
+
68
+ def add(self, value):
69
+ return self._client.add(self.key, value)
70
+
71
+ def madd(self, items: list):
72
+ return self._client.madd(self.key, *items)
73
+
74
+ def exists(self, value):
75
+ return self._client.exists(self.key, value)
76
+
77
+ def mexists(self, items: list):
78
+ return self._client.mexists(self.key, *items)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ testBLF = BloomFilter("test", {
83
+ "host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
84
+ "password": "SpiderLinux666",
85
+ })
86
+
87
+ print("start")
88
+ start_time = time.time()
89
+ testBLF.add("test")
90
+ add_time = time.time()
91
+ print("add time::: ")
92
+ print(add_time - start_time)
93
+ print("get::: ")
94
+ print(testBLF.exists("test"))
95
+ exist_time = time.time()
96
+ print("get time::: ")
97
+ print(exist_time - add_time)
98
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.11
3
+ Version: 1.2.13
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -16,6 +16,7 @@ Requires-Dist: requests (>=2.19.1)
16
16
  Requires-Dist: oss2 (>=2.18.1)
17
17
  Requires-Dist: redis (>=4.4.4)
18
18
  Requires-Dist: aliyun-log-python-sdk
19
+ Requires-Dist: mmh3
19
20
 
20
21
  # cobweb
21
22
  cobweb是一个基于python的分布式爬虫调度框架,目前支持分布式爬虫,单机爬虫,支持自定义数据库,支持自定义数据存储,支持自定义数据处理等操作。
@@ -1,6 +1,6 @@
1
1
  cobweb/__init__.py,sha256=uMHyf4Fekbyw2xBCbkA8R0LwCpBJf5p_7pWbh60ZWYk,83
2
2
  cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
3
- cobweb/setting.py,sha256=Wev0clo4ZETI5cRvBnzTWnJWo0Nowv_uvNCqlzYPSiE,1990
3
+ cobweb/setting.py,sha256=S33EYK5hrrXn8GFkOmwt9Qn7OdNv8E4h8a0ZyeNydCM,2092
4
4
  cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
5
5
  cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
6
6
  cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
@@ -11,27 +11,28 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
11
11
  cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
12
12
  cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
13
13
  cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
14
- cobweb/crawlers/crawler.py,sha256=MJ2rFTrVjQ4Q9TmX0dpbOvT4tdokQwh7dKeFY_W7pPE,6002
14
+ cobweb/crawlers/crawler.py,sha256=5mex-ENuzZSME0EIdwS9fnWkAX6LQuEyoDNcFm0emqs,6132
15
15
  cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
16
16
  cobweb/db/__init__.py,sha256=ut0iEyBLjcJL06WNG_5_d4hO5PJWvDrKWMkDOdmgh2M,30
17
17
  cobweb/db/redis_db.py,sha256=NNI2QkRV1hEZI-z-COEncXt88z3pZN6wusKlcQzc8V4,4304
18
18
  cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
19
19
  cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
20
20
  cobweb/launchers/__init__.py,sha256=af0Y6wrGX8SQZ7w7XL2sOtREjCT3dwad-uCc3nIontY,76
21
- cobweb/launchers/launcher.py,sha256=mBpq0CmxXXv-KdXQ2x7vwOkkAvaKECLkZLGhraafkQA,5953
21
+ cobweb/launchers/launcher.py,sha256=sH9bj5TIPX5RjpNFvHmqTtlhyBxQqbTDhuvYSd2LspI,6114
22
22
  cobweb/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
23
- cobweb/launchers/launcher_pro.py,sha256=OmlC5o3BdcsTFwOVAKNJdnwdlTl-fUiXC2kViWEpcoU,7677
23
+ cobweb/launchers/launcher_pro.py,sha256=4hi8xviIJr8HqTERShfdZjp767KXBeIyqyHaW9rYOhE,7815
24
24
  cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
25
25
  cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
26
26
  cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
27
27
  cobweb/pipelines/pipeline.py,sha256=4TJLX0sUHRxYndF5A4Vs5btUGI-wigkOcFvhTW1hLXI,2009
28
28
  cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
29
29
  cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
30
- cobweb/utils/__init__.py,sha256=JTE4sBfHnKHhD6w9Auk0MIT7O9BMOamCeryhlHNx3Zg,47
30
+ cobweb/utils/__init__.py,sha256=vBtZTy3EfRE0MmH43URhmr7nw6_oOWTEbGOM9xR_9o8,78
31
+ cobweb/utils/bloom.py,sha256=G0PlaMVTz6KCmhcNToi28bAHj1YJjDVqwJAQ_DUBWGk,3030
31
32
  cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
32
33
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
33
- cobweb_launcher-1.2.11.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
34
- cobweb_launcher-1.2.11.dist-info/METADATA,sha256=GSDZ3aWvattuW3Md-faK0cSBPyGDA8bJcVZRlSVp4ag,6490
35
- cobweb_launcher-1.2.11.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
36
- cobweb_launcher-1.2.11.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
37
- cobweb_launcher-1.2.11.dist-info/RECORD,,
34
+ cobweb_launcher-1.2.13.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
35
+ cobweb_launcher-1.2.13.dist-info/METADATA,sha256=-HRZ-uipPBajhDyuDF2TTtQQAectd1gsjk3KwvWtHpo,6510
36
+ cobweb_launcher-1.2.13.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
37
+ cobweb_launcher-1.2.13.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
38
+ cobweb_launcher-1.2.13.dist-info/RECORD,,