cobweb-launcher 1.2.11__tar.gz → 1.2.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (39) hide show
  1. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/PKG-INFO +1 -1
  2. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/crawlers/crawler.py +4 -1
  3. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/launchers/launcher.py +6 -2
  4. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/launchers/launcher_pro.py +4 -0
  5. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/setting.py +6 -0
  6. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/utils/__init__.py +1 -0
  7. cobweb-launcher-1.2.13/cobweb/utils/bloom.py +98 -0
  8. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  9. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb_launcher.egg-info/SOURCES.txt +1 -0
  10. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb_launcher.egg-info/requires.txt +1 -0
  11. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/setup.py +2 -2
  12. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/LICENSE +0 -0
  13. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/README.md +0 -0
  14. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/__init__.py +0 -0
  15. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/__init__.py +0 -0
  16. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/common_queue.py +0 -0
  17. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/decorators.py +0 -0
  18. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/item.py +0 -0
  19. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/log.py +0 -0
  20. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/request.py +0 -0
  21. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/response.py +0 -0
  22. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/base/seed.py +0 -0
  23. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/constant.py +0 -0
  24. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/crawlers/__init__.py +0 -0
  25. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/db/__init__.py +0 -0
  26. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/db/redis_db.py +0 -0
  27. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/exceptions/__init__.py +0 -0
  28. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/exceptions/oss_db_exception.py +0 -0
  29. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/launchers/__init__.py +0 -0
  30. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/launchers/launcher_air.py +0 -0
  31. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/pipelines/__init__.py +0 -0
  32. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/pipelines/pipeline.py +0 -0
  33. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/pipelines/pipeline_console.py +0 -0
  34. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/pipelines/pipeline_loghub.py +0 -0
  35. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/utils/oss.py +0 -0
  36. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb/utils/tools.py +0 -0
  37. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  38. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/cobweb_launcher.egg-info/top_level.txt +0 -0
  39. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.13}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.11
3
+ Version: 1.2.13
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -26,7 +26,8 @@ class Crawler(threading.Thread):
26
26
  launcher_queue: Union[Mapping[str, Queue]],
27
27
  custom_func: Union[Mapping[str, Callable]],
28
28
  thread_num: int,
29
- max_retries: int
29
+ max_retries: int,
30
+ time_sleep: int
30
31
  ):
31
32
  super().__init__()
32
33
 
@@ -43,6 +44,7 @@ class Crawler(threading.Thread):
43
44
 
44
45
  self.thread_num = thread_num
45
46
  self.max_retries = max_retries
47
+ self.time_sleep = time_sleep
46
48
 
47
49
  @staticmethod
48
50
  def request(seed: Seed) -> Union[Request, BaseItem]:
@@ -150,6 +152,7 @@ class Crawler(threading.Thread):
150
152
  ))
151
153
  seed.params.retry += 1
152
154
  self._todo.push(seed)
155
+ time.sleep(self.time_sleep * seed.params.retry)
153
156
  finally:
154
157
  time.sleep(0.1)
155
158
  logger.info("spider thread close")
@@ -94,10 +94,13 @@ class Launcher(threading.Thread):
94
94
 
95
95
  self._spider_max_retries = setting.SPIDER_MAX_RETRIES
96
96
  self._spider_thread_num = setting.SPIDER_THREAD_NUM
97
+ self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
97
98
 
98
99
  self._done_model = setting.DONE_MODEL
99
100
  self._task_model = setting.TASK_MODEL
100
101
 
102
+ self._filter_field = setting.FILTER_FIELD
103
+
101
104
  @property
102
105
  def request(self):
103
106
  """
@@ -151,7 +154,7 @@ class Launcher(threading.Thread):
151
154
  def _remove_doing_seeds(self, seeds):
152
155
  for seed in seeds:
153
156
  self.__DOING__.pop(seed, None)
154
- logger.info("remove %s seeds from __DOING__" % len(seeds))
157
+ # logger.info("remove %s seeds from __DOING__" % len(seeds))
155
158
 
156
159
  def _execute(self):
157
160
  for func_name in self.__LAUNCHER_FUNC__:
@@ -168,7 +171,8 @@ class Launcher(threading.Thread):
168
171
  launcher_queue=self.__LAUNCHER_QUEUE__,
169
172
  custom_func=self.__CUSTOM_FUNC__,
170
173
  thread_num = self._spider_thread_num,
171
- max_retries = self._spider_max_retries
174
+ max_retries = self._spider_max_retries,
175
+ time_sleep=self._spider_time_sleep
172
176
  ).start()
173
177
 
174
178
  self._Pipeline(
@@ -3,6 +3,7 @@ import threading
3
3
 
4
4
  from cobweb.db import RedisDB
5
5
  from cobweb.base import Seed, logger
6
+ from cobweb.utils import BloomFilter
6
7
  from cobweb.constant import DealModel, LogTemplate
7
8
  from .launcher import Launcher, check_pause
8
9
 
@@ -18,8 +19,11 @@ class LauncherPro(Launcher):
18
19
  self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
19
20
  self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
20
21
  self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
22
+ self._bf_key = "bloom_%s_%s" % (project, task)
21
23
  self._db = RedisDB()
22
24
 
25
+ self._bf = BloomFilter(self._bf_key)
26
+
23
27
  self._heartbeat_start_event = threading.Event()
24
28
  self._redis_queue_empty_event = threading.Event()
25
29
 
@@ -57,9 +57,15 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
57
57
  # spider
58
58
  SPIDER_THREAD_NUM = 10
59
59
  SPIDER_MAX_RETRIES = 5
60
+ SPIDER_TIME_SLEEP = 10
60
61
 
61
62
  # 任务模式
62
63
  TASK_MODEL = 0 # 0:单次,1:常驻
63
64
 
65
+
66
+ # bloom过滤器
67
+ CAPACITY = 100000000
68
+ ERROR_RATE = 0.001
69
+ FILTER_FIELD = "url"
64
70
  # 文件下载响应类型过滤
65
71
  # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
@@ -1,3 +1,4 @@
1
1
  from .oss import OssUtil
2
2
  from .tools import *
3
+ from .bloom import BloomFilter
3
4
 
@@ -0,0 +1,98 @@
1
+ import time
2
+
3
+
4
+ from redis import Redis
5
+ from cobweb import setting
6
+
7
+ # class BloomFilter:
8
+ #
9
+ # def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
10
+ # redis_config = redis_config or setting.REDIS_CONFIG
11
+ # capacity = capacity or setting.CAPACITY
12
+ # error_rate = error_rate or setting.ERROR_RATE
13
+ # redis_config['db'] = 3
14
+ #
15
+ # self.key = key
16
+ #
17
+ # pool = redis.ConnectionPool(**redis_config)
18
+ # self.bit_size = self.get_bit_size(capacity, error_rate)
19
+ # self.hash_count = self.get_hash_count(self.bit_size, capacity)
20
+ # self._init_bloom_key()
21
+ #
22
+ # def add(self, value):
23
+ # for seed in range(self.hash_count):
24
+ # result = mmh3.hash(value, seed) % self.bit_size
25
+ # self._client.setbit(self.key, result, 1)
26
+ # return True
27
+ #
28
+ # def exists(self, value):
29
+ # if not self._client.exists(self.key):
30
+ # return False
31
+ # for seed in range(self.hash_count):
32
+ # result = mmh3.hash(value, seed) % self.bit_size
33
+ # if not self._client.getbit(self.key, result):
34
+ # return False
35
+ # return True
36
+ #
37
+ # def _init_bloom_key(self):
38
+ # lua_script = """
39
+ # redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
40
+ # redis.call("EXPIRE", KEYS[1], 604800)
41
+ # """
42
+ # if self._client.exists(self.key):
43
+ # return True
44
+ # execute = self._client.register_script(lua_script)
45
+ # execute(keys=[self.key], args=[self.bit_size-1, 1])
46
+ #
47
+ # @classmethod
48
+ # def get_bit_size(cls, n, p):
49
+ # return int(-(n * math.log(p)) / (math.log(2) ** 2))
50
+ #
51
+ # @classmethod
52
+ # def get_hash_count(cls, m, n):
53
+ # return int((m / n) * math.log(2))
54
+
55
+ class BloomFilter:
56
+
57
+ def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
58
+ redis_config = redis_config or setting.REDIS_CONFIG
59
+ capacity = capacity or setting.CAPACITY
60
+ error_rate = error_rate or setting.ERROR_RATE
61
+ redis_config['db'] = 3
62
+
63
+ self.key = key
64
+
65
+ self._client = Redis(**redis_config).bf()
66
+ self._client.create(key=self.key, capacity=capacity, errorRate=error_rate)
67
+
68
+ def add(self, value):
69
+ return self._client.add(self.key, value)
70
+
71
+ def madd(self, items: list):
72
+ return self._client.madd(self.key, *items)
73
+
74
+ def exists(self, value):
75
+ return self._client.exists(self.key, value)
76
+
77
+ def mexists(self, items: list):
78
+ return self._client.mexists(self.key, *items)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ testBLF = BloomFilter("test", {
83
+ "host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
84
+ "password": "SpiderLinux666",
85
+ })
86
+
87
+ print("start")
88
+ start_time = time.time()
89
+ testBLF.add("test")
90
+ add_time = time.time()
91
+ print("add time::: ")
92
+ print(add_time - start_time)
93
+ print("get::: ")
94
+ print(testBLF.exists("test"))
95
+ exist_time = time.time()
96
+ print("get time::: ")
97
+ print(exist_time - add_time)
98
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.11
3
+ Version: 1.2.13
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -27,6 +27,7 @@ cobweb/pipelines/pipeline.py
27
27
  cobweb/pipelines/pipeline_console.py
28
28
  cobweb/pipelines/pipeline_loghub.py
29
29
  cobweb/utils/__init__.py
30
+ cobweb/utils/bloom.py
30
31
  cobweb/utils/oss.py
31
32
  cobweb/utils/tools.py
32
33
  cobweb_launcher.egg-info/PKG-INFO
@@ -2,3 +2,4 @@ requests>=2.19.1
2
2
  oss2>=2.18.1
3
3
  redis>=4.4.4
4
4
  aliyun-log-python-sdk
5
+ mmh3
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.2.11",
8
+ version="1.2.13",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -14,7 +14,7 @@ setup(
14
14
  description="spider_hole",
15
15
  long_description=long_description,
16
16
  long_description_content_type="text/markdown",
17
- install_requires=["requests>=2.19.1", "oss2>=2.18.1", "redis>=4.4.4", "aliyun-log-python-sdk"],
17
+ install_requires=["requests>=2.19.1", "oss2>=2.18.1", "redis>=4.4.4", "aliyun-log-python-sdk", "mmh3"],
18
18
  classifiers=[
19
19
  "Programming Language :: Python :: 3",
20
20
  ],