cobweb-launcher 1.2.11__tar.gz → 1.2.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (40) hide show
  1. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/PKG-INFO +1 -1
  2. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/crawlers/crawler.py +24 -14
  3. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/db/redis_db.py +1 -1
  4. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher.py +30 -3
  5. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher_pro.py +43 -20
  6. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/setting.py +7 -0
  7. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/utils/__init__.py +1 -0
  8. cobweb-launcher-1.2.14/cobweb/utils/bloom.py +76 -0
  9. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  10. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/SOURCES.txt +3 -1
  11. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/requires.txt +1 -0
  12. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/setup.py +2 -2
  13. cobweb-launcher-1.2.14/test/test.py +38 -0
  14. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/LICENSE +0 -0
  15. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/README.md +0 -0
  16. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/__init__.py +0 -0
  17. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/__init__.py +0 -0
  18. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/common_queue.py +0 -0
  19. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/decorators.py +0 -0
  20. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/item.py +0 -0
  21. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/log.py +0 -0
  22. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/request.py +0 -0
  23. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/response.py +0 -0
  24. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/base/seed.py +0 -0
  25. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/constant.py +0 -0
  26. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/crawlers/__init__.py +0 -0
  27. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/db/__init__.py +0 -0
  28. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/exceptions/__init__.py +0 -0
  29. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/exceptions/oss_db_exception.py +0 -0
  30. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/__init__.py +0 -0
  31. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/launchers/launcher_air.py +0 -0
  32. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/__init__.py +0 -0
  33. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline.py +0 -0
  34. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline_console.py +0 -0
  35. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/pipelines/pipeline_loghub.py +0 -0
  36. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/utils/oss.py +0 -0
  37. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb/utils/tools.py +0 -0
  38. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  39. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/cobweb_launcher.egg-info/top_level.txt +0 -0
  40. {cobweb-launcher-1.2.11 → cobweb-launcher-1.2.14}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.11
3
+ Version: 1.2.14
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -23,25 +23,33 @@ class Crawler(threading.Thread):
23
23
  self,
24
24
  stop: threading.Event,
25
25
  pause: threading.Event,
26
- launcher_queue: Union[Mapping[str, Queue]],
26
+ # launcher_queue: Union[Mapping[str, Queue]],
27
+ get_seed: Callable,
28
+ set_seed: Callable,
29
+ add_seed: Callable,
30
+ delete_seed: Callable,
31
+ upload_data: Callable,
27
32
  custom_func: Union[Mapping[str, Callable]],
28
33
  thread_num: int,
29
- max_retries: int
34
+ max_retries: int,
35
+ time_sleep: int,
30
36
  ):
31
37
  super().__init__()
32
38
 
33
39
  self._stop = stop
34
40
  self._pause = pause
35
- self._new = launcher_queue["new"]
36
- self._todo = launcher_queue["todo"]
37
- self._done = launcher_queue["done"]
38
- self._upload = launcher_queue["upload"]
41
+ self._get_seed = get_seed
42
+ self._set_seed = set_seed
43
+ self._add_seed = add_seed
44
+ self._delete_seed = delete_seed
45
+ self._upload_data = upload_data
39
46
 
40
47
  for func_name, _callable in custom_func.items():
41
48
  if isinstance(_callable, Callable):
42
49
  self.__setattr__(func_name, _callable)
43
50
 
44
51
  self.thread_num = thread_num
52
+ self.time_sleep = time_sleep
45
53
  self.max_retries = max_retries
46
54
 
47
55
  @staticmethod
@@ -64,23 +72,23 @@ class Crawler(threading.Thread):
64
72
 
65
73
  def distribute(self, item, seed):
66
74
  if isinstance(item, BaseItem):
67
- self._upload.push(item)
75
+ self._upload_data(item)
68
76
  elif isinstance(item, Seed):
69
- self._new.push(item)
77
+ self._add_seed(item)
70
78
  elif isinstance(item, str) and item == DealModel.poll:
71
- self._todo.push(seed)
79
+ self._set_seed(seed)
72
80
  elif isinstance(item, str) and item == DealModel.done:
73
- self._done.push(seed)
81
+ self._delete_seed(seed)
74
82
  elif isinstance(item, str) and item == DealModel.fail:
75
83
  seed.params.seed_status = DealModel.fail
76
- self._done.push(seed)
84
+ self._delete_seed(seed)
77
85
  else:
78
86
  raise TypeError("yield value type error!")
79
87
 
80
88
  def spider(self):
81
89
  while not self._stop.is_set():
82
90
 
83
- seed = self._todo.pop()
91
+ seed = self._get_seed()
84
92
 
85
93
  if not seed:
86
94
  time.sleep(1)
@@ -88,7 +96,7 @@ class Crawler(threading.Thread):
88
96
 
89
97
  elif seed.params.retry > self.max_retries:
90
98
  seed.params.seed_status = DealModel.fail
91
- self._done.push(seed)
99
+ self._delete_seed(seed)
92
100
  continue
93
101
 
94
102
  seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
@@ -149,7 +157,9 @@ class Crawler(threading.Thread):
149
157
  exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
150
158
  ))
151
159
  seed.params.retry += 1
152
- self._todo.push(seed)
160
+ # self._todo.push(seed)
161
+ self._set_seed(seed)
162
+ time.sleep(self.time_sleep * seed.params.retry)
153
163
  finally:
154
164
  time.sleep(0.1)
155
165
  logger.info("spider thread close")
@@ -63,7 +63,7 @@ class RedisDB:
63
63
 
64
64
  def lock(self, key, t=15) -> bool:
65
65
  lua_script = """
66
- local status = redis.call('setnx', KEYS[1], ARGV[1])
66
+ local status = redis.call('setnx', KEYS[1], 1)
67
67
  if ( status == 1 ) then
68
68
  redis.call('expire', KEYS[1], ARGV[1])
69
69
  end
@@ -17,6 +17,8 @@ def check_pause(func):
17
17
  func(self, *args, **kwargs)
18
18
  except Exception as e:
19
19
  logger.info(f"{func.__name__}: " + str(e))
20
+ finally:
21
+ time.sleep(0.1)
20
22
 
21
23
  return wrapper
22
24
 
@@ -94,10 +96,14 @@ class Launcher(threading.Thread):
94
96
 
95
97
  self._spider_max_retries = setting.SPIDER_MAX_RETRIES
96
98
  self._spider_thread_num = setting.SPIDER_THREAD_NUM
99
+ self._spider_time_sleep = setting.SPIDER_TIME_SLEEP
100
+ self._spider_max_speed = setting.SPIDER_MAX_SPEED
97
101
 
98
102
  self._done_model = setting.DONE_MODEL
99
103
  self._task_model = setting.TASK_MODEL
100
104
 
105
+ self._filter_field = setting.FILTER_FIELD
106
+
101
107
  @property
102
108
  def request(self):
103
109
  """
@@ -151,7 +157,22 @@ class Launcher(threading.Thread):
151
157
  def _remove_doing_seeds(self, seeds):
152
158
  for seed in seeds:
153
159
  self.__DOING__.pop(seed, None)
154
- logger.info("remove %s seeds from __DOING__" % len(seeds))
160
+ # logger.info("remove %s seeds from __DOING__" % len(seeds))
161
+
162
+ def _get_seed(self) -> Seed:
163
+ return self.__LAUNCHER_QUEUE__["todo"].pop()
164
+
165
+ def _set_seed(self, seed, **kwargs):
166
+ self.__LAUNCHER_QUEUE__["todo"].push(seed, **kwargs)
167
+
168
+ def _upload_data(self, data, **kwargs):
169
+ self.__LAUNCHER_QUEUE__["upload"].push(data, **kwargs)
170
+
171
+ def _add_seed(self, seed, **kwargs):
172
+ self.__LAUNCHER_QUEUE__["new"].push(seed, **kwargs)
173
+
174
+ def _delete_seed(self, seed, **kwargs):
175
+ self.__LAUNCHER_QUEUE__["done"].push(seed, **kwargs)
155
176
 
156
177
  def _execute(self):
157
178
  for func_name in self.__LAUNCHER_FUNC__:
@@ -165,10 +186,16 @@ class Launcher(threading.Thread):
165
186
 
166
187
  self._Crawler(
167
188
  stop=self._stop, pause=self._pause,
168
- launcher_queue=self.__LAUNCHER_QUEUE__,
189
+ # launcher_queue=self.__LAUNCHER_QUEUE__,
190
+ get_seed=self._get_seed,
191
+ set_seed=self._set_seed,
192
+ add_seed=self._add_seed,
193
+ delete_seed=self._delete_seed,
194
+ upload_data=self._upload_data,
169
195
  custom_func=self.__CUSTOM_FUNC__,
170
196
  thread_num = self._spider_thread_num,
171
- max_retries = self._spider_max_retries
197
+ max_retries = self._spider_max_retries,
198
+ time_sleep=self._spider_time_sleep
172
199
  ).start()
173
200
 
174
201
  self._Pipeline(
@@ -3,6 +3,7 @@ import threading
3
3
 
4
4
  from cobweb.db import RedisDB
5
5
  from cobweb.base import Seed, logger
6
+ from cobweb.utils import BloomFilter
6
7
  from cobweb.constant import DealModel, LogTemplate
7
8
  from .launcher import Launcher, check_pause
8
9
 
@@ -15,11 +16,19 @@ class LauncherPro(Launcher):
15
16
  self._done_key = "{%s:%s}:done" % (project, task)
16
17
  self._fail_key = "{%s:%s}:fail" % (project, task)
17
18
  self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
18
- self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
19
+
19
20
  self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
20
21
  self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
22
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
23
+
24
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
25
+
26
+ self._bf_key = "bloom_%s_%s" % (project, task)
27
+
21
28
  self._db = RedisDB()
22
29
 
30
+ self._bf = BloomFilter(self._bf_key)
31
+
23
32
  self._heartbeat_start_event = threading.Event()
24
33
  self._redis_queue_empty_event = threading.Event()
25
34
 
@@ -33,6 +42,21 @@ class LauncherPro(Launcher):
33
42
  else:
34
43
  self._db._client.incrby(key, count)
35
44
 
45
+ def _get_seed(self) -> Seed:
46
+ spider_speed = self._db._client.get(self._speed_control_key)
47
+ if int(spider_speed or 0) > self._spider_max_speed:
48
+ expire_time = self._db.ttl(self._speed_control_key)
49
+ if expire_time == -1:
50
+ self._db.delete(self._speed_control_key)
51
+ else:
52
+ logger.info(f"Too fast! Please wait {expire_time} seconds...")
53
+ time.sleep(expire_time / 2)
54
+ return None
55
+ seed = self.__LAUNCHER_QUEUE__["todo"].pop()
56
+ if seed and not self._db.lock(self._speed_control_key, t=60):
57
+ self._db._client.incrby(self._speed_control_key, 1)
58
+ return seed
59
+
36
60
  @check_pause
37
61
  def _execute_heartbeat(self):
38
62
  if self._heartbeat_start_event.is_set():
@@ -110,7 +134,7 @@ class LauncherPro(Launcher):
110
134
  """
111
135
  删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
112
136
  """
113
- seeds, s_seeds, f_seeds = [], [], []
137
+ seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
114
138
  status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
115
139
 
116
140
  for _ in range(self._done_queue_max_size):
@@ -118,26 +142,25 @@ class LauncherPro(Launcher):
118
142
  if not seed:
119
143
  break
120
144
  if seed.params.seed_status == DealModel.fail:
121
- f_seeds.append(seed.to_string)
145
+ seed_info["failed"].append(seed.to_string)
122
146
  elif self._done_model == 1:
123
- s_seeds.append(seed.to_string)
147
+ seed_info["succeed"].append(seed.to_string)
124
148
  else:
125
- seeds.append(seed.to_string)
126
- if seeds:
127
- count = self._db.zrem(self._todo_key, *seeds)
128
- if count:
129
- self.statistics(self._statistics_done_key, count)
130
- self._remove_doing_seeds(seeds)
131
- if s_seeds:
132
- count = self._db.done([self._todo_key, self._done_key], *s_seeds)
133
- if count:
134
- self.statistics(self._statistics_done_key, count)
135
- self._remove_doing_seeds(s_seeds)
136
- if f_seeds:
137
- count = self._db.done([self._todo_key, self._fail_key], *f_seeds)
138
- if count:
139
- self.statistics(self._statistics_fail_key, count)
140
- self._remove_doing_seeds(f_seeds)
149
+ seed_info["common"].append(seed.to_string)
150
+ seed_info['count'] += 1
151
+
152
+ if seed_info["count"]:
153
+
154
+ succeed_count = self._db.zrem(self._todo_key, *seed_info["common"])
155
+ succeed_count += self._db.done([self._todo_key, self._done_key], *seed_info["succeed"])
156
+ failed_count = self._db.done([self._todo_key, self._fail_key], *seed_info["failed"])
157
+
158
+ if failed_count:
159
+ self.statistics(self._statistics_fail_key, failed_count)
160
+ if succeed_count:
161
+ self.statistics(self._statistics_done_key, succeed_count)
162
+
163
+ self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
141
164
 
142
165
  if status:
143
166
  time.sleep(self._done_queue_wait_seconds)
@@ -57,9 +57,16 @@ DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加
57
57
  # spider
58
58
  SPIDER_THREAD_NUM = 10
59
59
  SPIDER_MAX_RETRIES = 5
60
+ SPIDER_TIME_SLEEP = 10
61
+ SPIDER_MAX_SPEED = 1000 # 一分钟最大采集数
60
62
 
61
63
  # 任务模式
62
64
  TASK_MODEL = 0 # 0:单次,1:常驻
63
65
 
66
+
67
+ # bloom过滤器
68
+ CAPACITY = 100000000
69
+ ERROR_RATE = 0.001
70
+ FILTER_FIELD = "url"
64
71
  # 文件下载响应类型过滤
65
72
  # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
@@ -1,3 +1,4 @@
1
1
  from .oss import OssUtil
2
2
  from .tools import *
3
+ from .bloom import BloomFilter
3
4
 
@@ -0,0 +1,76 @@
1
+ import math
2
+ import time
3
+
4
+ import mmh3
5
+ import redis
6
+ from cobweb import setting
7
+
8
+
9
+ class BloomFilter:
10
+
11
+ def __init__(self, key, redis_config=None, capacity=None, error_rate=None):
12
+ redis_config = redis_config or setting.REDIS_CONFIG
13
+ capacity = capacity or setting.CAPACITY
14
+ error_rate = error_rate or setting.ERROR_RATE
15
+ redis_config['db'] = 3
16
+
17
+ self.key = key
18
+
19
+ pool = redis.ConnectionPool(**redis_config)
20
+ self._client = redis.Redis(connection_pool=pool)
21
+ self.bit_size = self.get_bit_size(capacity, error_rate)
22
+ self.hash_count = self.get_hash_count(self.bit_size, capacity)
23
+ self._init_bloom_key()
24
+
25
+ def add(self, value):
26
+ for seed in range(self.hash_count):
27
+ result = mmh3.hash(value, seed) % self.bit_size
28
+ self._client.setbit(self.key, result, 1)
29
+ return True
30
+
31
+ def exists(self, value):
32
+ if not self._client.exists(self.key):
33
+ return False
34
+ for seed in range(self.hash_count):
35
+ result = mmh3.hash(value, seed) % self.bit_size
36
+ if not self._client.getbit(self.key, result):
37
+ return False
38
+ return True
39
+
40
+ def _init_bloom_key(self):
41
+ lua_script = """
42
+ redis.call("SETBIT", KEYS[1], ARGV[1], ARGV[2])
43
+ redis.call("EXPIRE", KEYS[1], 604800)
44
+ """
45
+ if self._client.exists(self.key):
46
+ return True
47
+ execute = self._client.register_script(lua_script)
48
+ execute(keys=[self.key], args=[self.bit_size-1, 1])
49
+
50
+ @classmethod
51
+ def get_bit_size(cls, n, p):
52
+ return int(-(n * math.log(p)) / (math.log(2) ** 2))
53
+
54
+ @classmethod
55
+ def get_hash_count(cls, m, n):
56
+ return int((m / n) * math.log(2))
57
+
58
+
59
+ if __name__ == '__main__':
60
+ testBLF = BloomFilter("test", {
61
+ "host": "r-j6c1t3etiefpmz7cwdpd.redis.rds.aliyuncs.com",
62
+ "password": "SpiderLinux666",
63
+ })
64
+
65
+ print("start")
66
+ start_time = time.time()
67
+ testBLF.add("test")
68
+ add_time = time.time()
69
+ print("add time::: ")
70
+ print(add_time - start_time)
71
+ print("get::: ")
72
+ print(testBLF.exists("test"))
73
+ exist_time = time.time()
74
+ print("get time::: ")
75
+ print(exist_time - add_time)
76
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.11
3
+ Version: 1.2.14
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -27,10 +27,12 @@ cobweb/pipelines/pipeline.py
27
27
  cobweb/pipelines/pipeline_console.py
28
28
  cobweb/pipelines/pipeline_loghub.py
29
29
  cobweb/utils/__init__.py
30
+ cobweb/utils/bloom.py
30
31
  cobweb/utils/oss.py
31
32
  cobweb/utils/tools.py
32
33
  cobweb_launcher.egg-info/PKG-INFO
33
34
  cobweb_launcher.egg-info/SOURCES.txt
34
35
  cobweb_launcher.egg-info/dependency_links.txt
35
36
  cobweb_launcher.egg-info/requires.txt
36
- cobweb_launcher.egg-info/top_level.txt
37
+ cobweb_launcher.egg-info/top_level.txt
38
+ test/test.py
@@ -2,3 +2,4 @@ requests>=2.19.1
2
2
  oss2>=2.18.1
3
3
  redis>=4.4.4
4
4
  aliyun-log-python-sdk
5
+ mmh3
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.2.11",
8
+ version="1.2.14",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -14,7 +14,7 @@ setup(
14
14
  description="spider_hole",
15
15
  long_description=long_description,
16
16
  long_description_content_type="text/markdown",
17
- install_requires=["requests>=2.19.1", "oss2>=2.18.1", "redis>=4.4.4", "aliyun-log-python-sdk"],
17
+ install_requires=["requests>=2.19.1", "oss2>=2.18.1", "redis>=4.4.4", "aliyun-log-python-sdk", "mmh3"],
18
18
  classifiers=[
19
19
  "Programming Language :: Python :: 3",
20
20
  ],
@@ -0,0 +1,38 @@
1
+ import os
2
+
3
+
4
+ os.environ["SPIDER_NUM"] = "1"
5
+ os.environ["REDIS_HOST"] = "r-j6cc5zw8m3pqom4chmpd.redis.rds.aliyuncs.com"
6
+ os.environ["REDIS_PASSWORD"] = "SpiderLinux666"
7
+ os.environ["REDIS_PORT"] = "6379"
8
+ os.environ["REDIS_DB"] = "0"
9
+ os.environ["SCHEDULER_QUEUE_LENGTH"] = "100"
10
+ os.environ["STORER_QUEUE_LENGTH"] = "10"
11
+ os.environ["LOGSTORE"] = "download_meta"
12
+ os.environ["LOGHUB_ACCESS_KEY"] = "LTAI5tH2FLYsBkxcbiLdoYZT"
13
+ os.environ["LOGHUB_ACCESS_SECRET"] = "4oD1NVvYfaWiqxkmGx6c2xtpPWEq17"
14
+ os.environ["BUCKET"] = "databee-video"
15
+ os.environ["ENDPOINT"] = "http://oss-cn-hangzhou.aliyuncs.com"
16
+ os.environ["OSS_ACCESS_KEY"] = "LTAI5tH2FLYsBkxcbiLdoYZT"
17
+ os.environ["OSS_ACCESS_SECRET"] = "4oD1NVvYfaWiqxkmGx6c2xtpPWEq17"
18
+ os.environ["CHUNK_SIZE"] = "1048576"
19
+ os.environ["MIN_SIZE"] = "1024"
20
+
21
+
22
+ from cobweb import LauncherPro
23
+
24
+ app = LauncherPro("test", "test", SPIDER_THREAD_NUM=1)
25
+
26
+ app.SEEDS = [
27
+ {"url": "https://www.baidu.com"},
28
+ {"url": "https://www.google.com"},
29
+ {"url": "https://www.bing.com"},
30
+ {"url": "https://www.baidu.com?v=1"},
31
+ {"url": "https://www.baidu.com?v=2"},
32
+ {"url": "https://www.baidu.com?v=3"},
33
+ {"url": "https://www.baidu.com?v=4"},
34
+ {"url": "https://www.baidu.com?v=5"}
35
+ ]
36
+
37
+ if __name__ == '__main__':
38
+ app.start()