cobweb-launcher 0.1.8__py3-none-any.whl → 1.2.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.8.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.8.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.8.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/db/api_db.py ADDED
@@ -0,0 +1,82 @@
1
+ import os
2
+ import json
3
+ import requests
4
+
5
+
6
+ class ApiDB:
7
+
8
+ def __init__(self, host=None, **kwargs):
9
+ self.host = host or os.getenv("REDIS_API_HOST", "http://127.0.0.1:4396")
10
+
11
+ def _get_response(self, api, params: dict = None):
12
+ try:
13
+ url = self.host + api
14
+ response = requests.get(url, params=params)
15
+ json_data = response.json()
16
+ response.close()
17
+ return json_data["data"]
18
+ except:
19
+ return None
20
+
21
+ def _post_response(self, api, params: dict = None, data: dict = None):
22
+ try:
23
+ url = self.host + api
24
+ headers = {"Content-Type": "application/json"}
25
+ response = requests.post(url, headers=headers, params=params, data=json.dumps(data))
26
+ json_data = response.json()
27
+ response.close()
28
+ return json_data["data"]
29
+ except:
30
+ return None
31
+
32
+ def get(self, name):
33
+ return self._get_response(api="/get", params=dict(name=name))
34
+
35
+ def setnx(self, name, value=""):
36
+ return self._get_response(api="/setnx", params=dict(name=name, value=value))
37
+
38
+ def setex(self, name, t, value=""):
39
+ return self._get_response(api="/setex", params=dict(name=name, value=value, t=t))
40
+
41
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
42
+ return self._get_response(api="/expire", params=dict(name=name, t=t, nx=nx, xx=xx, gt=gt, lt=lt))
43
+
44
+ def ttl(self, name):
45
+ return self._get_response(api="/ttl", params=dict(name=name))
46
+
47
+ def delete(self, name):
48
+ return self._get_response(api="/delete", params=dict(name=name))
49
+
50
+ def exists(self, name):
51
+ return self._get_response(api="/exists", params=dict(name=name))
52
+
53
+ def incrby(self, name, value):
54
+ return self._get_response(api="/incrby", params=dict(name=name, value=value))
55
+
56
+ def zcard(self, name) -> bool:
57
+ return self._get_response(api="/zcard", params=dict(name=name))
58
+
59
+ def zadd(self, name, item: dict, **kwargs):
60
+ return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
61
+
62
+ def zrem(self, name, *values):
63
+ return self._post_response(api="/zrem", data=dict(name=name, values=values))
64
+
65
+ def zcount(self, name, _min, _max):
66
+ return self._get_response(api="/zcount", params=dict(name=name, min=_min, max=_max))
67
+
68
+ def lock(self, name, t=15) -> bool:
69
+ return self._get_response(api="/lock", params=dict(name=name, t=t))
70
+
71
+ def auto_incr(self, name, t=15, limit=1000) -> bool:
72
+ return self._get_response(api="/auto_incr", params=dict(name=name, t=t, limit=limit))
73
+
74
+ def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
75
+ return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
76
+
77
+ def done(self, name: list, *values):
78
+ return self._post_response(api="/done", data=dict(name=name, values=values))
79
+
80
+
81
+
82
+
cobweb/db/redis_db.py CHANGED
@@ -1,223 +1,130 @@
1
- import time
2
1
  import redis
3
- from cobweb import Seed, log
4
- from cobweb.decorators import check_redis_status
2
+ from cobweb import setting
5
3
 
6
4
 
7
5
  class RedisDB:
8
6
 
9
- def __init__(
10
- self,
11
- project: str,
12
- task_name: str,
13
- config: dict,
14
- model: int,
15
- cs_lct: int,
16
- rs_time: int,
17
- ):
18
- pool = redis.ConnectionPool(**config)
19
- self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
20
- self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type zset, .format(priority)
21
- self.storer_key = f"{project}:{task_name}:seed_info:storer:%s" # redis type set,
22
- self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
23
- self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
24
- self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
25
- self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
26
- self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
27
- self.client = redis.Redis(connection_pool=pool)
28
- self.model = model
29
- self.cs_lct = cs_lct
30
- self.rs_time = rs_time
31
-
32
- @check_redis_status
33
- def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
34
- begin_time = int(time.time())
35
- while True:
36
- if self.client.setnx(key, ""):
37
- self.client.expire(key, t)
38
- return True
39
- if int(time.time()) - begin_time > timeout:
40
- break
41
- time.sleep(sleep_time)
42
-
43
- if self.client.ttl(key) == -1:
44
- delete_status = True
45
- for _ in range(3):
46
- if self.client.ttl(key) != -1:
47
- delete_status = False
48
- break
49
- time.sleep(0.5)
50
- if delete_status:
51
- self.client.expire(key, t)
52
- return False
53
- else:
54
- ttl = self.client.ttl(key)
55
- log.info("ttl: " + str(ttl))
56
- return False
57
-
58
- @check_redis_status
59
- def _deal_seed(self, seeds, is_add: bool):
60
- if not seeds:
61
- return None
62
-
63
- if not isinstance(seeds, list):
64
- seeds = [seeds]
65
-
66
- item_info = dict()
67
-
68
- for seed in seeds:
69
- if not isinstance(seed, Seed):
70
- seed = Seed(seed)
71
- item_info[seed.format_seed] = seed._priority
72
-
73
- if item_info:
74
- self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
75
-
76
- @check_redis_status
77
- def add_seed(self, seeds):
78
- self._deal_seed(seeds, is_add=True)
79
-
80
- @check_redis_status
81
- def reset_seed(self, seeds):
82
- self._deal_seed(seeds, is_add=False)
83
-
84
- @check_redis_status
85
- def del_seed(self, seeds, spider_status: bool = True):
86
- if not seeds:
87
- return None
88
-
89
- if not isinstance(seeds, list):
90
- seeds = [seeds]
91
-
92
- seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
93
-
94
- if seeds:
95
- # redis_key = self.succeed_key if spider_status else self.failed_key
96
- redis_key = None
97
- if spider_status:
98
- if isinstance(self.model, int) and self.model == 2:
99
- redis_key = self.succeed_key
100
- else:
101
- redis_key = self.failed_key
102
- if redis_key:
103
- self.client.sadd(redis_key, *(str(seed) for seed in seeds))
104
- self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
105
-
106
- @check_redis_status
107
- def set_storer(self, key, seeds):
108
- if not seeds:
109
- return None
110
-
111
- if not isinstance(seeds, list):
112
- seeds = [seeds]
113
-
114
- item_info = dict()
115
- score = -int(time.time())
116
- for seed in seeds:
117
- if not isinstance(seed, Seed):
118
- seed = Seed(seed)
119
- item_info[seed.format_seed] = score
120
-
121
- if item_info:
122
- self.client.zadd(self.storer_key % key, mapping=item_info)
123
- log.info(f"zadd storer key: length {len(item_info.keys())}")
124
-
125
- @check_redis_status
126
- def get_seed(self, length: int = 200):
127
- cs = time.time()
128
-
129
- if self._get_lock(key=self.update_lock):
130
-
131
- update_item, result = {}, []
132
-
133
- version = int(time.time())
134
-
135
- items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
136
-
137
- for value, priority in items:
138
- score = -(version + int(priority) / 1000)
139
- seed = Seed(value, priority=priority, version=version)
140
- update_item[value] = score
141
- result.append(seed)
142
-
143
- log.info("set seeds into queue time: " + str(time.time() - cs))
144
- if result:
145
- self.client.zadd(self.spider_key, mapping=update_item, xx=True)
146
-
147
- self.client.delete(self.update_lock)
148
- log.info("push seeds into queue time: " + str(time.time() - cs))
149
- return result
150
-
151
- @check_redis_status
152
- def check_spider_queue(self, stop, storer_num):
153
- while not stop.is_set():
154
- # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
155
- if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
156
- heartbeat = True if self.client.exists(self.heartbeat_key) else False
157
- # 重启重制score值,否则获取${rs_time}分钟前的分数值
158
- score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
159
-
160
- keys = self.client.keys(self.storer_key % "*")
161
-
162
- if keys and len(keys) >= storer_num:
163
- intersection_key = self.storer_key % "intersection"
164
- self.client.delete(intersection_key)
165
- self.client.zinterstore(intersection_key, keys)
166
-
167
- while True:
168
- members = self.client.zrange(intersection_key, 0, 1999)
169
- if not members:
170
- break
171
- for key in keys:
172
- self.client.zrem(key, *members)
173
- if self.model == 2:
174
- self.client.sadd(self.succeed_key, *members)
175
- self.client.zrem(self.spider_key, *members)
176
- self.client.zrem(intersection_key, *members)
177
- log.info("succeed spider data ...")
178
-
179
- for key in keys:
180
- self.client.zremrangebyscore(key, min=score, max="(0")
181
-
182
- while True:
183
- items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
184
- if not items:
185
- break
186
- reset_items = {}
187
- for value, priority in items:
188
- reset_score = "{:.3f}".format(priority).split(".")[1]
189
- reset_items[value] = int(reset_score)
190
- if reset_items:
191
- self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
192
-
193
- if not heartbeat:
194
- self.client.setex(self.heartbeat_key, 15, "")
195
-
196
- # self.client.delete(self.check_lock)
197
- # time.sleep(3)
198
-
199
- @check_redis_status
200
- def set_heartbeat(self, stop):
201
- time.sleep(5)
202
- while not stop.is_set():
203
- self.client.setex(self.heartbeat_key, 5, "")
204
- time.sleep(3)
205
-
206
- # @check_redis_status
207
- # def heartbeat(self):
208
- # """
209
- # 返回心跳key剩余存活时间
210
- # """
211
- # return self.client.ttl(self.heartbeat_key)
212
-
213
- @check_redis_status
214
- def spider_queue_length(self):
215
- return self.client.zcard(self.spider_key)
216
-
217
- @check_redis_status
218
- def ready_seed_length(self):
219
- return self.client.zcount(self.spider_key, min=0, max="+inf")
220
-
221
- @check_redis_status
222
- def get_scheduler_lock(self):
223
- return self._get_lock(self.scheduler_lock)
7
+ def __init__(self, **kwargs):
8
+ redis_config = kwargs or setting.REDIS_CONFIG
9
+ pool = redis.ConnectionPool(**redis_config)
10
+ self._client = redis.Redis(connection_pool=pool)
11
+
12
+ def setnx(self, name, value=""):
13
+ return self._client.setnx(name, value)
14
+
15
+ def setex(self, name, t, value=""):
16
+ return self._client.setex(name, t, value)
17
+
18
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
19
+ return self._client.expire(name, t, nx, xx, gt, lt)
20
+
21
+ def ttl(self, name):
22
+ return self._client.ttl(name)
23
+
24
+ def delete(self, name):
25
+ return self._client.delete(name)
26
+
27
+ def exists(self, *name) -> bool:
28
+ return self._client.exists(*name)
29
+
30
+ def sadd(self, name, value):
31
+ return self._client.sadd(name, value)
32
+
33
+ def zcard(self, name) -> bool:
34
+ return self._client.zcard(name)
35
+
36
+ def zadd(self, name, item: dict, **kwargs):
37
+ return self._client.zadd(name, item, **kwargs)
38
+
39
+ def zrem(self, name, *value):
40
+ return self._client.zrem(name, *value)
41
+
42
+ def zcount(self, name, _min, _max):
43
+ return self._client.zcount(name, _min, _max)
44
+
45
+ # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
46
+ # return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
47
+
48
+ def lua(self, script: str, keys: list = None, args: list = None):
49
+ keys = keys or []
50
+ args = args or []
51
+ keys_count = len(keys)
52
+ return self._client.eval(script, keys_count, *keys, *args)
53
+
54
+ def lua_sha(self, sha1: str, keys: list = None, args: list = None):
55
+ keys = keys or []
56
+ args = args or []
57
+ keys_count = len(keys)
58
+ return self._client.evalsha(sha1, keys_count, *keys, *args)
59
+
60
+ def execute_lua(self, lua_script: str, keys: list, *args):
61
+ execute = self._client.register_script(lua_script)
62
+ return execute(keys=keys, args=args)
63
+
64
+ def lock(self, key, t=15) -> bool:
65
+ lua_script = """
66
+ local status = redis.call('setnx', KEYS[1], 1)
67
+ if ( status == 1 ) then
68
+ redis.call('expire', KEYS[1], ARGV[1])
69
+ end
70
+ return status
71
+ """
72
+ status = self.execute_lua(lua_script, [key], t)
73
+ return bool(status)
74
+
75
+ def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
76
+ lua_script = """
77
+ local min = ARGV[1]
78
+ local max = ARGV[2]
79
+ local start = ARGV[3]
80
+ local count = ARGV[4]
81
+ local score = ARGV[5]
82
+ local members = nil
83
+
84
+ if ( type(count) == string ) then
85
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
86
+ else
87
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
88
+ end
89
+
90
+ local result = {}
91
+
92
+ for i = 1, #members, 2 do
93
+ local priority = nil
94
+ local member = members[i]
95
+ local originPriority = nil
96
+ if ( members[i+1] + 0 < 0 ) then
97
+ originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
98
+ else
99
+ originPriority = math.floor(members[i+1])
100
+ end
101
+
102
+ if ( score + 0 >= 1000 ) then
103
+ priority = -score - originPriority / 1000
104
+ elseif ( score + 0 == 0 ) then
105
+ priority = originPriority
106
+ else
107
+ originPriority = score
108
+ priority = score
109
+ end
110
+ redis.call('zadd', KEYS[1], priority, member)
111
+ table.insert(result, member)
112
+ table.insert(result, originPriority)
113
+ end
114
+
115
+ return result
116
+ """
117
+ members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
118
+ return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
119
+
120
+ def done(self, keys: list, *args) -> list:
121
+ lua_script = """
122
+ for i, member in ipairs(ARGV) do
123
+ redis.call("zrem", KEYS[1], member)
124
+ redis.call("sadd", KEYS[2], member)
125
+ end
126
+ """
127
+ self.execute_lua(lua_script, keys, *args)
128
+
129
+
130
+
@@ -0,0 +1 @@
1
+ from .oss_db_exception import *
@@ -0,0 +1,28 @@
1
+ class OssDBException(Exception):
2
+ """Base oss client exception that all others inherit."""
3
+
4
+
5
+ class OssDBMergeError(OssDBException):
6
+ """
7
+ Exception raised when execute merge operation fails.
8
+ """
9
+
10
+
11
+ class OssDBPutPartError(OssDBException):
12
+ """
13
+ Exception raised when upload part operation fails.
14
+ """
15
+
16
+
17
+ class OssDBPutObjError(OssDBException):
18
+ """
19
+ Exception raised when upload operation fails.
20
+ """
21
+
22
+
23
+ class OssDBAppendObjError(OssDBException):
24
+ """Exception raised when upload operation fails."""
25
+
26
+
27
+ class OssDBInitPartError(OssDBException):
28
+ """Exception raised when init upload operation fails."""
@@ -0,0 +1,3 @@
1
+ from .launcher_air import LauncherAir
2
+ from .launcher_pro import LauncherPro
3
+ from .launcher_api import LauncherApi