cobweb-launcher 0.1.7__py3-none-any.whl → 1.2.41__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. cobweb/__init__.py +2 -11
  2. cobweb/base/__init__.py +9 -0
  3. cobweb/base/basic.py +297 -0
  4. cobweb/base/common_queue.py +30 -0
  5. cobweb/base/decorators.py +40 -0
  6. cobweb/base/dotting.py +35 -0
  7. cobweb/base/item.py +46 -0
  8. cobweb/{log.py → base/log.py} +4 -6
  9. cobweb/base/request.py +82 -0
  10. cobweb/base/response.py +23 -0
  11. cobweb/base/seed.py +114 -0
  12. cobweb/constant.py +94 -0
  13. cobweb/crawlers/__init__.py +1 -0
  14. cobweb/crawlers/base_crawler.py +144 -0
  15. cobweb/crawlers/crawler.py +209 -0
  16. cobweb/crawlers/file_crawler.py +98 -0
  17. cobweb/db/__init__.py +2 -2
  18. cobweb/db/api_db.py +82 -0
  19. cobweb/db/redis_db.py +125 -218
  20. cobweb/exceptions/__init__.py +1 -0
  21. cobweb/exceptions/oss_db_exception.py +28 -0
  22. cobweb/launchers/__init__.py +3 -0
  23. cobweb/launchers/launcher.py +235 -0
  24. cobweb/launchers/launcher_air.py +88 -0
  25. cobweb/launchers/launcher_api.py +209 -0
  26. cobweb/launchers/launcher_pro.py +208 -0
  27. cobweb/pipelines/__init__.py +3 -0
  28. cobweb/pipelines/pipeline.py +69 -0
  29. cobweb/pipelines/pipeline_console.py +22 -0
  30. cobweb/pipelines/pipeline_loghub.py +34 -0
  31. cobweb/schedulers/__init__.py +3 -0
  32. cobweb/schedulers/scheduler_api.py +72 -0
  33. cobweb/schedulers/scheduler_redis.py +72 -0
  34. cobweb/setting.py +67 -6
  35. cobweb/utils/__init__.py +5 -0
  36. cobweb/utils/bloom.py +58 -0
  37. cobweb/utils/dotting.py +32 -0
  38. cobweb/utils/oss.py +94 -0
  39. cobweb/utils/tools.py +42 -0
  40. cobweb_launcher-1.2.41.dist-info/METADATA +205 -0
  41. cobweb_launcher-1.2.41.dist-info/RECORD +44 -0
  42. {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/WHEEL +1 -1
  43. cobweb/bbb.py +0 -191
  44. cobweb/db/oss_db.py +0 -127
  45. cobweb/db/scheduler/__init__.py +0 -0
  46. cobweb/db/scheduler/default.py +0 -8
  47. cobweb/db/scheduler/textfile.py +0 -27
  48. cobweb/db/storer/__init__.py +0 -0
  49. cobweb/db/storer/console.py +0 -9
  50. cobweb/db/storer/loghub.py +0 -54
  51. cobweb/db/storer/redis.py +0 -15
  52. cobweb/db/storer/textfile.py +0 -15
  53. cobweb/decorators.py +0 -16
  54. cobweb/distributed/__init__.py +0 -0
  55. cobweb/distributed/launcher.py +0 -243
  56. cobweb/distributed/models.py +0 -143
  57. cobweb/interface.py +0 -34
  58. cobweb/single/__init__.py +0 -0
  59. cobweb/single/launcher.py +0 -231
  60. cobweb/single/models.py +0 -134
  61. cobweb/single/nest.py +0 -153
  62. cobweb/task.py +0 -50
  63. cobweb/utils.py +0 -90
  64. cobweb_launcher-0.1.7.dist-info/METADATA +0 -45
  65. cobweb_launcher-0.1.7.dist-info/RECORD +0 -31
  66. {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/LICENSE +0 -0
  67. {cobweb_launcher-0.1.7.dist-info → cobweb_launcher-1.2.41.dist-info}/top_level.txt +0 -0
cobweb/db/api_db.py ADDED
@@ -0,0 +1,82 @@
1
+ import os
2
+ import json
3
+ import requests
4
+
5
+
6
+ class ApiDB:
7
+
8
+ def __init__(self, host=None, **kwargs):
9
+ self.host = host or os.getenv("REDIS_API_HOST", "http://127.0.0.1:4396")
10
+
11
+ def _get_response(self, api, params: dict = None):
12
+ try:
13
+ url = self.host + api
14
+ response = requests.get(url, params=params)
15
+ json_data = response.json()
16
+ response.close()
17
+ return json_data["data"]
18
+ except:
19
+ return None
20
+
21
+ def _post_response(self, api, params: dict = None, data: dict = None):
22
+ try:
23
+ url = self.host + api
24
+ headers = {"Content-Type": "application/json"}
25
+ response = requests.post(url, headers=headers, params=params, data=json.dumps(data))
26
+ json_data = response.json()
27
+ response.close()
28
+ return json_data["data"]
29
+ except:
30
+ return None
31
+
32
+ def get(self, name):
33
+ return self._get_response(api="/get", params=dict(name=name))
34
+
35
+ def setnx(self, name, value=""):
36
+ return self._get_response(api="/setnx", params=dict(name=name, value=value))
37
+
38
+ def setex(self, name, t, value=""):
39
+ return self._get_response(api="/setex", params=dict(name=name, value=value, t=t))
40
+
41
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
42
+ return self._get_response(api="/expire", params=dict(name=name, t=t, nx=nx, xx=xx, gt=gt, lt=lt))
43
+
44
+ def ttl(self, name):
45
+ return self._get_response(api="/ttl", params=dict(name=name))
46
+
47
+ def delete(self, name):
48
+ return self._get_response(api="/delete", params=dict(name=name))
49
+
50
+ def exists(self, name):
51
+ return self._get_response(api="/exists", params=dict(name=name))
52
+
53
+ def incrby(self, name, value):
54
+ return self._get_response(api="/incrby", params=dict(name=name, value=value))
55
+
56
+ def zcard(self, name) -> bool:
57
+ return self._get_response(api="/zcard", params=dict(name=name))
58
+
59
+ def zadd(self, name, item: dict, **kwargs):
60
+ return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
61
+
62
+ def zrem(self, name, *values):
63
+ return self._post_response(api="/zrem", data=dict(name=name, values=values))
64
+
65
+ def zcount(self, name, _min, _max):
66
+ return self._get_response(api="/zcount", params=dict(name=name, min=_min, max=_max))
67
+
68
+ def lock(self, name, t=15) -> bool:
69
+ return self._get_response(api="/lock", params=dict(name=name, t=t))
70
+
71
+ def auto_incr(self, name, t=15, limit=1000) -> bool:
72
+ return self._get_response(api="/auto_incr", params=dict(name=name, t=t, limit=limit))
73
+
74
+ def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
75
+ return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
76
+
77
+ def done(self, name: list, *values):
78
+ return self._post_response(api="/done", data=dict(name=name, values=values))
79
+
80
+
81
+
82
+
cobweb/db/redis_db.py CHANGED
@@ -1,223 +1,130 @@
1
- import time
2
1
  import redis
3
- from cobweb import Seed, log
4
- from cobweb.decorators import check_redis_status
2
+ from cobweb import setting
5
3
 
6
4
 
7
5
  class RedisDB:
8
6
 
9
- def __init__(
10
- self,
11
- project: str,
12
- task_name: str,
13
- config: dict,
14
- model: int,
15
- cs_lct: int,
16
- rs_time: int,
17
- ):
18
- pool = redis.ConnectionPool(**config)
19
- self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
20
- self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type zset, .format(priority)
21
- self.storer_key = f"{project}:{task_name}:seed_info:storer:%s" # redis type set,
22
- self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
23
- self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
24
- self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
25
- self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
26
- self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
27
- self.client = redis.Redis(connection_pool=pool)
28
- self.model = model
29
- self.cs_lct = cs_lct
30
- self.rs_time = rs_time
31
-
32
- @check_redis_status
33
- def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
34
- begin_time = int(time.time())
35
- while True:
36
- if self.client.setnx(key, ""):
37
- self.client.expire(key, t)
38
- return True
39
- if int(time.time()) - begin_time > timeout:
40
- break
41
- time.sleep(sleep_time)
42
-
43
- if self.client.ttl(key) == -1:
44
- delete_status = True
45
- for _ in range(3):
46
- if self.client.ttl(key) != -1:
47
- delete_status = False
48
- break
49
- time.sleep(0.5)
50
- if delete_status:
51
- self.client.expire(key, t)
52
- return False
53
- else:
54
- ttl = self.client.ttl(key)
55
- log.info("ttl: " + str(ttl))
56
- return False
57
-
58
- @check_redis_status
59
- def _deal_seed(self, seeds, is_add: bool):
60
- if not seeds:
61
- return None
62
-
63
- if not isinstance(seeds, list):
64
- seeds = [seeds]
65
-
66
- item_info = dict()
67
-
68
- for seed in seeds:
69
- if not isinstance(seed, Seed):
70
- seed = Seed(seed)
71
- item_info[seed.format_seed] = seed._priority
72
-
73
- if item_info:
74
- self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
75
-
76
- @check_redis_status
77
- def add_seed(self, seeds):
78
- self._deal_seed(seeds, is_add=True)
79
-
80
- @check_redis_status
81
- def reset_seed(self, seeds):
82
- self._deal_seed(seeds, is_add=False)
83
-
84
- @check_redis_status
85
- def del_seed(self, seeds, spider_status: bool = True):
86
- if not seeds:
87
- return None
88
-
89
- if not isinstance(seeds, list):
90
- seeds = [seeds]
91
-
92
- seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
93
-
94
- if seeds:
95
- # redis_key = self.succeed_key if spider_status else self.failed_key
96
- redis_key = None
97
- if spider_status:
98
- if isinstance(self.model, int) and self.model == 2:
99
- redis_key = self.succeed_key
100
- else:
101
- redis_key = self.failed_key
102
- if redis_key:
103
- self.client.sadd(redis_key, *(str(seed) for seed in seeds))
104
- self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
105
-
106
- @check_redis_status
107
- def set_storer(self, key, seeds):
108
- if not seeds:
109
- return None
110
-
111
- if not isinstance(seeds, list):
112
- seeds = [seeds]
113
-
114
- item_info = dict()
115
- score = -int(time.time())
116
- for seed in seeds:
117
- if not isinstance(seed, Seed):
118
- seed = Seed(seed)
119
- item_info[seed.format_seed] = score
120
-
121
- if item_info:
122
- self.client.zadd(self.storer_key % key, mapping=item_info)
123
- log.info(f"zadd storer key: length {len(item_info.keys())}")
124
-
125
- @check_redis_status
126
- def get_seed(self, length: int = 200):
127
- cs = time.time()
128
-
129
- if self._get_lock(key=self.update_lock):
130
-
131
- update_item, result = {}, []
132
-
133
- version = int(time.time())
134
-
135
- items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
136
-
137
- for value, priority in items:
138
- score = -(version + int(priority) / 1000)
139
- seed = Seed(value, priority=priority, version=version)
140
- update_item[value] = score
141
- result.append(seed)
142
-
143
- log.info("set seeds into queue time: " + str(time.time() - cs))
144
- if result:
145
- self.client.zadd(self.spider_key, mapping=update_item, xx=True)
146
-
147
- self.client.delete(self.update_lock)
148
- log.info("push seeds into queue time: " + str(time.time() - cs))
149
- return result
150
-
151
- @check_redis_status
152
- def check_spider_queue(self, stop, storer_num):
153
- while not stop.is_set():
154
- # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
155
- if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
156
- heartbeat = True if self.client.exists(self.heartbeat_key) else False
157
- # 重启重制score值,否则获取${rs_time}分钟前的分数值
158
- score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
159
-
160
- keys = self.client.keys(self.storer_key % "*")
161
-
162
- if keys and len(keys) >= storer_num:
163
- intersection_key = self.storer_key % "intersection"
164
- self.client.delete(intersection_key)
165
- self.client.zinterstore(intersection_key, keys)
166
-
167
- while True:
168
- members = self.client.zrange(intersection_key, 0, 1999)
169
- if not members:
170
- break
171
- for key in keys:
172
- self.client.zrem(key, *members)
173
- if self.model == 2:
174
- self.client.sadd(self.succeed_key, *members)
175
- self.client.zrem(self.spider_key, *members)
176
- self.client.zrem(intersection_key, *members)
177
- log.info("succeed spider data ...")
178
-
179
- for key in keys:
180
- self.client.zremrangebyscore(key, min=score, max="(0")
181
-
182
- while True:
183
- items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
184
- if not items:
185
- break
186
- reset_items = {}
187
- for value, priority in items:
188
- reset_score = "{:.3f}".format(priority).split(".")[1]
189
- reset_items[value] = int(reset_score)
190
- if reset_items:
191
- self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
192
-
193
- if not heartbeat:
194
- self.client.setex(self.heartbeat_key, 15, "")
195
-
196
- # self.client.delete(self.check_lock)
197
- # time.sleep(3)
198
-
199
- @check_redis_status
200
- def set_heartbeat(self, stop):
201
- time.sleep(5)
202
- while not stop.is_set():
203
- self.client.setex(self.heartbeat_key, 5, "")
204
- time.sleep(3)
205
-
206
- # @check_redis_status
207
- # def heartbeat(self):
208
- # """
209
- # 返回心跳key剩余存活时间
210
- # """
211
- # return self.client.ttl(self.heartbeat_key)
212
-
213
- @check_redis_status
214
- def spider_queue_length(self):
215
- return self.client.zcard(self.spider_key)
216
-
217
- @check_redis_status
218
- def ready_seed_length(self):
219
- return self.client.zcount(self.spider_key, min=0, max="+inf")
220
-
221
- @check_redis_status
222
- def get_scheduler_lock(self):
223
- return self._get_lock(self.scheduler_lock)
7
+ def __init__(self, **kwargs):
8
+ redis_config = kwargs or setting.REDIS_CONFIG
9
+ pool = redis.ConnectionPool(**redis_config)
10
+ self._client = redis.Redis(connection_pool=pool)
11
+
12
+ def setnx(self, name, value=""):
13
+ return self._client.setnx(name, value)
14
+
15
+ def setex(self, name, t, value=""):
16
+ return self._client.setex(name, t, value)
17
+
18
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
19
+ return self._client.expire(name, t, nx, xx, gt, lt)
20
+
21
+ def ttl(self, name):
22
+ return self._client.ttl(name)
23
+
24
+ def delete(self, name):
25
+ return self._client.delete(name)
26
+
27
+ def exists(self, *name) -> bool:
28
+ return self._client.exists(*name)
29
+
30
+ def sadd(self, name, value):
31
+ return self._client.sadd(name, value)
32
+
33
+ def zcard(self, name) -> bool:
34
+ return self._client.zcard(name)
35
+
36
+ def zadd(self, name, item: dict, **kwargs):
37
+ return self._client.zadd(name, item, **kwargs)
38
+
39
+ def zrem(self, name, *value):
40
+ return self._client.zrem(name, *value)
41
+
42
+ def zcount(self, name, _min, _max):
43
+ return self._client.zcount(name, _min, _max)
44
+
45
+ # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
46
+ # return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
47
+
48
+ def lua(self, script: str, keys: list = None, args: list = None):
49
+ keys = keys or []
50
+ args = args or []
51
+ keys_count = len(keys)
52
+ return self._client.eval(script, keys_count, *keys, *args)
53
+
54
+ def lua_sha(self, sha1: str, keys: list = None, args: list = None):
55
+ keys = keys or []
56
+ args = args or []
57
+ keys_count = len(keys)
58
+ return self._client.evalsha(sha1, keys_count, *keys, *args)
59
+
60
+ def execute_lua(self, lua_script: str, keys: list, *args):
61
+ execute = self._client.register_script(lua_script)
62
+ return execute(keys=keys, args=args)
63
+
64
+ def lock(self, key, t=15) -> bool:
65
+ lua_script = """
66
+ local status = redis.call('setnx', KEYS[1], 1)
67
+ if ( status == 1 ) then
68
+ redis.call('expire', KEYS[1], ARGV[1])
69
+ end
70
+ return status
71
+ """
72
+ status = self.execute_lua(lua_script, [key], t)
73
+ return bool(status)
74
+
75
+ def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
76
+ lua_script = """
77
+ local min = ARGV[1]
78
+ local max = ARGV[2]
79
+ local start = ARGV[3]
80
+ local count = ARGV[4]
81
+ local score = ARGV[5]
82
+ local members = nil
83
+
84
+ if ( type(count) == string ) then
85
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
86
+ else
87
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
88
+ end
89
+
90
+ local result = {}
91
+
92
+ for i = 1, #members, 2 do
93
+ local priority = nil
94
+ local member = members[i]
95
+ local originPriority = nil
96
+ if ( members[i+1] + 0 < 0 ) then
97
+ originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
98
+ else
99
+ originPriority = math.floor(members[i+1])
100
+ end
101
+
102
+ if ( score + 0 >= 1000 ) then
103
+ priority = -score - originPriority / 1000
104
+ elseif ( score + 0 == 0 ) then
105
+ priority = originPriority
106
+ else
107
+ originPriority = score
108
+ priority = score
109
+ end
110
+ redis.call('zadd', KEYS[1], priority, member)
111
+ table.insert(result, member)
112
+ table.insert(result, originPriority)
113
+ end
114
+
115
+ return result
116
+ """
117
+ members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
118
+ return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
119
+
120
+ def done(self, keys: list, *args) -> list:
121
+ lua_script = """
122
+ for i, member in ipairs(ARGV) do
123
+ redis.call("zrem", KEYS[1], member)
124
+ redis.call("sadd", KEYS[2], member)
125
+ end
126
+ """
127
+ self.execute_lua(lua_script, keys, *args)
128
+
129
+
130
+
@@ -0,0 +1 @@
1
+ from .oss_db_exception import *
@@ -0,0 +1,28 @@
1
+ class OssDBException(Exception):
2
+ """Base oss client exception that all others inherit."""
3
+
4
+
5
+ class OssDBMergeError(OssDBException):
6
+ """
7
+ Exception raised when execute merge operation fails.
8
+ """
9
+
10
+
11
+ class OssDBPutPartError(OssDBException):
12
+ """
13
+ Exception raised when upload part operation fails.
14
+ """
15
+
16
+
17
+ class OssDBPutObjError(OssDBException):
18
+ """
19
+ Exception raised when upload operation fails.
20
+ """
21
+
22
+
23
+ class OssDBAppendObjError(OssDBException):
24
+ """Exception raised when upload operation fails."""
25
+
26
+
27
+ class OssDBInitPartError(OssDBException):
28
+ """Exception raised when init upload operation fails."""
@@ -0,0 +1,3 @@
1
+ from .launcher_air import LauncherAir
2
+ from .launcher_pro import LauncherPro
3
+ from .launcher_api import LauncherApi