cobweb-launcher 0.1.23__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

@@ -0,0 +1,182 @@
1
+ import setting
2
+
3
+ from typing import Union
4
+
5
+ from cobweb.utils import OssUtil
6
+ from cobweb.crawlers import Crawler
7
+ from cobweb.base import Seed, BaseItem, Request, Response
8
+ from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
9
+
10
+
11
+ class CrawlerAir(Crawler):
12
+
13
+ oss_util = OssUtil()
14
+
15
+ @staticmethod
16
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
17
+ seed_dict = item.seed.to_dict
18
+ bucket_name = CrawlerAir.oss_util.bucket
19
+ try:
20
+ key = item.seed.oss_path
21
+ if CrawlerAir.oss_util.exists(key):
22
+ content_length = CrawlerAir.oss_util.head(key).content_length
23
+ yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
24
+ # data, cols = download_meta(item.seed, bucket_name=bucket_name, data_size=content_length, **seed_dict)
25
+ # yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
26
+
27
+ end = seed_dict.get("end", "")
28
+ start = seed_dict.get("start", "0")
29
+
30
+ if end or int(start):
31
+ item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
32
+
33
+ if not item.seed.params.identifier:
34
+ content = b""
35
+ chunk_size = CrawlerAir.oss_util.chunk_size
36
+ min_upload_size = CrawlerAir.oss_util.min_upload_size
37
+ position = seed_dict.get("position", 1)
38
+
39
+ response = item.download()
40
+
41
+ content_length = response.headers.get("content-length") or 0
42
+ content_type = response.headers.get("content-type", "").split(";")[0]
43
+ if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
44
+ yield Response(
45
+ item.seed, response, filter=True, msg=f"response content type is {content_type}",
46
+ bucket_name=bucket_name, data_size=content_length, **seed_dict
47
+ )
48
+ elif position == 1 and min_upload_size >= int(content_length) > 0:
49
+ """过小文件标识返回"""
50
+ yield Response(
51
+ item.seed, response, filter=True, msg="file size is too small",
52
+ bucket_name=bucket_name, data_size=content_length, **seed_dict
53
+ )
54
+ elif position == 1 and chunk_size > int(content_length) > min_upload_size:
55
+ """小文件直接下载"""
56
+ for part_data in response.iter_content(chunk_size):
57
+ content += part_data
58
+ CrawlerAir.oss_util.put(key, content)
59
+ yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
60
+ response.close()
61
+ else:
62
+ """中大文件同步分片下载"""
63
+ upload_content_length = 0
64
+ if not seed_dict.get("upload_id"):
65
+ seed_dict["upload_id"] = CrawlerAir.oss_util.init_part(key).upload_id
66
+ upload_id = seed_dict["upload_id"]
67
+ for part_data in response.iter_content(chunk_size):
68
+ content += part_data
69
+ if len(content) >= chunk_size:
70
+ upload_data = content[:chunk_size]
71
+ content = content[chunk_size:]
72
+ CrawlerAir.oss_util.put_part(key, upload_id, position, upload_data)
73
+ upload_content_length += len(upload_data)
74
+ position += 1
75
+ seed_dict['position'] = position
76
+ seed_dict['start'] = upload_content_length
77
+
78
+ response.close()
79
+ if content:
80
+ CrawlerAir.oss_util.put_part(key, upload_id, position, content)
81
+ content_length += len(content)
82
+ CrawlerAir.oss_util.merge(key, upload_id)
83
+ yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
84
+ # data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
85
+ # yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
86
+
87
+ elif item.seed.params.identifier == "merge":
88
+ CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
89
+ content_length = CrawlerAir.oss_util.head(key).content_length
90
+ yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
91
+ # data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
92
+ # yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
93
+ except OssDBPutPartError:
94
+ yield Seed(seed_dict)
95
+ except OssDBMergeError:
96
+ yield Seed(seed_dict, identifier="merge")
97
+
98
+
99
+ class CrawlerPro(Crawler):
100
+
101
+ oss_util = OssUtil()
102
+
103
+ @staticmethod
104
+ def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
105
+ seed_dict = item.seed.to_dict
106
+ bucket_name = CrawlerAir.oss_util.bucket
107
+ try:
108
+ key = item.seed.oss_path
109
+ if CrawlerAir.oss_util.exists(key):
110
+ content_length = CrawlerAir.oss_util.head(key).content_length
111
+ yield Response(item.seed, "exists", bucket_name=bucket_name, data_size=content_length, **seed_dict)
112
+
113
+ end = seed_dict.get("end", "")
114
+ start = seed_dict.get("start", "0")
115
+
116
+ if end or int(start):
117
+ item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
118
+
119
+ if not item.seed.params.identifier:
120
+ content = b""
121
+ chunk_size = CrawlerAir.oss_util.chunk_size
122
+ min_upload_size = CrawlerAir.oss_util.min_upload_size
123
+ position = seed_dict.get("position", 1)
124
+
125
+ response = item.download()
126
+
127
+ content_length = response.headers.get("content-length") or 0
128
+ content_type = response.headers.get("content-type", "").split(";")[0]
129
+ if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
130
+ yield Response(
131
+ item.seed, response, filter=True, msg=f"response content type is {content_type}",
132
+ bucket_name=bucket_name, data_size=content_length, **seed_dict
133
+ )
134
+ elif position == 1 and min_upload_size >= int(content_length) > 0:
135
+ """过小文件标识返回"""
136
+ yield Response(
137
+ item.seed, response, filter=True, msg="file size is too small",
138
+ bucket_name=bucket_name, data_size=content_length, **seed_dict
139
+ )
140
+ elif position == 1 and chunk_size > int(content_length) > min_upload_size:
141
+ """小文件直接下载"""
142
+ for part_data in response.iter_content(chunk_size):
143
+ content += part_data
144
+ CrawlerAir.oss_util.put(key, content)
145
+ yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
146
+ response.close()
147
+ else:
148
+ """中大文件同步分片下载"""
149
+ upload_content_length = 0
150
+ if not seed_dict.get("upload_id"):
151
+ seed_dict["upload_id"] = CrawlerAir.oss_util.init_part(key).upload_id
152
+ upload_id = seed_dict["upload_id"]
153
+ for part_data in response.iter_content(chunk_size):
154
+ content += part_data
155
+ if len(content) >= chunk_size:
156
+ upload_data = content[:chunk_size]
157
+ content = content[chunk_size:]
158
+ CrawlerAir.oss_util.put_part(key, upload_id, position, upload_data)
159
+ upload_content_length += len(upload_data)
160
+ position += 1
161
+ seed_dict['position'] = position
162
+ seed_dict['start'] = upload_content_length
163
+
164
+ response.close()
165
+ if content:
166
+ CrawlerAir.oss_util.put_part(key, upload_id, position, content)
167
+ content_length += len(content)
168
+ CrawlerAir.oss_util.merge(key, upload_id)
169
+ yield Response(item.seed, response, bucket_name=bucket_name, data_size=content_length, **seed_dict)
170
+ # data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
171
+ # yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
172
+
173
+ elif item.seed.params.identifier == "merge":
174
+ CrawlerAir.oss_util.merge(key, seed_dict["upload_id"])
175
+ content_length = CrawlerAir.oss_util.head(key).content_length
176
+ yield Response(item.seed, "merge", bucket_name=bucket_name, data_size=content_length, **seed_dict)
177
+ # data, cols = download_meta(item.seed, bucket_name, data_size=content_length, **seed_dict)
178
+ # yield DownloadItem(item.seed, sid=item.seed.sid, cols=cols, data=data)
179
+ except OssDBPutPartError:
180
+ yield Seed(seed_dict)
181
+ except OssDBMergeError:
182
+ yield Seed(seed_dict, identifier="merge")
cobweb/db/__init__.py CHANGED
@@ -1,3 +1 @@
1
- from .. import log, Seed, decorators
2
- from ..constant import Setting, DealModel
3
- from ..interface import SchedulerInterface, StorerInterface
1
+ from .redis_db import RedisDB
cobweb/db/redis_db.py CHANGED
@@ -1,211 +1,128 @@
1
- import time
2
1
  import redis
3
- from . import log, decorators, Seed, Setting, DealModel
4
- # from cobweb.decorators import decorators.check_redis_status
5
- # from cobweb.constant import Setting, DealModel
2
+ import setting
6
3
 
7
4
 
8
5
  class RedisDB:
9
6
 
10
- def __init__(
11
- self,
12
- project: str,
13
- task_name: str,
14
- config: dict,
15
- ):
16
- pool = redis.ConnectionPool(**config)
17
- self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
18
- self.spider_key = f"{project}:{task_name}:seed_info:spider" # redis type zset, .format(priority)
19
- self.storer_key = f"{project}:{task_name}:seed_info:storer:%s" # redis type set,
20
- self.failed_key = f"{project}:{task_name}:seed_info:failed" # redis type set, .format(priority)
21
- self.succeed_key = f"{project}:{task_name}:seed_info:succeed" # redis type set, .format(priority)
22
- self.update_lock = f"{project}:{task_name}:update_seed_lock" # redis type string
23
- self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
24
- self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
25
- self.client = redis.Redis(connection_pool=pool)
26
-
27
- @decorators.check_redis_status
28
- def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
29
- begin_time = int(time.time())
30
- while True:
31
- if self.client.setnx(key, ""):
32
- self.client.expire(key, t)
33
- return True
34
- if int(time.time()) - begin_time > timeout:
35
- break
36
- time.sleep(sleep_time)
37
-
38
- if self.client.ttl(key) == -1:
39
- delete_status = True
40
- for _ in range(3):
41
- if self.client.ttl(key) != -1:
42
- delete_status = False
43
- break
44
- time.sleep(0.5)
45
- if delete_status:
46
- self.client.expire(key, t)
47
- return False
48
- else:
49
- ttl = self.client.ttl(key)
50
- log.info("ttl: " + str(ttl))
51
- return False
52
-
53
- @decorators.check_redis_status
54
- def _deal_seed(self, seeds, is_add: bool):
55
- if not seeds:
56
- return None
57
-
58
- if not isinstance(seeds, list):
59
- seeds = [seeds]
60
-
61
- item_info = dict()
62
-
63
- for seed in seeds:
64
- if not isinstance(seed, Seed):
65
- seed = Seed(seed)
66
- item_info[seed.format_seed] = seed._priority
67
-
68
- if item_info:
69
- self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
70
-
71
- @decorators.check_redis_status
72
- def add_seed(self, seeds):
73
- self._deal_seed(seeds, is_add=True)
74
-
75
- @decorators.check_redis_status
76
- def reset_seed(self, seeds):
77
- self._deal_seed(seeds, is_add=False)
78
-
79
- @decorators.check_redis_status
80
- def del_seed(self, seeds, spider_status: bool = True):
81
- if not seeds:
82
- return None
83
-
84
- if not isinstance(seeds, list):
85
- seeds = [seeds]
86
-
87
- seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
88
-
89
- if seeds:
90
- redis_key = None
91
- if spider_status and Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
92
- self.client.sadd(redis_key, *(seed.format_seed for seed in seeds))
93
- elif not spider_status:
94
- self.client.sadd(redis_key, *(str(seed) for seed in seeds))
95
- self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
96
-
97
- @decorators.check_redis_status
98
- def set_storer(self, key, seeds):
99
- if not seeds:
100
- return None
101
-
102
- if not isinstance(seeds, list):
103
- seeds = [seeds]
104
-
105
- item_info = dict()
106
- score = -int(time.time())
107
- for seed in seeds:
108
- if not isinstance(seed, Seed):
109
- seed = Seed(seed)
110
- item_info[seed.format_seed] = score
111
-
112
- if item_info:
113
- self.client.zadd(self.storer_key % key, mapping=item_info)
114
- log.info(f"zadd storer key: length {len(item_info.keys())}")
115
-
116
- @decorators.check_redis_status
117
- def get_seed(self, length: int = 200):
118
- cs = time.time()
119
-
120
- if self._get_lock(key=self.update_lock):
121
-
122
- update_item, result = {}, []
123
-
124
- version = int(time.time())
125
-
126
- items = self.client.zrangebyscore(self.spider_key, min=0, max="+inf", start=0, num=length, withscores=True)
127
-
128
- for value, priority in items:
129
- score = -(version + int(priority) / 1000)
130
- seed = Seed(value, priority=priority, version=version)
131
- update_item[value] = score
132
- result.append(seed)
133
-
134
- log.info("set seeds into queue time: " + str(time.time() - cs))
135
- if result:
136
- self.client.zadd(self.spider_key, mapping=update_item, xx=True)
137
-
138
- self.client.delete(self.update_lock)
139
- log.info("push seeds into queue time: " + str(time.time() - cs))
140
- return result
141
-
142
- @decorators.check_redis_status
143
- def check_spider_queue(self, stop, storer_num):
144
- while not stop.is_set():
145
- # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
146
- if self._get_lock(key=self.check_lock, t=Setting.CHECK_LOCK_TIME, timeout=600, sleep_time=3):
147
- heartbeat = True if self.client.exists(self.heartbeat_key) else False
148
- # 重启重制score值,否则获取${rs_time}分钟前的分数值
149
- score = -int(time.time()) + Setting.RESET_SCORE if heartbeat else "-inf"
150
-
151
- keys = self.client.keys(self.storer_key % "*")
152
-
153
- if keys and len(keys) >= storer_num:
154
- intersection_key = self.storer_key % "intersection"
155
- self.client.delete(intersection_key)
156
- self.client.zinterstore(intersection_key, keys)
157
-
158
- while True:
159
- members = self.client.zrange(intersection_key, 0, 1999)
160
- if not members:
161
- break
162
- for key in keys:
163
- self.client.zrem(key, *members)
164
- if Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
165
- self.client.sadd(self.succeed_key, *members)
166
- self.client.zrem(self.spider_key, *members)
167
- self.client.zrem(intersection_key, *members)
168
- log.info("succeed spider data ...")
169
-
170
- for key in keys:
171
- self.client.zremrangebyscore(key, min=score, max="(0")
172
-
173
- while True:
174
- items = self.client.zrangebyscore(self.spider_key, min=score, max="(0", start=0, num=5000, withscores=True)
175
- if not items:
176
- break
177
- reset_items = {}
178
- for value, priority in items:
179
- reset_score = "{:.3f}".format(priority).split(".")[1]
180
- reset_items[value] = int(reset_score)
181
- if reset_items:
182
- self.client.zadd(self.spider_key, mapping=reset_items, xx=True)
183
-
184
- if not heartbeat:
185
- self.client.setex(self.heartbeat_key, 15, "")
186
-
187
- @decorators.check_redis_status
188
- def set_heartbeat(self, stop):
189
- time.sleep(5)
190
- while not stop.is_set():
191
- self.client.setex(self.heartbeat_key, 5, "")
192
- time.sleep(3)
193
-
194
- # @decorators.check_redis_status
195
- # def heartbeat(self):
196
- # """
197
- # 返回心跳key剩余存活时间
198
- # """
199
- # return self.client.ttl(self.heartbeat_key)
200
-
201
- @decorators.check_redis_status
202
- def spider_queue_length(self):
203
- return self.client.zcard(self.spider_key)
204
-
205
- @decorators.check_redis_status
206
- def ready_seed_length(self):
207
- return self.client.zcount(self.spider_key, min=0, max="+inf")
208
-
209
- @decorators.check_redis_status
210
- def get_scheduler_lock(self):
211
- return self._get_lock(self.scheduler_lock)
7
+ def __init__(self):
8
+ pool = redis.ConnectionPool(**setting.REDIS_CONFIG)
9
+ self._client = redis.Redis(connection_pool=pool)
10
+
11
+ def setnx(self, name, value=""):
12
+ return self._client.setnx(name, value)
13
+
14
+ def setex(self, name, t, value=""):
15
+ return self._client.setex(name, t, value)
16
+
17
+ def expire(self, name, t, nx: bool = False, xx: bool = False, gt: bool = False, lt: bool = False):
18
+ return self._client.expire(name, t, nx, xx, gt, lt)
19
+
20
+ def ttl(self, name):
21
+ return self._client.ttl(name)
22
+
23
+ def delete(self, name):
24
+ return self._client.delete(name)
25
+
26
+ def exists(self, *name) -> bool:
27
+ return self._client.exists(*name)
28
+
29
+ def sadd(self, name, value):
30
+ return self._client.sadd(name, value)
31
+
32
+ def zcard(self, name) -> bool:
33
+ return self._client.zcard(name)
34
+
35
+ def zadd(self, name, item: dict, **kwargs):
36
+ return self._client.zadd(name, item, **kwargs)
37
+
38
+ def zrem(self, name, *value):
39
+ return self._client.zrem(name, *value)
40
+
41
+ def zcount(self, name, _min, _max):
42
+ return self._client.zcount(name, _min, _max)
43
+
44
+ # def zrangebyscore(self, name, _min, _max, start, num, withscores: bool = False, *args):
45
+ # return self._client.zrangebyscore(name, _min, _max, start, num, withscores, *args)
46
+
47
+ def lua(self, script: str, keys: list = None, args: list = None):
48
+ keys = keys or []
49
+ args = args or []
50
+ keys_count = len(keys)
51
+ return self._client.eval(script, keys_count, *keys, *args)
52
+
53
+ def lua_sha(self, sha1: str, keys: list = None, args: list = None):
54
+ keys = keys or []
55
+ args = args or []
56
+ keys_count = len(keys)
57
+ return self._client.evalsha(sha1, keys_count, *keys, *args)
58
+
59
+ def execute_lua(self, lua_script: str, keys: list, *args):
60
+ execute = self._client.register_script(lua_script)
61
+ return execute(keys=keys, args=args)
62
+
63
+ def lock(self, key, t=15) -> bool:
64
+ lua_script = """
65
+ local status = redis.call('setnx', KEYS[1], ARGV[1])
66
+ if ( status == 1 ) then
67
+ redis.call('expire', KEYS[1], ARGV[1])
68
+ end
69
+ return status
70
+ """
71
+ status = self.execute_lua(lua_script, [key], t)
72
+ return bool(status)
73
+
74
+ def members(self, key, score, start=0, count=5000, _min="-inf", _max="+inf") -> list:
75
+ lua_script = """
76
+ local min = ARGV[1]
77
+ local max = ARGV[2]
78
+ local start = ARGV[3]
79
+ local count = ARGV[4]
80
+ local score = ARGV[5]
81
+ local members = nil
82
+
83
+ if ( type(count) == string ) then
84
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES')
85
+ else
86
+ members = redis.call('zrangebyscore', KEYS[1], min, max, 'WITHSCORES', 'limit', start, count)
87
+ end
88
+
89
+ local result = {}
90
+
91
+ for i = 1, #members, 2 do
92
+ local priority = nil
93
+ local member = members[i]
94
+ local originPriority = nil
95
+ if ( members[i+1] + 0 < 0 ) then
96
+ originPriority = math.ceil(members[i+1]) * 1000 - members[i+1] * 1000
97
+ else
98
+ originPriority = math.floor(members[i+1])
99
+ end
100
+
101
+ if ( score + 0 >= 1000 ) then
102
+ priority = -score - originPriority / 1000
103
+ elseif ( score + 0 == 0 ) then
104
+ priority = originPriority
105
+ else
106
+ originPriority = score
107
+ priority = score
108
+ end
109
+ redis.call('zadd', KEYS[1], priority, member)
110
+ table.insert(result, member)
111
+ table.insert(result, originPriority)
112
+ end
113
+
114
+ return result
115
+ """
116
+ members = self.execute_lua(lua_script, [key], _min, _max, start, count, score)
117
+ return [(members[i].decode(), int(members[i + 1])) for i in range(0, len(members), 2)]
118
+
119
+ def done(self, keys: list, *args) -> list:
120
+ lua_script = """
121
+ for i, member in ipairs(ARGV) do
122
+ redis.call("zrem", KEYS[1], member)
123
+ redis.call("sadd", KEYS[2], member)
124
+ end
125
+ """
126
+ self.execute_lua(lua_script, keys, *args)
127
+
128
+
@@ -0,0 +1 @@
1
+ from .oss_db_exception import *
@@ -0,0 +1,28 @@
1
+ class OssDBException(Exception):
2
+ """Base oss client exception that all others inherit."""
3
+
4
+
5
+ class OssDBMergeError(OssDBException):
6
+ """
7
+ Exception raised when execute merge operation fails.
8
+ """
9
+
10
+
11
+ class OssDBPutPartError(OssDBException):
12
+ """
13
+ Exception raised when upload part operation fails.
14
+ """
15
+
16
+
17
+ class OssDBPutObjError(OssDBException):
18
+ """
19
+ Exception raised when upload operation fails.
20
+ """
21
+
22
+
23
+ class OssDBAppendObjError(OssDBException):
24
+ """Exception raised when upload operation fails."""
25
+
26
+
27
+ class OssDBInitPartError(OssDBException):
28
+ """Exception raised when init upload operation fails."""
@@ -0,0 +1,2 @@
1
+ from .launcher import Launcher
2
+ from .launcher_pro import LauncherPro