cobweb-launcher 1.2.25__py3-none-any.whl → 3.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. cobweb/__init__.py +4 -1
  2. cobweb/base/__init__.py +3 -3
  3. cobweb/base/common_queue.py +37 -16
  4. cobweb/base/item.py +35 -16
  5. cobweb/base/{log.py → logger.py} +3 -3
  6. cobweb/base/request.py +741 -54
  7. cobweb/base/response.py +380 -13
  8. cobweb/base/seed.py +96 -48
  9. cobweb/base/task_queue.py +180 -0
  10. cobweb/base/test.py +257 -0
  11. cobweb/constant.py +10 -1
  12. cobweb/crawlers/crawler.py +12 -155
  13. cobweb/db/api_db.py +3 -2
  14. cobweb/db/redis_db.py +117 -28
  15. cobweb/launchers/__init__.py +4 -3
  16. cobweb/launchers/distributor.py +141 -0
  17. cobweb/launchers/launcher.py +95 -157
  18. cobweb/launchers/uploader.py +68 -0
  19. cobweb/log_dots/__init__.py +2 -0
  20. cobweb/log_dots/dot.py +258 -0
  21. cobweb/log_dots/loghub_dot.py +53 -0
  22. cobweb/pipelines/__init__.py +1 -1
  23. cobweb/pipelines/pipeline.py +5 -55
  24. cobweb/pipelines/pipeline_csv.py +25 -0
  25. cobweb/pipelines/pipeline_loghub.py +32 -12
  26. cobweb/schedulers/__init__.py +1 -0
  27. cobweb/schedulers/scheduler.py +66 -0
  28. cobweb/schedulers/scheduler_with_redis.py +189 -0
  29. cobweb/setting.py +27 -40
  30. cobweb/utils/__init__.py +5 -3
  31. cobweb/utils/bloom.py +58 -58
  32. cobweb/{base → utils}/decorators.py +14 -12
  33. cobweb/utils/dotting.py +300 -0
  34. cobweb/utils/oss.py +113 -94
  35. cobweb/utils/tools.py +3 -15
  36. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/METADATA +31 -43
  37. cobweb_launcher-3.2.20.dist-info/RECORD +44 -0
  38. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/WHEEL +1 -1
  39. cobweb/crawlers/base_crawler.py +0 -144
  40. cobweb/crawlers/file_crawler.py +0 -98
  41. cobweb/launchers/launcher_air.py +0 -88
  42. cobweb/launchers/launcher_api.py +0 -221
  43. cobweb/launchers/launcher_pro.py +0 -222
  44. cobweb/pipelines/base_pipeline.py +0 -54
  45. cobweb/pipelines/loghub_pipeline.py +0 -34
  46. cobweb/pipelines/pipeline_console.py +0 -22
  47. cobweb_launcher-1.2.25.dist-info/RECORD +0 -40
  48. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/LICENSE +0 -0
  49. {cobweb_launcher-1.2.25.dist-info → cobweb_launcher-3.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,180 @@
1
+ import time
2
+ import threading
3
+ from enum import Enum
4
+ from hashlib import md5
5
+ from dataclasses import dataclass
6
+ from typing import Dict, Any, Optional, List
7
+
8
+
9
+ class Status(Enum):
10
+ PENDING = 0 # 待处理
11
+ PROCESSING = 1 # 处理中
12
+ FINISHED = 2 # 已完成
13
+ INSERT = 3 # 新增
14
+ UPLOAD = 4 # 上传
15
+
16
+
17
+ @dataclass
18
+ class Task:
19
+ task_id: str # 种子唯一ID
20
+ data: Any # 种子内容
21
+ status: Status # 当前状态
22
+ priority: int # 优先级(数值越小越优先)
23
+ created_at: float # 创建时间戳
24
+ parent_id: Optional[str] = None # 父种子 ID
25
+ children_ids: List[str] = None # 子种子 ID 列表
26
+ ttl_seconds: Optional[int] = None # 可选 TTL 时间(秒)
27
+
28
+ def __post_init__(self):
29
+ if self.children_ids is None:
30
+ self.children_ids = []
31
+
32
+
33
+ class TaskQueue:
34
+
35
+ def __init__(self, cleanup_interval=60):
36
+ self._tasks: Dict[str, Task] = {}
37
+ self._lock = threading.Lock()
38
+ # self.cleanup_interval = cleanup_interval
39
+ # self._start_cleanup_task()
40
+
41
+ # def _start_cleanup_task(self):
42
+ # """启动后台线程清理过期种子"""
43
+ # def run():
44
+ # while True:
45
+ # time.sleep(self.cleanup_interval)
46
+ # self._cleanup_expired_seeds()
47
+ # threading.Thread(target=run, daemon=True).start()
48
+
49
+ def length(self) -> int:
50
+ with self._lock:
51
+ return len(self._tasks)
52
+
53
+ def status_length(self, status) -> int:
54
+ with self._lock:
55
+ return len([it for it in self._tasks.values() if it.status == status])
56
+
57
+ def get_task(self, task_id) -> Task:
58
+ with self._lock:
59
+ if task_id in self._tasks:
60
+ return self._tasks[task_id]
61
+
62
+ def get_task_by_status(self, status: list, limit: int = None) -> List[Task]:
63
+ with self._lock:
64
+ if not isinstance(status, list):
65
+ status = [status]
66
+ task_list = [it for it in self._tasks.values() if it.status in status]
67
+ task_list.sort(key=lambda x: (x.priority, x.created_at))
68
+ return task_list[:limit] if limit else task_list
69
+
70
+ def get_pending_task(self) -> Task:
71
+ with self._lock:
72
+ if items := [it for it in self._tasks.values() if it.status == Status.PENDING]:
73
+ items.sort(key=lambda x: (x.priority, x.created_at))
74
+ task_item = items[0]
75
+ task_item.status = Status.PROCESSING
76
+ self._tasks[task_item.task_id] = task_item
77
+ return task_item
78
+
79
+ def pop_task(self, status) -> Task:
80
+ with self._lock:
81
+ if items := [it for it in self._tasks.values() if it.status == status]:
82
+ items.sort(key=lambda x: (x.priority, x.created_at))
83
+ task_item = items[0]
84
+
85
+ to_remove = set()
86
+ queue = [task_item.task_id]
87
+
88
+ while queue:
89
+ current = queue.pop(0)
90
+ if current in self._tasks:
91
+ to_remove.add(current)
92
+ queue.extend(self._tasks[current].children_ids)
93
+ del self._tasks[current]
94
+
95
+ for tid in to_remove:
96
+ if task_item := self._tasks.get(tid):
97
+ if task_item.parent_id in self._tasks:
98
+ if tid in self._tasks[task_item.parent_id].children_ids:
99
+ self._tasks[task_item.parent_id].children_ids.remove(tid)
100
+
101
+ def add_task(
102
+ self,
103
+ task_id: str = None,
104
+ data: Any = None,
105
+ status=Status.PENDING,
106
+ priority: int = 500,
107
+ parent_id: Optional[str] = None,
108
+ ttl_seconds: Optional[int] = None
109
+ ) -> bool:
110
+ """添加新种子,可指定父种子"""
111
+ with self._lock:
112
+ if not task_id:
113
+ task_id = md5(str(time.time()).encode()).hexdigest()
114
+
115
+ if task_id in self._tasks:
116
+ return False # 防止重复添加
117
+
118
+ task_item = Task(
119
+ task_id=task_id,
120
+ data=data,
121
+ status=status,
122
+ priority=priority,
123
+ created_at=int(time.time()),
124
+ parent_id=parent_id,
125
+ ttl_seconds=ttl_seconds
126
+ )
127
+ self._tasks[task_id] = task_item
128
+
129
+ if parent_id and parent_id in self._tasks:
130
+ self._tasks[parent_id].children_ids.append(task_id)
131
+
132
+ return True
133
+
134
+ def update_task(self, task_id, status, data=None) -> Task:
135
+ with self._lock:
136
+ task_item = self._tasks[task_id]
137
+ task_item.status = status
138
+ if data:
139
+ task_item.data = data
140
+
141
+ if task_item.status != Status.FINISHED:
142
+ for tid in task_item.children_ids:
143
+ if self._tasks[tid].status == Status.INSERT:
144
+ del self._tasks[tid]
145
+
146
+ task_item.children_ids = []
147
+ self._tasks[task_id] = task_item
148
+
149
+ return task_item
150
+
151
+ def remove(self, task_ids: list) -> bool:
152
+ with self._lock:
153
+ for task_id in task_ids:
154
+ if task_item := self._tasks.get(task_id):
155
+
156
+ if task_item.children_ids:
157
+ continue
158
+
159
+ if task_item.parent_id in self._tasks:
160
+ if task_id in self._tasks[task_item.parent_id].children_ids:
161
+ self._tasks[task_item.parent_id].children_ids.remove(task_id)
162
+
163
+ del self._tasks[task_id]
164
+
165
+ def count_children(self, task_id: str) -> int:
166
+ with self._lock:
167
+ if task_id in self._tasks:
168
+ return len(self._tasks[task_id].children_ids)
169
+ return 0
170
+
171
+ # def _cleanup_expired_seeds(self):
172
+ # now = time.time()
173
+ # expired_ids = []
174
+ # with self._lock:
175
+ # for seed_id, seed in self._seeds.items():
176
+ # if seed.ttl_seconds and now - seed.created_at > seed.ttl_seconds:
177
+ # expired_ids.append(seed_id)
178
+ # for seed_id in expired_ids:
179
+ # self._seeds[seed_id] = self._seeds[seed_id]._replace(status=SeedStatus.EXPIRED)
180
+ # print(f"清理了 {len(expired_ids)} 个过期种子")
cobweb/base/test.py ADDED
@@ -0,0 +1,257 @@
1
+ import requests
2
+ from urllib.parse import urlparse
3
+ from typing import Dict, Optional
4
+
5
+
6
+ class FileTypeDetector:
7
+ def __init__(self):
8
+ self.file_signatures = {
9
+ # 图片格式
10
+ b'\x89PNG\r\n\x1a\n': 'PNG',
11
+ b'\xff\xd8\xff': 'JPEG',
12
+ b'GIF87a': 'GIF',
13
+ b'GIF89a': 'GIF',
14
+ b'RIFF': 'WEBP', # 需要进一步检查
15
+ b'BM': 'BMP',
16
+ b'II*\x00': 'TIFF',
17
+ b'MM\x00*': 'TIFF',
18
+ b'\x00\x00\x01\x00': 'ICO',
19
+ b'\x00\x00\x02\x00': 'CUR',
20
+
21
+ # 视频格式
22
+ b'\x00\x00\x00\x18ftypmp4': 'MP4',
23
+ b'\x00\x00\x00\x20ftypM4V': 'M4V',
24
+ b'FLV\x01': 'FLV',
25
+ b'\x1aE\xdf\xa3': 'WEBM',
26
+ b'RIFF': 'AVI', # 需要进一步检查
27
+ b'\x00\x00\x01\xba': 'MPEG',
28
+ b'\x00\x00\x01\xb3': 'MPEG',
29
+ b'OggS': 'OGV',
30
+
31
+ # 音频格式
32
+ b'ID3': 'MP3',
33
+ b'\xff\xfb': 'MP3',
34
+ b'\xff\xf3': 'MP3',
35
+ b'\xff\xf2': 'MP3',
36
+ b'fLaC': 'FLAC',
37
+ b'RIFF': 'WAV', # 需要进一步检查
38
+ b'OggS': 'OGG', # 需要进一步检查
39
+ b'ftypM4A': 'M4A',
40
+ b'MAC ': 'APE',
41
+
42
+ # 其他格式
43
+ b'%PDF': 'PDF',
44
+ b'PK\x03\x04': 'ZIP',
45
+ b'Rar!\x1a\x07\x00': 'RAR',
46
+ b'\x37\x7a\xbc\xaf\x27\x1c': '7Z',
47
+ }
48
+
49
+ # 扩展名映射
50
+ self.extension_map = {
51
+ # 图片
52
+ '.jpg': 'JPEG', '.jpeg': 'JPEG', '.png': 'PNG', '.gif': 'GIF',
53
+ '.webp': 'WEBP', '.bmp': 'BMP', '.tiff': 'TIFF', '.tif': 'TIFF',
54
+ '.ico': 'ICO', '.svg': 'SVG', '.heic': 'HEIC', '.avif': 'AVIF',
55
+
56
+ # 视频
57
+ '.mp4': 'MP4', '.avi': 'AVI', '.mov': 'MOV', '.wmv': 'WMV',
58
+ '.flv': 'FLV', '.webm': 'WEBM', '.mkv': 'MKV', '.m4v': 'M4V',
59
+ '.mpg': 'MPEG', '.mpeg': 'MPEG', '.3gp': '3GP', '.ogv': 'OGV',
60
+ '.ts': 'TS', '.mts': 'MTS', '.vob': 'VOB',
61
+
62
+ # 音频
63
+ '.mp3': 'MP3', '.wav': 'WAV', '.flac': 'FLAC', '.aac': 'AAC',
64
+ '.ogg': 'OGG', '.wma': 'WMA', '.m4a': 'M4A', '.ape': 'APE',
65
+ '.opus': 'OPUS', '.aiff': 'AIFF', '.au': 'AU',
66
+ }
67
+
68
+ # MIME类型映射
69
+ self.mime_type_map = {
70
+ # 图片
71
+ 'image/jpeg': 'JPEG', 'image/png': 'PNG', 'image/gif': 'GIF',
72
+ 'image/webp': 'WEBP', 'image/bmp': 'BMP', 'image/tiff': 'TIFF',
73
+ 'image/svg+xml': 'SVG', 'image/x-icon': 'ICO',
74
+
75
+ # 视频
76
+ 'video/mp4': 'MP4', 'video/avi': 'AVI', 'video/quicktime': 'MOV',
77
+ 'video/x-msvideo': 'AVI', 'video/webm': 'WEBM', 'video/x-flv': 'FLV',
78
+ 'video/3gpp': '3GP', 'video/ogg': 'OGV',
79
+
80
+ # 音频
81
+ 'audio/mpeg': 'MP3', 'audio/wav': 'WAV', 'audio/flac': 'FLAC',
82
+ 'audio/aac': 'AAC', 'audio/ogg': 'OGG', 'audio/x-ms-wma': 'WMA',
83
+ 'audio/mp4': 'M4A', 'audio/opus': 'OPUS',
84
+ }
85
+
86
+ self.session = requests.Session()
87
+ self.session.headers.update({
88
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
89
+ })
90
+
91
+ def get_file_extension(self, url: str) -> str:
92
+ """从URL获取文件扩展名"""
93
+ parsed = urlparse(url)
94
+ path = parsed.path.lower()
95
+ site = parsed.netloc
96
+
97
+ # 移除查询参数
98
+ if '?' in path:
99
+ path = path.split('?')[0]
100
+
101
+ # 获取扩展名
102
+ if '.' in path:
103
+ return '.' + path.split('.')[-1], site
104
+ return '', site
105
+
106
+ def detect_by_extension(self, url: str) -> Optional[str]:
107
+ """通过文件扩展名检测类型"""
108
+ ext, site = self.get_file_extension(url)
109
+ return self.extension_map.get(ext)
110
+
111
+ def detect_by_mime_type(self, content_type: str) -> Optional[str]:
112
+ """通过MIME类型检测"""
113
+ if not content_type:
114
+ return None
115
+
116
+ # 清理content-type,移除参数
117
+ mime_type = content_type.split(';')[0].strip().lower()
118
+ return self.mime_type_map.get(mime_type)
119
+
120
+ def get_partial_content(self, url: str, max_bytes: int = 64) -> Optional[bytes]:
121
+ """获取文件的前几个字节"""
122
+ try:
123
+ headers = {'Range': f'bytes=0-{max_bytes - 1}'}
124
+ response = self.session.get(url, headers=headers, timeout=10)
125
+
126
+ if response.status_code in [200, 206]:
127
+ return response.content
128
+ except Exception as e:
129
+ print(f"获取内容失败: {e}")
130
+ return None
131
+
132
+ def detect_by_signature(self, data: bytes) -> Optional[str]:
133
+ """通过文件签名检测类型"""
134
+ if not data:
135
+ return None
136
+
137
+ # 检查各种文件签名
138
+ for signature, file_type in self.file_signatures.items():
139
+ if data.startswith(signature):
140
+ # 特殊处理需要进一步检查的格式
141
+ if signature == b'RIFF' and len(data) >= 12:
142
+ # 检查是WEBP、AVI还是WAV
143
+ if data[8:12] == b'WEBP':
144
+ return 'WEBP'
145
+ elif data[8:12] == b'AVI ':
146
+ return 'AVI'
147
+ elif data[8:12] == b'WAVE':
148
+ return 'WAV'
149
+ elif signature == b'OggS' and len(data) >= 32:
150
+ # 检查是OGG音频还是OGV视频
151
+ if b'vorbis' in data[:64].lower():
152
+ return 'OGG'
153
+ elif b'theora' in data[:64].lower():
154
+ return 'OGV'
155
+ else:
156
+ return 'OGG'
157
+ else:
158
+ return file_type
159
+
160
+ # 检查MP4相关格式
161
+ if len(data) >= 12 and data[4:8] == b'ftyp':
162
+ brand = data[8:12]
163
+ if brand in [b'mp41', b'mp42', b'isom', b'avc1']:
164
+ return 'MP4'
165
+ elif brand == b'M4A ':
166
+ return 'M4A'
167
+ elif brand == b'M4V ':
168
+ return 'M4V'
169
+ elif brand == b'qt ':
170
+ return 'MOV'
171
+
172
+ return None
173
+
174
+ def get_detailed_info(self, url, content_type, data) -> Dict:
175
+ """获取详细的文件信息"""
176
+ result = {
177
+ 'url': url,
178
+ 'site': None,
179
+ 'detected_type': None,
180
+ 'confidence': 'unknown',
181
+ 'methods_used': [],
182
+ 'content_type': content_type,
183
+ 'extension': None
184
+ }
185
+
186
+ # 1. 先尝试HEAD请求获取HTTP头信息
187
+ try:
188
+ result['content_type'] = content_type
189
+ # result['file_size'] = content_length
190
+
191
+ # 通过MIME类型检测
192
+ mime_detected = self.detect_by_mime_type(content_type)
193
+ if mime_detected:
194
+ result['detected_type'] = mime_detected
195
+ result['confidence'] = 'high'
196
+ result['methods_used'].append('mime_type')
197
+ except Exception as e:
198
+ print(f"HEAD请求失败: {e}")
199
+
200
+ # 2. 通过扩展名检测
201
+ ext_detected = self.detect_by_extension(url)
202
+ result['extension'], result['site'] = self.get_file_extension(url)
203
+
204
+ if ext_detected:
205
+ if not result['detected_type']:
206
+ result['detected_type'] = ext_detected
207
+ result['confidence'] = 'medium'
208
+ elif result['detected_type'] == ext_detected:
209
+ result['confidence'] = 'very_high' # MIME和扩展名一致
210
+ result['methods_used'].append('extension')
211
+
212
+ # 3. 如果前两种方法不确定,使用文件签名检测
213
+ if result['confidence'] in ['unknown', 'medium']:
214
+ signature_detected = self.detect_by_signature(data)
215
+ if signature_detected:
216
+ if not result['detected_type']:
217
+ result['detected_type'] = signature_detected
218
+ result['confidence'] = 'high'
219
+ elif result['detected_type'] == signature_detected:
220
+ result['confidence'] = 'very_high'
221
+ else:
222
+ # 冲突时,优先相信文件签名
223
+ result['detected_type'] = signature_detected
224
+ result['confidence'] = 'high'
225
+ result['methods_used'].append('file_signature')
226
+
227
+ return result
228
+
229
+ def detect_file_type(self, url: str) -> str:
230
+ """简单的文件类型检测,返回类型字符串"""
231
+ info = self.get_detailed_info(url)
232
+ return info.get('detected_type', 'Unknown')
233
+
234
+ def get_file_category(self, file_type: str) -> str:
235
+ """获取文件类别"""
236
+ if not file_type or file_type == 'Unknown':
237
+ return 'Unknown'
238
+
239
+ image_types = {'PNG', 'JPEG', 'GIF', 'WEBP', 'BMP', 'TIFF', 'ICO', 'SVG', 'HEIC', 'AVIF'}
240
+ video_types = {'MP4', 'AVI', 'MOV', 'WMV', 'FLV', 'WEBM', 'MKV', 'M4V', 'MPEG', '3GP', 'OGV', 'TS', 'MTS',
241
+ 'VOB'}
242
+ audio_types = {'MP3', 'WAV', 'FLAC', 'AAC', 'OGG', 'WMA', 'M4A', 'APE', 'OPUS', 'AIFF', 'AU'}
243
+
244
+ if file_type in image_types:
245
+ return 'Image'
246
+ elif file_type in video_types:
247
+ return 'Video'
248
+ elif file_type in audio_types:
249
+ return 'Audio'
250
+ else:
251
+ return 'Other'
252
+
253
+
254
+ # if __name__ == "__main__":
255
+ # detector = FileTypeDetector()
256
+ # result = detector.get_detailed_info("https://cdn.pixabay.com/user/2024/12/10/12-18-33-812_96x96.jpeg")
257
+ # print(result)
cobweb/constant.py CHANGED
@@ -1,3 +1,5 @@
1
+ from enum import Enum
2
+
1
3
 
2
4
  class CrawlerModel:
3
5
 
@@ -28,6 +30,13 @@ class DealModel:
28
30
  poll = "deal model: poll"
29
31
 
30
32
 
33
+ class ResponseStatus(Enum):
34
+ failed = "failed"
35
+ succeed = "succeed"
36
+ filter = "filter"
37
+ max_retry = "max retry"
38
+
39
+
31
40
  class LogTemplate:
32
41
 
33
42
  console_item = """
@@ -51,7 +60,7 @@ class LogTemplate:
51
60
  launcher_pro_polling = """
52
61
  ----------------------- start - 轮训日志: {task} -----------------
53
62
  内存队列
54
- 种子数: {doing_len}
63
+ 消费中: {doing_len}
55
64
  待消费: {todo_len}
56
65
  已消费: {done_len}
57
66
  redis队列
@@ -1,170 +1,27 @@
1
- import json
2
- import threading
3
- import time
4
- import traceback
5
- from inspect import isgenerator
6
- from typing import Union, Callable, Mapping
7
-
8
- from cobweb.constant import DealModel, LogTemplate
1
+ from typing import Any, Generator
9
2
  from cobweb.base import (
10
- Queue,
11
- Seed,
12
- BaseItem,
13
- Request,
14
- Response,
15
- ConsoleItem,
16
- logger
3
+ Seed,
4
+ BaseItem,
5
+ Request,
6
+ Response,
7
+ CSVItem,
17
8
  )
18
9
 
19
10
 
20
- class Crawler(threading.Thread):
21
-
22
- def __init__(
23
- self,
24
- stop: threading.Event,
25
- pause: threading.Event,
26
- # launcher_queue: Union[Mapping[str, Queue]],
27
- get_seed: Callable,
28
- set_seed: Callable,
29
- add_seed: Callable,
30
- delete_seed: Callable,
31
- upload_data: Callable,
32
- custom_func: Union[Mapping[str, Callable]],
33
- thread_num: int,
34
- max_retries: int,
35
- time_sleep: int,
36
- ):
37
- super().__init__()
38
-
39
- self._stop = stop
40
- self._pause = pause
41
- self._get_seed = get_seed
42
- self._set_seed = set_seed
43
- self._add_seed = add_seed
44
- self._delete_seed = delete_seed
45
- self._upload_data = upload_data
46
-
47
- for func_name, _callable in custom_func.items():
48
- if isinstance(_callable, Callable):
49
- self.__setattr__(func_name, _callable)
50
-
51
- self.thread_num = thread_num
52
- self.time_sleep = time_sleep
53
- self.max_retries = max_retries
11
+ class Crawler:
54
12
 
55
13
  @staticmethod
56
- def request(seed: Seed) -> Union[Request, BaseItem]:
14
+ def request(seed: Seed) -> Generator[Request, Response, None]:
57
15
  yield Request(seed.url, seed, timeout=5)
58
16
 
59
17
  @staticmethod
60
- def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
18
+ def download(item: Request) -> Generator[Response, Any, None]:
61
19
  response = item.download()
62
20
  yield Response(item.seed, response, **item.to_dict)
63
21
 
64
22
  @staticmethod
65
- def parse(item: Response) -> BaseItem:
23
+ def parse(item: Response) -> Generator[BaseItem, Any, None]:
66
24
  upload_item = item.to_dict
67
- upload_item["text"] = item.response.text
68
- yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
69
-
70
- # def get_seed(self) -> Seed:
71
- # return self._todo.pop()
72
-
73
- def distribute(self, item, seed):
74
- if isinstance(item, BaseItem):
75
- self._upload_data(item)
76
- elif isinstance(item, Seed):
77
- self._add_seed(item)
78
- elif isinstance(item, str) and item == DealModel.poll:
79
- self._set_seed(seed)
80
- elif isinstance(item, str) and item == DealModel.done:
81
- self._delete_seed(seed)
82
- elif isinstance(item, str) and item == DealModel.fail:
83
- seed.params.seed_status = DealModel.fail
84
- self._delete_seed(seed)
85
- else:
86
- raise TypeError("yield value type error!")
87
-
88
- def spider(self):
89
- while not self._stop.is_set():
90
-
91
- seed = self._get_seed()
92
-
93
- if not seed:
94
- time.sleep(1)
95
- continue
96
-
97
- elif seed.params.retry > self.max_retries:
98
- seed.params.seed_status = DealModel.fail
99
- self._delete_seed(seed)
100
- continue
101
-
102
- seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
103
-
104
- try:
105
- request_iterators = self.request(seed)
106
-
107
- if not isgenerator(request_iterators):
108
- raise TypeError("request function isn't a generator!")
109
-
110
- iterator_status = False
111
-
112
- for request_item in request_iterators:
113
-
114
- iterator_status = True
115
-
116
- if isinstance(request_item, Request):
117
- iterator_status = False
118
- download_iterators = self.download(request_item)
119
- if not isgenerator(download_iterators):
120
- raise TypeError("download function isn't a generator")
121
-
122
- for download_item in download_iterators:
123
- iterator_status = True
124
- if isinstance(download_item, Response):
125
- iterator_status = False
126
- logger.info(LogTemplate.download_info.format(
127
- detail=seed_detail_log_info,
128
- retry=seed.params.retry,
129
- priority=seed.params.priority,
130
- seed_version=seed.params.seed_version,
131
- identifier=seed.identifier or "",
132
- status=download_item.response,
133
- response=LogTemplate.log_info(download_item.to_dict)
134
- ))
135
- parse_iterators = self.parse(download_item)
136
- if not isgenerator(parse_iterators):
137
- raise TypeError("parse function isn't a generator")
138
- for parse_item in parse_iterators:
139
- iterator_status = True
140
- if isinstance(parse_item, Response):
141
- raise TypeError("upload_item can't be a Response instance")
142
- self.distribute(parse_item, seed)
143
- else:
144
- self.distribute(download_item, seed)
145
- else:
146
- self.distribute(request_item, seed)
147
-
148
- if not iterator_status:
149
- raise ValueError("request/download/parse function yield value error!")
150
- except Exception as e:
151
- logger.info(LogTemplate.download_exception.format(
152
- detail=seed_detail_log_info,
153
- retry=seed.params.retry,
154
- priority=seed.params.priority,
155
- seed_version=seed.params.seed_version,
156
- identifier=seed.identifier or "",
157
- exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
158
- ))
159
- seed.params.retry += 1
160
- # self._todo.push(seed)
161
- self._set_seed(seed)
162
- time.sleep(self.time_sleep * seed.params.retry)
163
- finally:
164
- time.sleep(0.1)
165
- logger.info("spider thread close")
166
-
167
- def run(self):
168
- for index in range(self.thread_num):
169
- threading.Thread(name=f"spider_{index}", target=self.spider).start()
25
+ upload_item["content"] = getattr(item.response, "text", item.response)
26
+ yield CSVItem(item.seed, data=upload_item)
170
27
 
cobweb/db/api_db.py CHANGED
@@ -57,7 +57,8 @@ class ApiDB:
57
57
  return self._get_response(api="/zcard", params=dict(name=name))
58
58
 
59
59
  def zadd(self, name, item: dict, **kwargs):
60
- return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
60
+ if item:
61
+ return self._post_response(api="/zadd", data=dict(name=name, mapping=item, **kwargs))
61
62
 
62
63
  def zrem(self, name, *values):
63
64
  return self._post_response(api="/zrem", data=dict(name=name, values=values))
@@ -71,7 +72,7 @@ class ApiDB:
71
72
  def auto_incr(self, name, t=15, limit=1000) -> bool:
72
73
  return self._get_response(api="/auto_incr", params=dict(name=name, t=t, limit=limit))
73
74
 
74
- def members(self, name, score, start=0, count=5000, _min="-inf", _max="+inf"):
75
+ def members(self, name, score, start=0, count=1000, _min="-inf", _max="+inf"):
75
76
  return self._get_response(api="/members", params=dict(name=name, score=score, start=start, count=count, min=_min, max=_max))
76
77
 
77
78
  def done(self, name: list, *values):