github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""Simple translation module for repository descriptions."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# 常见 AI 项目关键词翻译字典(按优先级排序)
|
|
8
|
+
TRANSLATION_DICT = {
|
|
9
|
+
# 完整短语优先翻译
|
|
10
|
+
"production-ready platform": "生产就绪的平台",
|
|
11
|
+
"personal ai assistant": "个人 AI 助手",
|
|
12
|
+
"workflow automation platform": "工作流自动化平台",
|
|
13
|
+
"ai coding assistant": "AI 编程助手",
|
|
14
|
+
"coding assistant": "编程助手",
|
|
15
|
+
"code generation": "代码生成",
|
|
16
|
+
"image generation": "图像生成",
|
|
17
|
+
"text generation": "文本生成",
|
|
18
|
+
"speech recognition": "语音识别",
|
|
19
|
+
"speech synthesis": "语音合成",
|
|
20
|
+
"face recognition": "人脸识别",
|
|
21
|
+
"object detection": "物体检测",
|
|
22
|
+
"natural language processing": "自然语言处理",
|
|
23
|
+
"large language model": "大语言模型",
|
|
24
|
+
"machine learning": "机器学习",
|
|
25
|
+
"deep learning": "深度学习",
|
|
26
|
+
"neural network": "神经网络",
|
|
27
|
+
"computer vision": "计算机视觉",
|
|
28
|
+
"reinforcement learning": "强化学习",
|
|
29
|
+
"generative ai": "生成式 AI",
|
|
30
|
+
"artificial intelligence": "人工智能",
|
|
31
|
+
"retrieval-augmented generation": "检索增强生成",
|
|
32
|
+
"stable diffusion": "Stable Diffusion",
|
|
33
|
+
"web ui": "网页界面",
|
|
34
|
+
"webui": "网页界面",
|
|
35
|
+
"open source": "开源",
|
|
36
|
+
"open-source": "开源",
|
|
37
|
+
"self-host": "自托管",
|
|
38
|
+
"self-hosted": "自托管",
|
|
39
|
+
"production-ready": "生产就绪",
|
|
40
|
+
"user-friendly": "用户友好",
|
|
41
|
+
"cross-platform": "跨平台",
|
|
42
|
+
"real-time": "实时",
|
|
43
|
+
"high performance": "高性能",
|
|
44
|
+
"lightweight": "轻量级",
|
|
45
|
+
"knowledge base": "知识库",
|
|
46
|
+
"workflow automation": "工作流自动化",
|
|
47
|
+
"image upscaling": "图像放大",
|
|
48
|
+
"face swap": "换脸",
|
|
49
|
+
"video generation": "视频生成",
|
|
50
|
+
"text to image": "文本生成图像",
|
|
51
|
+
"text to speech": "文本转语音",
|
|
52
|
+
"speech to text": "语音转文本",
|
|
53
|
+
"language model": "语言模型",
|
|
54
|
+
"vision model": "视觉模型",
|
|
55
|
+
"multimodal": "多模态",
|
|
56
|
+
"getting started": "入门",
|
|
57
|
+
"get started": "入门",
|
|
58
|
+
"best practices": "最佳实践",
|
|
59
|
+
"awesome list": "精选列表",
|
|
60
|
+
"awesome": "精选",
|
|
61
|
+
"alternative to": "...的替代方案",
|
|
62
|
+
"next generation": "下一代",
|
|
63
|
+
"next-gen": "下一代",
|
|
64
|
+
"powered by": "基于",
|
|
65
|
+
"based on": "基于",
|
|
66
|
+
"built with": "使用...构建",
|
|
67
|
+
"written in": "使用...编写",
|
|
68
|
+
"designed for": "专为...设计",
|
|
69
|
+
"easy to use": "易于使用",
|
|
70
|
+
"free and open source": "免费开源",
|
|
71
|
+
|
|
72
|
+
# AI 相关术语
|
|
73
|
+
"llm": "大语言模型",
|
|
74
|
+
"nlp": "自然语言处理",
|
|
75
|
+
"chatbot": "聊天机器人",
|
|
76
|
+
"chatgpt": "ChatGPT",
|
|
77
|
+
"gpt": "GPT",
|
|
78
|
+
"transformer": "Transformer",
|
|
79
|
+
"diffusion": "扩散模型",
|
|
80
|
+
"rag": "检索增强生成",
|
|
81
|
+
"embedding": "嵌入",
|
|
82
|
+
"vector": "向量",
|
|
83
|
+
"prompt": "提示词",
|
|
84
|
+
"prompts": "提示词",
|
|
85
|
+
"agent": "代理",
|
|
86
|
+
"agents": "代理",
|
|
87
|
+
"assistant": "助手",
|
|
88
|
+
"conversation": "对话",
|
|
89
|
+
"conversational": "对话式",
|
|
90
|
+
"context": "上下文",
|
|
91
|
+
"memory": "记忆",
|
|
92
|
+
"reasoning": "推理",
|
|
93
|
+
"planning": "规划",
|
|
94
|
+
"autonomous": "自主",
|
|
95
|
+
"intelligent": "智能",
|
|
96
|
+
"smart": "智能",
|
|
97
|
+
|
|
98
|
+
# 技术术语
|
|
99
|
+
"framework": "框架",
|
|
100
|
+
"platform": "平台",
|
|
101
|
+
"toolkit": "工具包",
|
|
102
|
+
"library": "库",
|
|
103
|
+
"api": "API",
|
|
104
|
+
"sdk": "SDK",
|
|
105
|
+
"cli": "命令行工具",
|
|
106
|
+
"gui": "图形界面",
|
|
107
|
+
"dashboard": "仪表盘",
|
|
108
|
+
"interface": "界面",
|
|
109
|
+
"workflow": "工作流",
|
|
110
|
+
"automation": "自动化",
|
|
111
|
+
"integration": "集成",
|
|
112
|
+
"pipeline": "管道",
|
|
113
|
+
"orchestration": "编排",
|
|
114
|
+
"deployment": "部署",
|
|
115
|
+
"training": "训练",
|
|
116
|
+
"fine-tuning": "微调",
|
|
117
|
+
"inference": "推理",
|
|
118
|
+
"model": "模型",
|
|
119
|
+
"models": "模型",
|
|
120
|
+
"database": "数据库",
|
|
121
|
+
"storage": "存储",
|
|
122
|
+
"search": "搜索",
|
|
123
|
+
"retrieval": "检索",
|
|
124
|
+
"document": "文档",
|
|
125
|
+
"pdf": "PDF",
|
|
126
|
+
"markdown": "Markdown",
|
|
127
|
+
"server": "服务器",
|
|
128
|
+
"client": "客户端",
|
|
129
|
+
"backend": "后端",
|
|
130
|
+
"frontend": "前端",
|
|
131
|
+
"service": "服务",
|
|
132
|
+
"services": "服务",
|
|
133
|
+
"endpoint": "端点",
|
|
134
|
+
"request": "请求",
|
|
135
|
+
"response": "响应",
|
|
136
|
+
|
|
137
|
+
# 动词
|
|
138
|
+
"build": "构建",
|
|
139
|
+
"building": "构建",
|
|
140
|
+
"create": "创建",
|
|
141
|
+
"develop": "开发",
|
|
142
|
+
"deploy": "部署",
|
|
143
|
+
"run": "运行",
|
|
144
|
+
"manage": "管理",
|
|
145
|
+
"analyze": "分析",
|
|
146
|
+
"process": "处理",
|
|
147
|
+
"generate": "生成",
|
|
148
|
+
"extract": "提取",
|
|
149
|
+
"convert": "转换",
|
|
150
|
+
"scrape": "抓取",
|
|
151
|
+
"crawl": "爬取",
|
|
152
|
+
"monitor": "监控",
|
|
153
|
+
"track": "追踪",
|
|
154
|
+
"visualize": "可视化",
|
|
155
|
+
"optimize": "优化",
|
|
156
|
+
"scale": "扩展",
|
|
157
|
+
"integrate": "集成",
|
|
158
|
+
"connect": "连接",
|
|
159
|
+
"support": "支持",
|
|
160
|
+
"supports": "支持",
|
|
161
|
+
|
|
162
|
+
# 形容词
|
|
163
|
+
"simple": "简单",
|
|
164
|
+
"fast": "快速",
|
|
165
|
+
"efficient": "高效",
|
|
166
|
+
"powerful": "强大",
|
|
167
|
+
"flexible": "灵活",
|
|
168
|
+
"extensible": "可扩展",
|
|
169
|
+
"modular": "模块化",
|
|
170
|
+
"customizable": "可定制",
|
|
171
|
+
"scalable": "可扩展",
|
|
172
|
+
"robust": "稳健",
|
|
173
|
+
"secure": "安全",
|
|
174
|
+
"privacy": "隐私",
|
|
175
|
+
"free": "免费",
|
|
176
|
+
"modern": "现代",
|
|
177
|
+
"latest": "最新",
|
|
178
|
+
|
|
179
|
+
# 名词
|
|
180
|
+
"developer": "开发者",
|
|
181
|
+
"developers": "开发者",
|
|
182
|
+
"user": "用户",
|
|
183
|
+
"users": "用户",
|
|
184
|
+
"team": "团队",
|
|
185
|
+
"organization": "组织",
|
|
186
|
+
"enterprise": "企业",
|
|
187
|
+
"business": "商业",
|
|
188
|
+
"application": "应用",
|
|
189
|
+
"applications": "应用",
|
|
190
|
+
"project": "项目",
|
|
191
|
+
"projects": "项目",
|
|
192
|
+
"repository": "仓库",
|
|
193
|
+
"code": "代码",
|
|
194
|
+
"data": "数据",
|
|
195
|
+
"file": "文件",
|
|
196
|
+
"files": "文件",
|
|
197
|
+
"community": "社区",
|
|
198
|
+
"resources": "资源",
|
|
199
|
+
"tools": "工具",
|
|
200
|
+
"utilities": "实用工具",
|
|
201
|
+
"extensions": "扩展",
|
|
202
|
+
"plugins": "插件",
|
|
203
|
+
"tutorial": "教程",
|
|
204
|
+
"documentation": "文档",
|
|
205
|
+
"examples": "示例",
|
|
206
|
+
"demo": "演示",
|
|
207
|
+
"sample": "示例",
|
|
208
|
+
"template": "模板",
|
|
209
|
+
"benchmark": "基准测试",
|
|
210
|
+
"testing": "测试",
|
|
211
|
+
"experiment": "实验",
|
|
212
|
+
"research": "研究",
|
|
213
|
+
"education": "教育",
|
|
214
|
+
"learning": "学习",
|
|
215
|
+
"course": "课程",
|
|
216
|
+
"lesson": "课程",
|
|
217
|
+
"roadmap": "路线图",
|
|
218
|
+
"guide": "指南",
|
|
219
|
+
"handbook": "手册",
|
|
220
|
+
"collection": "合集",
|
|
221
|
+
"list": "列表",
|
|
222
|
+
"curated": "精选",
|
|
223
|
+
"version": "版本",
|
|
224
|
+
"release": "发布",
|
|
225
|
+
"update": "更新",
|
|
226
|
+
"feature": "功能",
|
|
227
|
+
"features": "功能",
|
|
228
|
+
"capability": "能力",
|
|
229
|
+
"capabilities": "能力",
|
|
230
|
+
"functionality": "功能",
|
|
231
|
+
"component": "组件",
|
|
232
|
+
"module": "模块",
|
|
233
|
+
"package": "包",
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def translate_description(description: Optional[str]) -> str:
|
|
238
|
+
"""Translate repository description to Chinese.
|
|
239
|
+
|
|
240
|
+
Uses a dictionary-based approach for common AI terms.
|
|
241
|
+
For descriptions that are already in Chinese, returns as-is.
|
|
242
|
+
For mixed content, preserves readability.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
description: Original English description.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Translated description with Chinese terms mixed naturally.
|
|
249
|
+
"""
|
|
250
|
+
if description is None:
|
|
251
|
+
return "暂无描述"
|
|
252
|
+
|
|
253
|
+
# Check if already contains significant Chinese characters
|
|
254
|
+
chinese_chars = re.findall(r'[\u4e00-\u9fff]', description)
|
|
255
|
+
if len(chinese_chars) > len(description) * 0.3: # More than 30% Chinese
|
|
256
|
+
return description
|
|
257
|
+
|
|
258
|
+
# Clean the description
|
|
259
|
+
cleaned = " ".join(description.split())
|
|
260
|
+
|
|
261
|
+
# Don't translate if it's very short (likely a name or brand)
|
|
262
|
+
if len(cleaned) < 20:
|
|
263
|
+
return cleaned
|
|
264
|
+
|
|
265
|
+
# Sort by length (longest first) to avoid partial matches
|
|
266
|
+
sorted_terms = sorted(TRANSLATION_DICT.keys(), key=len, reverse=True)
|
|
267
|
+
|
|
268
|
+
translated = cleaned
|
|
269
|
+
for term in sorted_terms:
|
|
270
|
+
# Case-insensitive replacement, but preserve word boundaries
|
|
271
|
+
pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
|
|
272
|
+
if pattern.search(translated):
|
|
273
|
+
translated = pattern.sub(TRANSLATION_DICT[term], translated)
|
|
274
|
+
|
|
275
|
+
# Clean up any leftover English fragments that make it awkward
|
|
276
|
+
# Remove awkward patterns like "代理ic" -> "代理"
|
|
277
|
+
translated = re.sub(r'代理ic', '代理', translated)
|
|
278
|
+
translated = re.sub(r'开发ment', '开发', translated)
|
|
279
|
+
translated = re.sub(r'集成s', '集成', translated)
|
|
280
|
+
translated = re.sub(r'构建ing', '构建', translated)
|
|
281
|
+
translated = re.sub(r'分析er', '分析器', translated)
|
|
282
|
+
translated = re.sub(r'生成ive', '生成式', translated)
|
|
283
|
+
|
|
284
|
+
return translated
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def get_bilingual_description(description: Optional[str]) -> tuple[str, str]:
|
|
288
|
+
"""Get both original and translated description.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
description: Original description.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
(original, translated) tuple.
|
|
295
|
+
"""
|
|
296
|
+
original = description or "暂无描述"
|
|
297
|
+
translated = translate_description(description)
|
|
298
|
+
|
|
299
|
+
# If translation is same as original (no changes), don't duplicate
|
|
300
|
+
if original == translated:
|
|
301
|
+
return (original, "")
|
|
302
|
+
|
|
303
|
+
return (original, translated)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Plugin system for ai-scraper."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
from ai_scraper.models.repository import Repository
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class PluginInfo:
|
|
14
|
+
"""Plugin metadata."""
|
|
15
|
+
name: str
|
|
16
|
+
version: str
|
|
17
|
+
description: str
|
|
18
|
+
author: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BasePlugin(ABC):
|
|
22
|
+
"""Base class for plugins."""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def info(self) -> PluginInfo:
|
|
27
|
+
"""Get plugin information."""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
def on_scrape_start(self, config: dict) -> None:
|
|
31
|
+
"""Called when scraping starts."""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def on_repo_found(self, repo: Repository) -> Optional[Repository]:
|
|
35
|
+
"""Called when a repository is found. Can modify or filter.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
repo: Found repository.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Modified repository or None to filter out.
|
|
42
|
+
"""
|
|
43
|
+
return repo
|
|
44
|
+
|
|
45
|
+
def on_scrape_complete(self, repos: list[Repository], stats: dict) -> None:
|
|
46
|
+
"""Called when scraping completes."""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def on_export(self, data: Any, format: str) -> Any:
|
|
50
|
+
"""Called before export. Can modify data.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
data: Data to export.
|
|
54
|
+
format: Export format.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Modified data.
|
|
58
|
+
"""
|
|
59
|
+
return data
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class PluginManager:
|
|
63
|
+
"""Manage loaded plugins."""
|
|
64
|
+
|
|
65
|
+
def __init__(self):
|
|
66
|
+
self.plugins: dict[str, BasePlugin] = {}
|
|
67
|
+
|
|
68
|
+
def load_plugin(self, plugin_path: Path) -> Optional[str]:
|
|
69
|
+
"""Load a plugin from a Python file.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
plugin_path: Path to plugin file.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Plugin name if loaded successfully.
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
spec = importlib.util.spec_from_file_location("custom_plugin", plugin_path)
|
|
79
|
+
if not spec or not spec.loader:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
module = importlib.util.module_from_spec(spec)
|
|
83
|
+
spec.loader.exec_module(module)
|
|
84
|
+
|
|
85
|
+
# Find plugin class
|
|
86
|
+
for attr_name in dir(module):
|
|
87
|
+
attr = getattr(module, attr_name)
|
|
88
|
+
if isinstance(attr, type) and issubclass(attr, BasePlugin) and attr is not BasePlugin:
|
|
89
|
+
plugin = attr()
|
|
90
|
+
self.plugins[plugin.info.name] = plugin
|
|
91
|
+
return plugin.info.name
|
|
92
|
+
|
|
93
|
+
return None
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"Failed to load plugin: {e}")
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
def load_plugins_from_dir(self, plugin_dir: Path) -> list[str]:
|
|
99
|
+
"""Load all plugins from a directory.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
plugin_dir: Directory containing plugin files.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of loaded plugin names.
|
|
106
|
+
"""
|
|
107
|
+
loaded = []
|
|
108
|
+
for plugin_file in plugin_dir.glob("*.py"):
|
|
109
|
+
name = self.load_plugin(plugin_file)
|
|
110
|
+
if name:
|
|
111
|
+
loaded.append(name)
|
|
112
|
+
return loaded
|
|
113
|
+
|
|
114
|
+
def get_plugin(self, name: str) -> Optional[BasePlugin]:
|
|
115
|
+
"""Get a loaded plugin by name."""
|
|
116
|
+
return self.plugins.get(name)
|
|
117
|
+
|
|
118
|
+
def trigger(self, event: str, *args, **kwargs) -> Any:
|
|
119
|
+
"""Trigger an event on all plugins.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
event: Event name (e.g., "on_repo_found").
|
|
123
|
+
*args: Positional arguments.
|
|
124
|
+
**kwargs: Keyword arguments.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Modified result from plugin chain.
|
|
128
|
+
"""
|
|
129
|
+
result = args[0] if args else None
|
|
130
|
+
|
|
131
|
+
for plugin in self.plugins.values():
|
|
132
|
+
handler = getattr(plugin, event, None)
|
|
133
|
+
if handler:
|
|
134
|
+
try:
|
|
135
|
+
if result is not None:
|
|
136
|
+
result = handler(result, **kwargs)
|
|
137
|
+
else:
|
|
138
|
+
handler(*args, **kwargs)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
print(f"Plugin {plugin.info.name} error in {event}: {e}")
|
|
141
|
+
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Global plugin manager
|
|
146
|
+
plugin_manager = PluginManager()
|
ai_scraper/retry.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Retry logic with exponential backoff."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import functools
|
|
5
|
+
import logging
|
|
6
|
+
import random
|
|
7
|
+
from typing import Callable, Optional, Type, Tuple
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RetryHandler:
|
|
13
|
+
"""Handle retry logic with exponential backoff."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
max_retries: int = 3,
|
|
18
|
+
base_delay: float = 1.0,
|
|
19
|
+
exponential_base: float = 2.0,
|
|
20
|
+
max_delay: float = 60.0,
|
|
21
|
+
jitter: bool = True,
|
|
22
|
+
retryable_exceptions: Tuple[Type[Exception], ...] = (
|
|
23
|
+
ConnectionError,
|
|
24
|
+
TimeoutError,
|
|
25
|
+
),
|
|
26
|
+
):
|
|
27
|
+
"""Initialize retry handler.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
max_retries: Maximum number of retry attempts.
|
|
31
|
+
base_delay: Initial delay in seconds.
|
|
32
|
+
exponential_base: Base for exponential backoff.
|
|
33
|
+
max_delay: Maximum delay in seconds.
|
|
34
|
+
jitter: Add random jitter to delays.
|
|
35
|
+
retryable_exceptions: Exceptions that trigger retry.
|
|
36
|
+
"""
|
|
37
|
+
self.max_retries = max_retries
|
|
38
|
+
self.base_delay = base_delay
|
|
39
|
+
self.exponential_base = exponential_base
|
|
40
|
+
self.max_delay = max_delay
|
|
41
|
+
self.jitter = jitter
|
|
42
|
+
self.retryable_exceptions = retryable_exceptions
|
|
43
|
+
|
|
44
|
+
def _calculate_delay(self, attempt: int) -> float:
|
|
45
|
+
"""Calculate delay for a given attempt."""
|
|
46
|
+
delay = self.base_delay * (self.exponential_base ** attempt)
|
|
47
|
+
delay = min(delay, self.max_delay)
|
|
48
|
+
|
|
49
|
+
if self.jitter:
|
|
50
|
+
# Add up to 25% jitter
|
|
51
|
+
delay = delay * (1 + random.random() * 0.25)
|
|
52
|
+
|
|
53
|
+
return delay
|
|
54
|
+
|
|
55
|
+
async def execute(
|
|
56
|
+
self,
|
|
57
|
+
func: Callable,
|
|
58
|
+
*args,
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
61
|
+
"""Execute a function with retry logic.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
func: Async function to execute.
|
|
65
|
+
*args: Positional arguments for func.
|
|
66
|
+
**kwargs: Keyword arguments for func.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Result of the function.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
Last exception after all retries exhausted.
|
|
73
|
+
"""
|
|
74
|
+
last_exception = None
|
|
75
|
+
|
|
76
|
+
for attempt in range(self.max_retries + 1):
|
|
77
|
+
try:
|
|
78
|
+
return await func(*args, **kwargs)
|
|
79
|
+
except self.retryable_exceptions as e:
|
|
80
|
+
last_exception = e
|
|
81
|
+
|
|
82
|
+
if attempt < self.max_retries:
|
|
83
|
+
delay = self._calculate_delay(attempt)
|
|
84
|
+
logger.warning(
|
|
85
|
+
f"Retry {attempt + 1}/{self.max_retries} after {delay:.2f}s: {e}"
|
|
86
|
+
)
|
|
87
|
+
await asyncio.sleep(delay)
|
|
88
|
+
else:
|
|
89
|
+
logger.error(f"All {self.max_retries} retries exhausted")
|
|
90
|
+
|
|
91
|
+
raise last_exception
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def with_retry(
|
|
95
|
+
max_retries: int = 3,
|
|
96
|
+
base_delay: float = 1.0,
|
|
97
|
+
exponential_base: float = 2.0,
|
|
98
|
+
max_delay: float = 60.0,
|
|
99
|
+
jitter: bool = True,
|
|
100
|
+
retryable_exceptions: Tuple[Type[Exception], ...] = (
|
|
101
|
+
ConnectionError,
|
|
102
|
+
TimeoutError,
|
|
103
|
+
),
|
|
104
|
+
):
|
|
105
|
+
"""Decorator for retry logic.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
max_retries: Maximum number of retry attempts.
|
|
109
|
+
base_delay: Initial delay in seconds.
|
|
110
|
+
exponential_base: Base for exponential backoff.
|
|
111
|
+
max_delay: Maximum delay in seconds.
|
|
112
|
+
jitter: Add random jitter to delays.
|
|
113
|
+
retryable_exceptions: Exceptions that trigger retry.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Decorated function.
|
|
117
|
+
"""
|
|
118
|
+
handler = RetryHandler(
|
|
119
|
+
max_retries=max_retries,
|
|
120
|
+
base_delay=base_delay,
|
|
121
|
+
exponential_base=exponential_base,
|
|
122
|
+
max_delay=max_delay,
|
|
123
|
+
jitter=jitter,
|
|
124
|
+
retryable_exceptions=retryable_exceptions,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def decorator(func):
|
|
128
|
+
@functools.wraps(func)
|
|
129
|
+
async def wrapper(*args, **kwargs):
|
|
130
|
+
return await handler.execute(func, *args, **kwargs)
|
|
131
|
+
|
|
132
|
+
return wrapper
|
|
133
|
+
|
|
134
|
+
return decorator
|
ai_scraper/scheduler.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Scheduled task management."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import signal
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
from croniter import croniter
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TaskScheduler:
|
|
12
|
+
"""Schedule and run periodic tasks."""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.tasks: dict[str, dict] = {}
|
|
16
|
+
self.running = False
|
|
17
|
+
|
|
18
|
+
def add_task(
|
|
19
|
+
self,
|
|
20
|
+
name: str,
|
|
21
|
+
cron_expr: str,
|
|
22
|
+
callback: Callable,
|
|
23
|
+
*args,
|
|
24
|
+
**kwargs,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Add a scheduled task.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
name: Task name.
|
|
30
|
+
cron_expr: Cron expression (e.g., "0 9 * * *" for daily at 9am).
|
|
31
|
+
callback: Function to call.
|
|
32
|
+
*args: Positional arguments for callback.
|
|
33
|
+
**kwargs: Keyword arguments for callback.
|
|
34
|
+
"""
|
|
35
|
+
self.tasks[name] = {
|
|
36
|
+
"cron": croniter(cron_expr, datetime.now()),
|
|
37
|
+
"callback": callback,
|
|
38
|
+
"args": args,
|
|
39
|
+
"kwargs": kwargs,
|
|
40
|
+
"next_run": None,
|
|
41
|
+
}
|
|
42
|
+
self._update_next_run(name)
|
|
43
|
+
|
|
44
|
+
def _update_next_run(self, name: str) -> None:
|
|
45
|
+
"""Update next run time for a task."""
|
|
46
|
+
task = self.tasks[name]
|
|
47
|
+
task["next_run"] = task["cron"].get_next(datetime)
|
|
48
|
+
|
|
49
|
+
async def run(self) -> None:
|
|
50
|
+
"""Start the scheduler loop."""
|
|
51
|
+
self.running = True
|
|
52
|
+
|
|
53
|
+
# Handle shutdown signals
|
|
54
|
+
loop = asyncio.get_event_loop()
|
|
55
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
56
|
+
loop.add_signal_handler(sig, self.stop)
|
|
57
|
+
|
|
58
|
+
while self.running:
|
|
59
|
+
now = datetime.now()
|
|
60
|
+
sleep_time = 60.0 # Check every minute
|
|
61
|
+
|
|
62
|
+
for name, task in self.tasks.items():
|
|
63
|
+
if task["next_run"] and task["next_run"] <= now:
|
|
64
|
+
# Run the task
|
|
65
|
+
try:
|
|
66
|
+
if asyncio.iscoroutinefunction(task["callback"]):
|
|
67
|
+
await task["callback"](*task["args"], **task["kwargs"])
|
|
68
|
+
else:
|
|
69
|
+
task["callback"](*task["args"], **task["kwargs"])
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Task {name} failed: {e}")
|
|
72
|
+
|
|
73
|
+
# Schedule next run
|
|
74
|
+
self._update_next_run(name)
|
|
75
|
+
|
|
76
|
+
await asyncio.sleep(sleep_time)
|
|
77
|
+
|
|
78
|
+
def stop(self) -> None:
|
|
79
|
+
"""Stop the scheduler."""
|
|
80
|
+
self.running = False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Global scheduler instance
|
|
84
|
+
scheduler = TaskScheduler()
|