github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""Markdown exporter for generating beautiful reports."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ai_scraper.models import Repository
|
|
8
|
+
from ai_scraper.output.translator import translate_description
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# 分类映射(英文 -> 中文)
|
|
12
|
+
CATEGORY_NAMES = {
|
|
13
|
+
"LLM": "大语言模型",
|
|
14
|
+
"Chatbot": "聊天机器人",
|
|
15
|
+
"Generative AI": "生成式 AI",
|
|
16
|
+
"Computer Vision": "计算机视觉",
|
|
17
|
+
"NLP": "自然语言处理",
|
|
18
|
+
"Machine Learning": "机器学习",
|
|
19
|
+
"Deep Learning": "深度学习",
|
|
20
|
+
"MLOps": "机器学习运维",
|
|
21
|
+
"AI Infrastructure": "AI 基础设施",
|
|
22
|
+
"AI Ethics": "AI 伦理",
|
|
23
|
+
"Reinforcement Learning": "强化学习",
|
|
24
|
+
"Robotics": "机器人",
|
|
25
|
+
"AutoML": "自动化机器学习",
|
|
26
|
+
"Data Science": "数据科学",
|
|
27
|
+
"AI Tools": "AI 工具",
|
|
28
|
+
"Other": "其他",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# 语言图标
|
|
32
|
+
LANGUAGE_ICONS = {
|
|
33
|
+
"Python": "🐍",
|
|
34
|
+
"TypeScript": "📘",
|
|
35
|
+
"JavaScript": "💛",
|
|
36
|
+
"Java": "☕",
|
|
37
|
+
"Go": "🐹",
|
|
38
|
+
"Rust": "🦀",
|
|
39
|
+
"C++": "⚡",
|
|
40
|
+
"C": "⚙️",
|
|
41
|
+
"Jupyter Notebook": "📊",
|
|
42
|
+
"HTML": "🌐",
|
|
43
|
+
"CSS": "🎨",
|
|
44
|
+
"Ruby": "💎",
|
|
45
|
+
"PHP": "🐘",
|
|
46
|
+
"Swift": "🍎",
|
|
47
|
+
"Kotlin": "🎯",
|
|
48
|
+
"Lua": "🌙",
|
|
49
|
+
"Shell": "🖥️",
|
|
50
|
+
"Dart": "🎯",
|
|
51
|
+
"Scala": "🔴",
|
|
52
|
+
"R": "📈",
|
|
53
|
+
"MATLAB": "📐",
|
|
54
|
+
"Julia": "💜",
|
|
55
|
+
"Haskell": "λ",
|
|
56
|
+
"Elixir": "💧",
|
|
57
|
+
"Clojure": "🧬",
|
|
58
|
+
"F#": "🔷",
|
|
59
|
+
"OCaml": "🐫",
|
|
60
|
+
"Nim": "👑",
|
|
61
|
+
"Crystal": "💎",
|
|
62
|
+
"Elm": "🌳",
|
|
63
|
+
"V": "⚡",
|
|
64
|
+
"Zig": "⚡",
|
|
65
|
+
"Nim": "👑",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# 热门程度图标
|
|
69
|
+
STAR_LEVELS = [
|
|
70
|
+
(100000, "🔥🔥🔥🔥🔥", "超热门"),
|
|
71
|
+
(50000, "🔥🔥🔥🔥", "非常热门"),
|
|
72
|
+
(10000, "🔥🔥🔥", "热门"),
|
|
73
|
+
(5000, "🔥🔥", "较热门"),
|
|
74
|
+
(1000, "🔥", "值得关注"),
|
|
75
|
+
(0, "⭐", "新星"),
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class MarkdownExporter:
|
|
80
|
+
"""Export repositories to beautiful Markdown format."""
|
|
81
|
+
|
|
82
|
+
def __init__(self, output_dir: Path, filename: str = "repositories.md"):
|
|
83
|
+
"""Initialize the exporter.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
output_dir: Directory for output files.
|
|
87
|
+
filename: Name of the output file.
|
|
88
|
+
"""
|
|
89
|
+
self.output_dir = Path(output_dir)
|
|
90
|
+
self.filename = filename
|
|
91
|
+
|
|
92
|
+
def export_repositories(self, repos: list[Repository]) -> Path:
|
|
93
|
+
"""Export repositories to a Markdown file.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
repos: List of repositories to export.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Path to the created file.
|
|
100
|
+
"""
|
|
101
|
+
# Create output directory if needed
|
|
102
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
# Generate content
|
|
105
|
+
content = self._generate_content(repos)
|
|
106
|
+
|
|
107
|
+
# Write file
|
|
108
|
+
output_path = self.output_dir / self.filename
|
|
109
|
+
output_path.write_text(content, encoding="utf-8")
|
|
110
|
+
|
|
111
|
+
return output_path
|
|
112
|
+
|
|
113
|
+
def _generate_content(self, repos: list[Repository]) -> str:
|
|
114
|
+
"""Generate the full Markdown content."""
|
|
115
|
+
lines = []
|
|
116
|
+
|
|
117
|
+
# 标题和介绍
|
|
118
|
+
lines.append("# 🤖 AI 开源项目精选")
|
|
119
|
+
lines.append("")
|
|
120
|
+
lines.append("> 精选 GitHub 上最受欢迎的 AI 相关开源项目,助你发现优质资源!")
|
|
121
|
+
lines.append("")
|
|
122
|
+
|
|
123
|
+
# 统计信息
|
|
124
|
+
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
125
|
+
total_stars = sum(repo.stars for repo in repos)
|
|
126
|
+
languages = self._get_language_stats(repos)
|
|
127
|
+
|
|
128
|
+
lines.append("## 📊 统计概览")
|
|
129
|
+
lines.append("")
|
|
130
|
+
lines.append(f"| 📅 更新时间 | 📦 项目数量 | ⭐ 总星数 |")
|
|
131
|
+
lines.append(f"|:---:|:---:|:---:|")
|
|
132
|
+
lines.append(f"| {now} | {len(repos)} | {self._format_number(total_stars)} |")
|
|
133
|
+
lines.append("")
|
|
134
|
+
lines.append("### 🌈 语言分布")
|
|
135
|
+
lines.append("")
|
|
136
|
+
lang_chart = self._generate_language_chart(languages)
|
|
137
|
+
lines.append(lang_chart)
|
|
138
|
+
lines.append("")
|
|
139
|
+
|
|
140
|
+
# 目录
|
|
141
|
+
lines.append("## 📑 目录")
|
|
142
|
+
lines.append("")
|
|
143
|
+
lines.append("- [🔥 超热门项目 (100K+ Stars)](#🔥-超热门项目-100k-stars)")
|
|
144
|
+
lines.append("- [🌟 热门项目 (50K-100K Stars)](#🌟-热门项目-50k-100k-stars)")
|
|
145
|
+
lines.append("- [⭐ 优质项目 (10K-50K Stars)](#⭐-优质项目-10k-50k-stars)")
|
|
146
|
+
lines.append("- [💡 新兴项目 (1K-10K Stars)](#💡-新兴项目-1k-10k-stars)")
|
|
147
|
+
lines.append("")
|
|
148
|
+
|
|
149
|
+
# 按星数分组展示
|
|
150
|
+
grouped_repos = self._group_by_stars(repos)
|
|
151
|
+
|
|
152
|
+
for group_name, group_repos in grouped_repos:
|
|
153
|
+
if group_repos:
|
|
154
|
+
lines.append(f"## {group_name}")
|
|
155
|
+
lines.append("")
|
|
156
|
+
for repo in group_repos:
|
|
157
|
+
lines.append(self._format_repo_card(repo))
|
|
158
|
+
lines.append("")
|
|
159
|
+
lines.append("---")
|
|
160
|
+
lines.append("")
|
|
161
|
+
|
|
162
|
+
# 页脚
|
|
163
|
+
lines.append("")
|
|
164
|
+
lines.append("## 📝 说明")
|
|
165
|
+
lines.append("")
|
|
166
|
+
lines.append("- 数据来源于 GitHub API,按 Star 数排序")
|
|
167
|
+
lines.append("- 项目描述包含英文原文和中文翻译")
|
|
168
|
+
lines.append("- 语言图标用于快速识别项目技术栈")
|
|
169
|
+
lines.append("- 热门程度图标表示项目的受欢迎程度")
|
|
170
|
+
lines.append("")
|
|
171
|
+
lines.append("---")
|
|
172
|
+
lines.append("")
|
|
173
|
+
lines.append("*由 [AI Scraper](https://github.com/lwx66615/github-ai-scraper) 自动生成*")
|
|
174
|
+
|
|
175
|
+
return "\n".join(lines)
|
|
176
|
+
|
|
177
|
+
def _format_repo_card(self, repo: Repository) -> str:
|
|
178
|
+
"""Format a single repository as a beautiful card.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
repo: Repository to format.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Markdown formatted card.
|
|
185
|
+
"""
|
|
186
|
+
lang_icon = LANGUAGE_ICONS.get(repo.language or "", "📁")
|
|
187
|
+
star_level = self._get_star_level(repo.stars)
|
|
188
|
+
|
|
189
|
+
# 项目标题
|
|
190
|
+
title = f"### [{repo.full_name}]({repo.url})"
|
|
191
|
+
subtitle = f"{lang_icon} {repo.language or '未知'} | {star_level[0]} {self._format_number(repo.stars)} Stars | {star_level[1]}"
|
|
192
|
+
|
|
193
|
+
lines = []
|
|
194
|
+
lines.append(title)
|
|
195
|
+
lines.append("")
|
|
196
|
+
lines.append(f"**{subtitle}**")
|
|
197
|
+
lines.append("")
|
|
198
|
+
|
|
199
|
+
# 描述 - 原文和中文翻译
|
|
200
|
+
original_desc = self._clean_description(repo.description, 200)
|
|
201
|
+
translated_desc = translate_description(repo.description)
|
|
202
|
+
|
|
203
|
+
# 如果翻译后不同,显示双语
|
|
204
|
+
if translated_desc != original_desc and translated_desc != (repo.description or "暂无描述"):
|
|
205
|
+
lines.append(f"> {original_desc}")
|
|
206
|
+
lines.append(f"> ")
|
|
207
|
+
lines.append(f"> **中文:** {translated_desc}")
|
|
208
|
+
else:
|
|
209
|
+
lines.append(f"> {original_desc}")
|
|
210
|
+
lines.append("")
|
|
211
|
+
|
|
212
|
+
# Topics 标签
|
|
213
|
+
if repo.topics:
|
|
214
|
+
topics_display = " ".join(f"`{topic}`" for topic in repo.topics[:8])
|
|
215
|
+
lines.append(f"**标签:** {topics_display}")
|
|
216
|
+
|
|
217
|
+
return "\n".join(lines)
|
|
218
|
+
|
|
219
|
+
def _group_by_stars(self, repos: list[Repository]) -> list[tuple[str, list[Repository]]]:
|
|
220
|
+
"""Group repositories by star count.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
repos: List of repositories.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
List of (group_name, repos) tuples.
|
|
227
|
+
"""
|
|
228
|
+
groups = [
|
|
229
|
+
("🔥 超热门项目 (100K+ Stars)", []),
|
|
230
|
+
("🌟 热门项目 (50K-100K Stars)", []),
|
|
231
|
+
("⭐ 优质项目 (10K-50K Stars)", []),
|
|
232
|
+
("💡 新兴项目 (1K-10K Stars)", []),
|
|
233
|
+
("🌱 新星项目 (<1K Stars)", []),
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
for repo in repos:
|
|
237
|
+
if repo.stars >= 100000:
|
|
238
|
+
groups[0][1].append(repo)
|
|
239
|
+
elif repo.stars >= 50000:
|
|
240
|
+
groups[1][1].append(repo)
|
|
241
|
+
elif repo.stars >= 10000:
|
|
242
|
+
groups[2][1].append(repo)
|
|
243
|
+
elif repo.stars >= 1000:
|
|
244
|
+
groups[3][1].append(repo)
|
|
245
|
+
else:
|
|
246
|
+
groups[4][1].append(repo)
|
|
247
|
+
|
|
248
|
+
return groups
|
|
249
|
+
|
|
250
|
+
def _get_star_level(self, stars: int) -> tuple[str, str]:
|
|
251
|
+
"""Get star level icon and description.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
stars: Star count.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
(icon, description) tuple.
|
|
258
|
+
"""
|
|
259
|
+
for threshold, icon, desc in STAR_LEVELS:
|
|
260
|
+
if stars >= threshold:
|
|
261
|
+
return (icon, desc)
|
|
262
|
+
return ("⭐", "新星")
|
|
263
|
+
|
|
264
|
+
def _format_number(self, num: int) -> str:
|
|
265
|
+
"""Format large numbers with K/M suffix.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
num: Number to format.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Formatted string.
|
|
272
|
+
"""
|
|
273
|
+
if num >= 1000000:
|
|
274
|
+
return f"{num / 1000000:.1f}M"
|
|
275
|
+
elif num >= 1000:
|
|
276
|
+
return f"{num / 1000:.1f}K"
|
|
277
|
+
return str(num)
|
|
278
|
+
|
|
279
|
+
def _get_language_stats(self, repos: list[Repository]) -> dict[str, int]:
|
|
280
|
+
"""Get language distribution statistics.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
repos: List of repositories.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Dictionary of language counts.
|
|
287
|
+
"""
|
|
288
|
+
stats = {}
|
|
289
|
+
for repo in repos:
|
|
290
|
+
lang = repo.language or "Unknown"
|
|
291
|
+
stats[lang] = stats.get(lang, 0) + 1
|
|
292
|
+
return stats
|
|
293
|
+
|
|
294
|
+
def _generate_language_chart(self, languages: dict[str, int]) -> str:
|
|
295
|
+
"""Generate a simple text-based language distribution chart.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
languages: Language statistics.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Markdown formatted chart.
|
|
302
|
+
"""
|
|
303
|
+
# Sort by count
|
|
304
|
+
sorted_langs = sorted(languages.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
305
|
+
|
|
306
|
+
lines = []
|
|
307
|
+
for lang, count in sorted_langs:
|
|
308
|
+
icon = LANGUAGE_ICONS.get(lang, "📁")
|
|
309
|
+
bar_len = min(20, int(count / max(languages.values()) * 20))
|
|
310
|
+
bar = "█" * bar_len + "░" * (20 - bar_len)
|
|
311
|
+
lines.append(f"| {icon} {lang or '未知'} | `{bar}` | {count} |")
|
|
312
|
+
|
|
313
|
+
return "\n".join(lines)
|
|
314
|
+
|
|
315
|
+
def _clean_description(self, description: Optional[str], max_len: int = 200) -> str:
|
|
316
|
+
"""Clean description for display.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
description: Original description.
|
|
320
|
+
max_len: Maximum length.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Cleaned description.
|
|
324
|
+
"""
|
|
325
|
+
if description is None:
|
|
326
|
+
return "暂无描述"
|
|
327
|
+
|
|
328
|
+
# Remove newlines and collapse spaces
|
|
329
|
+
cleaned = " ".join(description.split())
|
|
330
|
+
|
|
331
|
+
# Escape pipe characters
|
|
332
|
+
cleaned = cleaned.replace("|", r"\|")
|
|
333
|
+
|
|
334
|
+
# Truncate if needed
|
|
335
|
+
if len(cleaned) > max_len:
|
|
336
|
+
cleaned = cleaned[:max_len] + "..."
|
|
337
|
+
|
|
338
|
+
return cleaned
|
ai_scraper/output/rss.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""RSS exporter for generating feed subscriptions."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from xml.etree import ElementTree as ET
|
|
6
|
+
|
|
7
|
+
from ai_scraper.models import Repository
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RSSExporter:
|
|
11
|
+
"""Export repositories to RSS feed format."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, output_dir: Path, filename: str = "repositories.xml"):
|
|
14
|
+
"""Initialize the exporter.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
output_dir: Directory for output files.
|
|
18
|
+
filename: Name of the output file.
|
|
19
|
+
"""
|
|
20
|
+
self.output_dir = Path(output_dir)
|
|
21
|
+
self.filename = filename
|
|
22
|
+
|
|
23
|
+
def export_repositories(
|
|
24
|
+
self,
|
|
25
|
+
repos: list[Repository],
|
|
26
|
+
title: str = "AI Repositories Feed",
|
|
27
|
+
description: str = "Latest AI repositories from GitHub",
|
|
28
|
+
) -> Path:
|
|
29
|
+
"""Export repositories to an RSS feed.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
repos: List of repositories to export.
|
|
33
|
+
title: Feed title.
|
|
34
|
+
description: Feed description.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Path to the created file.
|
|
38
|
+
"""
|
|
39
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
# Create RSS structure
|
|
42
|
+
rss = ET.Element("rss", version="2.0")
|
|
43
|
+
channel = ET.SubElement(rss, "channel")
|
|
44
|
+
|
|
45
|
+
# Channel info
|
|
46
|
+
ET.SubElement(channel, "title").text = title
|
|
47
|
+
ET.SubElement(channel, "description").text = description
|
|
48
|
+
ET.SubElement(channel, "link").text = "https://github.com/topics/ai"
|
|
49
|
+
ET.SubElement(channel, "language").text = "en-us"
|
|
50
|
+
ET.SubElement(channel, "lastBuildDate").text = datetime.now(timezone.utc).strftime(
|
|
51
|
+
"%a, %d %b %Y %H:%M:%S GMT"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Add items
|
|
55
|
+
for repo in repos:
|
|
56
|
+
item = ET.SubElement(channel, "item")
|
|
57
|
+
|
|
58
|
+
ET.SubElement(item, "title").text = repo.full_name
|
|
59
|
+
ET.SubElement(item, "link").text = repo.url
|
|
60
|
+
ET.SubElement(item, "description").text = repo.description or "No description"
|
|
61
|
+
ET.SubElement(item, "pubDate").text = (
|
|
62
|
+
repo.updated_at.strftime("%a, %d %b %Y %H:%M:%S GMT")
|
|
63
|
+
if repo.updated_at
|
|
64
|
+
else ""
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Custom elements
|
|
68
|
+
ET.SubElement(item, "stars").text = str(repo.stars)
|
|
69
|
+
if repo.language:
|
|
70
|
+
ET.SubElement(item, "language").text = repo.language
|
|
71
|
+
|
|
72
|
+
# Write to file
|
|
73
|
+
output_path = self.output_dir / self.filename
|
|
74
|
+
|
|
75
|
+
# Pretty print
|
|
76
|
+
ET.indent(rss, space=" ")
|
|
77
|
+
tree = ET.ElementTree(rss)
|
|
78
|
+
|
|
79
|
+
with open(output_path, "wb") as f:
|
|
80
|
+
tree.write(f, encoding="utf-8", xml_declaration=True)
|
|
81
|
+
|
|
82
|
+
return output_path
|