github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,338 @@
1
+ """Markdown exporter for generating beautiful reports."""
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from ai_scraper.models import Repository
8
+ from ai_scraper.output.translator import translate_description
9
+
10
+
11
+ # 分类映射(英文 -> 中文)
12
+ CATEGORY_NAMES = {
13
+ "LLM": "大语言模型",
14
+ "Chatbot": "聊天机器人",
15
+ "Generative AI": "生成式 AI",
16
+ "Computer Vision": "计算机视觉",
17
+ "NLP": "自然语言处理",
18
+ "Machine Learning": "机器学习",
19
+ "Deep Learning": "深度学习",
20
+ "MLOps": "机器学习运维",
21
+ "AI Infrastructure": "AI 基础设施",
22
+ "AI Ethics": "AI 伦理",
23
+ "Reinforcement Learning": "强化学习",
24
+ "Robotics": "机器人",
25
+ "AutoML": "自动化机器学习",
26
+ "Data Science": "数据科学",
27
+ "AI Tools": "AI 工具",
28
+ "Other": "其他",
29
+ }
30
+
31
+ # 语言图标
32
+ LANGUAGE_ICONS = {
33
+ "Python": "🐍",
34
+ "TypeScript": "📘",
35
+ "JavaScript": "💛",
36
+ "Java": "☕",
37
+ "Go": "🐹",
38
+ "Rust": "🦀",
39
+ "C++": "⚡",
40
+ "C": "⚙️",
41
+ "Jupyter Notebook": "📊",
42
+ "HTML": "🌐",
43
+ "CSS": "🎨",
44
+ "Ruby": "💎",
45
+ "PHP": "🐘",
46
+ "Swift": "🍎",
47
+ "Kotlin": "🎯",
48
+ "Lua": "🌙",
49
+ "Shell": "🖥️",
50
+ "Dart": "🎯",
51
+ "Scala": "🔴",
52
+ "R": "📈",
53
+ "MATLAB": "📐",
54
+ "Julia": "💜",
55
+ "Haskell": "λ",
56
+ "Elixir": "💧",
57
+ "Clojure": "🧬",
58
+ "F#": "🔷",
59
+ "OCaml": "🐫",
60
+ "Nim": "👑",
61
+ "Crystal": "💎",
62
+ "Elm": "🌳",
63
+ "V": "⚡",
64
+ "Zig": "⚡",
65
+ "Nim": "👑",
66
+ }
67
+
68
+ # 热门程度图标
69
+ STAR_LEVELS = [
70
+ (100000, "🔥🔥🔥🔥🔥", "超热门"),
71
+ (50000, "🔥🔥🔥🔥", "非常热门"),
72
+ (10000, "🔥🔥🔥", "热门"),
73
+ (5000, "🔥🔥", "较热门"),
74
+ (1000, "🔥", "值得关注"),
75
+ (0, "⭐", "新星"),
76
+ ]
77
+
78
+
79
+ class MarkdownExporter:
80
+ """Export repositories to beautiful Markdown format."""
81
+
82
+ def __init__(self, output_dir: Path, filename: str = "repositories.md"):
83
+ """Initialize the exporter.
84
+
85
+ Args:
86
+ output_dir: Directory for output files.
87
+ filename: Name of the output file.
88
+ """
89
+ self.output_dir = Path(output_dir)
90
+ self.filename = filename
91
+
92
+ def export_repositories(self, repos: list[Repository]) -> Path:
93
+ """Export repositories to a Markdown file.
94
+
95
+ Args:
96
+ repos: List of repositories to export.
97
+
98
+ Returns:
99
+ Path to the created file.
100
+ """
101
+ # Create output directory if needed
102
+ self.output_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ # Generate content
105
+ content = self._generate_content(repos)
106
+
107
+ # Write file
108
+ output_path = self.output_dir / self.filename
109
+ output_path.write_text(content, encoding="utf-8")
110
+
111
+ return output_path
112
+
113
+ def _generate_content(self, repos: list[Repository]) -> str:
114
+ """Generate the full Markdown content."""
115
+ lines = []
116
+
117
+ # 标题和介绍
118
+ lines.append("# 🤖 AI 开源项目精选")
119
+ lines.append("")
120
+ lines.append("> 精选 GitHub 上最受欢迎的 AI 相关开源项目,助你发现优质资源!")
121
+ lines.append("")
122
+
123
+ # 统计信息
124
+ now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
125
+ total_stars = sum(repo.stars for repo in repos)
126
+ languages = self._get_language_stats(repos)
127
+
128
+ lines.append("## 📊 统计概览")
129
+ lines.append("")
130
+ lines.append(f"| 📅 更新时间 | 📦 项目数量 | ⭐ 总星数 |")
131
+ lines.append(f"|:---:|:---:|:---:|")
132
+ lines.append(f"| {now} | {len(repos)} | {self._format_number(total_stars)} |")
133
+ lines.append("")
134
+ lines.append("### 🌈 语言分布")
135
+ lines.append("")
136
+ lang_chart = self._generate_language_chart(languages)
137
+ lines.append(lang_chart)
138
+ lines.append("")
139
+
140
+ # 目录
141
+ lines.append("## 📑 目录")
142
+ lines.append("")
143
+ lines.append("- [🔥 超热门项目 (100K+ Stars)](#🔥-超热门项目-100k-stars)")
144
+ lines.append("- [🌟 热门项目 (50K-100K Stars)](#🌟-热门项目-50k-100k-stars)")
145
+ lines.append("- [⭐ 优质项目 (10K-50K Stars)](#⭐-优质项目-10k-50k-stars)")
146
+ lines.append("- [💡 新兴项目 (1K-10K Stars)](#💡-新兴项目-1k-10k-stars)")
147
+ lines.append("")
148
+
149
+ # 按星数分组展示
150
+ grouped_repos = self._group_by_stars(repos)
151
+
152
+ for group_name, group_repos in grouped_repos:
153
+ if group_repos:
154
+ lines.append(f"## {group_name}")
155
+ lines.append("")
156
+ for repo in group_repos:
157
+ lines.append(self._format_repo_card(repo))
158
+ lines.append("")
159
+ lines.append("---")
160
+ lines.append("")
161
+
162
+ # 页脚
163
+ lines.append("")
164
+ lines.append("## 📝 说明")
165
+ lines.append("")
166
+ lines.append("- 数据来源于 GitHub API,按 Star 数排序")
167
+ lines.append("- 项目描述包含英文原文和中文翻译")
168
+ lines.append("- 语言图标用于快速识别项目技术栈")
169
+ lines.append("- 热门程度图标表示项目的受欢迎程度")
170
+ lines.append("")
171
+ lines.append("---")
172
+ lines.append("")
173
+ lines.append("*由 [AI Scraper](https://github.com/lwx66615/github-ai-scraper) 自动生成*")
174
+
175
+ return "\n".join(lines)
176
+
177
+ def _format_repo_card(self, repo: Repository) -> str:
178
+ """Format a single repository as a beautiful card.
179
+
180
+ Args:
181
+ repo: Repository to format.
182
+
183
+ Returns:
184
+ Markdown formatted card.
185
+ """
186
+ lang_icon = LANGUAGE_ICONS.get(repo.language or "", "📁")
187
+ star_level = self._get_star_level(repo.stars)
188
+
189
+ # 项目标题
190
+ title = f"### [{repo.full_name}]({repo.url})"
191
+ subtitle = f"{lang_icon} {repo.language or '未知'} | {star_level[0]} {self._format_number(repo.stars)} Stars | {star_level[1]}"
192
+
193
+ lines = []
194
+ lines.append(title)
195
+ lines.append("")
196
+ lines.append(f"**{subtitle}**")
197
+ lines.append("")
198
+
199
+ # 描述 - 原文和中文翻译
200
+ original_desc = self._clean_description(repo.description, 200)
201
+ translated_desc = translate_description(repo.description)
202
+
203
+ # 如果翻译后不同,显示双语
204
+ if translated_desc != original_desc and translated_desc != (repo.description or "暂无描述"):
205
+ lines.append(f"> {original_desc}")
206
+ lines.append(f"> ")
207
+ lines.append(f"> **中文:** {translated_desc}")
208
+ else:
209
+ lines.append(f"> {original_desc}")
210
+ lines.append("")
211
+
212
+ # Topics 标签
213
+ if repo.topics:
214
+ topics_display = " ".join(f"`{topic}`" for topic in repo.topics[:8])
215
+ lines.append(f"**标签:** {topics_display}")
216
+
217
+ return "\n".join(lines)
218
+
219
+ def _group_by_stars(self, repos: list[Repository]) -> list[tuple[str, list[Repository]]]:
220
+ """Group repositories by star count.
221
+
222
+ Args:
223
+ repos: List of repositories.
224
+
225
+ Returns:
226
+ List of (group_name, repos) tuples.
227
+ """
228
+ groups = [
229
+ ("🔥 超热门项目 (100K+ Stars)", []),
230
+ ("🌟 热门项目 (50K-100K Stars)", []),
231
+ ("⭐ 优质项目 (10K-50K Stars)", []),
232
+ ("💡 新兴项目 (1K-10K Stars)", []),
233
+ ("🌱 新星项目 (<1K Stars)", []),
234
+ ]
235
+
236
+ for repo in repos:
237
+ if repo.stars >= 100000:
238
+ groups[0][1].append(repo)
239
+ elif repo.stars >= 50000:
240
+ groups[1][1].append(repo)
241
+ elif repo.stars >= 10000:
242
+ groups[2][1].append(repo)
243
+ elif repo.stars >= 1000:
244
+ groups[3][1].append(repo)
245
+ else:
246
+ groups[4][1].append(repo)
247
+
248
+ return groups
249
+
250
+ def _get_star_level(self, stars: int) -> tuple[str, str]:
251
+ """Get star level icon and description.
252
+
253
+ Args:
254
+ stars: Star count.
255
+
256
+ Returns:
257
+ (icon, description) tuple.
258
+ """
259
+ for threshold, icon, desc in STAR_LEVELS:
260
+ if stars >= threshold:
261
+ return (icon, desc)
262
+ return ("⭐", "新星")
263
+
264
+ def _format_number(self, num: int) -> str:
265
+ """Format large numbers with K/M suffix.
266
+
267
+ Args:
268
+ num: Number to format.
269
+
270
+ Returns:
271
+ Formatted string.
272
+ """
273
+ if num >= 1000000:
274
+ return f"{num / 1000000:.1f}M"
275
+ elif num >= 1000:
276
+ return f"{num / 1000:.1f}K"
277
+ return str(num)
278
+
279
+ def _get_language_stats(self, repos: list[Repository]) -> dict[str, int]:
280
+ """Get language distribution statistics.
281
+
282
+ Args:
283
+ repos: List of repositories.
284
+
285
+ Returns:
286
+ Dictionary of language counts.
287
+ """
288
+ stats = {}
289
+ for repo in repos:
290
+ lang = repo.language or "Unknown"
291
+ stats[lang] = stats.get(lang, 0) + 1
292
+ return stats
293
+
294
+ def _generate_language_chart(self, languages: dict[str, int]) -> str:
295
+ """Generate a simple text-based language distribution chart.
296
+
297
+ Args:
298
+ languages: Language statistics.
299
+
300
+ Returns:
301
+ Markdown formatted chart.
302
+ """
303
+ # Sort by count
304
+ sorted_langs = sorted(languages.items(), key=lambda x: x[1], reverse=True)[:10]
305
+
306
+ lines = []
307
+ for lang, count in sorted_langs:
308
+ icon = LANGUAGE_ICONS.get(lang, "📁")
309
+ bar_len = min(20, int(count / max(languages.values()) * 20))
310
+ bar = "█" * bar_len + "░" * (20 - bar_len)
311
+ lines.append(f"| {icon} {lang or '未知'} | `{bar}` | {count} |")
312
+
313
+ return "\n".join(lines)
314
+
315
+ def _clean_description(self, description: Optional[str], max_len: int = 200) -> str:
316
+ """Clean description for display.
317
+
318
+ Args:
319
+ description: Original description.
320
+ max_len: Maximum length.
321
+
322
+ Returns:
323
+ Cleaned description.
324
+ """
325
+ if description is None:
326
+ return "暂无描述"
327
+
328
+ # Remove newlines and collapse spaces
329
+ cleaned = " ".join(description.split())
330
+
331
+ # Escape pipe characters
332
+ cleaned = cleaned.replace("|", r"\|")
333
+
334
+ # Truncate if needed
335
+ if len(cleaned) > max_len:
336
+ cleaned = cleaned[:max_len] + "..."
337
+
338
+ return cleaned
@@ -0,0 +1,82 @@
1
+ """RSS exporter for generating feed subscriptions."""
2
+
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from xml.etree import ElementTree as ET
6
+
7
+ from ai_scraper.models import Repository
8
+
9
+
10
+ class RSSExporter:
11
+ """Export repositories to RSS feed format."""
12
+
13
+ def __init__(self, output_dir: Path, filename: str = "repositories.xml"):
14
+ """Initialize the exporter.
15
+
16
+ Args:
17
+ output_dir: Directory for output files.
18
+ filename: Name of the output file.
19
+ """
20
+ self.output_dir = Path(output_dir)
21
+ self.filename = filename
22
+
23
+ def export_repositories(
24
+ self,
25
+ repos: list[Repository],
26
+ title: str = "AI Repositories Feed",
27
+ description: str = "Latest AI repositories from GitHub",
28
+ ) -> Path:
29
+ """Export repositories to an RSS feed.
30
+
31
+ Args:
32
+ repos: List of repositories to export.
33
+ title: Feed title.
34
+ description: Feed description.
35
+
36
+ Returns:
37
+ Path to the created file.
38
+ """
39
+ self.output_dir.mkdir(parents=True, exist_ok=True)
40
+
41
+ # Create RSS structure
42
+ rss = ET.Element("rss", version="2.0")
43
+ channel = ET.SubElement(rss, "channel")
44
+
45
+ # Channel info
46
+ ET.SubElement(channel, "title").text = title
47
+ ET.SubElement(channel, "description").text = description
48
+ ET.SubElement(channel, "link").text = "https://github.com/topics/ai"
49
+ ET.SubElement(channel, "language").text = "en-us"
50
+ ET.SubElement(channel, "lastBuildDate").text = datetime.now(timezone.utc).strftime(
51
+ "%a, %d %b %Y %H:%M:%S GMT"
52
+ )
53
+
54
+ # Add items
55
+ for repo in repos:
56
+ item = ET.SubElement(channel, "item")
57
+
58
+ ET.SubElement(item, "title").text = repo.full_name
59
+ ET.SubElement(item, "link").text = repo.url
60
+ ET.SubElement(item, "description").text = repo.description or "No description"
61
+ ET.SubElement(item, "pubDate").text = (
62
+ repo.updated_at.strftime("%a, %d %b %Y %H:%M:%S GMT")
63
+ if repo.updated_at
64
+ else ""
65
+ )
66
+
67
+ # Custom elements
68
+ ET.SubElement(item, "stars").text = str(repo.stars)
69
+ if repo.language:
70
+ ET.SubElement(item, "language").text = repo.language
71
+
72
+ # Write to file
73
+ output_path = self.output_dir / self.filename
74
+
75
+ # Pretty print
76
+ ET.indent(rss, space=" ")
77
+ tree = ET.ElementTree(rss)
78
+
79
+ with open(output_path, "wb") as f:
80
+ tree.write(f, encoding="utf-8", xml_declaration=True)
81
+
82
+ return output_path