github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/health.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Repository health assessment."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ai_scraper.models.repository import Repository
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class HealthScore:
|
|
12
|
+
"""Repository health score breakdown."""
|
|
13
|
+
|
|
14
|
+
overall: float
|
|
15
|
+
activity: float
|
|
16
|
+
popularity: float
|
|
17
|
+
maintenance: float
|
|
18
|
+
community: float
|
|
19
|
+
grade: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HealthAssessor:
|
|
23
|
+
"""Assess repository health based on multiple factors."""
|
|
24
|
+
|
|
25
|
+
def assess(self, repo: Repository) -> HealthScore:
|
|
26
|
+
"""Assess repository health.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
repo: Repository to assess.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Health score breakdown.
|
|
33
|
+
"""
|
|
34
|
+
activity = self._score_activity(repo)
|
|
35
|
+
popularity = self._score_popularity(repo)
|
|
36
|
+
maintenance = self._score_maintenance(repo)
|
|
37
|
+
community = self._score_community(repo)
|
|
38
|
+
|
|
39
|
+
# Weighted overall score
|
|
40
|
+
overall = (
|
|
41
|
+
activity * 0.3 +
|
|
42
|
+
popularity * 0.25 +
|
|
43
|
+
maintenance * 0.25 +
|
|
44
|
+
community * 0.2
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
grade = self.get_grade(overall)
|
|
48
|
+
|
|
49
|
+
return HealthScore(
|
|
50
|
+
overall=overall,
|
|
51
|
+
activity=activity,
|
|
52
|
+
popularity=popularity,
|
|
53
|
+
maintenance=maintenance,
|
|
54
|
+
community=community,
|
|
55
|
+
grade=grade,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _score_activity(self, repo: Repository) -> float:
|
|
59
|
+
"""Score repository activity (0-100)."""
|
|
60
|
+
if not repo.pushed_at:
|
|
61
|
+
return 0
|
|
62
|
+
|
|
63
|
+
days_since_push = (datetime.now() - repo.pushed_at).days
|
|
64
|
+
|
|
65
|
+
if days_since_push <= 7:
|
|
66
|
+
return 100
|
|
67
|
+
elif days_since_push <= 30:
|
|
68
|
+
return 80
|
|
69
|
+
elif days_since_push <= 90:
|
|
70
|
+
return 60
|
|
71
|
+
elif days_since_push <= 180:
|
|
72
|
+
return 40
|
|
73
|
+
elif days_since_push <= 365:
|
|
74
|
+
return 20
|
|
75
|
+
else:
|
|
76
|
+
return 0
|
|
77
|
+
|
|
78
|
+
def _score_popularity(self, repo: Repository) -> float:
|
|
79
|
+
"""Score repository popularity (0-100)."""
|
|
80
|
+
stars = repo.stars
|
|
81
|
+
|
|
82
|
+
if stars >= 10000:
|
|
83
|
+
return 100
|
|
84
|
+
elif stars >= 5000:
|
|
85
|
+
return 85
|
|
86
|
+
elif stars >= 1000:
|
|
87
|
+
return 70
|
|
88
|
+
elif stars >= 500:
|
|
89
|
+
return 55
|
|
90
|
+
elif stars >= 100:
|
|
91
|
+
return 40
|
|
92
|
+
elif stars >= 50:
|
|
93
|
+
return 25
|
|
94
|
+
else:
|
|
95
|
+
return 10
|
|
96
|
+
|
|
97
|
+
def _score_maintenance(self, repo: Repository) -> float:
|
|
98
|
+
"""Score repository maintenance (0-100)."""
|
|
99
|
+
if not repo.open_issues:
|
|
100
|
+
return 50 # Unknown
|
|
101
|
+
|
|
102
|
+
# Lower open issues ratio is better
|
|
103
|
+
if repo.stars > 0:
|
|
104
|
+
issue_ratio = repo.open_issues / repo.stars
|
|
105
|
+
if issue_ratio < 0.01:
|
|
106
|
+
return 100
|
|
107
|
+
elif issue_ratio < 0.05:
|
|
108
|
+
return 80
|
|
109
|
+
elif issue_ratio < 0.1:
|
|
110
|
+
return 60
|
|
111
|
+
elif issue_ratio < 0.2:
|
|
112
|
+
return 40
|
|
113
|
+
else:
|
|
114
|
+
return 20
|
|
115
|
+
|
|
116
|
+
return 50
|
|
117
|
+
|
|
118
|
+
def _score_community(self, repo: Repository) -> float:
|
|
119
|
+
"""Score community engagement (0-100)."""
|
|
120
|
+
forks = repo.forks or 0
|
|
121
|
+
|
|
122
|
+
if forks >= 1000:
|
|
123
|
+
return 100
|
|
124
|
+
elif forks >= 500:
|
|
125
|
+
return 85
|
|
126
|
+
elif forks >= 100:
|
|
127
|
+
return 70
|
|
128
|
+
elif forks >= 50:
|
|
129
|
+
return 55
|
|
130
|
+
elif forks >= 10:
|
|
131
|
+
return 40
|
|
132
|
+
elif forks >= 5:
|
|
133
|
+
return 25
|
|
134
|
+
else:
|
|
135
|
+
return 10
|
|
136
|
+
|
|
137
|
+
def get_grade(self, score: float) -> str:
|
|
138
|
+
"""Convert score to letter grade.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
score: Score (0-100).
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Letter grade (A-F).
|
|
145
|
+
"""
|
|
146
|
+
if score >= 90:
|
|
147
|
+
return "A"
|
|
148
|
+
elif score >= 80:
|
|
149
|
+
return "B"
|
|
150
|
+
elif score >= 70:
|
|
151
|
+
return "C"
|
|
152
|
+
elif score >= 60:
|
|
153
|
+
return "D"
|
|
154
|
+
else:
|
|
155
|
+
return "F"
|
ai_scraper/i18n.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Internationalization support for multi-language search."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Default keyword translations
|
|
7
|
+
DEFAULT_TRANSLATIONS = {
|
|
8
|
+
"en": {
|
|
9
|
+
"ai": "ai",
|
|
10
|
+
"artificial intelligence": "artificial intelligence",
|
|
11
|
+
"machine learning": "machine learning",
|
|
12
|
+
"deep learning": "deep learning",
|
|
13
|
+
"neural network": "neural network",
|
|
14
|
+
"llm": "llm",
|
|
15
|
+
"large language model": "large language model",
|
|
16
|
+
"gpt": "gpt",
|
|
17
|
+
"transformer": "transformer",
|
|
18
|
+
"nlp": "nlp",
|
|
19
|
+
"natural language processing": "natural language processing",
|
|
20
|
+
"computer vision": "computer vision",
|
|
21
|
+
"reinforcement learning": "reinforcement learning",
|
|
22
|
+
"pytorch": "pytorch",
|
|
23
|
+
"tensorflow": "tensorflow",
|
|
24
|
+
"huggingface": "huggingface",
|
|
25
|
+
},
|
|
26
|
+
"zh": {
|
|
27
|
+
"ai": "人工智能",
|
|
28
|
+
"artificial intelligence": "人工智能",
|
|
29
|
+
"machine learning": "机器学习",
|
|
30
|
+
"deep learning": "深度学习",
|
|
31
|
+
"neural network": "神经网络",
|
|
32
|
+
"llm": "大语言模型",
|
|
33
|
+
"large language model": "大语言模型",
|
|
34
|
+
"gpt": "GPT",
|
|
35
|
+
"transformer": "Transformer",
|
|
36
|
+
"nlp": "自然语言处理",
|
|
37
|
+
"natural language processing": "自然语言处理",
|
|
38
|
+
"computer vision": "计算机视觉",
|
|
39
|
+
"reinforcement learning": "强化学习",
|
|
40
|
+
"pytorch": "PyTorch",
|
|
41
|
+
"tensorflow": "TensorFlow",
|
|
42
|
+
"huggingface": "Hugging Face",
|
|
43
|
+
},
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class I18nManager:
|
|
48
|
+
"""Manage internationalization for keywords."""
|
|
49
|
+
|
|
50
|
+
def __init__(self):
|
|
51
|
+
"""Initialize i18n manager with default translations."""
|
|
52
|
+
self._translations = dict(DEFAULT_TRANSLATIONS)
|
|
53
|
+
|
|
54
|
+
def get_keywords(self, language: str) -> set[str]:
|
|
55
|
+
"""Get all keywords for a language.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
language: Language code (e.g., "en", "zh").
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Set of keywords for the language.
|
|
62
|
+
"""
|
|
63
|
+
if language in self._translations:
|
|
64
|
+
return set(self._translations[language].values())
|
|
65
|
+
# Fallback to English
|
|
66
|
+
return set(self._translations.get("en", {}).values())
|
|
67
|
+
|
|
68
|
+
def add_translation(
|
|
69
|
+
self,
|
|
70
|
+
source_lang: str,
|
|
71
|
+
source_term: str,
|
|
72
|
+
target_lang: str,
|
|
73
|
+
target_term: str,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Add a custom translation.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
source_lang: Source language code.
|
|
79
|
+
source_term: Term in source language.
|
|
80
|
+
target_lang: Target language code.
|
|
81
|
+
target_term: Translation in target language.
|
|
82
|
+
"""
|
|
83
|
+
if target_lang not in self._translations:
|
|
84
|
+
self._translations[target_lang] = {}
|
|
85
|
+
|
|
86
|
+
self._translations[target_lang][source_term] = target_term
|
|
87
|
+
|
|
88
|
+
def translate(
|
|
89
|
+
self,
|
|
90
|
+
term: str,
|
|
91
|
+
source_lang: str = "en",
|
|
92
|
+
target_lang: str = "zh",
|
|
93
|
+
) -> Optional[str]:
|
|
94
|
+
"""Translate a term between languages.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
term: Term to translate.
|
|
98
|
+
source_lang: Source language code.
|
|
99
|
+
target_lang: Target language code.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Translated term or None if not found.
|
|
103
|
+
"""
|
|
104
|
+
if target_lang not in self._translations:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
return self._translations[target_lang].get(term.lower())
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_translated_keywords(
|
|
111
|
+
keywords: list[str],
|
|
112
|
+
languages: Optional[list[str]] = None,
|
|
113
|
+
) -> list[str]:
|
|
114
|
+
"""Get keywords translated to multiple languages.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
keywords: List of keywords to translate.
|
|
118
|
+
languages: List of target language codes. Defaults to ["en", "zh"].
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
List of keywords in all specified languages.
|
|
122
|
+
"""
|
|
123
|
+
if languages is None:
|
|
124
|
+
languages = ["en", "zh"]
|
|
125
|
+
|
|
126
|
+
i18n = I18nManager()
|
|
127
|
+
result = []
|
|
128
|
+
|
|
129
|
+
for keyword in keywords:
|
|
130
|
+
keyword_lower = keyword.lower()
|
|
131
|
+
for lang in languages:
|
|
132
|
+
# Add original keyword
|
|
133
|
+
if lang == "en":
|
|
134
|
+
result.append(keyword)
|
|
135
|
+
else:
|
|
136
|
+
# Add translation if available
|
|
137
|
+
translated = i18n.translate(keyword_lower, "en", lang)
|
|
138
|
+
if translated:
|
|
139
|
+
result.append(translated)
|
|
140
|
+
|
|
141
|
+
return list(set(result)) # Remove duplicates
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Interactive CLI mode."""
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.prompt import Prompt
|
|
5
|
+
from rich.panel import Panel
|
|
6
|
+
|
|
7
|
+
console = Console()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def show_main_menu() -> str:
|
|
11
|
+
"""Show main menu and get user choice.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
User's menu choice.
|
|
15
|
+
"""
|
|
16
|
+
console.print(Panel.fit(
|
|
17
|
+
"[bold cyan]GitHub AI Scraper[/bold cyan]\n"
|
|
18
|
+
"AI Repository Discovery Tool",
|
|
19
|
+
border_style="cyan"
|
|
20
|
+
))
|
|
21
|
+
|
|
22
|
+
console.print("\n[bold]What would you like to do?[/bold]\n")
|
|
23
|
+
console.print(" [1] Quick Scrape - Fetch top AI repos (fast)")
|
|
24
|
+
console.print(" [2] Deep Scrape - Comprehensive search (slow)")
|
|
25
|
+
console.print(" [3] Custom Scrape - Set your own parameters")
|
|
26
|
+
console.print(" [4] View Results - List scraped repositories")
|
|
27
|
+
console.print(" [5] Trending - See trending repos")
|
|
28
|
+
console.print(" [6] Export Data - Export to CSV/JSON")
|
|
29
|
+
console.print(" [7] Settings - Configure options")
|
|
30
|
+
console.print(" [q] Quit\n")
|
|
31
|
+
|
|
32
|
+
return Prompt.ask("Select an option", choices=["1", "2", "3", "4", "5", "6", "7", "q"])
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_scrape_params() -> dict:
|
|
36
|
+
"""Interactively get scrape parameters.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Dictionary of scrape parameters.
|
|
40
|
+
"""
|
|
41
|
+
console.print("\n[bold]Custom Scrape Configuration[/bold]\n")
|
|
42
|
+
|
|
43
|
+
# Get min_stars with validation
|
|
44
|
+
while True:
|
|
45
|
+
try:
|
|
46
|
+
min_stars = int(Prompt.ask("Minimum stars", default="100"))
|
|
47
|
+
if min_stars < 0:
|
|
48
|
+
console.print("[red]Minimum stars must be non-negative[/red]")
|
|
49
|
+
continue
|
|
50
|
+
break
|
|
51
|
+
except ValueError:
|
|
52
|
+
console.print("[red]Please enter a valid number[/red]")
|
|
53
|
+
|
|
54
|
+
# Get max_results with validation
|
|
55
|
+
while True:
|
|
56
|
+
try:
|
|
57
|
+
max_results = int(Prompt.ask("Maximum results", default="100"))
|
|
58
|
+
if max_results <= 0:
|
|
59
|
+
console.print("[red]Maximum results must be greater than 0[/red]")
|
|
60
|
+
continue
|
|
61
|
+
break
|
|
62
|
+
except ValueError:
|
|
63
|
+
console.print("[red]Please enter a valid number[/red]")
|
|
64
|
+
|
|
65
|
+
language = Prompt.ask("Language filter (leave empty for all)", default="")
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"min_stars": min_stars,
|
|
69
|
+
"max_results": max_results,
|
|
70
|
+
"language": language or None,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def show_scrape_progress(current: int, total: int, repo_name: str):
|
|
75
|
+
"""Show progress during scraping.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
current: Current count.
|
|
79
|
+
total: Total expected.
|
|
80
|
+
repo_name: Name of current repo.
|
|
81
|
+
"""
|
|
82
|
+
percent = (current / total * 100) if total > 0 else 0
|
|
83
|
+
console.print(f" [{current}/{total}] {percent:.0f}% - {repo_name[:40]}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def confirm_action(message: str) -> bool:
|
|
87
|
+
"""Ask for confirmation.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
message: Confirmation message.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
True if confirmed.
|
|
94
|
+
"""
|
|
95
|
+
from rich.prompt import Confirm
|
|
96
|
+
return Confirm.ask(message)
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Keyword extraction from repository metadata."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from ai_scraper.models import Repository
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Common English stopwords to filter out
|
|
10
|
+
LOW_QUALITY_KEYWORDS = {
|
|
11
|
+
"aaif",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
STOPWORDS: set[str] = {
|
|
15
|
+
"a", "an", "the", "and", "or", "but", "is", "are", "was", "were",
|
|
16
|
+
"be", "been", "being", "have", "has", "had", "do", "does", "did",
|
|
17
|
+
"will", "would", "could", "should", "may", "might", "must", "shall",
|
|
18
|
+
"can", "need", "dare", "ought", "used", "to", "of", "in", "for",
|
|
19
|
+
"on", "with", "at", "by", "from", "as", "into", "through", "during",
|
|
20
|
+
"before", "after", "above", "below", "between", "under", "again",
|
|
21
|
+
"further", "then", "once", "here", "there", "when", "where", "why",
|
|
22
|
+
"how", "all", "each", "few", "more", "most", "other", "some", "such",
|
|
23
|
+
"no", "nor", "not", "only", "own", "same", "so", "than", "too",
|
|
24
|
+
"very", "just", "also", "now", "that", "this", "these", "those",
|
|
25
|
+
"what", "which", "who", "whom", "whose", "if", "else", "because",
|
|
26
|
+
"while", "although", "though", "since", "until", "unless", "however",
|
|
27
|
+
"therefore", "thus", "hence", "either", "neither", "both", "not",
|
|
28
|
+
"only", "also", "even", "still", "already", "yet", "just", "only",
|
|
29
|
+
"i", "me", "my", "myself", "we", "our", "ours", "ourselves",
|
|
30
|
+
"you", "your", "yours", "yourself", "yourselves", "he", "him",
|
|
31
|
+
"his", "himself", "she", "her", "hers", "herself", "it", "its",
|
|
32
|
+
"itself", "they", "them", "their", "theirs", "themselves",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# 无效关键词模式(需要过滤掉)
|
|
36
|
+
INVALID_PATTERNS = [
|
|
37
|
+
r'^\d+/\w+$', # 数字/单词模式,如 "0/zero", "112/ai"
|
|
38
|
+
r'^[\w-]+/[\w-]+$', # 路径模式,如 "owner/repo"
|
|
39
|
+
r'^\d+$', # 纯数字
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# 最小关键词长度(AI 相关缩写例外)
|
|
43
|
+
MIN_KEYWORD_LENGTH = 3
|
|
44
|
+
VALID_SHORT_KEYWORDS = {
|
|
45
|
+
"ai", "ml", "dl", "nlp", "cv", "llm", "gpt", "rag", "mcp",
|
|
46
|
+
"rnn", "cnn", "gan", "vae", "rl", "cl", "asr", "tts",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class KeywordExtractor:
|
|
51
|
+
"""Extract and manage keywords from repository metadata."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, keywords_file: Path, max_keywords: int = 100):
|
|
54
|
+
"""Initialize the extractor.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
keywords_file: Path to file for persisting keywords.
|
|
58
|
+
max_keywords: Maximum number of keywords to keep.
|
|
59
|
+
"""
|
|
60
|
+
self.keywords_file = keywords_file
|
|
61
|
+
self.max_keywords = max_keywords
|
|
62
|
+
|
|
63
|
+
def load_keywords(self) -> set[str]:
|
|
64
|
+
"""Load keywords from file.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Set of keywords, or empty set if file doesn't exist.
|
|
68
|
+
"""
|
|
69
|
+
if not self.keywords_file.exists():
|
|
70
|
+
return set()
|
|
71
|
+
|
|
72
|
+
keywords: set[str] = set()
|
|
73
|
+
with open(self.keywords_file, "r", encoding="utf-8") as f:
|
|
74
|
+
for line in f:
|
|
75
|
+
keyword = line.strip()
|
|
76
|
+
if keyword:
|
|
77
|
+
keywords.add(keyword)
|
|
78
|
+
return keywords
|
|
79
|
+
|
|
80
|
+
def save_keywords(self, keywords: set[str]) -> None:
|
|
81
|
+
"""Save keywords to file.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
keywords: Set of keywords to save.
|
|
85
|
+
"""
|
|
86
|
+
# Ensure parent directory exists
|
|
87
|
+
self.keywords_file.parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
with open(self.keywords_file, "w", encoding="utf-8") as f:
|
|
90
|
+
for keyword in sorted(keywords):
|
|
91
|
+
f.write(f"{keyword}\n")
|
|
92
|
+
|
|
93
|
+
def extract_from_repos(self, repos: list[Repository]) -> set[str]:
|
|
94
|
+
"""Extract keywords from a list of repositories.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
repos: List of repositories to extract from.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Set of extracted keywords.
|
|
101
|
+
"""
|
|
102
|
+
keywords: set[str] = set()
|
|
103
|
+
for repo in repos:
|
|
104
|
+
keywords.update(self._extract_from_topics(repo))
|
|
105
|
+
keywords.update(self._extract_from_description(repo))
|
|
106
|
+
keywords.update(self._extract_from_name(repo))
|
|
107
|
+
|
|
108
|
+
# Apply quality filter
|
|
109
|
+
return self._filter_keywords(keywords)
|
|
110
|
+
|
|
111
|
+
def _filter_keywords(self, keywords: set[str]) -> set[str]:
|
|
112
|
+
"""Filter out low-quality keywords.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
keywords: Set of keywords to filter.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Filtered set of high-quality keywords.
|
|
119
|
+
"""
|
|
120
|
+
filtered: set[str] = set()
|
|
121
|
+
|
|
122
|
+
for keyword in keywords:
|
|
123
|
+
keyword_lower = keyword.strip().lower()
|
|
124
|
+
if not keyword_lower:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Skip known low-quality tokens discovered from repository noise
|
|
128
|
+
if keyword_lower in LOW_QUALITY_KEYWORDS:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
# Skip if matches invalid patterns
|
|
132
|
+
skip = False
|
|
133
|
+
for pattern in INVALID_PATTERNS:
|
|
134
|
+
if re.match(pattern, keyword_lower):
|
|
135
|
+
skip = True
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
if skip:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# Check minimum length
|
|
142
|
+
if len(keyword_lower) < MIN_KEYWORD_LENGTH:
|
|
143
|
+
# Allow known short AI terms
|
|
144
|
+
if keyword_lower not in VALID_SHORT_KEYWORDS:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Skip if it looks like a file path with extension
|
|
148
|
+
if '.' in keyword_lower and not keyword_lower.startswith('.'):
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
# Skip if it's mostly numbers
|
|
152
|
+
digit_count = sum(1 for c in keyword_lower if c.isdigit())
|
|
153
|
+
if digit_count >= len(keyword_lower) * 0.5:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
filtered.add(keyword_lower)
|
|
157
|
+
|
|
158
|
+
return filtered
|
|
159
|
+
|
|
160
|
+
def _extract_from_topics(self, repo: Repository) -> set[str]:
|
|
161
|
+
"""Extract keywords from repository topics.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
repo: Repository to extract from.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Set of keywords from topics (lowercase).
|
|
168
|
+
"""
|
|
169
|
+
return {topic.lower() for topic in repo.topics}
|
|
170
|
+
|
|
171
|
+
def _extract_from_description(self, repo: Repository) -> set[str]:
|
|
172
|
+
"""Extract keywords from repository description.
|
|
173
|
+
|
|
174
|
+
Splits on non-alphanumeric characters, filters stopwords,
|
|
175
|
+
requires min 2 chars, and excludes pure digits.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
repo: Repository to extract from.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Set of keywords from description.
|
|
182
|
+
"""
|
|
183
|
+
if not repo.description:
|
|
184
|
+
return set()
|
|
185
|
+
|
|
186
|
+
keywords: set[str] = set()
|
|
187
|
+
# Split on non-alphanumeric characters
|
|
188
|
+
words = re.split(r"[^a-zA-Z0-9]+", repo.description.lower())
|
|
189
|
+
|
|
190
|
+
for word in words:
|
|
191
|
+
# Skip empty strings
|
|
192
|
+
if not word:
|
|
193
|
+
continue
|
|
194
|
+
# Skip short words (less than 2 chars)
|
|
195
|
+
if len(word) < 2:
|
|
196
|
+
continue
|
|
197
|
+
# Skip stopwords
|
|
198
|
+
if word in STOPWORDS:
|
|
199
|
+
continue
|
|
200
|
+
# Skip pure digits
|
|
201
|
+
if word.isdigit():
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
keywords.add(word)
|
|
205
|
+
|
|
206
|
+
return keywords
|
|
207
|
+
|
|
208
|
+
def _extract_from_name(self, repo: Repository) -> set[str]:
|
|
209
|
+
"""Extract keywords from repository name.
|
|
210
|
+
|
|
211
|
+
Splits on hyphens and underscores, filters stopwords,
|
|
212
|
+
requires min 2 chars, and excludes pure digits.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
repo: Repository to extract from.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Set of keywords from name.
|
|
219
|
+
"""
|
|
220
|
+
keywords: set[str] = set()
|
|
221
|
+
# Split on hyphens and underscores
|
|
222
|
+
parts = re.split(r"[-_]+", repo.name.lower())
|
|
223
|
+
|
|
224
|
+
for part in parts:
|
|
225
|
+
# Skip empty strings
|
|
226
|
+
if not part:
|
|
227
|
+
continue
|
|
228
|
+
# Skip short parts (less than 2 chars)
|
|
229
|
+
if len(part) < 2:
|
|
230
|
+
continue
|
|
231
|
+
# Skip stopwords
|
|
232
|
+
if part in STOPWORDS:
|
|
233
|
+
continue
|
|
234
|
+
# Skip pure digits
|
|
235
|
+
if part.isdigit():
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
keywords.add(part)
|
|
239
|
+
|
|
240
|
+
return keywords
|
|
241
|
+
|
|
242
|
+
def merge_keywords(
|
|
243
|
+
self, existing: set[str], new: set[str]
|
|
244
|
+
) -> set[str]:
|
|
245
|
+
"""Merge new keywords with existing, respecting max_keywords limit.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
existing: Existing set of keywords.
|
|
249
|
+
new: New keywords to merge.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Merged set of keywords, limited to max_keywords.
|
|
253
|
+
"""
|
|
254
|
+
merged = existing | new
|
|
255
|
+
if len(merged) <= self.max_keywords:
|
|
256
|
+
return merged
|
|
257
|
+
|
|
258
|
+
# Prioritize existing keywords, then add new ones up to limit
|
|
259
|
+
result: set[str] = set(existing)
|
|
260
|
+
for keyword in sorted(new):
|
|
261
|
+
if keyword not in result:
|
|
262
|
+
result.add(keyword)
|
|
263
|
+
if len(result) >= self.max_keywords:
|
|
264
|
+
break
|
|
265
|
+
return result
|
|
266
|
+
|
|
267
|
+
def get_keywords_for_search(self) -> list[str]:
|
|
268
|
+
"""Get keywords as a sorted list for search queries.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Sorted list of keywords.
|
|
272
|
+
"""
|
|
273
|
+
keywords = self.load_keywords()
|
|
274
|
+
return sorted(keywords)
|