github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/config.py ADDED
@@ -0,0 +1,237 @@
1
+ """Configuration management."""
2
+
3
+ import os
4
+ import re
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import yaml
10
+
11
+
12
+ @dataclass
13
+ class GitHubConfig:
14
+ """GitHub API configuration."""
15
+
16
+ token: Optional[str] = None
17
+ cache_ttl: int = 3600
18
+
19
+
20
+ @dataclass
21
+ class GitLabConfig:
22
+ """GitLab API configuration."""
23
+
24
+ token: Optional[str] = None
25
+ base_url: str = "https://gitlab.com/api/v4"
26
+ cache_ttl: int = 3600
27
+
28
+
29
+ @dataclass
30
+ class FilterConfigYaml:
31
+ """Filter configuration from YAML."""
32
+
33
+ min_stars: int = 100
34
+ keywords: list[str] = field(default_factory=lambda: [
35
+ "ai", "artificial intelligence", "machine learning", "deep learning",
36
+ "neural network", "llm", "gpt", "transformer", "nlp", "computer vision",
37
+ "reinforcement learning", "pytorch", "tensorflow", "huggingface"
38
+ ])
39
+ topics: list[str] = field(default_factory=lambda: [
40
+ "ai", "machine-learning", "deep-learning", "neural-network",
41
+ "natural-language-processing", "computer-vision", "llm", "gpt",
42
+ "pytorch", "tensorflow", "huggingface", "openai", "langchain"
43
+ ])
44
+ languages: list[str] = field(default_factory=list)
45
+ exclude_keywords: list[str] = field(default_factory=list)
46
+
47
+
48
+ @dataclass
49
+ class ScrapeConfigYaml:
50
+ """Scrape configuration from YAML."""
51
+
52
+ data_fields: list[str] = field(default_factory=lambda: [
53
+ "stars", "language", "topics", "contributors"
54
+ ])
55
+ max_results: int = 500
56
+ concurrency: int = 5
57
+ cache_ttl: int = 3600
58
+
59
+
60
+ @dataclass
61
+ class DatabaseConfig:
62
+ """Database configuration."""
63
+
64
+ path: str = "./data/ai_scraper.db"
65
+
66
+
67
+ @dataclass
68
+ class SchedulerConfig:
69
+ """Go scheduler configuration."""
70
+
71
+ enabled: bool = True
72
+ workers: int = 4
73
+
74
+
75
+ @dataclass
76
+ class KeywordsConfig:
77
+ """Keywords configuration."""
78
+
79
+ file: str = "./keywords.txt"
80
+ max_keywords: int = 100
81
+
82
+
83
+ @dataclass
84
+ class OutputConfig:
85
+ """Output configuration."""
86
+
87
+ dir: str = "./output"
88
+ filename: str = "repositories.md"
89
+
90
+
91
+ @dataclass
92
+ class WebhookEndpointConfig:
93
+ """Webhook endpoint configuration."""
94
+
95
+ url: str = ""
96
+ events: list[str] = field(default_factory=list)
97
+
98
+
99
+ @dataclass
100
+ class WebhooksConfig:
101
+ """Webhooks configuration."""
102
+
103
+ enabled: bool = False
104
+ endpoints: list[WebhookEndpointConfig] = field(default_factory=list)
105
+
106
+
107
+ @dataclass
108
+ class Config:
109
+ """Main configuration."""
110
+
111
+ github: GitHubConfig = field(default_factory=GitHubConfig)
112
+ gitlab: GitLabConfig = field(default_factory=GitLabConfig)
113
+ filter: FilterConfigYaml = field(default_factory=FilterConfigYaml)
114
+ scrape: ScrapeConfigYaml = field(default_factory=ScrapeConfigYaml)
115
+ database: DatabaseConfig = field(default_factory=DatabaseConfig)
116
+ scheduler: SchedulerConfig = field(default_factory=SchedulerConfig)
117
+ keywords: KeywordsConfig = field(default_factory=KeywordsConfig)
118
+ output: OutputConfig = field(default_factory=OutputConfig)
119
+ webhooks: WebhooksConfig = field(default_factory=WebhooksConfig)
120
+
121
+
122
+ def _substitute_env_vars(value: str) -> str:
123
+ """Substitute environment variables in string value."""
124
+ pattern = r'\$\{([^}]+)\}'
125
+
126
+ def replace(match):
127
+ var_name = match.group(1)
128
+ return os.environ.get(var_name, "")
129
+
130
+ return re.sub(pattern, replace, value)
131
+
132
+
133
+ def _process_config_values(config_dict: dict) -> dict:
134
+ """Recursively process config values for env var substitution."""
135
+ result = {}
136
+ for key, value in config_dict.items():
137
+ if isinstance(value, dict):
138
+ result[key] = _process_config_values(value)
139
+ elif isinstance(value, str):
140
+ result[key] = _substitute_env_vars(value)
141
+ elif isinstance(value, list):
142
+ result[key] = [
143
+ _substitute_env_vars(item) if isinstance(item, str) else item
144
+ for item in value
145
+ ]
146
+ else:
147
+ result[key] = value
148
+ return result
149
+
150
+
151
+ def load_config(config_path: Optional[Path] = None) -> Config:
152
+ """Load configuration from YAML file."""
153
+ if config_path is None or not config_path.exists():
154
+ return Config()
155
+
156
+ with open(config_path, "r", encoding="utf-8") as f:
157
+ raw_config = yaml.safe_load(f) or {}
158
+
159
+ processed_config = _process_config_values(raw_config)
160
+
161
+ github = GitHubConfig(
162
+ token=processed_config.get("github", {}).get("token"),
163
+ cache_ttl=processed_config.get("github", {}).get("cache_ttl", 3600),
164
+ )
165
+
166
+ gitlab_dict = processed_config.get("gitlab", {})
167
+ gitlab_config = GitLabConfig(
168
+ token=gitlab_dict.get("token"),
169
+ base_url=gitlab_dict.get("base_url", "https://gitlab.com/api/v4"),
170
+ cache_ttl=gitlab_dict.get("cache_ttl", 3600),
171
+ )
172
+
173
+ filter_dict = processed_config.get("filter", {})
174
+ filter_config = FilterConfigYaml(
175
+ min_stars=filter_dict.get("min_stars", 100),
176
+ keywords=filter_dict.get("keywords", FilterConfigYaml().keywords),
177
+ topics=filter_dict.get("topics", FilterConfigYaml().topics),
178
+ languages=filter_dict.get("languages", []),
179
+ exclude_keywords=filter_dict.get("exclude_keywords", []),
180
+ )
181
+
182
+ scrape_dict = processed_config.get("scrape", {})
183
+ scrape_config = ScrapeConfigYaml(
184
+ data_fields=scrape_dict.get("data_fields", ScrapeConfigYaml().data_fields),
185
+ max_results=scrape_dict.get("max_results", 500),
186
+ concurrency=scrape_dict.get("concurrency", 5),
187
+ cache_ttl=scrape_dict.get("cache_ttl", 3600),
188
+ )
189
+
190
+ database_dict = processed_config.get("database", {})
191
+ database_config = DatabaseConfig(
192
+ path=database_dict.get("path", "./data/ai_scraper.db"),
193
+ )
194
+
195
+ scheduler_dict = processed_config.get("scheduler", {})
196
+ scheduler_config = SchedulerConfig(
197
+ enabled=scheduler_dict.get("enabled", True),
198
+ workers=scheduler_dict.get("workers", 4),
199
+ )
200
+
201
+ keywords_dict = processed_config.get("keywords", {})
202
+ keywords_config = KeywordsConfig(
203
+ file=keywords_dict.get("file", "./keywords.txt"),
204
+ max_keywords=keywords_dict.get("max_keywords", 100),
205
+ )
206
+
207
+ output_dict = processed_config.get("output", {})
208
+ output_config = OutputConfig(
209
+ dir=output_dict.get("dir", "./output"),
210
+ filename=output_dict.get("filename", "repositories.md"),
211
+ )
212
+
213
+ webhooks_dict = processed_config.get("webhooks", {})
214
+ endpoints_list = webhooks_dict.get("endpoints", [])
215
+ endpoints = [
216
+ WebhookEndpointConfig(
217
+ url=endpoint.get("url", ""),
218
+ events=endpoint.get("events", []),
219
+ )
220
+ for endpoint in endpoints_list
221
+ ]
222
+ webhooks_config = WebhooksConfig(
223
+ enabled=webhooks_dict.get("enabled", False),
224
+ endpoints=endpoints,
225
+ )
226
+
227
+ return Config(
228
+ github=github,
229
+ gitlab=gitlab_config,
230
+ filter=filter_config,
231
+ scrape=scrape_config,
232
+ database=database_config,
233
+ scheduler=scheduler_config,
234
+ keywords=keywords_config,
235
+ output=output_config,
236
+ webhooks=webhooks_config,
237
+ )
@@ -0,0 +1,82 @@
1
+ """Configuration file watcher for hot reload."""
2
+
3
+ import logging
4
+ import threading
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Callable, Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ConfigWatcher:
13
+ """Watch configuration file for changes."""
14
+
15
+ def __init__(
16
+ self,
17
+ config_path: Path,
18
+ on_change: Callable[[Path], None],
19
+ poll_interval: float = 1.0,
20
+ ):
21
+ """Initialize config watcher.
22
+
23
+ Args:
24
+ config_path: Path to configuration file.
25
+ on_change: Callback when file changes.
26
+ poll_interval: Polling interval in seconds.
27
+ """
28
+ self.config_path = Path(config_path)
29
+ self.on_change = on_change
30
+ self.poll_interval = poll_interval
31
+ self._running = False
32
+ self._thread: Optional[threading.Thread] = None
33
+ self._last_mtime: Optional[float] = None
34
+
35
+ def start(self) -> None:
36
+ """Start watching for changes."""
37
+ if self._running:
38
+ return
39
+
40
+ self._running = True
41
+ self._last_mtime = self._get_mtime()
42
+
43
+ self._thread = threading.Thread(target=self._watch_loop, daemon=True)
44
+ self._thread.start()
45
+
46
+ logger.info(f"Started watching {self.config_path}")
47
+
48
+ def stop(self) -> None:
49
+ """Stop watching for changes."""
50
+ self._running = False
51
+ if self._thread:
52
+ self._thread.join(timeout=2)
53
+ self._thread = None
54
+
55
+ logger.info("Stopped config watcher")
56
+
57
+ def _get_mtime(self) -> Optional[float]:
58
+ """Get file modification time."""
59
+ try:
60
+ return self.config_path.stat().st_mtime
61
+ except FileNotFoundError:
62
+ return None
63
+
64
+ def _watch_loop(self) -> None:
65
+ """Main watch loop."""
66
+ while self._running:
67
+ try:
68
+ current_mtime = self._get_mtime()
69
+
70
+ if current_mtime is not None and current_mtime != self._last_mtime:
71
+ logger.info(f"Config file changed: {self.config_path}")
72
+ self._last_mtime = current_mtime
73
+ try:
74
+ self.on_change(self.config_path)
75
+ except Exception as e:
76
+ logger.error(f"Error in on_change callback: {e}")
77
+
78
+ time.sleep(self.poll_interval)
79
+
80
+ except Exception as e:
81
+ logger.error(f"Error in watch loop: {e}")
82
+ time.sleep(self.poll_interval)
ai_scraper/dedup.py ADDED
@@ -0,0 +1,148 @@
1
+ """Repository deduplication utilities."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from ai_scraper.models.repository import Repository
7
+
8
+
9
+ @dataclass
10
+ class DuplicationInfo:
11
+ """Information about repository duplication."""
12
+ is_fork: bool
13
+ is_mirror: bool
14
+ is_similar: bool
15
+ original_repo: Optional[str]
16
+ duplicate_type: str # "fork", "mirror", "similar", "none"
17
+ similarity_score: float = 0.0
18
+
19
+
20
+ class DeduplicationChecker:
21
+ """Check for repository duplicates."""
22
+
23
+ # Common mirror patterns
24
+ MIRROR_PATTERNS = [
25
+ "-mirror",
26
+ "-mirror.git",
27
+ ".mirror",
28
+ "mirror-",
29
+ ]
30
+
31
+ def check(self, repo: Repository, is_fork: bool = False) -> DuplicationInfo:
32
+ """Check if repository is a duplicate.
33
+
34
+ Args:
35
+ repo: Repository to check.
36
+ is_fork: Whether the repo is a fork (from API data).
37
+
38
+ Returns:
39
+ Duplication information.
40
+ """
41
+ # Check mirror patterns in name
42
+ name_lower = repo.name.lower()
43
+ is_mirror = any(pattern in name_lower for pattern in self.MIRROR_PATTERNS)
44
+
45
+ # Extract original repo name if mirror
46
+ original = None
47
+ duplicate_type = "none"
48
+
49
+ if is_fork:
50
+ duplicate_type = "fork"
51
+ original = self._extract_original_from_fork(repo.name)
52
+ elif is_mirror:
53
+ duplicate_type = "mirror"
54
+ original = self._extract_original_name(repo.name)
55
+
56
+ return DuplicationInfo(
57
+ is_fork=is_fork,
58
+ is_mirror=is_mirror,
59
+ is_similar=False,
60
+ original_repo=original,
61
+ duplicate_type=duplicate_type,
62
+ )
63
+
64
+ def _extract_original_name(self, mirror_name: str) -> str:
65
+ """Extract original repository name from mirror name."""
66
+ name = mirror_name
67
+ for pattern in self.MIRROR_PATTERNS:
68
+ name = name.replace(pattern, "")
69
+ return name.strip("-_")
70
+
71
+ def _extract_original_from_fork(self, fork_name: str) -> str:
72
+ """Extract original repo name from fork."""
73
+ # Fork name is usually user/original-repo
74
+ # We'd need API data to know the actual original
75
+ return fork_name
76
+
77
+ def find_similar_content(
78
+ self,
79
+ repos: list[Repository],
80
+ threshold: float = 0.8,
81
+ ) -> list[tuple[Repository, Repository, float]]:
82
+ """Find repositories with similar content.
83
+
84
+ Args:
85
+ repos: List of repositories.
86
+ threshold: Similarity threshold (0-1).
87
+
88
+ Returns:
89
+ List of (repo1, repo2, similarity) tuples.
90
+ """
91
+ similar_pairs = []
92
+
93
+ for i, repo1 in enumerate(repos):
94
+ for repo2 in repos[i + 1:]:
95
+ similarity = self._calculate_similarity(repo1, repo2)
96
+ if similarity >= threshold:
97
+ similar_pairs.append((repo1, repo2, similarity))
98
+
99
+ return similar_pairs
100
+
101
+ def _calculate_similarity(self, repo1: Repository, repo2: Repository) -> float:
102
+ """Calculate similarity between two repositories."""
103
+ # Compare descriptions
104
+ desc1 = (repo1.description or "").lower()
105
+ desc2 = (repo2.description or "").lower()
106
+
107
+ # Simple Jaccard similarity on words
108
+ words1 = set(desc1.split())
109
+ words2 = set(desc2.split())
110
+
111
+ if not words1 or not words2:
112
+ return 0.0
113
+
114
+ intersection = words1 & words2
115
+ union = words1 | words2
116
+
117
+ return len(intersection) / len(union)
118
+
119
+ def find_duplicates(self, repos: list[Repository]) -> dict[str, list[Repository]]:
120
+ """Find groups of duplicate repositories.
121
+
122
+ Args:
123
+ repos: List of repositories.
124
+
125
+ Returns:
126
+ Dictionary mapping normalized names to duplicate groups.
127
+ """
128
+ groups: dict[str, list[Repository]] = {}
129
+
130
+ for repo in repos:
131
+ normalized = self._normalize_name(repo.name)
132
+ if normalized not in groups:
133
+ groups[normalized] = []
134
+ groups[normalized].append(repo)
135
+
136
+ # Return only groups with duplicates
137
+ return {k: v for k, v in groups.items() if len(v) > 1}
138
+
139
+ def _normalize_name(self, name: str) -> str:
140
+ """Normalize repository name for comparison."""
141
+ name = name.lower()
142
+ # Remove common suffixes
143
+ for suffix in ["-mirror", "-mirror.git", ".mirror", "-fork"]:
144
+ name = name.replace(suffix, "")
145
+ # Remove organization prefix
146
+ if "/" in name:
147
+ name = name.split("/")[-1]
148
+ return name.strip("-_")
@@ -0,0 +1,5 @@
1
+ """Filters for ai_scraper."""
2
+
3
+ from ai_scraper.filters.ai_filter import AIFilter
4
+
5
+ __all__ = ["AIFilter"]
@@ -0,0 +1,93 @@
1
+ """AI-related content filter."""
2
+
3
+ from ai_scraper.models.repository import Repository, FilterConfig
4
+ from ai_scraper.classifier import RepositoryClassifier, Classification
5
+
6
+
7
+ class AIFilter:
8
+ """Filter for detecting AI-related repositories."""
9
+
10
+ def __init__(self):
11
+ """Initialize the AI filter with a classifier."""
12
+ self._classifier = RepositoryClassifier()
13
+
14
+ def is_ai_related(self, repo: Repository, config: FilterConfig) -> bool:
15
+ """Check if repository is AI-related.
16
+
17
+ Args:
18
+ repo: Repository to check.
19
+ config: Filter configuration.
20
+
21
+ Returns:
22
+ True if repository is AI-related.
23
+ """
24
+ # Check exclude keywords first
25
+ text_to_check = f"{repo.name} {repo.description or ''}".lower()
26
+ for exclude in config.exclude_keywords:
27
+ # Normalize: replace hyphens with spaces for matching
28
+ exclude_normalized = exclude.lower().replace("-", " ")
29
+ if exclude_normalized in text_to_check or exclude.lower() in text_to_check:
30
+ return False
31
+
32
+ # Check topics
33
+ repo_topics_lower = [t.lower() for t in repo.topics]
34
+ for topic in config.topics:
35
+ if topic.lower() in repo_topics_lower:
36
+ return True
37
+
38
+ # Check keywords in name and description
39
+ for keyword in config.keywords:
40
+ # Normalize: replace hyphens with spaces for matching
41
+ keyword_normalized = keyword.lower().replace("-", " ")
42
+ if keyword_normalized in text_to_check or keyword.lower() in text_to_check:
43
+ return True
44
+
45
+ return False
46
+
47
+ def score_relevance(self, repo: Repository) -> float:
48
+ """Calculate AI relevance score for a repository.
49
+
50
+ Args:
51
+ repo: Repository to score.
52
+
53
+ Returns:
54
+ Relevance score between 0.0 and 1.0.
55
+ """
56
+ score = 0.0
57
+ text_to_check = f"{repo.name} {repo.description or ''}".lower()
58
+
59
+ # Default AI indicators
60
+ ai_keywords = [
61
+ "ai", "artificial intelligence", "machine learning", "deep learning",
62
+ "neural network", "llm", "gpt", "transformer", "nlp", "computer vision",
63
+ "pytorch", "tensorflow", "huggingface", "openai", "langchain"
64
+ ]
65
+
66
+ ai_topics = [
67
+ "ai", "machine-learning", "deep-learning", "neural-network",
68
+ "natural-language-processing", "computer-vision", "llm", "gpt",
69
+ "pytorch", "tensorflow", "huggingface", "openai", "langchain"
70
+ ]
71
+
72
+ # Count keyword matches
73
+ keyword_matches = sum(1 for kw in ai_keywords if kw in text_to_check)
74
+ score += min(keyword_matches * 0.2, 0.6)
75
+
76
+ # Count topic matches
77
+ repo_topics_lower = [t.lower() for t in repo.topics]
78
+ topic_matches = sum(1 for topic in ai_topics if topic in repo_topics_lower)
79
+ score += min(topic_matches * 0.15, 0.4)
80
+
81
+ return min(score, 1.0)
82
+
83
+ def classify(self, repo: Repository) -> Classification:
84
+ """Classify a repository into an AI category.
85
+
86
+ Args:
87
+ repo: Repository to classify.
88
+
89
+ Returns:
90
+ Classification result with primary category, secondary categories,
91
+ confidence, tech stack, and maturity assessment.
92
+ """
93
+ return self._classifier.classify(repo)