github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,196 @@
1
+ """REST API server for ai-scraper."""
2
+
3
+ from contextlib import asynccontextmanager
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from fastapi import Depends, FastAPI, Header, HTTPException, Query
9
+ from pydantic import BaseModel
10
+
11
+ from ai_scraper.auth import verify_api_key
12
+ from ai_scraper.config import load_config
13
+ from ai_scraper.storage.async_database import AsyncDatabase
14
+
15
+
16
+ # Global database instance
17
+ db: Optional[AsyncDatabase] = None
18
+
19
+ # Authentication enabled flag
20
+ _auth_enabled = False
21
+
22
+
23
+ def set_auth_enabled(enabled: bool) -> None:
24
+ """Enable or disable API authentication."""
25
+ global _auth_enabled
26
+ _auth_enabled = enabled
27
+
28
+
29
+ async def verify_auth(x_api_key: Optional[str] = Header(None)):
30
+ """Verify API key if authentication is enabled."""
31
+ if _auth_enabled and not verify_api_key(x_api_key):
32
+ raise HTTPException(
33
+ status_code=401,
34
+ detail="Invalid or missing API key"
35
+ )
36
+ return True
37
+
38
+
39
+ @asynccontextmanager
40
+ async def lifespan(app: FastAPI):
41
+ """Manage application lifecycle."""
42
+ global db
43
+ config = load_config()
44
+ db = AsyncDatabase(Path(config.database.path))
45
+ if Path(config.database.path).exists():
46
+ await db.init_db()
47
+ yield
48
+ if db:
49
+ await db.close()
50
+
51
+
52
+ app = FastAPI(
53
+ title="GitHub AI Scraper API",
54
+ description="REST API for accessing scraped AI repositories",
55
+ version="0.1.0",
56
+ lifespan=lifespan,
57
+ )
58
+
59
+
60
+ class RepositoryResponse(BaseModel):
61
+ """Repository API response model."""
62
+ id: int
63
+ name: str
64
+ full_name: str
65
+ description: Optional[str]
66
+ stars: int
67
+ language: Optional[str]
68
+ topics: list[str]
69
+ url: str
70
+
71
+
72
+ class StatsResponse(BaseModel):
73
+ """Statistics API response model."""
74
+ repository_count: int
75
+ snapshot_count: int
76
+ total_stars: int
77
+
78
+
79
+ @app.get("/api/repos", response_model=list[RepositoryResponse], dependencies=[Depends(verify_auth)])
80
+ async def list_repositories(
81
+ limit: int = Query(default=20, ge=1, le=100),
82
+ sort: str = Query(default="stars", pattern="^(stars|updated|relevance)$"),
83
+ language: Optional[str] = None,
84
+ min_stars: Optional[int] = None,
85
+ ):
86
+ """List repositories with optional filters."""
87
+ if not db:
88
+ raise HTTPException(status_code=503, detail="Database not available")
89
+
90
+ repos = await db.get_all_repositories(limit=limit, sort_by=sort)
91
+
92
+ if language:
93
+ repos = [r for r in repos if r.language and r.language.lower() == language.lower()]
94
+ if min_stars:
95
+ repos = [r for r in repos if r.stars >= min_stars]
96
+
97
+ return [
98
+ RepositoryResponse(
99
+ id=r.id,
100
+ name=r.name,
101
+ full_name=r.full_name,
102
+ description=r.description,
103
+ stars=r.stars,
104
+ language=r.language,
105
+ topics=r.topics,
106
+ url=r.url,
107
+ )
108
+ for r in repos
109
+ ]
110
+
111
+
112
+ @app.get("/api/repos/{repo_id}", response_model=RepositoryResponse, dependencies=[Depends(verify_auth)])
113
+ async def get_repository(repo_id: int):
114
+ """Get a specific repository by ID."""
115
+ if not db:
116
+ raise HTTPException(status_code=503, detail="Database not available")
117
+
118
+ # Use direct query instead of full scan
119
+ repo = await db.get_repository_by_id(repo_id)
120
+
121
+ if not repo:
122
+ raise HTTPException(status_code=404, detail="Repository not found")
123
+
124
+ return RepositoryResponse(
125
+ id=repo.id,
126
+ name=repo.name,
127
+ full_name=repo.full_name,
128
+ description=repo.description,
129
+ stars=repo.stars,
130
+ language=repo.language,
131
+ topics=repo.topics,
132
+ url=repo.url,
133
+ )
134
+
135
+
136
+ @app.get("/api/stats", response_model=StatsResponse, dependencies=[Depends(verify_auth)])
137
+ async def get_stats():
138
+ """Get database statistics."""
139
+ if not db:
140
+ raise HTTPException(status_code=503, detail="Database not available")
141
+
142
+ stats = await db.get_stats()
143
+ return StatsResponse(**stats)
144
+
145
+
146
+ @app.get("/api/trending", dependencies=[Depends(verify_auth)])
147
+ async def get_trending(
148
+ days: int = Query(default=7, ge=1, le=30),
149
+ limit: int = Query(default=10, ge=1, le=50),
150
+ ):
151
+ """Get trending repositories."""
152
+ if not db:
153
+ raise HTTPException(status_code=503, detail="Database not available")
154
+
155
+ trends = await db.get_trending(days=days, limit=limit)
156
+ return [
157
+ {
158
+ "repo_id": t.repo_id,
159
+ "repo_name": t.repo_name,
160
+ "initial_stars": t.initial_stars,
161
+ "current_stars": t.current_stars,
162
+ "growth_rate": round(t.growth_rate * 100, 2),
163
+ }
164
+ for t in trends
165
+ ]
166
+
167
+
168
+ @app.get("/api/search", dependencies=[Depends(verify_auth)])
169
+ async def search_repositories(
170
+ q: str = Query(..., min_length=2),
171
+ limit: int = Query(default=20, ge=1, le=100),
172
+ ):
173
+ """Search repositories by name or description."""
174
+ if not db:
175
+ raise HTTPException(status_code=503, detail="Database not available")
176
+
177
+ repos = await db.search_local(query=q, limit=limit)
178
+ return [
179
+ RepositoryResponse(
180
+ id=r.id,
181
+ name=r.name,
182
+ full_name=r.full_name,
183
+ description=r.description,
184
+ stars=r.stars,
185
+ language=r.language,
186
+ topics=r.topics,
187
+ url=r.url,
188
+ )
189
+ for r in repos
190
+ ]
191
+
192
+
193
+ def run_server(host: str = "0.0.0.0", port: int = 8080):
194
+ """Run the API server."""
195
+ import uvicorn
196
+ uvicorn.run(app, host=host, port=port)
ai_scraper/auth.py ADDED
@@ -0,0 +1,68 @@
1
+ """API authentication module."""
2
+
3
+ import hashlib
4
+ import os
5
+ import secrets
6
+ from typing import Optional
7
+
8
+
9
+ # In-memory store for API keys (in production, use secure storage)
10
+ _api_keys: set[str] = set()
11
+
12
+
13
+ def create_api_key() -> str:
14
+ """Generate a new API key.
15
+
16
+ Returns:
17
+ API key string starting with 'as_'.
18
+ """
19
+ key = secrets.token_hex(16)
20
+ api_key = f"as_{key}"
21
+ _api_keys.add(api_key)
22
+ return api_key
23
+
24
+
25
+ def verify_api_key(api_key: Optional[str]) -> bool:
26
+ """Verify an API key.
27
+
28
+ Args:
29
+ api_key: API key to verify.
30
+
31
+ Returns:
32
+ True if valid, False otherwise.
33
+ """
34
+ if not api_key:
35
+ return False
36
+ return api_key in _api_keys
37
+
38
+
39
+ def hash_token(token: str) -> str:
40
+ """Hash a token for secure storage.
41
+
42
+ Args:
43
+ token: Token to hash.
44
+
45
+ Returns:
46
+ Hashed token.
47
+ """
48
+ return hashlib.sha256(token.encode()).hexdigest()
49
+
50
+
51
+ def load_api_keys_from_env() -> None:
52
+ """Load API keys from environment variable."""
53
+ env_keys = os.environ.get("AI_SCRAPER_API_KEYS", "")
54
+ if env_keys:
55
+ for key in env_keys.split(","):
56
+ key = key.strip()
57
+ if key:
58
+ _api_keys.add(key)
59
+
60
+
61
+ def get_valid_api_keys() -> set[str]:
62
+ """Get all valid API keys (for testing)."""
63
+ return _api_keys.copy()
64
+
65
+
66
+ def clear_api_keys() -> None:
67
+ """Clear all API keys (for testing)."""
68
+ _api_keys.clear()
ai_scraper/backup.py ADDED
@@ -0,0 +1,112 @@
1
+ """Database backup management."""
2
+
3
+ import gzip
4
+ import logging
5
+ import shutil
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class BackupManager:
14
+ """Manage database backups."""
15
+
16
+ def __init__(
17
+ self,
18
+ backup_dir: Path,
19
+ max_backups: int = 10,
20
+ compress: bool = True,
21
+ ):
22
+ """Initialize backup manager.
23
+
24
+ Args:
25
+ backup_dir: Directory for storing backups.
26
+ max_backups: Maximum number of backups to keep.
27
+ compress: Whether to compress backups.
28
+ """
29
+ self.backup_dir = Path(backup_dir)
30
+ self.backup_dir.mkdir(parents=True, exist_ok=True)
31
+ self.max_backups = max_backups
32
+ self.compress = compress
33
+
34
+ def create_backup(self, db_path: Path, name: Optional[str] = None) -> Path:
35
+ """Create a backup of the database.
36
+
37
+ Args:
38
+ db_path: Path to database file.
39
+ name: Optional custom name for backup.
40
+
41
+ Returns:
42
+ Path to created backup file.
43
+ """
44
+ if not db_path.exists():
45
+ raise FileNotFoundError(f"Database not found: {db_path}")
46
+
47
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
48
+ backup_name = name or f"backup_{timestamp}"
49
+ backup_name += ".db.gz" if self.compress else ".db"
50
+
51
+ backup_path = self.backup_dir / backup_name
52
+
53
+ if self.compress:
54
+ with open(db_path, "rb") as f_in:
55
+ with gzip.open(backup_path, "wb") as f_out:
56
+ shutil.copyfileobj(f_in, f_out)
57
+ else:
58
+ shutil.copy2(db_path, backup_path)
59
+
60
+ logger.info(f"Created backup: {backup_path}")
61
+
62
+ # Cleanup old backups
63
+ self._cleanup_old_backups()
64
+
65
+ return backup_path
66
+
67
+ def restore_backup(self, backup_path: Path, target_path: Path) -> None:
68
+ """Restore database from backup.
69
+
70
+ Args:
71
+ backup_path: Path to backup file.
72
+ target_path: Path to restore database to.
73
+ """
74
+ if not backup_path.exists():
75
+ raise FileNotFoundError(f"Backup not found: {backup_path}")
76
+
77
+ if backup_path.suffix == ".gz":
78
+ with gzip.open(backup_path, "rb") as f_in:
79
+ with open(target_path, "wb") as f_out:
80
+ shutil.copyfileobj(f_in, f_out)
81
+ else:
82
+ shutil.copy2(backup_path, target_path)
83
+
84
+ logger.info(f"Restored backup to: {target_path}")
85
+
86
+ def list_backups(self) -> list[Path]:
87
+ """List all available backups.
88
+
89
+ Returns:
90
+ List of backup file paths, sorted by modification time (newest first).
91
+ """
92
+ backups = list(self.backup_dir.glob("backup_*.db*"))
93
+ backups.sort(key=lambda p: p.stat().st_mtime, reverse=True)
94
+ return backups
95
+
96
+ def delete_backup(self, backup_path: Path) -> None:
97
+ """Delete a backup file.
98
+
99
+ Args:
100
+ backup_path: Path to backup file.
101
+ """
102
+ if backup_path.exists():
103
+ backup_path.unlink()
104
+ logger.info(f"Deleted backup: {backup_path}")
105
+
106
+ def _cleanup_old_backups(self) -> None:
107
+ """Remove old backups exceeding max_backups limit."""
108
+ backups = self.list_backups()
109
+
110
+ while len(backups) > self.max_backups:
111
+ old_backup = backups.pop()
112
+ self.delete_backup(old_backup)
ai_scraper/cache.py ADDED
@@ -0,0 +1,95 @@
1
+ """Request caching for GitHub API."""
2
+
3
+ import hashlib
4
+ import json
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+
10
+ class RequestCache:
11
+ """File-based cache for API responses."""
12
+
13
+ def __init__(self, cache_dir: Path, ttl: int = 3600):
14
+ """Initialize cache.
15
+
16
+ Args:
17
+ cache_dir: Directory for cache files.
18
+ ttl: Time-to-live in seconds.
19
+ """
20
+ self.cache_dir = Path(cache_dir)
21
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
22
+ self.ttl = ttl
23
+
24
+ def _get_cache_key(self, url: str, params: Optional[dict] = None) -> str:
25
+ """Generate cache key from URL and params."""
26
+ key_data = url + json.dumps(params or {}, sort_keys=True)
27
+ return hashlib.md5(key_data.encode()).hexdigest()
28
+
29
+ def get(self, url: str, params: Optional[dict] = None) -> Optional[dict]:
30
+ """Get cached response.
31
+
32
+ Args:
33
+ url: Request URL.
34
+ params: Request parameters.
35
+
36
+ Returns:
37
+ Cached data or None.
38
+ """
39
+ key = self._get_cache_key(url, params)
40
+ cache_file = self.cache_dir / f"{key}.json"
41
+
42
+ if not cache_file.exists():
43
+ return None
44
+
45
+ try:
46
+ with open(cache_file, "r", encoding="utf-8") as f:
47
+ cached = json.load(f)
48
+
49
+ # Check TTL
50
+ if time.time() - cached.get("timestamp", 0) > self.ttl:
51
+ cache_file.unlink()
52
+ return None
53
+
54
+ return cached.get("data")
55
+ except (json.JSONDecodeError, KeyError):
56
+ return None
57
+
58
+ def set(self, url: str, params: Optional[dict], data: dict) -> None:
59
+ """Cache response.
60
+
61
+ Args:
62
+ url: Request URL.
63
+ params: Request parameters.
64
+ data: Response data to cache.
65
+ """
66
+ key = self._get_cache_key(url, params)
67
+ cache_file = self.cache_dir / f"{key}.json"
68
+
69
+ with open(cache_file, "w", encoding="utf-8") as f:
70
+ json.dump({
71
+ "timestamp": time.time(),
72
+ "data": data,
73
+ }, f)
74
+
75
+ def clear(self) -> int:
76
+ """Clear all cached data.
77
+
78
+ Returns:
79
+ Number of files deleted.
80
+ """
81
+ count = 0
82
+ for cache_file in self.cache_dir.glob("*.json"):
83
+ cache_file.unlink()
84
+ count += 1
85
+ return count
86
+
87
+ def get_stats(self) -> dict:
88
+ """Get cache statistics."""
89
+ files = list(self.cache_dir.glob("*.json"))
90
+ total_size = sum(f.stat().st_size for f in files)
91
+ return {
92
+ "file_count": len(files),
93
+ "total_size_bytes": total_size,
94
+ "total_size_mb": round(total_size / 1024 / 1024, 2),
95
+ }
@@ -0,0 +1,135 @@
1
+ """Repository classification system."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from ai_scraper.models.repository import Repository
7
+
8
+
9
+ @dataclass
10
+ class Classification:
11
+ """Repository classification result."""
12
+ primary_category: str
13
+ secondary_categories: list[str]
14
+ confidence: float
15
+ tech_stack: list[str]
16
+ maturity: str # experimental, production, enterprise
17
+
18
+
19
+ # Category definitions with keywords and topics
20
+ CATEGORIES = {
21
+ "llm": {
22
+ "keywords": ["llm", "large language model", "gpt", "claude", "llama", "mistral", "transformer"],
23
+ "topics": ["llm", "gpt", "language-model", "transformers"],
24
+ },
25
+ "computer-vision": {
26
+ "keywords": ["computer vision", "image recognition", "object detection", "segmentation", "yolo", "opencv"],
27
+ "topics": ["computer-vision", "object-detection", "image-segmentation", "opencv"],
28
+ },
29
+ "nlp": {
30
+ "keywords": ["nlp", "natural language", "text processing", "sentiment", "ner", "spacy", "nltk"],
31
+ "topics": ["nlp", "natural-language-processing", "text-analysis", "spacy"],
32
+ },
33
+ "ml-framework": {
34
+ "keywords": ["pytorch", "tensorflow", "jax", "keras", "machine learning framework"],
35
+ "topics": ["pytorch", "tensorflow", "jax", "keras", "deep-learning"],
36
+ },
37
+ "reinforcement-learning": {
38
+ "keywords": ["reinforcement learning", "rl", "q-learning", "policy gradient", "dqn", "ppo"],
39
+ "topics": ["reinforcement-learning", "deep-reinforcement-learning", "rl"],
40
+ },
41
+ "generative-ai": {
42
+ "keywords": ["generative", "diffusion", "gan", "stable diffusion", "midjourney", "image generation"],
43
+ "topics": ["generative-ai", "diffusion-model", "gan", "stable-diffusion"],
44
+ },
45
+ "ai-tools": {
46
+ "keywords": ["langchain", "llamaindex", "autogpt", "agent", "ai tool", "ai framework"],
47
+ "topics": ["langchain", "llamaindex", "autogpt", "ai-agent"],
48
+ },
49
+ "mlops": {
50
+ "keywords": ["mlops", "ml pipeline", "model deployment", "model serving", "mlflow", "kubeflow", "model registry"],
51
+ "topics": ["mlops", "ml-pipeline", "model-deployment", "mlflow", "kubeflow"],
52
+ },
53
+ "ai-infrastructure": {
54
+ "keywords": ["gpu", "cuda", "inference", "optimization", "quantization", "tensorrt", "onnx runtime"],
55
+ "topics": ["gpu-computing", "inference", "model-optimization", "quantization"],
56
+ },
57
+ "ai-ethics": {
58
+ "keywords": ["ai ethics", "bias detection", "fairness", "explainability", "interpretability", "responsible ai"],
59
+ "topics": ["ai-ethics", "fairness", "explainability", "responsible-ai"],
60
+ },
61
+ }
62
+
63
+ TECH_STACK = {
64
+ "pytorch": ["pytorch", "torch"],
65
+ "tensorflow": ["tensorflow", "tf"],
66
+ "jax": ["jax", "flax"],
67
+ "huggingface": ["huggingface", "transformers", "hugging face"],
68
+ "onnx": ["onnx"],
69
+ "openai": ["openai", "gpt-4", "gpt-3.5"],
70
+ "anthropic": ["anthropic", "claude"],
71
+ }
72
+
73
+
74
+ class RepositoryClassifier:
75
+ """Classify repositories into AI categories."""
76
+
77
+ def classify(self, repo: Repository) -> Classification:
78
+ """Classify a repository.
79
+
80
+ Args:
81
+ repo: Repository to classify.
82
+
83
+ Returns:
84
+ Classification result.
85
+ """
86
+ text = f"{repo.name} {repo.description or ''}".lower()
87
+ topics_lower = [t.lower() for t in repo.topics]
88
+
89
+ # Score each category
90
+ scores = {}
91
+ for category, rules in CATEGORIES.items():
92
+ score = 0.0
93
+
94
+ # Keyword matches
95
+ for kw in rules["keywords"]:
96
+ if kw in text:
97
+ score += 0.3
98
+
99
+ # Topic matches
100
+ for topic in rules["topics"]:
101
+ if topic in topics_lower:
102
+ score += 0.5
103
+
104
+ scores[category] = min(score, 1.0)
105
+
106
+ # Get primary and secondary categories
107
+ sorted_cats = sorted(scores.items(), key=lambda x: x[1], reverse=True)
108
+ primary = sorted_cats[0][0] if sorted_cats[0][1] > 0.3 else "other"
109
+ secondary = [cat for cat, score in sorted_cats[1:4] if score > 0.2]
110
+
111
+ # Detect tech stack
112
+ tech_stack = []
113
+ for tech, keywords in TECH_STACK.items():
114
+ if any(kw in text for kw in keywords):
115
+ tech_stack.append(tech)
116
+
117
+ # Determine maturity
118
+ maturity = self._assess_maturity(repo)
119
+
120
+ return Classification(
121
+ primary_category=primary,
122
+ secondary_categories=secondary,
123
+ confidence=sorted_cats[0][1],
124
+ tech_stack=tech_stack,
125
+ maturity=maturity,
126
+ )
127
+
128
+ def _assess_maturity(self, repo: Repository) -> str:
129
+ """Assess repository maturity."""
130
+ if repo.stars >= 10000 and repo.forks and repo.forks >= 500:
131
+ return "enterprise"
132
+ elif repo.stars >= 1000:
133
+ return "production"
134
+ else:
135
+ return "experimental"