github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/api_server.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""REST API server for ai-scraper."""
|
|
2
|
+
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from fastapi import Depends, FastAPI, Header, HTTPException, Query
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from ai_scraper.auth import verify_api_key
|
|
12
|
+
from ai_scraper.config import load_config
|
|
13
|
+
from ai_scraper.storage.async_database import AsyncDatabase
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Global database instance
|
|
17
|
+
db: Optional[AsyncDatabase] = None
|
|
18
|
+
|
|
19
|
+
# Authentication enabled flag
|
|
20
|
+
_auth_enabled = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def set_auth_enabled(enabled: bool) -> None:
|
|
24
|
+
"""Enable or disable API authentication."""
|
|
25
|
+
global _auth_enabled
|
|
26
|
+
_auth_enabled = enabled
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def verify_auth(x_api_key: Optional[str] = Header(None)):
|
|
30
|
+
"""Verify API key if authentication is enabled."""
|
|
31
|
+
if _auth_enabled and not verify_api_key(x_api_key):
|
|
32
|
+
raise HTTPException(
|
|
33
|
+
status_code=401,
|
|
34
|
+
detail="Invalid or missing API key"
|
|
35
|
+
)
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@asynccontextmanager
|
|
40
|
+
async def lifespan(app: FastAPI):
|
|
41
|
+
"""Manage application lifecycle."""
|
|
42
|
+
global db
|
|
43
|
+
config = load_config()
|
|
44
|
+
db = AsyncDatabase(Path(config.database.path))
|
|
45
|
+
if Path(config.database.path).exists():
|
|
46
|
+
await db.init_db()
|
|
47
|
+
yield
|
|
48
|
+
if db:
|
|
49
|
+
await db.close()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
app = FastAPI(
|
|
53
|
+
title="GitHub AI Scraper API",
|
|
54
|
+
description="REST API for accessing scraped AI repositories",
|
|
55
|
+
version="0.1.0",
|
|
56
|
+
lifespan=lifespan,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class RepositoryResponse(BaseModel):
|
|
61
|
+
"""Repository API response model."""
|
|
62
|
+
id: int
|
|
63
|
+
name: str
|
|
64
|
+
full_name: str
|
|
65
|
+
description: Optional[str]
|
|
66
|
+
stars: int
|
|
67
|
+
language: Optional[str]
|
|
68
|
+
topics: list[str]
|
|
69
|
+
url: str
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class StatsResponse(BaseModel):
|
|
73
|
+
"""Statistics API response model."""
|
|
74
|
+
repository_count: int
|
|
75
|
+
snapshot_count: int
|
|
76
|
+
total_stars: int
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@app.get("/api/repos", response_model=list[RepositoryResponse], dependencies=[Depends(verify_auth)])
|
|
80
|
+
async def list_repositories(
|
|
81
|
+
limit: int = Query(default=20, ge=1, le=100),
|
|
82
|
+
sort: str = Query(default="stars", pattern="^(stars|updated|relevance)$"),
|
|
83
|
+
language: Optional[str] = None,
|
|
84
|
+
min_stars: Optional[int] = None,
|
|
85
|
+
):
|
|
86
|
+
"""List repositories with optional filters."""
|
|
87
|
+
if not db:
|
|
88
|
+
raise HTTPException(status_code=503, detail="Database not available")
|
|
89
|
+
|
|
90
|
+
repos = await db.get_all_repositories(limit=limit, sort_by=sort)
|
|
91
|
+
|
|
92
|
+
if language:
|
|
93
|
+
repos = [r for r in repos if r.language and r.language.lower() == language.lower()]
|
|
94
|
+
if min_stars:
|
|
95
|
+
repos = [r for r in repos if r.stars >= min_stars]
|
|
96
|
+
|
|
97
|
+
return [
|
|
98
|
+
RepositoryResponse(
|
|
99
|
+
id=r.id,
|
|
100
|
+
name=r.name,
|
|
101
|
+
full_name=r.full_name,
|
|
102
|
+
description=r.description,
|
|
103
|
+
stars=r.stars,
|
|
104
|
+
language=r.language,
|
|
105
|
+
topics=r.topics,
|
|
106
|
+
url=r.url,
|
|
107
|
+
)
|
|
108
|
+
for r in repos
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@app.get("/api/repos/{repo_id}", response_model=RepositoryResponse, dependencies=[Depends(verify_auth)])
|
|
113
|
+
async def get_repository(repo_id: int):
|
|
114
|
+
"""Get a specific repository by ID."""
|
|
115
|
+
if not db:
|
|
116
|
+
raise HTTPException(status_code=503, detail="Database not available")
|
|
117
|
+
|
|
118
|
+
# Use direct query instead of full scan
|
|
119
|
+
repo = await db.get_repository_by_id(repo_id)
|
|
120
|
+
|
|
121
|
+
if not repo:
|
|
122
|
+
raise HTTPException(status_code=404, detail="Repository not found")
|
|
123
|
+
|
|
124
|
+
return RepositoryResponse(
|
|
125
|
+
id=repo.id,
|
|
126
|
+
name=repo.name,
|
|
127
|
+
full_name=repo.full_name,
|
|
128
|
+
description=repo.description,
|
|
129
|
+
stars=repo.stars,
|
|
130
|
+
language=repo.language,
|
|
131
|
+
topics=repo.topics,
|
|
132
|
+
url=repo.url,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@app.get("/api/stats", response_model=StatsResponse, dependencies=[Depends(verify_auth)])
|
|
137
|
+
async def get_stats():
|
|
138
|
+
"""Get database statistics."""
|
|
139
|
+
if not db:
|
|
140
|
+
raise HTTPException(status_code=503, detail="Database not available")
|
|
141
|
+
|
|
142
|
+
stats = await db.get_stats()
|
|
143
|
+
return StatsResponse(**stats)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@app.get("/api/trending", dependencies=[Depends(verify_auth)])
|
|
147
|
+
async def get_trending(
|
|
148
|
+
days: int = Query(default=7, ge=1, le=30),
|
|
149
|
+
limit: int = Query(default=10, ge=1, le=50),
|
|
150
|
+
):
|
|
151
|
+
"""Get trending repositories."""
|
|
152
|
+
if not db:
|
|
153
|
+
raise HTTPException(status_code=503, detail="Database not available")
|
|
154
|
+
|
|
155
|
+
trends = await db.get_trending(days=days, limit=limit)
|
|
156
|
+
return [
|
|
157
|
+
{
|
|
158
|
+
"repo_id": t.repo_id,
|
|
159
|
+
"repo_name": t.repo_name,
|
|
160
|
+
"initial_stars": t.initial_stars,
|
|
161
|
+
"current_stars": t.current_stars,
|
|
162
|
+
"growth_rate": round(t.growth_rate * 100, 2),
|
|
163
|
+
}
|
|
164
|
+
for t in trends
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@app.get("/api/search", dependencies=[Depends(verify_auth)])
|
|
169
|
+
async def search_repositories(
|
|
170
|
+
q: str = Query(..., min_length=2),
|
|
171
|
+
limit: int = Query(default=20, ge=1, le=100),
|
|
172
|
+
):
|
|
173
|
+
"""Search repositories by name or description."""
|
|
174
|
+
if not db:
|
|
175
|
+
raise HTTPException(status_code=503, detail="Database not available")
|
|
176
|
+
|
|
177
|
+
repos = await db.search_local(query=q, limit=limit)
|
|
178
|
+
return [
|
|
179
|
+
RepositoryResponse(
|
|
180
|
+
id=r.id,
|
|
181
|
+
name=r.name,
|
|
182
|
+
full_name=r.full_name,
|
|
183
|
+
description=r.description,
|
|
184
|
+
stars=r.stars,
|
|
185
|
+
language=r.language,
|
|
186
|
+
topics=r.topics,
|
|
187
|
+
url=r.url,
|
|
188
|
+
)
|
|
189
|
+
for r in repos
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def run_server(host: str = "0.0.0.0", port: int = 8080):
|
|
194
|
+
"""Run the API server."""
|
|
195
|
+
import uvicorn
|
|
196
|
+
uvicorn.run(app, host=host, port=port)
|
ai_scraper/auth.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""API authentication module."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
import secrets
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# In-memory store for API keys (in production, use secure storage)
|
|
10
|
+
_api_keys: set[str] = set()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_api_key() -> str:
|
|
14
|
+
"""Generate a new API key.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
API key string starting with 'as_'.
|
|
18
|
+
"""
|
|
19
|
+
key = secrets.token_hex(16)
|
|
20
|
+
api_key = f"as_{key}"
|
|
21
|
+
_api_keys.add(api_key)
|
|
22
|
+
return api_key
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def verify_api_key(api_key: Optional[str]) -> bool:
|
|
26
|
+
"""Verify an API key.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
api_key: API key to verify.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
True if valid, False otherwise.
|
|
33
|
+
"""
|
|
34
|
+
if not api_key:
|
|
35
|
+
return False
|
|
36
|
+
return api_key in _api_keys
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def hash_token(token: str) -> str:
|
|
40
|
+
"""Hash a token for secure storage.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
token: Token to hash.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Hashed token.
|
|
47
|
+
"""
|
|
48
|
+
return hashlib.sha256(token.encode()).hexdigest()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def load_api_keys_from_env() -> None:
|
|
52
|
+
"""Load API keys from environment variable."""
|
|
53
|
+
env_keys = os.environ.get("AI_SCRAPER_API_KEYS", "")
|
|
54
|
+
if env_keys:
|
|
55
|
+
for key in env_keys.split(","):
|
|
56
|
+
key = key.strip()
|
|
57
|
+
if key:
|
|
58
|
+
_api_keys.add(key)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_valid_api_keys() -> set[str]:
|
|
62
|
+
"""Get all valid API keys (for testing)."""
|
|
63
|
+
return _api_keys.copy()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def clear_api_keys() -> None:
|
|
67
|
+
"""Clear all API keys (for testing)."""
|
|
68
|
+
_api_keys.clear()
|
ai_scraper/backup.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Database backup management."""
|
|
2
|
+
|
|
3
|
+
import gzip
|
|
4
|
+
import logging
|
|
5
|
+
import shutil
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BackupManager:
|
|
14
|
+
"""Manage database backups."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
backup_dir: Path,
|
|
19
|
+
max_backups: int = 10,
|
|
20
|
+
compress: bool = True,
|
|
21
|
+
):
|
|
22
|
+
"""Initialize backup manager.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
backup_dir: Directory for storing backups.
|
|
26
|
+
max_backups: Maximum number of backups to keep.
|
|
27
|
+
compress: Whether to compress backups.
|
|
28
|
+
"""
|
|
29
|
+
self.backup_dir = Path(backup_dir)
|
|
30
|
+
self.backup_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
self.max_backups = max_backups
|
|
32
|
+
self.compress = compress
|
|
33
|
+
|
|
34
|
+
def create_backup(self, db_path: Path, name: Optional[str] = None) -> Path:
|
|
35
|
+
"""Create a backup of the database.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
db_path: Path to database file.
|
|
39
|
+
name: Optional custom name for backup.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Path to created backup file.
|
|
43
|
+
"""
|
|
44
|
+
if not db_path.exists():
|
|
45
|
+
raise FileNotFoundError(f"Database not found: {db_path}")
|
|
46
|
+
|
|
47
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
48
|
+
backup_name = name or f"backup_{timestamp}"
|
|
49
|
+
backup_name += ".db.gz" if self.compress else ".db"
|
|
50
|
+
|
|
51
|
+
backup_path = self.backup_dir / backup_name
|
|
52
|
+
|
|
53
|
+
if self.compress:
|
|
54
|
+
with open(db_path, "rb") as f_in:
|
|
55
|
+
with gzip.open(backup_path, "wb") as f_out:
|
|
56
|
+
shutil.copyfileobj(f_in, f_out)
|
|
57
|
+
else:
|
|
58
|
+
shutil.copy2(db_path, backup_path)
|
|
59
|
+
|
|
60
|
+
logger.info(f"Created backup: {backup_path}")
|
|
61
|
+
|
|
62
|
+
# Cleanup old backups
|
|
63
|
+
self._cleanup_old_backups()
|
|
64
|
+
|
|
65
|
+
return backup_path
|
|
66
|
+
|
|
67
|
+
def restore_backup(self, backup_path: Path, target_path: Path) -> None:
|
|
68
|
+
"""Restore database from backup.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
backup_path: Path to backup file.
|
|
72
|
+
target_path: Path to restore database to.
|
|
73
|
+
"""
|
|
74
|
+
if not backup_path.exists():
|
|
75
|
+
raise FileNotFoundError(f"Backup not found: {backup_path}")
|
|
76
|
+
|
|
77
|
+
if backup_path.suffix == ".gz":
|
|
78
|
+
with gzip.open(backup_path, "rb") as f_in:
|
|
79
|
+
with open(target_path, "wb") as f_out:
|
|
80
|
+
shutil.copyfileobj(f_in, f_out)
|
|
81
|
+
else:
|
|
82
|
+
shutil.copy2(backup_path, target_path)
|
|
83
|
+
|
|
84
|
+
logger.info(f"Restored backup to: {target_path}")
|
|
85
|
+
|
|
86
|
+
def list_backups(self) -> list[Path]:
|
|
87
|
+
"""List all available backups.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of backup file paths, sorted by modification time (newest first).
|
|
91
|
+
"""
|
|
92
|
+
backups = list(self.backup_dir.glob("backup_*.db*"))
|
|
93
|
+
backups.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
94
|
+
return backups
|
|
95
|
+
|
|
96
|
+
def delete_backup(self, backup_path: Path) -> None:
|
|
97
|
+
"""Delete a backup file.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
backup_path: Path to backup file.
|
|
101
|
+
"""
|
|
102
|
+
if backup_path.exists():
|
|
103
|
+
backup_path.unlink()
|
|
104
|
+
logger.info(f"Deleted backup: {backup_path}")
|
|
105
|
+
|
|
106
|
+
def _cleanup_old_backups(self) -> None:
|
|
107
|
+
"""Remove old backups exceeding max_backups limit."""
|
|
108
|
+
backups = self.list_backups()
|
|
109
|
+
|
|
110
|
+
while len(backups) > self.max_backups:
|
|
111
|
+
old_backup = backups.pop()
|
|
112
|
+
self.delete_backup(old_backup)
|
ai_scraper/cache.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Request caching for GitHub API."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RequestCache:
|
|
11
|
+
"""File-based cache for API responses."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, cache_dir: Path, ttl: int = 3600):
|
|
14
|
+
"""Initialize cache.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
cache_dir: Directory for cache files.
|
|
18
|
+
ttl: Time-to-live in seconds.
|
|
19
|
+
"""
|
|
20
|
+
self.cache_dir = Path(cache_dir)
|
|
21
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
self.ttl = ttl
|
|
23
|
+
|
|
24
|
+
def _get_cache_key(self, url: str, params: Optional[dict] = None) -> str:
|
|
25
|
+
"""Generate cache key from URL and params."""
|
|
26
|
+
key_data = url + json.dumps(params or {}, sort_keys=True)
|
|
27
|
+
return hashlib.md5(key_data.encode()).hexdigest()
|
|
28
|
+
|
|
29
|
+
def get(self, url: str, params: Optional[dict] = None) -> Optional[dict]:
|
|
30
|
+
"""Get cached response.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
url: Request URL.
|
|
34
|
+
params: Request parameters.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Cached data or None.
|
|
38
|
+
"""
|
|
39
|
+
key = self._get_cache_key(url, params)
|
|
40
|
+
cache_file = self.cache_dir / f"{key}.json"
|
|
41
|
+
|
|
42
|
+
if not cache_file.exists():
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
47
|
+
cached = json.load(f)
|
|
48
|
+
|
|
49
|
+
# Check TTL
|
|
50
|
+
if time.time() - cached.get("timestamp", 0) > self.ttl:
|
|
51
|
+
cache_file.unlink()
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
return cached.get("data")
|
|
55
|
+
except (json.JSONDecodeError, KeyError):
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
def set(self, url: str, params: Optional[dict], data: dict) -> None:
|
|
59
|
+
"""Cache response.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
url: Request URL.
|
|
63
|
+
params: Request parameters.
|
|
64
|
+
data: Response data to cache.
|
|
65
|
+
"""
|
|
66
|
+
key = self._get_cache_key(url, params)
|
|
67
|
+
cache_file = self.cache_dir / f"{key}.json"
|
|
68
|
+
|
|
69
|
+
with open(cache_file, "w", encoding="utf-8") as f:
|
|
70
|
+
json.dump({
|
|
71
|
+
"timestamp": time.time(),
|
|
72
|
+
"data": data,
|
|
73
|
+
}, f)
|
|
74
|
+
|
|
75
|
+
def clear(self) -> int:
|
|
76
|
+
"""Clear all cached data.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Number of files deleted.
|
|
80
|
+
"""
|
|
81
|
+
count = 0
|
|
82
|
+
for cache_file in self.cache_dir.glob("*.json"):
|
|
83
|
+
cache_file.unlink()
|
|
84
|
+
count += 1
|
|
85
|
+
return count
|
|
86
|
+
|
|
87
|
+
def get_stats(self) -> dict:
|
|
88
|
+
"""Get cache statistics."""
|
|
89
|
+
files = list(self.cache_dir.glob("*.json"))
|
|
90
|
+
total_size = sum(f.stat().st_size for f in files)
|
|
91
|
+
return {
|
|
92
|
+
"file_count": len(files),
|
|
93
|
+
"total_size_bytes": total_size,
|
|
94
|
+
"total_size_mb": round(total_size / 1024 / 1024, 2),
|
|
95
|
+
}
|
ai_scraper/classifier.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Repository classification system."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from ai_scraper.models.repository import Repository
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Classification:
|
|
11
|
+
"""Repository classification result."""
|
|
12
|
+
primary_category: str
|
|
13
|
+
secondary_categories: list[str]
|
|
14
|
+
confidence: float
|
|
15
|
+
tech_stack: list[str]
|
|
16
|
+
maturity: str # experimental, production, enterprise
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Category definitions with keywords and topics
|
|
20
|
+
CATEGORIES = {
|
|
21
|
+
"llm": {
|
|
22
|
+
"keywords": ["llm", "large language model", "gpt", "claude", "llama", "mistral", "transformer"],
|
|
23
|
+
"topics": ["llm", "gpt", "language-model", "transformers"],
|
|
24
|
+
},
|
|
25
|
+
"computer-vision": {
|
|
26
|
+
"keywords": ["computer vision", "image recognition", "object detection", "segmentation", "yolo", "opencv"],
|
|
27
|
+
"topics": ["computer-vision", "object-detection", "image-segmentation", "opencv"],
|
|
28
|
+
},
|
|
29
|
+
"nlp": {
|
|
30
|
+
"keywords": ["nlp", "natural language", "text processing", "sentiment", "ner", "spacy", "nltk"],
|
|
31
|
+
"topics": ["nlp", "natural-language-processing", "text-analysis", "spacy"],
|
|
32
|
+
},
|
|
33
|
+
"ml-framework": {
|
|
34
|
+
"keywords": ["pytorch", "tensorflow", "jax", "keras", "machine learning framework"],
|
|
35
|
+
"topics": ["pytorch", "tensorflow", "jax", "keras", "deep-learning"],
|
|
36
|
+
},
|
|
37
|
+
"reinforcement-learning": {
|
|
38
|
+
"keywords": ["reinforcement learning", "rl", "q-learning", "policy gradient", "dqn", "ppo"],
|
|
39
|
+
"topics": ["reinforcement-learning", "deep-reinforcement-learning", "rl"],
|
|
40
|
+
},
|
|
41
|
+
"generative-ai": {
|
|
42
|
+
"keywords": ["generative", "diffusion", "gan", "stable diffusion", "midjourney", "image generation"],
|
|
43
|
+
"topics": ["generative-ai", "diffusion-model", "gan", "stable-diffusion"],
|
|
44
|
+
},
|
|
45
|
+
"ai-tools": {
|
|
46
|
+
"keywords": ["langchain", "llamaindex", "autogpt", "agent", "ai tool", "ai framework"],
|
|
47
|
+
"topics": ["langchain", "llamaindex", "autogpt", "ai-agent"],
|
|
48
|
+
},
|
|
49
|
+
"mlops": {
|
|
50
|
+
"keywords": ["mlops", "ml pipeline", "model deployment", "model serving", "mlflow", "kubeflow", "model registry"],
|
|
51
|
+
"topics": ["mlops", "ml-pipeline", "model-deployment", "mlflow", "kubeflow"],
|
|
52
|
+
},
|
|
53
|
+
"ai-infrastructure": {
|
|
54
|
+
"keywords": ["gpu", "cuda", "inference", "optimization", "quantization", "tensorrt", "onnx runtime"],
|
|
55
|
+
"topics": ["gpu-computing", "inference", "model-optimization", "quantization"],
|
|
56
|
+
},
|
|
57
|
+
"ai-ethics": {
|
|
58
|
+
"keywords": ["ai ethics", "bias detection", "fairness", "explainability", "interpretability", "responsible ai"],
|
|
59
|
+
"topics": ["ai-ethics", "fairness", "explainability", "responsible-ai"],
|
|
60
|
+
},
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
TECH_STACK = {
|
|
64
|
+
"pytorch": ["pytorch", "torch"],
|
|
65
|
+
"tensorflow": ["tensorflow", "tf"],
|
|
66
|
+
"jax": ["jax", "flax"],
|
|
67
|
+
"huggingface": ["huggingface", "transformers", "hugging face"],
|
|
68
|
+
"onnx": ["onnx"],
|
|
69
|
+
"openai": ["openai", "gpt-4", "gpt-3.5"],
|
|
70
|
+
"anthropic": ["anthropic", "claude"],
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class RepositoryClassifier:
|
|
75
|
+
"""Classify repositories into AI categories."""
|
|
76
|
+
|
|
77
|
+
def classify(self, repo: Repository) -> Classification:
|
|
78
|
+
"""Classify a repository.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
repo: Repository to classify.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Classification result.
|
|
85
|
+
"""
|
|
86
|
+
text = f"{repo.name} {repo.description or ''}".lower()
|
|
87
|
+
topics_lower = [t.lower() for t in repo.topics]
|
|
88
|
+
|
|
89
|
+
# Score each category
|
|
90
|
+
scores = {}
|
|
91
|
+
for category, rules in CATEGORIES.items():
|
|
92
|
+
score = 0.0
|
|
93
|
+
|
|
94
|
+
# Keyword matches
|
|
95
|
+
for kw in rules["keywords"]:
|
|
96
|
+
if kw in text:
|
|
97
|
+
score += 0.3
|
|
98
|
+
|
|
99
|
+
# Topic matches
|
|
100
|
+
for topic in rules["topics"]:
|
|
101
|
+
if topic in topics_lower:
|
|
102
|
+
score += 0.5
|
|
103
|
+
|
|
104
|
+
scores[category] = min(score, 1.0)
|
|
105
|
+
|
|
106
|
+
# Get primary and secondary categories
|
|
107
|
+
sorted_cats = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
108
|
+
primary = sorted_cats[0][0] if sorted_cats[0][1] > 0.3 else "other"
|
|
109
|
+
secondary = [cat for cat, score in sorted_cats[1:4] if score > 0.2]
|
|
110
|
+
|
|
111
|
+
# Detect tech stack
|
|
112
|
+
tech_stack = []
|
|
113
|
+
for tech, keywords in TECH_STACK.items():
|
|
114
|
+
if any(kw in text for kw in keywords):
|
|
115
|
+
tech_stack.append(tech)
|
|
116
|
+
|
|
117
|
+
# Determine maturity
|
|
118
|
+
maturity = self._assess_maturity(repo)
|
|
119
|
+
|
|
120
|
+
return Classification(
|
|
121
|
+
primary_category=primary,
|
|
122
|
+
secondary_categories=secondary,
|
|
123
|
+
confidence=sorted_cats[0][1],
|
|
124
|
+
tech_stack=tech_stack,
|
|
125
|
+
maturity=maturity,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _assess_maturity(self, repo: Repository) -> str:
|
|
129
|
+
"""Assess repository maturity."""
|
|
130
|
+
if repo.stars >= 10000 and repo.forks and repo.forks >= 500:
|
|
131
|
+
return "enterprise"
|
|
132
|
+
elif repo.stars >= 1000:
|
|
133
|
+
return "production"
|
|
134
|
+
else:
|
|
135
|
+
return "experimental"
|