github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,99 @@
1
+ """Scrape progress tracking for resume support."""
2
+
3
+ import json
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Optional
7
+ import hashlib
8
+
9
+
10
+ class ScrapeProgress:
11
+ """Track and persist scrape progress for resume support."""
12
+
13
+ def __init__(self, storage_dir: Path):
14
+ """Initialize progress tracker.
15
+
16
+ Args:
17
+ storage_dir: Directory for storing progress files.
18
+ """
19
+ self.storage_dir = Path(storage_dir)
20
+ self.storage_dir.mkdir(parents=True, exist_ok=True)
21
+
22
+ def _query_to_filename(self, query: str) -> str:
23
+ """Convert query to a safe filename."""
24
+ query_hash = hashlib.md5(query.encode()).hexdigest()[:8]
25
+ return f"progress_{query_hash}.json"
26
+
27
+ def save(
28
+ self,
29
+ query: str,
30
+ last_page: int,
31
+ total_found: int,
32
+ timestamp: datetime,
33
+ ) -> None:
34
+ """Save scrape progress.
35
+
36
+ Args:
37
+ query: Search query.
38
+ last_page: Last successfully fetched page.
39
+ total_found: Total repositories found so far.
40
+ timestamp: Timestamp of the progress.
41
+ """
42
+ filename = self._query_to_filename(query)
43
+ filepath = self.storage_dir / filename
44
+
45
+ data = {
46
+ "query": query,
47
+ "last_page": last_page,
48
+ "total_found": total_found,
49
+ "timestamp": timestamp.isoformat(),
50
+ }
51
+
52
+ filepath.write_text(json.dumps(data, indent=2), encoding="utf-8")
53
+
54
+ def load(self, query: str) -> Optional[dict]:
55
+ """Load scrape progress.
56
+
57
+ Args:
58
+ query: Search query.
59
+
60
+ Returns:
61
+ Progress data or None if not found.
62
+ """
63
+ filename = self._query_to_filename(query)
64
+ filepath = self.storage_dir / filename
65
+
66
+ if not filepath.exists():
67
+ return None
68
+
69
+ try:
70
+ data = json.loads(filepath.read_text(encoding="utf-8"))
71
+ data["timestamp"] = datetime.fromisoformat(data["timestamp"])
72
+ return data
73
+ except (json.JSONDecodeError, KeyError, ValueError):
74
+ return None
75
+
76
+ def clear(self, query: str) -> None:
77
+ """Clear progress for a query.
78
+
79
+ Args:
80
+ query: Search query.
81
+ """
82
+ filename = self._query_to_filename(query)
83
+ filepath = self.storage_dir / filename
84
+
85
+ if filepath.exists():
86
+ filepath.unlink()
87
+
88
+ def has_progress(self, query: str) -> bool:
89
+ """Check if progress exists for a query.
90
+
91
+ Args:
92
+ query: Search query.
93
+
94
+ Returns:
95
+ True if progress exists.
96
+ """
97
+ filename = self._query_to_filename(query)
98
+ filepath = self.storage_dir / filename
99
+ return filepath.exists()
@@ -0,0 +1,127 @@
1
+ """Secure token storage using encryption."""
2
+
3
+ import base64
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+
10
+ class SecureStorage:
11
+ """Secure storage for sensitive tokens."""
12
+
13
+ def __init__(self, storage_dir: Path):
14
+ """Initialize secure storage.
15
+
16
+ Args:
17
+ storage_dir: Directory for storing encrypted tokens.
18
+ """
19
+ self.storage_dir = Path(storage_dir)
20
+ self.storage_dir.mkdir(parents=True, exist_ok=True)
21
+ self.token_file = self.storage_dir / "tokens.enc"
22
+ self._cipher = None
23
+
24
+ def _get_cipher(self):
25
+ """Get or create cipher for encryption."""
26
+ if self._cipher is None:
27
+ try:
28
+ from cryptography.fernet import Fernet
29
+ from cryptography.hazmat.primitives import hashes
30
+ from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
31
+
32
+ # Use a key derived from machine-specific info
33
+ machine_id = f"{os.environ.get('USERNAME', 'user')}{os.environ.get('COMPUTERNAME', 'host')}"
34
+
35
+ # Use a fixed salt for simplicity
36
+ salt = b'ai_scraper_salt_v1'
37
+
38
+ kdf = PBKDF2HMAC(
39
+ algorithm=hashes.SHA256(),
40
+ length=32,
41
+ salt=salt,
42
+ iterations=100000,
43
+ )
44
+
45
+ key = base64.urlsafe_b64encode(kdf.derive(machine_id.encode()))
46
+ self._cipher = Fernet(key)
47
+ except ImportError:
48
+ # Fallback to base64 encoding if cryptography not available
49
+ self._cipher = None
50
+
51
+ return self._cipher
52
+
53
+ def store_token(self, name: str, token: str) -> None:
54
+ """Store a token securely.
55
+
56
+ Args:
57
+ name: Token name/identifier.
58
+ token: Token value to store.
59
+ """
60
+ cipher = self._get_cipher()
61
+
62
+ # Load existing tokens
63
+ tokens = self._load_tokens()
64
+
65
+ # Add/update token
66
+ tokens[name] = token
67
+
68
+ # Encrypt and save
69
+ data = json.dumps(tokens)
70
+ if cipher:
71
+ encrypted = cipher.encrypt(data.encode())
72
+ self.token_file.write_bytes(encrypted)
73
+ else:
74
+ # Fallback: base64 encode
75
+ encoded = base64.b64encode(data.encode())
76
+ self.token_file.write_bytes(encoded)
77
+
78
+ def get_token(self, name: str) -> Optional[str]:
79
+ """Retrieve a stored token.
80
+
81
+ Args:
82
+ name: Token name/identifier.
83
+
84
+ Returns:
85
+ Token value or None if not found.
86
+ """
87
+ tokens = self._load_tokens()
88
+ return tokens.get(name)
89
+
90
+ def delete_token(self, name: str) -> None:
91
+ """Delete a stored token.
92
+
93
+ Args:
94
+ name: Token name/identifier.
95
+ """
96
+ tokens = self._load_tokens()
97
+ if name in tokens:
98
+ del tokens[name]
99
+
100
+ cipher = self._get_cipher()
101
+ data = json.dumps(tokens)
102
+
103
+ if cipher:
104
+ encrypted = cipher.encrypt(data.encode())
105
+ self.token_file.write_bytes(encrypted)
106
+ else:
107
+ encoded = base64.b64encode(data.encode())
108
+ self.token_file.write_bytes(encoded)
109
+
110
+ def _load_tokens(self) -> dict:
111
+ """Load tokens from encrypted storage."""
112
+ if not self.token_file.exists():
113
+ return {}
114
+
115
+ try:
116
+ cipher = self._get_cipher()
117
+ data = self.token_file.read_bytes()
118
+
119
+ if cipher:
120
+ decrypted = cipher.decrypt(data)
121
+ return json.loads(decrypted.decode())
122
+ else:
123
+ # Fallback: base64 decode
124
+ decoded = base64.b64decode(data)
125
+ return json.loads(decoded.decode())
126
+ except Exception:
127
+ return {}
@@ -0,0 +1,5 @@
1
+ """Storage module for ai_scraper."""
2
+
3
+ from ai_scraper.storage.database import Database
4
+
5
+ __all__ = ["Database"]
@@ -0,0 +1,237 @@
1
+ """Async SQLite database storage."""
2
+
3
+ import json
4
+ import aiosqlite
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from ai_scraper.models.repository import Repository
11
+
12
+
13
+ @dataclass
14
+ class TrendResult:
15
+ """Trend analysis result."""
16
+
17
+ repo_id: int
18
+ repo_name: str
19
+ initial_stars: int
20
+ current_stars: int
21
+ growth_rate: float
22
+
23
+
24
+ class AsyncDatabase:
25
+ """Async SQLite database for storing repository data."""
26
+
27
+ def __init__(self, db_path: Path):
28
+ """Initialize database.
29
+
30
+ Args:
31
+ db_path: Path to SQLite database file.
32
+ """
33
+ self.db_path = Path(db_path)
34
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
35
+ self.conn: Optional[aiosqlite.Connection] = None
36
+
37
+ async def init_db(self) -> None:
38
+ """Initialize database tables."""
39
+ self.conn = await aiosqlite.connect(self.db_path)
40
+ self.conn.row_factory = aiosqlite.Row
41
+
42
+ await self.conn.executescript("""
43
+ CREATE TABLE IF NOT EXISTS repositories (
44
+ id INTEGER PRIMARY KEY,
45
+ name TEXT UNIQUE NOT NULL,
46
+ full_name TEXT,
47
+ description TEXT,
48
+ stars INTEGER,
49
+ language TEXT,
50
+ topics TEXT,
51
+ created_at TIMESTAMP,
52
+ updated_at TIMESTAMP,
53
+ pushed_at TIMESTAMP,
54
+ url TEXT,
55
+ open_issues INTEGER,
56
+ forks INTEGER,
57
+ contributors INTEGER,
58
+ relevance_score REAL,
59
+ first_seen_at TIMESTAMP,
60
+ last_updated_at TIMESTAMP
61
+ );
62
+
63
+ CREATE TABLE IF NOT EXISTS snapshots (
64
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
65
+ repo_id INTEGER,
66
+ stars INTEGER,
67
+ snapshot_at TIMESTAMP,
68
+ FOREIGN KEY (repo_id) REFERENCES repositories(id)
69
+ );
70
+
71
+ CREATE INDEX IF NOT EXISTS idx_stars ON repositories(stars DESC);
72
+ CREATE INDEX IF NOT EXISTS idx_updated ON repositories(last_updated_at DESC);
73
+ CREATE INDEX IF NOT EXISTS idx_repo_id ON snapshots(repo_id);
74
+ CREATE INDEX IF NOT EXISTS idx_language ON repositories(language);
75
+ CREATE INDEX IF NOT EXISTS idx_created_at ON repositories(created_at DESC);
76
+ CREATE INDEX IF NOT EXISTS idx_relevance ON repositories(relevance_score DESC);
77
+ CREATE INDEX IF NOT EXISTS idx_snapshot_at ON snapshots(snapshot_at DESC);
78
+ """)
79
+ await self.conn.commit()
80
+
81
+ async def save_repository(self, repo: Repository, relevance_score: float = 0.0) -> None:
82
+ """Save or update a repository."""
83
+ now = datetime.now().isoformat()
84
+
85
+ await self.conn.execute("""
86
+ INSERT INTO repositories (
87
+ id, name, full_name, description, stars, language, topics,
88
+ created_at, updated_at, pushed_at, url, open_issues, forks,
89
+ contributors, relevance_score, first_seen_at, last_updated_at
90
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
91
+ ON CONFLICT(id) DO UPDATE SET
92
+ full_name = excluded.full_name,
93
+ description = excluded.description,
94
+ stars = excluded.stars,
95
+ language = excluded.language,
96
+ topics = excluded.topics,
97
+ updated_at = excluded.updated_at,
98
+ pushed_at = excluded.pushed_at,
99
+ open_issues = excluded.open_issues,
100
+ forks = excluded.forks,
101
+ contributors = excluded.contributors,
102
+ relevance_score = excluded.relevance_score,
103
+ last_updated_at = excluded.last_updated_at
104
+ """, (
105
+ repo.id, repo.name, repo.full_name, repo.description, repo.stars,
106
+ repo.language, json.dumps(repo.topics), repo.created_at.isoformat(),
107
+ repo.updated_at.isoformat(), repo.pushed_at.isoformat(), repo.url,
108
+ repo.open_issues, repo.forks, repo.contributors, relevance_score,
109
+ now, now
110
+ ))
111
+ await self.conn.commit()
112
+
113
+ async def get_repository_by_id(self, repo_id: int) -> Optional[Repository]:
114
+ """Get a specific repository by ID."""
115
+ cursor = await self.conn.execute(
116
+ "SELECT * FROM repositories WHERE id = ?", (repo_id,)
117
+ )
118
+ row = await cursor.fetchone()
119
+ if row is None:
120
+ return None
121
+ return self._row_to_repo(row)
122
+
123
+ async def get_all_repositories(self, limit: int = 100, sort_by: str = "stars") -> list[Repository]:
124
+ """Get all repositories."""
125
+ valid_sort_fields = ["stars", "updated_at", "relevance_score"]
126
+ sort_field = sort_by if sort_by in valid_sort_fields else "stars"
127
+
128
+ cursor = await self.conn.execute(f"""
129
+ SELECT * FROM repositories
130
+ ORDER BY {sort_field} DESC
131
+ LIMIT ?
132
+ """, (limit,))
133
+
134
+ rows = await cursor.fetchall()
135
+ return [self._row_to_repo(row) for row in rows]
136
+
137
+ async def get_stats(self) -> dict:
138
+ """Get database statistics."""
139
+ cursor = await self.conn.execute("SELECT COUNT(*) as count FROM repositories")
140
+ row = await cursor.fetchone()
141
+ repo_count = row["count"]
142
+
143
+ cursor = await self.conn.execute("SELECT COUNT(*) as count FROM snapshots")
144
+ row = await cursor.fetchone()
145
+ snapshot_count = row["count"]
146
+
147
+ cursor = await self.conn.execute("SELECT SUM(stars) as total FROM repositories")
148
+ row = await cursor.fetchone()
149
+ total_stars = row["total"] or 0
150
+
151
+ return {
152
+ "repository_count": repo_count,
153
+ "snapshot_count": snapshot_count,
154
+ "total_stars": total_stars,
155
+ }
156
+
157
+ async def get_last_scrape_time(self) -> Optional[datetime]:
158
+ """Get the timestamp of the most recent repository update."""
159
+ cursor = await self.conn.execute(
160
+ "SELECT MAX(last_updated_at) as max_time FROM repositories"
161
+ )
162
+ row = await cursor.fetchone()
163
+
164
+ if row["max_time"] is None:
165
+ return None
166
+
167
+ return datetime.fromisoformat(row["max_time"])
168
+
169
+ async def search_local(self, query: str, limit: int = 20) -> list[Repository]:
170
+ """Search repositories locally."""
171
+ cursor = await self.conn.execute("""
172
+ SELECT * FROM repositories
173
+ WHERE name LIKE ? OR description LIKE ?
174
+ ORDER BY stars DESC
175
+ LIMIT ?
176
+ """, (f"%{query}%", f"%{query}%", limit))
177
+
178
+ rows = await cursor.fetchall()
179
+ return [self._row_to_repo(row) for row in rows]
180
+
181
+ async def get_trending(self, days: int = 7, limit: int = 10) -> list[TrendResult]:
182
+ """Get trending repositories by star growth."""
183
+ cursor = await self.conn.execute("""
184
+ SELECT
185
+ r.id as repo_id,
186
+ r.name as repo_name,
187
+ s1.stars as initial_stars,
188
+ r.stars as current_stars
189
+ FROM repositories r
190
+ JOIN snapshots s1 ON r.id = s1.repo_id
191
+ WHERE s1.snapshot_at >= datetime('now', ?)
192
+ GROUP BY r.id
193
+ HAVING current_stars > initial_stars
194
+ ORDER BY (CAST(current_stars AS FLOAT) / initial_stars - 1) DESC
195
+ LIMIT ?
196
+ """, (f'-{days} days', limit))
197
+
198
+ results = []
199
+ async for row in cursor:
200
+ initial = row["initial_stars"]
201
+ current = row["current_stars"]
202
+ growth = (current - initial) / initial if initial > 0 else 0.0
203
+
204
+ results.append(TrendResult(
205
+ repo_id=row["repo_id"],
206
+ repo_name=row["repo_name"],
207
+ initial_stars=initial,
208
+ current_stars=current,
209
+ growth_rate=growth,
210
+ ))
211
+
212
+ return results
213
+
214
+ async def close(self) -> None:
215
+ """Close database connection."""
216
+ if self.conn:
217
+ await self.conn.close()
218
+ self.conn = None
219
+
220
+ def _row_to_repo(self, row: aiosqlite.Row) -> Repository:
221
+ """Convert database row to Repository object."""
222
+ return Repository(
223
+ id=row["id"],
224
+ name=row["name"],
225
+ full_name=row["full_name"],
226
+ description=row["description"],
227
+ stars=row["stars"],
228
+ language=row["language"],
229
+ topics=json.loads(row["topics"]) if row["topics"] else [],
230
+ created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else None,
231
+ updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
232
+ pushed_at=datetime.fromisoformat(row["pushed_at"]) if row["pushed_at"] else None,
233
+ url=row["url"],
234
+ open_issues=row["open_issues"],
235
+ forks=row["forks"],
236
+ contributors=row["contributors"],
237
+ )