github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,456 @@
1
+ """SQLite database storage."""
2
+
3
+ import json
4
+ import sqlite3
5
+ from dataclasses import dataclass
6
+ from datetime import datetime, timedelta
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from ai_scraper.models.repository import Repository
11
+
12
+
13
+ @dataclass
14
+ class TrendResult:
15
+ """Trend analysis result."""
16
+
17
+ repo_id: int
18
+ repo_name: str
19
+ initial_stars: int
20
+ current_stars: int
21
+ growth_rate: float
22
+
23
+
24
+ class Database:
25
+ """SQLite database for storing repository data."""
26
+
27
+ def __init__(self, db_path: Path):
28
+ """Initialize database.
29
+
30
+ Args:
31
+ db_path: Path to SQLite database file.
32
+ """
33
+ self.db_path = Path(db_path)
34
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
35
+ self.conn: Optional[sqlite3.Connection] = None
36
+
37
+ def init_db(self) -> None:
38
+ """Initialize database tables."""
39
+ self.conn = sqlite3.connect(self.db_path)
40
+ self.conn.row_factory = sqlite3.Row
41
+
42
+ cursor = self.conn.cursor()
43
+
44
+ # Create repositories table
45
+ cursor.execute("""
46
+ CREATE TABLE IF NOT EXISTS repositories (
47
+ id INTEGER PRIMARY KEY,
48
+ name TEXT UNIQUE NOT NULL,
49
+ full_name TEXT,
50
+ description TEXT,
51
+ stars INTEGER,
52
+ language TEXT,
53
+ topics TEXT,
54
+ created_at TIMESTAMP,
55
+ updated_at TIMESTAMP,
56
+ pushed_at TIMESTAMP,
57
+ url TEXT,
58
+ open_issues INTEGER,
59
+ forks INTEGER,
60
+ contributors INTEGER,
61
+ relevance_score REAL,
62
+ first_seen_at TIMESTAMP,
63
+ last_updated_at TIMESTAMP
64
+ )
65
+ """)
66
+
67
+ # Create snapshots table
68
+ cursor.execute("""
69
+ CREATE TABLE IF NOT EXISTS snapshots (
70
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
71
+ repo_id INTEGER,
72
+ stars INTEGER,
73
+ snapshot_at TIMESTAMP,
74
+ FOREIGN KEY (repo_id) REFERENCES repositories(id)
75
+ )
76
+ """)
77
+
78
+ # Create indexes
79
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_stars ON repositories(stars DESC)")
80
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_updated ON repositories(last_updated_at DESC)")
81
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_repo_id ON snapshots(repo_id)")
82
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_language ON repositories(language)")
83
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_created_at ON repositories(created_at DESC)")
84
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_relevance ON repositories(relevance_score DESC)")
85
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_snapshot_at ON snapshots(snapshot_at DESC)")
86
+
87
+ self.conn.commit()
88
+
89
+ def save_repository(self, repo: Repository, relevance_score: float = 0.0) -> None:
90
+ """Save or update a repository.
91
+
92
+ Args:
93
+ repo: Repository to save.
94
+ relevance_score: AI relevance score.
95
+ """
96
+ cursor = self.conn.cursor()
97
+
98
+ now = datetime.now().isoformat()
99
+
100
+ cursor.execute("""
101
+ INSERT INTO repositories (
102
+ id, name, full_name, description, stars, language, topics,
103
+ created_at, updated_at, pushed_at, url, open_issues, forks,
104
+ contributors, relevance_score, first_seen_at, last_updated_at
105
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
106
+ ON CONFLICT(id) DO UPDATE SET
107
+ full_name = excluded.full_name,
108
+ description = excluded.description,
109
+ stars = excluded.stars,
110
+ language = excluded.language,
111
+ topics = excluded.topics,
112
+ updated_at = excluded.updated_at,
113
+ pushed_at = excluded.pushed_at,
114
+ open_issues = excluded.open_issues,
115
+ forks = excluded.forks,
116
+ contributors = excluded.contributors,
117
+ relevance_score = excluded.relevance_score,
118
+ last_updated_at = excluded.last_updated_at
119
+ """, (
120
+ repo.id, repo.name, repo.full_name, repo.description, repo.stars,
121
+ repo.language, json.dumps(repo.topics), repo.created_at.isoformat(),
122
+ repo.updated_at.isoformat(), repo.pushed_at.isoformat(), repo.url,
123
+ repo.open_issues, repo.forks, repo.contributors, relevance_score,
124
+ now, now
125
+ ))
126
+
127
+ self.conn.commit()
128
+
129
+ def save_snapshot(self, repo_id: int, stars: int, snapshot_at: datetime) -> None:
130
+ """Save a repository snapshot.
131
+
132
+ Args:
133
+ repo_id: Repository ID.
134
+ stars: Star count at snapshot time.
135
+ snapshot_at: Snapshot timestamp.
136
+ """
137
+ cursor = self.conn.cursor()
138
+
139
+ cursor.execute("""
140
+ INSERT INTO snapshots (repo_id, stars, snapshot_at)
141
+ VALUES (?, ?, ?)
142
+ """, (repo_id, stars, snapshot_at.isoformat()))
143
+
144
+ self.conn.commit()
145
+
146
+ def get_snapshots(self, repo_id: int) -> list[dict]:
147
+ """Get snapshots for a repository.
148
+
149
+ Args:
150
+ repo_id: Repository ID.
151
+
152
+ Returns:
153
+ List of snapshot records.
154
+ """
155
+ cursor = self.conn.cursor()
156
+
157
+ cursor.execute("""
158
+ SELECT stars, snapshot_at FROM snapshots
159
+ WHERE repo_id = ?
160
+ ORDER BY snapshot_at DESC
161
+ """, (repo_id,))
162
+
163
+ return [dict(row) for row in cursor.fetchall()]
164
+
165
+ def get_all_repositories(self, limit: int = 100, sort_by: str = "stars") -> list[Repository]:
166
+ """Get all repositories.
167
+
168
+ Args:
169
+ limit: Maximum number of repositories to return.
170
+ sort_by: Field to sort by.
171
+
172
+ Returns:
173
+ List of repositories.
174
+ """
175
+ cursor = self.conn.cursor()
176
+
177
+ valid_sort_fields = ["stars", "updated_at", "relevance_score"]
178
+ sort_field = sort_by if sort_by in valid_sort_fields else "stars"
179
+
180
+ cursor.execute(f"""
181
+ SELECT * FROM repositories
182
+ ORDER BY {sort_field} DESC
183
+ LIMIT ?
184
+ """, (limit,))
185
+
186
+ rows = cursor.fetchall()
187
+ return [self._row_to_repo(row) for row in rows]
188
+
189
+ def get_trending(self, days: int = 7, limit: int = 10) -> list[TrendResult]:
190
+ """Get trending repositories by star growth.
191
+
192
+ Args:
193
+ days: Number of days to analyze.
194
+ limit: Maximum number of results.
195
+
196
+ Returns:
197
+ List of trending repositories.
198
+ """
199
+ cursor = self.conn.cursor()
200
+
201
+ cutoff = (datetime.now() - timedelta(days=days)).isoformat()
202
+
203
+ cursor.execute("""
204
+ SELECT
205
+ r.id as repo_id,
206
+ r.name as repo_name,
207
+ MIN(s1.stars) as initial_stars,
208
+ r.stars as current_stars
209
+ FROM repositories r
210
+ JOIN snapshots s1 ON r.id = s1.repo_id
211
+ WHERE s1.snapshot_at >= ?
212
+ GROUP BY r.id
213
+ HAVING current_stars > initial_stars
214
+ ORDER BY (CAST(current_stars AS FLOAT) / initial_stars - 1) DESC
215
+ LIMIT ?
216
+ """, (cutoff, limit))
217
+
218
+ results = []
219
+ for row in cursor.fetchall():
220
+ initial = row["initial_stars"]
221
+ current = row["current_stars"]
222
+ growth = (current - initial) / initial if initial > 0 else 0.0
223
+
224
+ results.append(TrendResult(
225
+ repo_id=row["repo_id"],
226
+ repo_name=row["repo_name"],
227
+ initial_stars=initial,
228
+ current_stars=current,
229
+ growth_rate=growth,
230
+ ))
231
+
232
+ return results
233
+
234
+ def search_local(self, query: str, limit: int = 20) -> list[Repository]:
235
+ """Search repositories locally.
236
+
237
+ Args:
238
+ query: Search query.
239
+ limit: Maximum number of results.
240
+
241
+ Returns:
242
+ List of matching repositories.
243
+ """
244
+ cursor = self.conn.cursor()
245
+
246
+ cursor.execute("""
247
+ SELECT * FROM repositories
248
+ WHERE name LIKE ? OR description LIKE ?
249
+ ORDER BY stars DESC
250
+ LIMIT ?
251
+ """, (f"%{query}%", f"%{query}%", limit))
252
+
253
+ rows = cursor.fetchall()
254
+ return [self._row_to_repo(row) for row in rows]
255
+
256
+ def get_stats(self) -> dict:
257
+ """Get database statistics.
258
+
259
+ Returns:
260
+ Dictionary with statistics.
261
+ """
262
+ cursor = self.conn.cursor()
263
+
264
+ cursor.execute("SELECT COUNT(*) as count FROM repositories")
265
+ repo_count = cursor.fetchone()["count"]
266
+
267
+ cursor.execute("SELECT COUNT(*) as count FROM snapshots")
268
+ snapshot_count = cursor.fetchone()["count"]
269
+
270
+ cursor.execute("SELECT SUM(stars) as total FROM repositories")
271
+ total_stars = cursor.fetchone()["total"] or 0
272
+
273
+ return {
274
+ "repository_count": repo_count,
275
+ "snapshot_count": snapshot_count,
276
+ "total_stars": total_stars,
277
+ }
278
+
279
+ def clean_old_snapshots(self, days: int = 30) -> int:
280
+ """Clean snapshots older than specified days.
281
+
282
+ Args:
283
+ days: Number of days to keep.
284
+
285
+ Returns:
286
+ Number of deleted snapshots.
287
+ """
288
+ cursor = self.conn.cursor()
289
+
290
+ cursor.execute("""
291
+ DELETE FROM snapshots
292
+ WHERE snapshot_at < datetime('now', ?)
293
+ """, (f'-{days} days',))
294
+
295
+ deleted = cursor.rowcount
296
+ self.conn.commit()
297
+
298
+ return deleted
299
+
300
+ def _row_to_repo(self, row: sqlite3.Row) -> Repository:
301
+ """Convert database row to Repository object.
302
+
303
+ Args:
304
+ row: Database row.
305
+
306
+ Returns:
307
+ Repository object.
308
+ """
309
+ return Repository(
310
+ id=row["id"],
311
+ name=row["name"],
312
+ full_name=row["full_name"],
313
+ description=row["description"],
314
+ stars=row["stars"],
315
+ language=row["language"],
316
+ topics=json.loads(row["topics"]) if row["topics"] else [],
317
+ created_at=datetime.fromisoformat(row["created_at"]) if row["created_at"] else None,
318
+ updated_at=datetime.fromisoformat(row["updated_at"]) if row["updated_at"] else None,
319
+ pushed_at=datetime.fromisoformat(row["pushed_at"]) if row["pushed_at"] else None,
320
+ url=row["url"],
321
+ open_issues=row["open_issues"],
322
+ forks=row["forks"],
323
+ contributors=row["contributors"],
324
+ )
325
+
326
+ def get_last_scrape_time(self) -> Optional[datetime]:
327
+ """Get the timestamp of the most recent repository update.
328
+
329
+ Returns:
330
+ MAX(last_updated_at) from repositories, or None if empty.
331
+ """
332
+ cursor = self.conn.cursor()
333
+
334
+ cursor.execute("SELECT MAX(last_updated_at) as max_time FROM repositories")
335
+ row = cursor.fetchone()
336
+
337
+ if row["max_time"] is None:
338
+ return None
339
+
340
+ return datetime.fromisoformat(row["max_time"])
341
+
342
+ def get_repos_updated_since(self, since: datetime) -> list[Repository]:
343
+ """Get repositories updated on or after the given time.
344
+
345
+ Args:
346
+ since: Cutoff datetime.
347
+
348
+ Returns:
349
+ List of repositories where last_updated_at >= since.
350
+ """
351
+ cursor = self.conn.cursor()
352
+
353
+ cursor.execute("""
354
+ SELECT * FROM repositories
355
+ WHERE last_updated_at >= ?
356
+ ORDER BY last_updated_at DESC
357
+ """, (since.isoformat(),))
358
+
359
+ rows = cursor.fetchall()
360
+ return [self._row_to_repo(row) for row in rows]
361
+
362
+ def needs_update(self, repo_id: int, max_age_days: int = 7) -> bool:
363
+ """Check if a repository needs to be updated.
364
+
365
+ Args:
366
+ repo_id: Repository ID to check.
367
+ max_age_days: Maximum age in days before needing update.
368
+
369
+ Returns:
370
+ True if repo doesn't exist or is older than max_age_days.
371
+ """
372
+ cursor = self.conn.cursor()
373
+
374
+ cursor.execute("""
375
+ SELECT last_updated_at FROM repositories WHERE id = ?
376
+ """, (repo_id,))
377
+
378
+ row = cursor.fetchone()
379
+
380
+ if row is None:
381
+ return True
382
+
383
+ last_updated = datetime.fromisoformat(row["last_updated_at"])
384
+ age = datetime.now() - last_updated
385
+
386
+ return age.days > max_age_days
387
+
388
+ def get_repos_by_language(self, language: str, limit: int = 50) -> list[Repository]:
389
+ """Get repositories by language (optimized with index).
390
+
391
+ Args:
392
+ language: Programming language.
393
+ limit: Maximum results.
394
+
395
+ Returns:
396
+ List of repositories.
397
+ """
398
+ cursor = self.conn.cursor()
399
+ cursor.execute("""
400
+ SELECT * FROM repositories
401
+ WHERE language = ?
402
+ ORDER BY stars DESC
403
+ LIMIT ?
404
+ """, (language, limit))
405
+
406
+ return [self._row_to_repo(row) for row in cursor.fetchall()]
407
+
408
+ def get_top_repos(self, limit: int = 100) -> list[Repository]:
409
+ """Get top repositories by stars (optimized).
410
+
411
+ Args:
412
+ limit: Maximum results.
413
+
414
+ Returns:
415
+ List of top repositories.
416
+ """
417
+ cursor = self.conn.cursor()
418
+ cursor.execute("""
419
+ SELECT * FROM repositories
420
+ ORDER BY stars DESC
421
+ LIMIT ?
422
+ """, (limit,))
423
+
424
+ return [self._row_to_repo(row) for row in cursor.fetchall()]
425
+
426
+ def vacuum(self) -> None:
427
+ """Optimize database by running VACUUM."""
428
+ cursor = self.conn.cursor()
429
+ cursor.execute("VACUUM")
430
+ self.conn.commit()
431
+
432
+ def clean_invalid_repos(self) -> int:
433
+ """Remove repositories with invalid or missing required data.
434
+
435
+ Returns:
436
+ Number of removed repositories.
437
+ """
438
+ cursor = self.conn.cursor()
439
+
440
+ # Remove repos with NULL name or empty URL
441
+ cursor.execute("""
442
+ DELETE FROM repositories
443
+ WHERE name IS NULL OR name = ''
444
+ OR url IS NULL OR url = ''
445
+ """)
446
+
447
+ removed = cursor.rowcount
448
+ self.conn.commit()
449
+
450
+ return removed
451
+
452
+ def close(self) -> None:
453
+ """Close database connection."""
454
+ if self.conn:
455
+ self.conn.close()
456
+ self.conn = None
ai_scraper/webhooks.py ADDED
@@ -0,0 +1,95 @@
1
+ """Webhook notification support."""
2
+
3
+ import asyncio
4
+ import json
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ import aiohttp
9
+
10
+
11
+ @dataclass
12
+ class WebhookConfig:
13
+ """Webhook configuration."""
14
+ url: str
15
+ events: list[str] # ["scrape_complete", "trending_found", "error"]
16
+ headers: Optional[dict] = None
17
+
18
+
19
+ class WebhookNotifier:
20
+ """Send webhook notifications."""
21
+
22
+ def __init__(self, webhooks: list[WebhookConfig]):
23
+ self.webhooks = webhooks
24
+ self.session: Optional[aiohttp.ClientSession] = None
25
+
26
+ async def _get_session(self) -> aiohttp.ClientSession:
27
+ if self.session is None or self.session.closed:
28
+ self.session = aiohttp.ClientSession()
29
+ return self.session
30
+
31
+ async def notify(self, event: str, data: dict) -> None:
32
+ """Send notification for an event.
33
+
34
+ Args:
35
+ event: Event name.
36
+ data: Event data.
37
+ """
38
+ session = await self._get_session()
39
+
40
+ payload = {
41
+ "event": event,
42
+ "timestamp": data.get("timestamp"),
43
+ "data": data,
44
+ }
45
+
46
+ for webhook in self.webhooks:
47
+ if event not in webhook.events:
48
+ continue
49
+
50
+ try:
51
+ async with session.post(
52
+ webhook.url,
53
+ json=payload,
54
+ headers=webhook.headers,
55
+ ) as response:
56
+ if response.status >= 400:
57
+ print(f"Webhook failed: {response.status}")
58
+ except Exception as e:
59
+ print(f"Webhook error: {e}")
60
+
61
+ async def close(self) -> None:
62
+ if self.session and not self.session.closed:
63
+ await self.session.close()
64
+
65
+
66
+ # Built-in formatters for common services
67
+ def format_slack_message(event: str, data: dict) -> dict:
68
+ """Format message for Slack webhook."""
69
+ if event == "scrape_complete":
70
+ return {
71
+ "text": "Scrape Complete",
72
+ "blocks": [
73
+ {
74
+ "type": "section",
75
+ "text": {
76
+ "type": "mrkdwn",
77
+ "text": f"*Scrape Complete*\n"
78
+ f"Found {data.get('repos_count', 0)} AI repositories\n"
79
+ f"Total stars: {data.get('total_stars', 0):,}",
80
+ },
81
+ }
82
+ ],
83
+ }
84
+ return {"text": f"Event: {event}"}
85
+
86
+
87
+ def format_telegram_message(event: str, data: dict) -> str:
88
+ """Format message for Telegram bot."""
89
+ if event == "scrape_complete":
90
+ return (
91
+ f"🤖 *Scrape Complete*\n\n"
92
+ f"📊 Found {data.get('repos_count', 0)} AI repositories\n"
93
+ f"⭐ Total stars: {data.get('total_stars', 0):,}"
94
+ )
95
+ return f"Event: {event}"