github-ai-scraper 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. ai_scraper/__init__.py +3 -0
  2. ai_scraper/api/__init__.py +6 -0
  3. ai_scraper/api/github.py +340 -0
  4. ai_scraper/api/gitlab.py +418 -0
  5. ai_scraper/api/rate_limiter.py +120 -0
  6. ai_scraper/api_server.py +196 -0
  7. ai_scraper/auth.py +68 -0
  8. ai_scraper/backup.py +112 -0
  9. ai_scraper/cache.py +95 -0
  10. ai_scraper/classifier.py +135 -0
  11. ai_scraper/cli.py +747 -0
  12. ai_scraper/config.py +237 -0
  13. ai_scraper/config_watcher.py +82 -0
  14. ai_scraper/dedup.py +148 -0
  15. ai_scraper/filters/__init__.py +5 -0
  16. ai_scraper/filters/ai_filter.py +93 -0
  17. ai_scraper/health.py +155 -0
  18. ai_scraper/i18n.py +141 -0
  19. ai_scraper/interactive.py +96 -0
  20. ai_scraper/keywords/__init__.py +5 -0
  21. ai_scraper/keywords/extractor.py +274 -0
  22. ai_scraper/logging_config.py +74 -0
  23. ai_scraper/models/__init__.py +5 -0
  24. ai_scraper/models/repository.py +72 -0
  25. ai_scraper/output/__init__.py +6 -0
  26. ai_scraper/output/excel.py +79 -0
  27. ai_scraper/output/html.py +152 -0
  28. ai_scraper/output/markdown.py +338 -0
  29. ai_scraper/output/rss.py +82 -0
  30. ai_scraper/output/translator.py +303 -0
  31. ai_scraper/plugin_system.py +146 -0
  32. ai_scraper/plugins/__init__.py +5 -0
  33. ai_scraper/retry.py +134 -0
  34. ai_scraper/scheduler.py +84 -0
  35. ai_scraper/scrape_progress.py +99 -0
  36. ai_scraper/secure_storage.py +127 -0
  37. ai_scraper/storage/__init__.py +5 -0
  38. ai_scraper/storage/async_database.py +237 -0
  39. ai_scraper/storage/database.py +456 -0
  40. ai_scraper/webhooks.py +95 -0
  41. github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
  42. github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
  43. github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
  44. github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """GitHub AI high-star repositories scraper."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """API module for ai_scraper."""
2
+
3
+ from ai_scraper.api.github import GitHubClient
4
+ from ai_scraper.api.rate_limiter import RateLimiter
5
+
6
+ __all__ = ["GitHubClient", "RateLimiter"]
@@ -0,0 +1,340 @@
1
+ """GitHub API client."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import aiohttp
10
+
11
+ from ai_scraper.api.rate_limiter import RateLimitInfo, RateLimiter
12
+ from ai_scraper.cache import RequestCache
13
+ from ai_scraper.models.repository import Repository
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class GitHubAPIError(Exception):
19
+ """GitHub API error."""
20
+
21
+ def __init__(self, status: int, message: str):
22
+ self.status = status
23
+ self.message = message
24
+ super().__init__(f"GitHub API error {status}: {message}")
25
+
26
+
27
+ class GitHubClient:
28
+ """Asynchronous GitHub API client."""
29
+
30
+ BASE_URL = "https://api.github.com"
31
+
32
+ def __init__(
33
+ self,
34
+ token: Optional[str] = None,
35
+ cache_dir: Optional[Path] = None,
36
+ cache_ttl: int = 3600,
37
+ connection_pool_size: int = 10,
38
+ ):
39
+ """Initialize GitHub client.
40
+
41
+ Args:
42
+ token: GitHub Personal Access Token (optional).
43
+ cache_dir: Directory for cache files (optional).
44
+ cache_ttl: Cache time-to-live in seconds.
45
+ connection_pool_size: Maximum number of connections in pool.
46
+ """
47
+ self.token = token
48
+ self.session: Optional[aiohttp.ClientSession] = None
49
+ self.connection_pool_size = connection_pool_size
50
+
51
+ # Rate limiter: 60/hour without token, 5000/hour with token
52
+ rate = 5000 if token else 60
53
+ self.rate_limiter = RateLimiter(requests_per_hour=rate)
54
+
55
+ # Request cache
56
+ self.cache: Optional[RequestCache] = None
57
+ if cache_dir:
58
+ self.cache = RequestCache(cache_dir=cache_dir, ttl=cache_ttl)
59
+
60
+ async def _get_session(self) -> aiohttp.ClientSession:
61
+ """Get or create aiohttp session with connection pooling."""
62
+ if self.session is None or self.session.closed:
63
+ headers = {"Accept": "application/vnd.github.v3+json"}
64
+ if self.token:
65
+ headers["Authorization"] = f"token {self.token}"
66
+
67
+ # Configure connection pool
68
+ connector = aiohttp.TCPConnector(
69
+ limit=self.connection_pool_size,
70
+ limit_per_host=self.connection_pool_size,
71
+ enable_cleanup_closed=True,
72
+ )
73
+
74
+ self.session = aiohttp.ClientSession(
75
+ headers=headers,
76
+ connector=connector,
77
+ timeout=aiohttp.ClientTimeout(total=30),
78
+ )
79
+ return self.session
80
+
81
+ async def close(self) -> None:
82
+ """Close the HTTP session."""
83
+ if self.session and not self.session.closed:
84
+ await self.session.close()
85
+
86
+ async def _request(self, endpoint: str, params: Optional[dict] = None) -> dict:
87
+ """Make an API request.
88
+
89
+ Args:
90
+ endpoint: API endpoint (without base URL).
91
+ params: Query parameters.
92
+
93
+ Returns:
94
+ JSON response data.
95
+
96
+ Raises:
97
+ GitHubAPIError: On API errors.
98
+ """
99
+ url = f"{self.BASE_URL}{endpoint}"
100
+
101
+ # Check cache first
102
+ if self.cache:
103
+ cached = self.cache.get(url, params)
104
+ if cached is not None:
105
+ logger.debug(f"Cache hit for {endpoint}")
106
+ return cached
107
+
108
+ # Wait for rate limiter
109
+ while not self.rate_limiter.try_acquire():
110
+ wait_time = self.rate_limiter.wait_time()
111
+ logger.debug(f"Rate limited, waiting {wait_time:.1f}s")
112
+ await asyncio.sleep(min(wait_time, 1.0))
113
+
114
+ session = await self._get_session()
115
+
116
+ async with session.get(url, params=params) as response:
117
+ if response.status == 401:
118
+ raise GitHubAPIError(401, "Unauthorized - check your token")
119
+ elif response.status == 403:
120
+ # Rate limited
121
+ reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
122
+ raise GitHubAPIError(403, f"Rate limited, resets at {reset_time}")
123
+ elif response.status == 503:
124
+ raise GitHubAPIError(503, "Service unavailable")
125
+ elif response.status >= 400:
126
+ text = await response.text()
127
+ raise GitHubAPIError(response.status, text)
128
+
129
+ data = await response.json()
130
+
131
+ # Cache successful response
132
+ if self.cache:
133
+ self.cache.set(url, params, data)
134
+ logger.debug(f"Cached response for {endpoint}")
135
+
136
+ return data
137
+
138
+ async def search_repositories(
139
+ self,
140
+ query: str,
141
+ sort: str = "stars",
142
+ order: str = "desc",
143
+ page: int = 1,
144
+ per_page: int = 100,
145
+ ) -> list[Repository]:
146
+ """Search repositories.
147
+
148
+ Args:
149
+ query: Search query.
150
+ sort: Sort field (stars, forks, updated).
151
+ order: Sort order (asc, desc).
152
+ page: Page number.
153
+ per_page: Results per page (max 100).
154
+
155
+ Returns:
156
+ List of repositories.
157
+ """
158
+ params = {
159
+ "q": query,
160
+ "sort": sort,
161
+ "order": order,
162
+ "page": page,
163
+ "per_page": min(per_page, 100),
164
+ }
165
+
166
+ data = await self._request("/search/repositories", params)
167
+ items = data.get("items", [])
168
+
169
+ return [self._parse_repository(item) for item in items]
170
+
171
+ async def search_repositories_concurrent(
172
+ self,
173
+ query: str,
174
+ max_pages: int = 5,
175
+ per_page: int = 100,
176
+ sort: str = "stars",
177
+ order: str = "desc",
178
+ max_concurrent: int = 5,
179
+ ) -> list[Repository]:
180
+ """Search repositories concurrently across multiple pages.
181
+
182
+ Args:
183
+ query: Search query.
184
+ max_pages: Maximum number of pages to fetch.
185
+ per_page: Results per page (max 100).
186
+ sort: Sort field (stars, forks, updated).
187
+ order: Sort order (asc, desc).
188
+ max_concurrent: Maximum concurrent requests.
189
+
190
+ Returns:
191
+ List of repositories from all pages.
192
+ """
193
+ import asyncio
194
+
195
+ semaphore = asyncio.Semaphore(max_concurrent)
196
+
197
+ async def fetch_page(page: int) -> list[Repository]:
198
+ async with semaphore:
199
+ params = {
200
+ "q": query,
201
+ "sort": sort,
202
+ "order": order,
203
+ "page": page,
204
+ "per_page": min(per_page, 100),
205
+ }
206
+ data = await self._request("/search/repositories", params)
207
+ items = data.get("items", [])
208
+ return [self._parse_repository(item) for item in items]
209
+
210
+ # Create tasks for all pages
211
+ tasks = [fetch_page(page) for page in range(1, max_pages + 1)]
212
+
213
+ # Execute concurrently
214
+ results = await asyncio.gather(*tasks, return_exceptions=True)
215
+
216
+ # Flatten results, skipping exceptions
217
+ all_repos = []
218
+ for result in results:
219
+ if isinstance(result, Exception):
220
+ logger.warning(f"Page fetch failed: {result}")
221
+ continue
222
+ all_repos.extend(result)
223
+
224
+ return all_repos
225
+
226
+ async def get_repository(self, owner: str, repo: str) -> Repository:
227
+ """Get a single repository.
228
+
229
+ Args:
230
+ owner: Repository owner.
231
+ repo: Repository name.
232
+
233
+ Returns:
234
+ Repository data.
235
+ """
236
+ data = await self._request(f"/repos/{owner}/{repo}")
237
+ return self._parse_repository(data)
238
+
239
+ async def get_contributors(self, owner: str, repo: str) -> int:
240
+ """Get contributor count for a repository.
241
+
242
+ Args:
243
+ owner: Repository owner.
244
+ repo: Repository name.
245
+
246
+ Returns:
247
+ Number of contributors.
248
+ """
249
+ try:
250
+ # GitHub doesn't provide count directly, so we fetch first page
251
+ data = await self._request(
252
+ f"/repos/{owner}/{repo}/contributors",
253
+ params={"per_page": 1, "anon": "true"}
254
+ )
255
+
256
+ # Check Link header for total count
257
+ session = await self._get_session()
258
+ url = f"{self.BASE_URL}/repos/{owner}/{repo}/contributors"
259
+
260
+ async with session.get(url, params={"per_page": 1}) as response:
261
+ link_header = response.headers.get("Link", "")
262
+ # Parse last page number from Link header
263
+ if 'rel="last"' in link_header:
264
+ # Extract page number from last link
265
+ import re
266
+ match = re.search(r'page=(\d+)>; rel="last"', link_header)
267
+ if match:
268
+ return int(match.group(1))
269
+
270
+ # Fallback: return length of current page
271
+ return len(data)
272
+ except GitHubAPIError:
273
+ return 0
274
+
275
+ async def get_rate_limit(self) -> RateLimitInfo:
276
+ """Get current rate limit status.
277
+
278
+ Returns:
279
+ Rate limit information.
280
+ """
281
+ data = await self._request("/rate_limit")
282
+
283
+ resources = data.get("resources", {})
284
+ search = resources.get("search", {})
285
+ core = resources.get("core", {})
286
+
287
+ return RateLimitInfo(
288
+ search_limit=search.get("limit", 0),
289
+ search_remaining=search.get("remaining", 0),
290
+ search_reset=search.get("reset", 0),
291
+ core_limit=core.get("limit", 0),
292
+ core_remaining=core.get("remaining", 0),
293
+ core_reset=core.get("reset", 0),
294
+ )
295
+
296
+ def _parse_repository(self, data: dict) -> Repository:
297
+ """Parse repository data from API response.
298
+
299
+ Args:
300
+ data: API response data.
301
+
302
+ Returns:
303
+ Repository object.
304
+ """
305
+ return Repository(
306
+ id=data["id"],
307
+ name=data["full_name"],
308
+ full_name=data["full_name"],
309
+ description=data.get("description"),
310
+ stars=data.get("stargazers_count", 0),
311
+ language=data.get("language"),
312
+ topics=data.get("topics", []),
313
+ created_at=self._parse_datetime(data.get("created_at")),
314
+ updated_at=self._parse_datetime(data.get("updated_at")),
315
+ pushed_at=self._parse_datetime(data.get("pushed_at")),
316
+ url=data.get("html_url", ""),
317
+ open_issues=data.get("open_issues_count"),
318
+ forks=data.get("forks_count"),
319
+ )
320
+
321
+ def _parse_datetime(self, value: Optional[str]) -> Optional[datetime]:
322
+ """Parse ISO datetime string.
323
+
324
+ Args:
325
+ value: ISO datetime string.
326
+
327
+ Returns:
328
+ datetime object or None.
329
+ """
330
+ if not value:
331
+ return None
332
+
333
+ # Handle ISO format with Z suffix
334
+ if value.endswith("Z"):
335
+ value = value[:-1] + "+00:00"
336
+
337
+ try:
338
+ return datetime.fromisoformat(value.replace("+00:00", ""))
339
+ except ValueError:
340
+ return None