github-ai-scraper 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_scraper/__init__.py +3 -0
- ai_scraper/api/__init__.py +6 -0
- ai_scraper/api/github.py +340 -0
- ai_scraper/api/gitlab.py +418 -0
- ai_scraper/api/rate_limiter.py +120 -0
- ai_scraper/api_server.py +196 -0
- ai_scraper/auth.py +68 -0
- ai_scraper/backup.py +112 -0
- ai_scraper/cache.py +95 -0
- ai_scraper/classifier.py +135 -0
- ai_scraper/cli.py +747 -0
- ai_scraper/config.py +237 -0
- ai_scraper/config_watcher.py +82 -0
- ai_scraper/dedup.py +148 -0
- ai_scraper/filters/__init__.py +5 -0
- ai_scraper/filters/ai_filter.py +93 -0
- ai_scraper/health.py +155 -0
- ai_scraper/i18n.py +141 -0
- ai_scraper/interactive.py +96 -0
- ai_scraper/keywords/__init__.py +5 -0
- ai_scraper/keywords/extractor.py +274 -0
- ai_scraper/logging_config.py +74 -0
- ai_scraper/models/__init__.py +5 -0
- ai_scraper/models/repository.py +72 -0
- ai_scraper/output/__init__.py +6 -0
- ai_scraper/output/excel.py +79 -0
- ai_scraper/output/html.py +152 -0
- ai_scraper/output/markdown.py +338 -0
- ai_scraper/output/rss.py +82 -0
- ai_scraper/output/translator.py +303 -0
- ai_scraper/plugin_system.py +146 -0
- ai_scraper/plugins/__init__.py +5 -0
- ai_scraper/retry.py +134 -0
- ai_scraper/scheduler.py +84 -0
- ai_scraper/scrape_progress.py +99 -0
- ai_scraper/secure_storage.py +127 -0
- ai_scraper/storage/__init__.py +5 -0
- ai_scraper/storage/async_database.py +237 -0
- ai_scraper/storage/database.py +456 -0
- ai_scraper/webhooks.py +95 -0
- github_ai_scraper-0.1.2.dist-info/METADATA +299 -0
- github_ai_scraper-0.1.2.dist-info/RECORD +44 -0
- github_ai_scraper-0.1.2.dist-info/WHEEL +4 -0
- github_ai_scraper-0.1.2.dist-info/entry_points.txt +2 -0
ai_scraper/__init__.py
ADDED
ai_scraper/api/github.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""GitHub API client."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
import aiohttp
|
|
10
|
+
|
|
11
|
+
from ai_scraper.api.rate_limiter import RateLimitInfo, RateLimiter
|
|
12
|
+
from ai_scraper.cache import RequestCache
|
|
13
|
+
from ai_scraper.models.repository import Repository
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GitHubAPIError(Exception):
|
|
19
|
+
"""GitHub API error."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, status: int, message: str):
|
|
22
|
+
self.status = status
|
|
23
|
+
self.message = message
|
|
24
|
+
super().__init__(f"GitHub API error {status}: {message}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class GitHubClient:
|
|
28
|
+
"""Asynchronous GitHub API client."""
|
|
29
|
+
|
|
30
|
+
BASE_URL = "https://api.github.com"
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
token: Optional[str] = None,
|
|
35
|
+
cache_dir: Optional[Path] = None,
|
|
36
|
+
cache_ttl: int = 3600,
|
|
37
|
+
connection_pool_size: int = 10,
|
|
38
|
+
):
|
|
39
|
+
"""Initialize GitHub client.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
token: GitHub Personal Access Token (optional).
|
|
43
|
+
cache_dir: Directory for cache files (optional).
|
|
44
|
+
cache_ttl: Cache time-to-live in seconds.
|
|
45
|
+
connection_pool_size: Maximum number of connections in pool.
|
|
46
|
+
"""
|
|
47
|
+
self.token = token
|
|
48
|
+
self.session: Optional[aiohttp.ClientSession] = None
|
|
49
|
+
self.connection_pool_size = connection_pool_size
|
|
50
|
+
|
|
51
|
+
# Rate limiter: 60/hour without token, 5000/hour with token
|
|
52
|
+
rate = 5000 if token else 60
|
|
53
|
+
self.rate_limiter = RateLimiter(requests_per_hour=rate)
|
|
54
|
+
|
|
55
|
+
# Request cache
|
|
56
|
+
self.cache: Optional[RequestCache] = None
|
|
57
|
+
if cache_dir:
|
|
58
|
+
self.cache = RequestCache(cache_dir=cache_dir, ttl=cache_ttl)
|
|
59
|
+
|
|
60
|
+
async def _get_session(self) -> aiohttp.ClientSession:
|
|
61
|
+
"""Get or create aiohttp session with connection pooling."""
|
|
62
|
+
if self.session is None or self.session.closed:
|
|
63
|
+
headers = {"Accept": "application/vnd.github.v3+json"}
|
|
64
|
+
if self.token:
|
|
65
|
+
headers["Authorization"] = f"token {self.token}"
|
|
66
|
+
|
|
67
|
+
# Configure connection pool
|
|
68
|
+
connector = aiohttp.TCPConnector(
|
|
69
|
+
limit=self.connection_pool_size,
|
|
70
|
+
limit_per_host=self.connection_pool_size,
|
|
71
|
+
enable_cleanup_closed=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self.session = aiohttp.ClientSession(
|
|
75
|
+
headers=headers,
|
|
76
|
+
connector=connector,
|
|
77
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
|
78
|
+
)
|
|
79
|
+
return self.session
|
|
80
|
+
|
|
81
|
+
async def close(self) -> None:
|
|
82
|
+
"""Close the HTTP session."""
|
|
83
|
+
if self.session and not self.session.closed:
|
|
84
|
+
await self.session.close()
|
|
85
|
+
|
|
86
|
+
async def _request(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
|
87
|
+
"""Make an API request.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
endpoint: API endpoint (without base URL).
|
|
91
|
+
params: Query parameters.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
JSON response data.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
GitHubAPIError: On API errors.
|
|
98
|
+
"""
|
|
99
|
+
url = f"{self.BASE_URL}{endpoint}"
|
|
100
|
+
|
|
101
|
+
# Check cache first
|
|
102
|
+
if self.cache:
|
|
103
|
+
cached = self.cache.get(url, params)
|
|
104
|
+
if cached is not None:
|
|
105
|
+
logger.debug(f"Cache hit for {endpoint}")
|
|
106
|
+
return cached
|
|
107
|
+
|
|
108
|
+
# Wait for rate limiter
|
|
109
|
+
while not self.rate_limiter.try_acquire():
|
|
110
|
+
wait_time = self.rate_limiter.wait_time()
|
|
111
|
+
logger.debug(f"Rate limited, waiting {wait_time:.1f}s")
|
|
112
|
+
await asyncio.sleep(min(wait_time, 1.0))
|
|
113
|
+
|
|
114
|
+
session = await self._get_session()
|
|
115
|
+
|
|
116
|
+
async with session.get(url, params=params) as response:
|
|
117
|
+
if response.status == 401:
|
|
118
|
+
raise GitHubAPIError(401, "Unauthorized - check your token")
|
|
119
|
+
elif response.status == 403:
|
|
120
|
+
# Rate limited
|
|
121
|
+
reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
|
|
122
|
+
raise GitHubAPIError(403, f"Rate limited, resets at {reset_time}")
|
|
123
|
+
elif response.status == 503:
|
|
124
|
+
raise GitHubAPIError(503, "Service unavailable")
|
|
125
|
+
elif response.status >= 400:
|
|
126
|
+
text = await response.text()
|
|
127
|
+
raise GitHubAPIError(response.status, text)
|
|
128
|
+
|
|
129
|
+
data = await response.json()
|
|
130
|
+
|
|
131
|
+
# Cache successful response
|
|
132
|
+
if self.cache:
|
|
133
|
+
self.cache.set(url, params, data)
|
|
134
|
+
logger.debug(f"Cached response for {endpoint}")
|
|
135
|
+
|
|
136
|
+
return data
|
|
137
|
+
|
|
138
|
+
async def search_repositories(
|
|
139
|
+
self,
|
|
140
|
+
query: str,
|
|
141
|
+
sort: str = "stars",
|
|
142
|
+
order: str = "desc",
|
|
143
|
+
page: int = 1,
|
|
144
|
+
per_page: int = 100,
|
|
145
|
+
) -> list[Repository]:
|
|
146
|
+
"""Search repositories.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
query: Search query.
|
|
150
|
+
sort: Sort field (stars, forks, updated).
|
|
151
|
+
order: Sort order (asc, desc).
|
|
152
|
+
page: Page number.
|
|
153
|
+
per_page: Results per page (max 100).
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
List of repositories.
|
|
157
|
+
"""
|
|
158
|
+
params = {
|
|
159
|
+
"q": query,
|
|
160
|
+
"sort": sort,
|
|
161
|
+
"order": order,
|
|
162
|
+
"page": page,
|
|
163
|
+
"per_page": min(per_page, 100),
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
data = await self._request("/search/repositories", params)
|
|
167
|
+
items = data.get("items", [])
|
|
168
|
+
|
|
169
|
+
return [self._parse_repository(item) for item in items]
|
|
170
|
+
|
|
171
|
+
async def search_repositories_concurrent(
|
|
172
|
+
self,
|
|
173
|
+
query: str,
|
|
174
|
+
max_pages: int = 5,
|
|
175
|
+
per_page: int = 100,
|
|
176
|
+
sort: str = "stars",
|
|
177
|
+
order: str = "desc",
|
|
178
|
+
max_concurrent: int = 5,
|
|
179
|
+
) -> list[Repository]:
|
|
180
|
+
"""Search repositories concurrently across multiple pages.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
query: Search query.
|
|
184
|
+
max_pages: Maximum number of pages to fetch.
|
|
185
|
+
per_page: Results per page (max 100).
|
|
186
|
+
sort: Sort field (stars, forks, updated).
|
|
187
|
+
order: Sort order (asc, desc).
|
|
188
|
+
max_concurrent: Maximum concurrent requests.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
List of repositories from all pages.
|
|
192
|
+
"""
|
|
193
|
+
import asyncio
|
|
194
|
+
|
|
195
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
196
|
+
|
|
197
|
+
async def fetch_page(page: int) -> list[Repository]:
|
|
198
|
+
async with semaphore:
|
|
199
|
+
params = {
|
|
200
|
+
"q": query,
|
|
201
|
+
"sort": sort,
|
|
202
|
+
"order": order,
|
|
203
|
+
"page": page,
|
|
204
|
+
"per_page": min(per_page, 100),
|
|
205
|
+
}
|
|
206
|
+
data = await self._request("/search/repositories", params)
|
|
207
|
+
items = data.get("items", [])
|
|
208
|
+
return [self._parse_repository(item) for item in items]
|
|
209
|
+
|
|
210
|
+
# Create tasks for all pages
|
|
211
|
+
tasks = [fetch_page(page) for page in range(1, max_pages + 1)]
|
|
212
|
+
|
|
213
|
+
# Execute concurrently
|
|
214
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
215
|
+
|
|
216
|
+
# Flatten results, skipping exceptions
|
|
217
|
+
all_repos = []
|
|
218
|
+
for result in results:
|
|
219
|
+
if isinstance(result, Exception):
|
|
220
|
+
logger.warning(f"Page fetch failed: {result}")
|
|
221
|
+
continue
|
|
222
|
+
all_repos.extend(result)
|
|
223
|
+
|
|
224
|
+
return all_repos
|
|
225
|
+
|
|
226
|
+
async def get_repository(self, owner: str, repo: str) -> Repository:
|
|
227
|
+
"""Get a single repository.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
owner: Repository owner.
|
|
231
|
+
repo: Repository name.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Repository data.
|
|
235
|
+
"""
|
|
236
|
+
data = await self._request(f"/repos/{owner}/{repo}")
|
|
237
|
+
return self._parse_repository(data)
|
|
238
|
+
|
|
239
|
+
async def get_contributors(self, owner: str, repo: str) -> int:
|
|
240
|
+
"""Get contributor count for a repository.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
owner: Repository owner.
|
|
244
|
+
repo: Repository name.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Number of contributors.
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
# GitHub doesn't provide count directly, so we fetch first page
|
|
251
|
+
data = await self._request(
|
|
252
|
+
f"/repos/{owner}/{repo}/contributors",
|
|
253
|
+
params={"per_page": 1, "anon": "true"}
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Check Link header for total count
|
|
257
|
+
session = await self._get_session()
|
|
258
|
+
url = f"{self.BASE_URL}/repos/{owner}/{repo}/contributors"
|
|
259
|
+
|
|
260
|
+
async with session.get(url, params={"per_page": 1}) as response:
|
|
261
|
+
link_header = response.headers.get("Link", "")
|
|
262
|
+
# Parse last page number from Link header
|
|
263
|
+
if 'rel="last"' in link_header:
|
|
264
|
+
# Extract page number from last link
|
|
265
|
+
import re
|
|
266
|
+
match = re.search(r'page=(\d+)>; rel="last"', link_header)
|
|
267
|
+
if match:
|
|
268
|
+
return int(match.group(1))
|
|
269
|
+
|
|
270
|
+
# Fallback: return length of current page
|
|
271
|
+
return len(data)
|
|
272
|
+
except GitHubAPIError:
|
|
273
|
+
return 0
|
|
274
|
+
|
|
275
|
+
async def get_rate_limit(self) -> RateLimitInfo:
|
|
276
|
+
"""Get current rate limit status.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Rate limit information.
|
|
280
|
+
"""
|
|
281
|
+
data = await self._request("/rate_limit")
|
|
282
|
+
|
|
283
|
+
resources = data.get("resources", {})
|
|
284
|
+
search = resources.get("search", {})
|
|
285
|
+
core = resources.get("core", {})
|
|
286
|
+
|
|
287
|
+
return RateLimitInfo(
|
|
288
|
+
search_limit=search.get("limit", 0),
|
|
289
|
+
search_remaining=search.get("remaining", 0),
|
|
290
|
+
search_reset=search.get("reset", 0),
|
|
291
|
+
core_limit=core.get("limit", 0),
|
|
292
|
+
core_remaining=core.get("remaining", 0),
|
|
293
|
+
core_reset=core.get("reset", 0),
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def _parse_repository(self, data: dict) -> Repository:
|
|
297
|
+
"""Parse repository data from API response.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
data: API response data.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Repository object.
|
|
304
|
+
"""
|
|
305
|
+
return Repository(
|
|
306
|
+
id=data["id"],
|
|
307
|
+
name=data["full_name"],
|
|
308
|
+
full_name=data["full_name"],
|
|
309
|
+
description=data.get("description"),
|
|
310
|
+
stars=data.get("stargazers_count", 0),
|
|
311
|
+
language=data.get("language"),
|
|
312
|
+
topics=data.get("topics", []),
|
|
313
|
+
created_at=self._parse_datetime(data.get("created_at")),
|
|
314
|
+
updated_at=self._parse_datetime(data.get("updated_at")),
|
|
315
|
+
pushed_at=self._parse_datetime(data.get("pushed_at")),
|
|
316
|
+
url=data.get("html_url", ""),
|
|
317
|
+
open_issues=data.get("open_issues_count"),
|
|
318
|
+
forks=data.get("forks_count"),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def _parse_datetime(self, value: Optional[str]) -> Optional[datetime]:
|
|
322
|
+
"""Parse ISO datetime string.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
value: ISO datetime string.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
datetime object or None.
|
|
329
|
+
"""
|
|
330
|
+
if not value:
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
# Handle ISO format with Z suffix
|
|
334
|
+
if value.endswith("Z"):
|
|
335
|
+
value = value[:-1] + "+00:00"
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
return datetime.fromisoformat(value.replace("+00:00", ""))
|
|
339
|
+
except ValueError:
|
|
340
|
+
return None
|