chatsbom 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,453 @@
1
+ import datetime
2
+ import json
3
+ import os
4
+ import time
5
+ from collections.abc import Generator
6
+ from dataclasses import dataclass
7
+ from dataclasses import field
8
+
9
+ import dotenv
10
+ import requests
11
+ import structlog
12
+ import typer
13
+ from rich.console import Console
14
+ from rich.progress import Progress
15
+ from rich.progress import SpinnerColumn
16
+ from rich.progress import TaskID
17
+ from rich.progress import TextColumn
18
+ from rich.progress import TimeElapsedColumn
19
+ from rich.table import Table
20
+
21
+ from chatsbom.core.client import get_http_client
22
+ from chatsbom.core.config import get_config
23
+ from chatsbom.models.language import Language
24
+
25
+ dotenv.load_dotenv()
26
+ logger = structlog.get_logger('Searcher')
27
+ console = Console()
28
+ console = Console()
29
+
30
+
31
+ @dataclass
32
+ class SearchStats:
33
+ api_requests: int = 0
34
+ cache_hits: int = 0
35
+ repos_found: int = 0
36
+ repos_saved: int = 0
37
+ start_time: float = field(default_factory=time.time)
38
+
39
+
40
+ class GitHubClient:
41
+ """Handles GitHub API interaction, authentication, and rate limiting."""
42
+
43
+ def __init__(self, token: str, delay: float = 2.0):
44
+ self.session = get_http_client()
45
+ self.session.headers.update({
46
+ 'Authorization': f"Bearer {token}",
47
+ 'Accept': 'application/vnd.github.v3+json',
48
+ 'User-Agent': 'SBOM-Insight',
49
+ })
50
+ self.delay = delay
51
+ self.last_req_time = 0.0
52
+
53
+ def search_repositories(self, query: str, task_id: TaskID, progress: Progress, stats: SearchStats) -> Generator[dict, None, None]:
54
+ """
55
+ Iterates through pagination (pages 1-10) for a given query.
56
+ Handles API rate limits automatically.
57
+ """
58
+ page = 1
59
+ # GitHub API Search limit: 1000 results (10 pages * 100)
60
+ max_pages = 10
61
+
62
+ while page <= max_pages:
63
+ self._wait_for_rate_limit()
64
+
65
+ url = 'https://api.github.com/search/repositories'
66
+ params = {
67
+ 'q': query,
68
+ 'sort': 'stars',
69
+ 'order': 'desc',
70
+ 'per_page': '100',
71
+ 'page': str(page),
72
+ }
73
+
74
+ try:
75
+ progress.update(
76
+ task_id, status=f"Page {page}",
77
+ )
78
+ start_time = time.time()
79
+ resp = self.session.get(url, params=params, timeout=20)
80
+ elapsed = time.time() - start_time
81
+ is_cached = getattr(resp, 'from_cache', False)
82
+
83
+ if not is_cached:
84
+ self.last_req_time = time.time()
85
+ logger.info(
86
+ 'API Request',
87
+ page=page,
88
+ status=resp.status_code,
89
+ elapsed=f"{elapsed:.2f}s",
90
+ url=resp.url,
91
+ query=query,
92
+ )
93
+ else:
94
+ stats.cache_hits += 1
95
+ timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
96
+ log_msg = (
97
+ f"{timestamp} \\[info ] API Request "
98
+ f"elapsed={elapsed:.2f}s page={page} "
99
+ f"query='{query}' status={resp.status_code} "
100
+ f"url='{resp.url}' [green](Cached)[/green]"
101
+ )
102
+ console.print(f"[dim]{log_msg}[/dim]")
103
+
104
+ stats.api_requests += 1
105
+
106
+ if resp.status_code == 200:
107
+ data = resp.json()
108
+ items = data.get('items', [])
109
+ if not items:
110
+ return
111
+ stats.repos_found += len(items)
112
+ yield from items
113
+ if len(items) < 100: # End of results
114
+ return
115
+ page += 1
116
+
117
+ elif resp.status_code in [403, 429]:
118
+ self._handle_api_limit(resp, task_id, progress)
119
+ elif resp.status_code == 422:
120
+ logger.error(
121
+ 'API 422 Error (Unprocessable Entity). Stopping current query.',
122
+ )
123
+ return
124
+ else:
125
+ logger.error(f"API Error {resp.status_code}: {resp.text}")
126
+ return
127
+
128
+ except requests.RequestException as e:
129
+ logger.error(f"Network error: {e}. Retrying in 5s...")
130
+ time.sleep(5)
131
+
132
+ def _wait_for_rate_limit(self):
133
+ """Token bucket style local rate limiting."""
134
+ gap = time.time() - self.last_req_time
135
+ if gap < self.delay:
136
+ time.sleep(self.delay - gap)
137
+
138
+ def _handle_api_limit(self, resp: requests.Response, task_id: TaskID, progress: Progress):
139
+ """Handles 403/429 responses by waiting until reset."""
140
+ reset_time = int(
141
+ resp.headers.get(
142
+ 'X-RateLimit-Reset', time.time() + 60,
143
+ ),
144
+ )
145
+ wait_seconds = max(60, reset_time - int(time.time())) + 2
146
+
147
+ logger.warning(f"Rate limit triggered. Waiting {wait_seconds}s...")
148
+ for i in range(wait_seconds, 0, -1):
149
+ progress.update(
150
+ task_id, status=f"[bold red]Limit {i}s",
151
+ )
152
+ time.sleep(1)
153
+
154
+
155
+ class Storage:
156
+ """Manages file persistence and deduplication."""
157
+
158
+ def __init__(self, filepath: str):
159
+ self.filepath = filepath
160
+ self.visited_ids: set[int] = set()
161
+ self.min_stars_seen: float = float('inf')
162
+ self._load_existing()
163
+
164
+ def _load_existing(self):
165
+ if not os.path.exists(self.filepath):
166
+ return
167
+
168
+ count = 0
169
+ try:
170
+ with open(self.filepath, encoding='utf-8') as f:
171
+ for line in f:
172
+ if line.strip():
173
+ try:
174
+ data = json.loads(line)
175
+ self.visited_ids.add(data['id'])
176
+
177
+ # Track minimum stars seen
178
+ stars = data.get('stargazers_count', float('inf'))
179
+ self.min_stars_seen = min(
180
+ self.min_stars_seen, stars,
181
+ )
182
+
183
+ count += 1
184
+ except json.JSONDecodeError:
185
+ pass
186
+ logger.info(
187
+ f"Loaded {count} existing records. Min stars: {self.min_stars_seen}",
188
+ )
189
+ except Exception as e:
190
+ logger.error(f"Failed to load existing data: {e}")
191
+
192
+ def save(self, item: dict) -> bool:
193
+ """Saves an item if it hasn't been seen before. Returns True if saved."""
194
+ if item['id'] in self.visited_ids:
195
+ return False
196
+
197
+ self.visited_ids.add(item['id'])
198
+
199
+ record = {
200
+ 'id': item['id'],
201
+ 'full_name': item['full_name'],
202
+ 'stars': item['stargazers_count'],
203
+ 'url': item['html_url'],
204
+ 'created_at': item['created_at'],
205
+ 'default_branch': item.get('default_branch', 'main'),
206
+ 'description': item.get('description', ''),
207
+ 'topics': item.get('topics', []),
208
+ }
209
+
210
+ with open(self.filepath, 'a', encoding='utf-8') as f:
211
+ f.write(json.dumps(record, ensure_ascii=False) + '\n')
212
+ f.flush()
213
+ return True
214
+
215
+
216
+ class Searcher:
217
+ """Main searcher logic controller."""
218
+
219
+ def __init__(self, token: str, lang: str, min_stars: int, output: str):
220
+ self.client = GitHubClient(token)
221
+ self.storage = Storage(output)
222
+ self.lang = lang
223
+ self.min_stars = min_stars
224
+ self.current_max_stars: int | None = None
225
+
226
+ def run(self):
227
+ # Freshness Check removed (handled by HTTP cache)
228
+ stats = SearchStats()
229
+
230
+ with Progress(
231
+ SpinnerColumn(),
232
+ TextColumn('[bold blue]{task.description}'),
233
+ TextColumn('•'),
234
+ TextColumn('[bold yellow]{task.fields[status]}'),
235
+ TextColumn('•'),
236
+ TextColumn('[bold green]{task.completed} repos'),
237
+ TextColumn('•'),
238
+ TextColumn('[cyan]Values: {task.fields[stars]}'),
239
+ TextColumn('•'),
240
+ TimeElapsedColumn(),
241
+ console=console,
242
+ ) as progress:
243
+ task = progress.add_task(
244
+ '[green]Crawling...',
245
+ total=None,
246
+ status='Init',
247
+ stars='N/A',
248
+ )
249
+
250
+ # 0. Completeness Check
251
+ if self.storage.min_stars_seen <= self.min_stars:
252
+ logger.info(
253
+ 'Search already complete for this threshold.',
254
+ min_stars_required=self.min_stars,
255
+ min_stars_found=self.storage.min_stars_seen,
256
+ )
257
+ return
258
+
259
+ while True:
260
+ # 1. Determine Query Range
261
+ if self.current_max_stars is None:
262
+ query = f"language:{self.lang} stars:>{self.min_stars}"
263
+ desc = f"> {self.min_stars}"
264
+ else:
265
+ query = f"language:{self.lang} stars:{self.min_stars}..{self.current_max_stars}"
266
+ desc = f"{self.min_stars}..{self.current_max_stars}"
267
+
268
+ progress.update(
269
+ task,
270
+ stars=desc,
271
+ status='Scanning',
272
+ )
273
+
274
+ # 2. Execute Batch
275
+ batch_items = []
276
+ min_stars_in_batch = float('inf')
277
+
278
+ for item in self.client.search_repositories(query, task, progress, stats):
279
+ batch_items.append(item)
280
+ stars = item['stargazers_count']
281
+ min_stars_in_batch = min(min_stars_in_batch, stars)
282
+
283
+ if self.storage.save(item):
284
+ progress.advance(task)
285
+ stats.repos_saved += 1
286
+ progress.console.print(
287
+ f" [green]★[/] [bold]{item['full_name']}[/] "
288
+ f"[dim]({stars:,} stars)[/]",
289
+ )
290
+
291
+ # 3. Analyze Batch for Next Cursor
292
+ count = len(batch_items)
293
+ if count == 0:
294
+ logger.info('[bold green]No more results. Done!')
295
+ break
296
+
297
+ if count < 1000:
298
+ # If we haven't hit the bottom star limit, but returned <1000,
299
+ # it implies we exhausted this specific query range.
300
+ if self.current_max_stars is None or min_stars_in_batch <= self.min_stars:
301
+ logger.info(
302
+ f"Batch ({count}) < 1000 and hit floor. Done.",
303
+ )
304
+ break
305
+ else:
306
+ # Move cursor down safely
307
+ self.current_max_stars = min_stars_in_batch - 1
308
+ else:
309
+ # We hit the 1000 limit. Check for "Star Wall"
310
+ if self.current_max_stars is not None and min_stars_in_batch == self.current_max_stars:
311
+ logger.warning(
312
+ f"Dense Star Wall at {min_stars_in_batch}★. Switching to Time Slicing...",
313
+ )
314
+ self._process_time_slice(
315
+ min_stars_in_batch, task, progress, stats,
316
+ )
317
+ self.current_max_stars = min_stars_in_batch - 1
318
+ else:
319
+ # Normal cursor movement
320
+ self.current_max_stars = min_stars_in_batch
321
+
322
+ # Boundary Check
323
+ if self.current_max_stars is not None and self.current_max_stars < self.min_stars:
324
+ logger.info(
325
+ '[bold green]Reached minimum star threshold. Done.',
326
+ )
327
+ break
328
+
329
+ # Print Summary Table (End of Run)
330
+ elapsed_time = time.time() - stats.start_time
331
+ table = Table(title='Search Summary')
332
+ table.add_column('Metric', style='cyan')
333
+ table.add_column('Value', style='magenta')
334
+
335
+ table.add_row('Total API Requests', str(stats.api_requests))
336
+ table.add_row('API Cache Hits', str(stats.cache_hits))
337
+ table.add_row('Repos Discovered', str(stats.repos_found))
338
+ table.add_row('New Repos Saved', str(stats.repos_saved))
339
+ table.add_row('Total Duration', f"{elapsed_time:.2f}s")
340
+
341
+ console.print(table)
342
+
343
+ def _process_time_slice(self, stars: int, task_id: TaskID, progress: Progress, stats: SearchStats):
344
+ """Handles dense star counts by slicing via 'created' date."""
345
+ start_dt = datetime.datetime(2008, 1, 1)
346
+ end_dt = datetime.datetime.now()
347
+ stack = [(start_dt, end_dt)]
348
+
349
+ while stack:
350
+ s, e = stack.pop()
351
+ date_range = f"{s.strftime('%Y-%m-%d')}..{e.strftime('%Y-%m-%d')}"
352
+ query = f"language:{self.lang} stars:{stars} created:{date_range}"
353
+
354
+ progress.update(
355
+ task_id,
356
+ status='Time Slice',
357
+ stars=f"{stars}★ [{date_range}]",
358
+ )
359
+
360
+ items = list(
361
+ self.client.search_repositories(
362
+ query, task_id, progress, stats,
363
+ ),
364
+ )
365
+
366
+ if len(items) >= 1000:
367
+ # Too many results, split time range
368
+ mid_ts = s.timestamp() + (e.timestamp() - s.timestamp()) / 2
369
+ mid = datetime.datetime.fromtimestamp(mid_ts)
370
+ stack.append((mid + datetime.timedelta(seconds=1), e))
371
+ stack.append((s, mid))
372
+ else:
373
+ # Process results
374
+ for item in items:
375
+ if self.storage.save(item):
376
+ progress.advance(task_id)
377
+ stats.repos_saved += 1
378
+
379
+
380
+ def main(
381
+ token: str = typer.Option(
382
+ None, envvar='GITHUB_TOKEN', help='GitHub Token',
383
+ ),
384
+ language: Language | None = typer.Option(
385
+ None, help='Target Programming Language (default: all)',
386
+ ),
387
+ min_stars: int = typer.Option(None, help='Minimum Star Count'),
388
+ output_path_arg: str | None = typer.Option(
389
+ None, '--output', help='Output JSONL Path',
390
+ ),
391
+ ):
392
+ """
393
+ Collect repository links from GitHub.
394
+ Crawls repositories by Star count, using cursor slicing to bypass 1000-item limits.
395
+ """
396
+ # Load config
397
+ config = get_config()
398
+
399
+ # Use config defaults if not provided
400
+ if token is None:
401
+ token = config.github.token
402
+ if min_stars is None:
403
+ min_stars = config.github.default_min_stars
404
+
405
+ if not token:
406
+ console.print(
407
+ '[bold red]Error:[/] GITHUB_TOKEN is not set.\n\n'
408
+ 'The GitHub Search API requires authentication. '
409
+ 'Please set the GITHUB_TOKEN environment variable:\n\n'
410
+ ' [cyan]export GITHUB_TOKEN="your_github_token"[/]\n\n'
411
+ 'Or add it to your [cyan].env[/] file:\n\n'
412
+ ' [cyan]GITHUB_TOKEN=your_github_token[/]\n\n'
413
+ 'You can create a fine-grained token at: '
414
+ '[link=https://github.com/settings/personal-access-tokens/new]'
415
+ 'https://github.com/settings/personal-access-tokens/new[/link]\n\n'
416
+ '[dim]Note: Fine-grained tokens have read-only access to public repos by default.[/dim]',
417
+ )
418
+ raise typer.Exit(1)
419
+
420
+ if language is None:
421
+ logger.warning('No language specified. Crawling ALL languages...')
422
+ target_languages = list(Language)
423
+ else:
424
+ target_languages = [language]
425
+
426
+ for lang in target_languages:
427
+
428
+ # Determine output path for this language
429
+ if output_path_arg is None:
430
+ current_output = str(config.paths.get_repo_list_path(str(lang)))
431
+ else:
432
+ current_output = output_path_arg
433
+
434
+ logger.info(
435
+ 'Starting Search',
436
+ language=str(lang),
437
+ min_stars=min_stars,
438
+ output=current_output,
439
+ )
440
+
441
+ try:
442
+ searcher = Searcher(token, lang, min_stars, current_output)
443
+ searcher.run()
444
+ except KeyboardInterrupt:
445
+ logger.warning('Aborted by user.')
446
+ raise typer.Exit(1)
447
+ except Exception as e:
448
+ logger.exception(f"Fatal Error processing {lang}: {e}")
449
+ continue
450
+
451
+
452
+ if __name__ == '__main__':
453
+ typer.run(main)