chatsbom 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatsbom/__init__.py +0 -0
- chatsbom/__main__.py +27 -0
- chatsbom/commands/__init__.py +1 -0
- chatsbom/commands/chat.py +297 -0
- chatsbom/commands/collect.py +453 -0
- chatsbom/commands/convert.py +263 -0
- chatsbom/commands/download.py +293 -0
- chatsbom/commands/index.py +327 -0
- chatsbom/commands/query.py +174 -0
- chatsbom/commands/status.py +223 -0
- chatsbom/core/__init__.py +1 -0
- chatsbom/core/clickhouse.py +98 -0
- chatsbom/core/client.py +54 -0
- chatsbom/core/config.py +145 -0
- chatsbom/core/repository.py +327 -0
- chatsbom/core/schema.py +31 -0
- chatsbom/core/validation.py +149 -0
- chatsbom/models/__init__.py +0 -0
- chatsbom/models/framework.py +129 -0
- chatsbom/models/language.py +167 -0
- chatsbom-0.2.1.dist-info/METADATA +125 -0
- chatsbom-0.2.1.dist-info/RECORD +24 -0
- chatsbom-0.2.1.dist-info/WHEEL +4 -0
- chatsbom-0.2.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from dataclasses import field
|
|
8
|
+
|
|
9
|
+
import dotenv
|
|
10
|
+
import requests
|
|
11
|
+
import structlog
|
|
12
|
+
import typer
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.progress import Progress
|
|
15
|
+
from rich.progress import SpinnerColumn
|
|
16
|
+
from rich.progress import TaskID
|
|
17
|
+
from rich.progress import TextColumn
|
|
18
|
+
from rich.progress import TimeElapsedColumn
|
|
19
|
+
from rich.table import Table
|
|
20
|
+
|
|
21
|
+
from chatsbom.core.client import get_http_client
|
|
22
|
+
from chatsbom.core.config import get_config
|
|
23
|
+
from chatsbom.models.language import Language
|
|
24
|
+
|
|
25
|
+
dotenv.load_dotenv()
|
|
26
|
+
logger = structlog.get_logger('Searcher')
|
|
27
|
+
console = Console()
|
|
28
|
+
console = Console()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class SearchStats:
|
|
33
|
+
api_requests: int = 0
|
|
34
|
+
cache_hits: int = 0
|
|
35
|
+
repos_found: int = 0
|
|
36
|
+
repos_saved: int = 0
|
|
37
|
+
start_time: float = field(default_factory=time.time)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GitHubClient:
|
|
41
|
+
"""Handles GitHub API interaction, authentication, and rate limiting."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, token: str, delay: float = 2.0):
|
|
44
|
+
self.session = get_http_client()
|
|
45
|
+
self.session.headers.update({
|
|
46
|
+
'Authorization': f"Bearer {token}",
|
|
47
|
+
'Accept': 'application/vnd.github.v3+json',
|
|
48
|
+
'User-Agent': 'SBOM-Insight',
|
|
49
|
+
})
|
|
50
|
+
self.delay = delay
|
|
51
|
+
self.last_req_time = 0.0
|
|
52
|
+
|
|
53
|
+
def search_repositories(self, query: str, task_id: TaskID, progress: Progress, stats: SearchStats) -> Generator[dict, None, None]:
|
|
54
|
+
"""
|
|
55
|
+
Iterates through pagination (pages 1-10) for a given query.
|
|
56
|
+
Handles API rate limits automatically.
|
|
57
|
+
"""
|
|
58
|
+
page = 1
|
|
59
|
+
# GitHub API Search limit: 1000 results (10 pages * 100)
|
|
60
|
+
max_pages = 10
|
|
61
|
+
|
|
62
|
+
while page <= max_pages:
|
|
63
|
+
self._wait_for_rate_limit()
|
|
64
|
+
|
|
65
|
+
url = 'https://api.github.com/search/repositories'
|
|
66
|
+
params = {
|
|
67
|
+
'q': query,
|
|
68
|
+
'sort': 'stars',
|
|
69
|
+
'order': 'desc',
|
|
70
|
+
'per_page': '100',
|
|
71
|
+
'page': str(page),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
progress.update(
|
|
76
|
+
task_id, status=f"Page {page}",
|
|
77
|
+
)
|
|
78
|
+
start_time = time.time()
|
|
79
|
+
resp = self.session.get(url, params=params, timeout=20)
|
|
80
|
+
elapsed = time.time() - start_time
|
|
81
|
+
is_cached = getattr(resp, 'from_cache', False)
|
|
82
|
+
|
|
83
|
+
if not is_cached:
|
|
84
|
+
self.last_req_time = time.time()
|
|
85
|
+
logger.info(
|
|
86
|
+
'API Request',
|
|
87
|
+
page=page,
|
|
88
|
+
status=resp.status_code,
|
|
89
|
+
elapsed=f"{elapsed:.2f}s",
|
|
90
|
+
url=resp.url,
|
|
91
|
+
query=query,
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
stats.cache_hits += 1
|
|
95
|
+
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
96
|
+
log_msg = (
|
|
97
|
+
f"{timestamp} \\[info ] API Request "
|
|
98
|
+
f"elapsed={elapsed:.2f}s page={page} "
|
|
99
|
+
f"query='{query}' status={resp.status_code} "
|
|
100
|
+
f"url='{resp.url}' [green](Cached)[/green]"
|
|
101
|
+
)
|
|
102
|
+
console.print(f"[dim]{log_msg}[/dim]")
|
|
103
|
+
|
|
104
|
+
stats.api_requests += 1
|
|
105
|
+
|
|
106
|
+
if resp.status_code == 200:
|
|
107
|
+
data = resp.json()
|
|
108
|
+
items = data.get('items', [])
|
|
109
|
+
if not items:
|
|
110
|
+
return
|
|
111
|
+
stats.repos_found += len(items)
|
|
112
|
+
yield from items
|
|
113
|
+
if len(items) < 100: # End of results
|
|
114
|
+
return
|
|
115
|
+
page += 1
|
|
116
|
+
|
|
117
|
+
elif resp.status_code in [403, 429]:
|
|
118
|
+
self._handle_api_limit(resp, task_id, progress)
|
|
119
|
+
elif resp.status_code == 422:
|
|
120
|
+
logger.error(
|
|
121
|
+
'API 422 Error (Unprocessable Entity). Stopping current query.',
|
|
122
|
+
)
|
|
123
|
+
return
|
|
124
|
+
else:
|
|
125
|
+
logger.error(f"API Error {resp.status_code}: {resp.text}")
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
except requests.RequestException as e:
|
|
129
|
+
logger.error(f"Network error: {e}. Retrying in 5s...")
|
|
130
|
+
time.sleep(5)
|
|
131
|
+
|
|
132
|
+
def _wait_for_rate_limit(self):
|
|
133
|
+
"""Token bucket style local rate limiting."""
|
|
134
|
+
gap = time.time() - self.last_req_time
|
|
135
|
+
if gap < self.delay:
|
|
136
|
+
time.sleep(self.delay - gap)
|
|
137
|
+
|
|
138
|
+
def _handle_api_limit(self, resp: requests.Response, task_id: TaskID, progress: Progress):
|
|
139
|
+
"""Handles 403/429 responses by waiting until reset."""
|
|
140
|
+
reset_time = int(
|
|
141
|
+
resp.headers.get(
|
|
142
|
+
'X-RateLimit-Reset', time.time() + 60,
|
|
143
|
+
),
|
|
144
|
+
)
|
|
145
|
+
wait_seconds = max(60, reset_time - int(time.time())) + 2
|
|
146
|
+
|
|
147
|
+
logger.warning(f"Rate limit triggered. Waiting {wait_seconds}s...")
|
|
148
|
+
for i in range(wait_seconds, 0, -1):
|
|
149
|
+
progress.update(
|
|
150
|
+
task_id, status=f"[bold red]Limit {i}s",
|
|
151
|
+
)
|
|
152
|
+
time.sleep(1)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class Storage:
|
|
156
|
+
"""Manages file persistence and deduplication."""
|
|
157
|
+
|
|
158
|
+
def __init__(self, filepath: str):
|
|
159
|
+
self.filepath = filepath
|
|
160
|
+
self.visited_ids: set[int] = set()
|
|
161
|
+
self.min_stars_seen: float = float('inf')
|
|
162
|
+
self._load_existing()
|
|
163
|
+
|
|
164
|
+
def _load_existing(self):
|
|
165
|
+
if not os.path.exists(self.filepath):
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
count = 0
|
|
169
|
+
try:
|
|
170
|
+
with open(self.filepath, encoding='utf-8') as f:
|
|
171
|
+
for line in f:
|
|
172
|
+
if line.strip():
|
|
173
|
+
try:
|
|
174
|
+
data = json.loads(line)
|
|
175
|
+
self.visited_ids.add(data['id'])
|
|
176
|
+
|
|
177
|
+
# Track minimum stars seen
|
|
178
|
+
stars = data.get('stargazers_count', float('inf'))
|
|
179
|
+
self.min_stars_seen = min(
|
|
180
|
+
self.min_stars_seen, stars,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
count += 1
|
|
184
|
+
except json.JSONDecodeError:
|
|
185
|
+
pass
|
|
186
|
+
logger.info(
|
|
187
|
+
f"Loaded {count} existing records. Min stars: {self.min_stars_seen}",
|
|
188
|
+
)
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.error(f"Failed to load existing data: {e}")
|
|
191
|
+
|
|
192
|
+
def save(self, item: dict) -> bool:
|
|
193
|
+
"""Saves an item if it hasn't been seen before. Returns True if saved."""
|
|
194
|
+
if item['id'] in self.visited_ids:
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
self.visited_ids.add(item['id'])
|
|
198
|
+
|
|
199
|
+
record = {
|
|
200
|
+
'id': item['id'],
|
|
201
|
+
'full_name': item['full_name'],
|
|
202
|
+
'stars': item['stargazers_count'],
|
|
203
|
+
'url': item['html_url'],
|
|
204
|
+
'created_at': item['created_at'],
|
|
205
|
+
'default_branch': item.get('default_branch', 'main'),
|
|
206
|
+
'description': item.get('description', ''),
|
|
207
|
+
'topics': item.get('topics', []),
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
with open(self.filepath, 'a', encoding='utf-8') as f:
|
|
211
|
+
f.write(json.dumps(record, ensure_ascii=False) + '\n')
|
|
212
|
+
f.flush()
|
|
213
|
+
return True
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class Searcher:
|
|
217
|
+
"""Main searcher logic controller."""
|
|
218
|
+
|
|
219
|
+
def __init__(self, token: str, lang: str, min_stars: int, output: str):
|
|
220
|
+
self.client = GitHubClient(token)
|
|
221
|
+
self.storage = Storage(output)
|
|
222
|
+
self.lang = lang
|
|
223
|
+
self.min_stars = min_stars
|
|
224
|
+
self.current_max_stars: int | None = None
|
|
225
|
+
|
|
226
|
+
def run(self):
|
|
227
|
+
# Freshness Check removed (handled by HTTP cache)
|
|
228
|
+
stats = SearchStats()
|
|
229
|
+
|
|
230
|
+
with Progress(
|
|
231
|
+
SpinnerColumn(),
|
|
232
|
+
TextColumn('[bold blue]{task.description}'),
|
|
233
|
+
TextColumn('•'),
|
|
234
|
+
TextColumn('[bold yellow]{task.fields[status]}'),
|
|
235
|
+
TextColumn('•'),
|
|
236
|
+
TextColumn('[bold green]{task.completed} repos'),
|
|
237
|
+
TextColumn('•'),
|
|
238
|
+
TextColumn('[cyan]Values: {task.fields[stars]}'),
|
|
239
|
+
TextColumn('•'),
|
|
240
|
+
TimeElapsedColumn(),
|
|
241
|
+
console=console,
|
|
242
|
+
) as progress:
|
|
243
|
+
task = progress.add_task(
|
|
244
|
+
'[green]Crawling...',
|
|
245
|
+
total=None,
|
|
246
|
+
status='Init',
|
|
247
|
+
stars='N/A',
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# 0. Completeness Check
|
|
251
|
+
if self.storage.min_stars_seen <= self.min_stars:
|
|
252
|
+
logger.info(
|
|
253
|
+
'Search already complete for this threshold.',
|
|
254
|
+
min_stars_required=self.min_stars,
|
|
255
|
+
min_stars_found=self.storage.min_stars_seen,
|
|
256
|
+
)
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
while True:
|
|
260
|
+
# 1. Determine Query Range
|
|
261
|
+
if self.current_max_stars is None:
|
|
262
|
+
query = f"language:{self.lang} stars:>{self.min_stars}"
|
|
263
|
+
desc = f"> {self.min_stars}"
|
|
264
|
+
else:
|
|
265
|
+
query = f"language:{self.lang} stars:{self.min_stars}..{self.current_max_stars}"
|
|
266
|
+
desc = f"{self.min_stars}..{self.current_max_stars}"
|
|
267
|
+
|
|
268
|
+
progress.update(
|
|
269
|
+
task,
|
|
270
|
+
stars=desc,
|
|
271
|
+
status='Scanning',
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# 2. Execute Batch
|
|
275
|
+
batch_items = []
|
|
276
|
+
min_stars_in_batch = float('inf')
|
|
277
|
+
|
|
278
|
+
for item in self.client.search_repositories(query, task, progress, stats):
|
|
279
|
+
batch_items.append(item)
|
|
280
|
+
stars = item['stargazers_count']
|
|
281
|
+
min_stars_in_batch = min(min_stars_in_batch, stars)
|
|
282
|
+
|
|
283
|
+
if self.storage.save(item):
|
|
284
|
+
progress.advance(task)
|
|
285
|
+
stats.repos_saved += 1
|
|
286
|
+
progress.console.print(
|
|
287
|
+
f" [green]★[/] [bold]{item['full_name']}[/] "
|
|
288
|
+
f"[dim]({stars:,} stars)[/]",
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# 3. Analyze Batch for Next Cursor
|
|
292
|
+
count = len(batch_items)
|
|
293
|
+
if count == 0:
|
|
294
|
+
logger.info('[bold green]No more results. Done!')
|
|
295
|
+
break
|
|
296
|
+
|
|
297
|
+
if count < 1000:
|
|
298
|
+
# If we haven't hit the bottom star limit, but returned <1000,
|
|
299
|
+
# it implies we exhausted this specific query range.
|
|
300
|
+
if self.current_max_stars is None or min_stars_in_batch <= self.min_stars:
|
|
301
|
+
logger.info(
|
|
302
|
+
f"Batch ({count}) < 1000 and hit floor. Done.",
|
|
303
|
+
)
|
|
304
|
+
break
|
|
305
|
+
else:
|
|
306
|
+
# Move cursor down safely
|
|
307
|
+
self.current_max_stars = min_stars_in_batch - 1
|
|
308
|
+
else:
|
|
309
|
+
# We hit the 1000 limit. Check for "Star Wall"
|
|
310
|
+
if self.current_max_stars is not None and min_stars_in_batch == self.current_max_stars:
|
|
311
|
+
logger.warning(
|
|
312
|
+
f"Dense Star Wall at {min_stars_in_batch}★. Switching to Time Slicing...",
|
|
313
|
+
)
|
|
314
|
+
self._process_time_slice(
|
|
315
|
+
min_stars_in_batch, task, progress, stats,
|
|
316
|
+
)
|
|
317
|
+
self.current_max_stars = min_stars_in_batch - 1
|
|
318
|
+
else:
|
|
319
|
+
# Normal cursor movement
|
|
320
|
+
self.current_max_stars = min_stars_in_batch
|
|
321
|
+
|
|
322
|
+
# Boundary Check
|
|
323
|
+
if self.current_max_stars is not None and self.current_max_stars < self.min_stars:
|
|
324
|
+
logger.info(
|
|
325
|
+
'[bold green]Reached minimum star threshold. Done.',
|
|
326
|
+
)
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
# Print Summary Table (End of Run)
|
|
330
|
+
elapsed_time = time.time() - stats.start_time
|
|
331
|
+
table = Table(title='Search Summary')
|
|
332
|
+
table.add_column('Metric', style='cyan')
|
|
333
|
+
table.add_column('Value', style='magenta')
|
|
334
|
+
|
|
335
|
+
table.add_row('Total API Requests', str(stats.api_requests))
|
|
336
|
+
table.add_row('API Cache Hits', str(stats.cache_hits))
|
|
337
|
+
table.add_row('Repos Discovered', str(stats.repos_found))
|
|
338
|
+
table.add_row('New Repos Saved', str(stats.repos_saved))
|
|
339
|
+
table.add_row('Total Duration', f"{elapsed_time:.2f}s")
|
|
340
|
+
|
|
341
|
+
console.print(table)
|
|
342
|
+
|
|
343
|
+
def _process_time_slice(self, stars: int, task_id: TaskID, progress: Progress, stats: SearchStats):
|
|
344
|
+
"""Handles dense star counts by slicing via 'created' date."""
|
|
345
|
+
start_dt = datetime.datetime(2008, 1, 1)
|
|
346
|
+
end_dt = datetime.datetime.now()
|
|
347
|
+
stack = [(start_dt, end_dt)]
|
|
348
|
+
|
|
349
|
+
while stack:
|
|
350
|
+
s, e = stack.pop()
|
|
351
|
+
date_range = f"{s.strftime('%Y-%m-%d')}..{e.strftime('%Y-%m-%d')}"
|
|
352
|
+
query = f"language:{self.lang} stars:{stars} created:{date_range}"
|
|
353
|
+
|
|
354
|
+
progress.update(
|
|
355
|
+
task_id,
|
|
356
|
+
status='Time Slice',
|
|
357
|
+
stars=f"{stars}★ [{date_range}]",
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
items = list(
|
|
361
|
+
self.client.search_repositories(
|
|
362
|
+
query, task_id, progress, stats,
|
|
363
|
+
),
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if len(items) >= 1000:
|
|
367
|
+
# Too many results, split time range
|
|
368
|
+
mid_ts = s.timestamp() + (e.timestamp() - s.timestamp()) / 2
|
|
369
|
+
mid = datetime.datetime.fromtimestamp(mid_ts)
|
|
370
|
+
stack.append((mid + datetime.timedelta(seconds=1), e))
|
|
371
|
+
stack.append((s, mid))
|
|
372
|
+
else:
|
|
373
|
+
# Process results
|
|
374
|
+
for item in items:
|
|
375
|
+
if self.storage.save(item):
|
|
376
|
+
progress.advance(task_id)
|
|
377
|
+
stats.repos_saved += 1
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def main(
|
|
381
|
+
token: str = typer.Option(
|
|
382
|
+
None, envvar='GITHUB_TOKEN', help='GitHub Token',
|
|
383
|
+
),
|
|
384
|
+
language: Language | None = typer.Option(
|
|
385
|
+
None, help='Target Programming Language (default: all)',
|
|
386
|
+
),
|
|
387
|
+
min_stars: int = typer.Option(None, help='Minimum Star Count'),
|
|
388
|
+
output_path_arg: str | None = typer.Option(
|
|
389
|
+
None, '--output', help='Output JSONL Path',
|
|
390
|
+
),
|
|
391
|
+
):
|
|
392
|
+
"""
|
|
393
|
+
Collect repository links from GitHub.
|
|
394
|
+
Crawls repositories by Star count, using cursor slicing to bypass 1000-item limits.
|
|
395
|
+
"""
|
|
396
|
+
# Load config
|
|
397
|
+
config = get_config()
|
|
398
|
+
|
|
399
|
+
# Use config defaults if not provided
|
|
400
|
+
if token is None:
|
|
401
|
+
token = config.github.token
|
|
402
|
+
if min_stars is None:
|
|
403
|
+
min_stars = config.github.default_min_stars
|
|
404
|
+
|
|
405
|
+
if not token:
|
|
406
|
+
console.print(
|
|
407
|
+
'[bold red]Error:[/] GITHUB_TOKEN is not set.\n\n'
|
|
408
|
+
'The GitHub Search API requires authentication. '
|
|
409
|
+
'Please set the GITHUB_TOKEN environment variable:\n\n'
|
|
410
|
+
' [cyan]export GITHUB_TOKEN="your_github_token"[/]\n\n'
|
|
411
|
+
'Or add it to your [cyan].env[/] file:\n\n'
|
|
412
|
+
' [cyan]GITHUB_TOKEN=your_github_token[/]\n\n'
|
|
413
|
+
'You can create a fine-grained token at: '
|
|
414
|
+
'[link=https://github.com/settings/personal-access-tokens/new]'
|
|
415
|
+
'https://github.com/settings/personal-access-tokens/new[/link]\n\n'
|
|
416
|
+
'[dim]Note: Fine-grained tokens have read-only access to public repos by default.[/dim]',
|
|
417
|
+
)
|
|
418
|
+
raise typer.Exit(1)
|
|
419
|
+
|
|
420
|
+
if language is None:
|
|
421
|
+
logger.warning('No language specified. Crawling ALL languages...')
|
|
422
|
+
target_languages = list(Language)
|
|
423
|
+
else:
|
|
424
|
+
target_languages = [language]
|
|
425
|
+
|
|
426
|
+
for lang in target_languages:
|
|
427
|
+
|
|
428
|
+
# Determine output path for this language
|
|
429
|
+
if output_path_arg is None:
|
|
430
|
+
current_output = str(config.paths.get_repo_list_path(str(lang)))
|
|
431
|
+
else:
|
|
432
|
+
current_output = output_path_arg
|
|
433
|
+
|
|
434
|
+
logger.info(
|
|
435
|
+
'Starting Search',
|
|
436
|
+
language=str(lang),
|
|
437
|
+
min_stars=min_stars,
|
|
438
|
+
output=current_output,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
try:
|
|
442
|
+
searcher = Searcher(token, lang, min_stars, current_output)
|
|
443
|
+
searcher.run()
|
|
444
|
+
except KeyboardInterrupt:
|
|
445
|
+
logger.warning('Aborted by user.')
|
|
446
|
+
raise typer.Exit(1)
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logger.exception(f"Fatal Error processing {lang}: {e}")
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
if __name__ == '__main__':
|
|
453
|
+
typer.run(main)
|