chatsbom 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ import concurrent.futures
2
+ import subprocess
3
+ import time
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ import structlog
9
+ import typer
10
+ from rich.console import Console
11
+ from rich.progress import BarColumn
12
+ from rich.progress import Progress
13
+ from rich.progress import SpinnerColumn
14
+ from rich.progress import TextColumn
15
+ from rich.progress import TimeElapsedColumn
16
+ from rich.table import Table
17
+
18
+ from chatsbom.models.language import Language
19
+
20
+ logger = structlog.get_logger('converter')
21
+ console = Console()
22
+
23
+
24
+ @dataclass
25
+ class ConvertResult:
26
+ project_path: str
27
+ status_msg: str
28
+ converted: int = 0
29
+ skipped: int = 0
30
+ failed: int = 0
31
+
32
+
33
+ def find_project_dirs(base_dir: Path, language: Language | None = None) -> list[Path]:
34
+ """
35
+ Finds leaf project directories.
36
+ Structure: base_dir / language / owner / repo / branch / [files]
37
+ We assume the 'branch' directory is the project root.
38
+ """
39
+ projects = []
40
+ # base_dir / lang / owner / repo / branch
41
+ if not base_dir.exists():
42
+ return []
43
+
44
+ # Iterate languages
45
+ for lang_dir in base_dir.iterdir():
46
+ if not lang_dir.is_dir():
47
+ continue
48
+
49
+ # Filter by language if specified
50
+ if language and lang_dir.name != language.value:
51
+ continue
52
+
53
+ # Iterate owners
54
+ for owner_dir in lang_dir.iterdir():
55
+ if not owner_dir.is_dir():
56
+ continue
57
+
58
+ # Iterate repos
59
+ for repo_dir in owner_dir.iterdir():
60
+ if not repo_dir.is_dir():
61
+ continue
62
+
63
+ # Iterate branches (project roots)
64
+ for branch_dir in repo_dir.iterdir():
65
+ if branch_dir.is_dir():
66
+ projects.append(branch_dir)
67
+ return projects
68
+
69
+
70
+ def convert_project(project_dir: Path, output_format: str, overwrite: bool) -> ConvertResult:
71
+ """Runs syft on a project directory."""
72
+ output_file = project_dir / 'sbom.json'
73
+ stats = ConvertResult(project_path=str(project_dir), status_msg='')
74
+
75
+ if output_file.exists() and not overwrite:
76
+ stats.skipped += 1
77
+ stats.status_msg = '[dim]Skip[/dim]'
78
+
79
+ # Log skipped/cached
80
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
81
+ log_msg = (
82
+ f"{timestamp} \\[info ] SBOM Generated "
83
+ f"elapsed=0.00s output={output_file} "
84
+ f"project={project_dir} size={output_file.stat().st_size} "
85
+ f"[green](Cached)[/green]"
86
+ )
87
+ console.print(f"[dim]{log_msg}[/dim]")
88
+
89
+ return stats
90
+
91
+ # syft dir:. -o json
92
+ cmd = [
93
+ 'syft',
94
+ f"dir:{project_dir.absolute()}",
95
+ '-o', output_format,
96
+ ]
97
+
98
+ try:
99
+ # Capture output to avoid polluting CLI
100
+ start_time = time.time()
101
+ result = subprocess.run(
102
+ cmd,
103
+ capture_output=True,
104
+ text=True,
105
+ check=True,
106
+ )
107
+ elapsed = time.time() - start_time
108
+
109
+ # Write output to file
110
+ with open(output_file, 'w', encoding='utf-8') as f:
111
+ f.write(result.stdout)
112
+
113
+ logger.info(
114
+ 'SBOM Generated',
115
+ project=str(project_dir),
116
+ output=str(output_file),
117
+ size=len(result.stdout),
118
+ elapsed=f"{elapsed:.2f}s",
119
+ )
120
+ stats.converted += 1
121
+ stats.status_msg = '[green]Done[/green]'
122
+ return stats
123
+ except subprocess.CalledProcessError as e:
124
+ logger.error(f"Syft failed for {project_dir}: {e.stderr}")
125
+ stats.failed += 1
126
+ stats.status_msg = '[red]Fail[/red]'
127
+ return stats
128
+ except Exception as e:
129
+ logger.error(f"Error {project_dir}: {e}")
130
+ stats.failed += 1
131
+ stats.status_msg = '[red]Err[/red]'
132
+ return stats
133
+
134
+
135
+ def main(
136
+ input_dir: str = typer.Option(
137
+ 'data', help='Root data directory',
138
+ ),
139
+ concurrency: int = typer.Option(
140
+ 4, help='Number of concurrent syft processes',
141
+ ),
142
+ output_format: str = typer.Option(
143
+ 'json', '--format', help='Syft output format (json, spdx-json, cyclonedx-json)',
144
+ ),
145
+ overwrite: bool = typer.Option(
146
+ False, help='Overwrite existing SBOM files',
147
+ ),
148
+ limit: int | None = typer.Option(
149
+ None, help='Limit number of projects to convert (for testing)',
150
+ ),
151
+ language: Language | None = typer.Option(
152
+ None, help='Filter by Language (default: all)',
153
+ ),
154
+ ):
155
+ """
156
+ Convert downloaded project manifests to SBOMs using Syft.
157
+ """
158
+ # Check if syft is installed
159
+ try:
160
+ subprocess.run(
161
+ ['syft', 'version'],
162
+ capture_output=True,
163
+ check=True,
164
+ )
165
+ except FileNotFoundError:
166
+ console.print(
167
+ '[bold red]Error:[/] syft is not installed.\n\n'
168
+ 'The convert-sbom command requires Syft to generate SBOMs. '
169
+ 'Please install Syft:\n\n'
170
+ ' [cyan]# macOS / Linux (Homebrew)[/]\n'
171
+ ' [cyan]brew tap anchore/syft[/]\n'
172
+ ' [cyan]brew install syft[/]\n\n'
173
+ ' [cyan]# Or via install script[/]\n'
174
+ ' [cyan]curl -sSfL https://get.anchore.io/syft | sudo sh -s -- -b /usr/local/bin[/]\n\n'
175
+ 'For more options, visit: '
176
+ '[link=https://github.com/anchore/syft?tab=readme-ov-file#installation]'
177
+ 'https://github.com/anchore/syft?tab=readme-ov-file#installation[/link]',
178
+ )
179
+ raise typer.Exit(1)
180
+ except subprocess.CalledProcessError:
181
+ console.print(
182
+ '[bold yellow]Warning:[/] Could not verify syft version, proceeding anyway.',
183
+ )
184
+
185
+ base_path = Path(input_dir)
186
+ if not base_path.exists():
187
+ logger.error(f"Input directory not found: {base_path}")
188
+ raise typer.Exit(1)
189
+
190
+ if limit:
191
+ logger.warning(f"Test Mode: Limiting to top {limit} projects")
192
+
193
+ projects = find_project_dirs(base_path, language)
194
+ if limit:
195
+ projects = projects[:limit]
196
+
197
+ logger.info(
198
+ 'Starting SBOM Conversion',
199
+ input_dir=input_dir,
200
+ concurrency=concurrency,
201
+ format=output_format,
202
+ overwrite=overwrite,
203
+ found_projects=len(projects),
204
+ )
205
+
206
+ with Progress(
207
+ SpinnerColumn(),
208
+ TextColumn('[bold blue]{task.description}'),
209
+ BarColumn(),
210
+ TextColumn('[progress.percentage]{task.percentage:>3.0f}%'),
211
+ TextColumn('•'),
212
+ TextColumn('[green]{task.completed}/{task.total}'),
213
+ TextColumn('•'),
214
+ TextColumn('[dim]{task.fields[status]}'),
215
+ TextColumn('•'),
216
+ TimeElapsedColumn(),
217
+ console=console,
218
+ ) as progress:
219
+
220
+ overall_stats = {
221
+ 'converted': 0,
222
+ 'skipped': 0,
223
+ 'failed': 0,
224
+ 'total': len(projects),
225
+ }
226
+ start_time = time.time()
227
+
228
+ task = progress.add_task(
229
+ 'Converting...',
230
+ total=len(projects),
231
+ status='Starting',
232
+ )
233
+
234
+ with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
235
+ future_to_project = {
236
+ executor.submit(convert_project, p, output_format, overwrite): p
237
+ for p in projects
238
+ }
239
+
240
+ for future in concurrent.futures.as_completed(future_to_project):
241
+ result = future.result()
242
+ overall_stats['converted'] += result.converted
243
+ overall_stats['skipped'] += result.skipped
244
+ overall_stats['failed'] += result.failed
245
+ progress.update(task, advance=1, status=result.status_msg)
246
+
247
+ # Print Summary Table
248
+ total_time = time.time() - start_time
249
+ table = Table(title='Conversion Summary')
250
+ table.add_column('Metric', style='cyan')
251
+ table.add_column('Value', style='magenta')
252
+
253
+ table.add_row('Total Projects', str(overall_stats['total']))
254
+ table.add_row('Converted (Success)', str(overall_stats['converted']))
255
+ table.add_row('Skipped (Exists)', str(overall_stats['skipped']))
256
+ table.add_row('Failed', str(overall_stats['failed']))
257
+ table.add_row('Total Duration', f"{total_time:.2f}s")
258
+
259
+ console.print(table)
260
+
261
+
262
+ if __name__ == '__main__':
263
+ typer.run(main)
@@ -0,0 +1,293 @@
1
+ import concurrent.futures
2
+ import json
3
+ import os
4
+ import time
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ import dotenv
10
+ import requests
11
+ import structlog
12
+ import typer
13
+ from rich.console import Console
14
+ from rich.progress import BarColumn
15
+ from rich.progress import Progress
16
+ from rich.progress import SpinnerColumn
17
+ from rich.progress import TextColumn
18
+ from rich.table import Table
19
+
20
+ from chatsbom.core.client import get_http_client
21
+ from chatsbom.models.language import Language
22
+ from chatsbom.models.language import LanguageFactory
23
+
24
+ dotenv.load_dotenv()
25
+ console = Console()
26
+ logger = structlog.get_logger('downloader')
27
+
28
+
29
+ @dataclass
30
+ class DownloadResult:
31
+ repo: str
32
+ status_msg: str
33
+ downloaded_files: int = 0
34
+ missing_files: int = 0
35
+ failed_files: int = 0
36
+ skipped_files: int = 0
37
+ cache_hits: int = 0
38
+
39
+
40
+ class SBOMDownloader:
41
+ """Handles concurrent downloading of SBOM files from GitHub."""
42
+
43
+ def __init__(self, token: str | None, base_dir: str, timeout: int = 10, pool_size: int = 50):
44
+ self.session = get_http_client(pool_size=pool_size)
45
+
46
+ if token:
47
+ self.session.headers.update({'Authorization': f"Bearer {token}"})
48
+
49
+ self.base_dir = Path(base_dir)
50
+ self.timeout = timeout
51
+
52
+ def download_repo(self, repo: dict, lang: Language) -> DownloadResult:
53
+ """Downloads SBOM files for a single repository."""
54
+ full_name = repo['full_name']
55
+ owner, name = full_name.split('/')
56
+ branch = repo.get('default_branch', 'master')
57
+
58
+ target_dir = self.base_dir / lang / owner / name / branch
59
+ target_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ base_url = f"https://raw.githubusercontent.com/{owner}/{name}/{branch}"
62
+ language_handler = LanguageFactory.get_handler(lang)
63
+ targets: list[str] = language_handler.get_sbom_paths()
64
+
65
+ result_msgs = []
66
+ stats = DownloadResult(repo=full_name, status_msg='')
67
+
68
+ for filename in targets:
69
+ file_path = target_dir / filename
70
+
71
+ try:
72
+ start_time = time.time()
73
+ url = f"{base_url}/{filename}"
74
+ resp = self.session.get(url, timeout=self.timeout)
75
+ elapsed = time.time() - start_time
76
+
77
+ # Check for cache hit (requests-cache adds 'from_cache' attribute)
78
+ if getattr(resp, 'from_cache', False):
79
+ stats.cache_hits += 1
80
+
81
+ if resp.status_code == 200:
82
+ file_path.parent.mkdir(parents=True, exist_ok=True)
83
+ with open(file_path, 'wb') as f:
84
+ f.write(resp.content)
85
+
86
+ # Visual Caching Indicator
87
+ is_cached = getattr(resp, 'from_cache', False)
88
+ if is_cached:
89
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
90
+ log_msg = (
91
+ f"{timestamp} \\[info ] Downloaded "
92
+ f"elapsed={elapsed:.2f}s file={filename} "
93
+ f"repo={full_name} size={len(resp.content)} "
94
+ f"url={resp.url} [green](Cached)[/green]"
95
+ )
96
+ console.print(f"[dim]{log_msg}[/dim]")
97
+ else:
98
+ logger.info(
99
+ 'Downloaded',
100
+ repo=full_name,
101
+ file=filename,
102
+ size=len(resp.content),
103
+ elapsed=f"{elapsed:.2f}s",
104
+ url=resp.url,
105
+ )
106
+
107
+ stats.downloaded_files += 1
108
+ result_msgs.append(f"[green]{filename}[/green]")
109
+ elif resp.status_code == 404:
110
+ stats.missing_files += 1
111
+ result_msgs.append(
112
+ f"[dim yellow]no {filename}[/dim yellow]",
113
+ )
114
+ else:
115
+ stats.failed_files += 1
116
+ logger.warning(
117
+ 'Download Failed',
118
+ repo=full_name,
119
+ file=filename,
120
+ status=resp.status_code,
121
+ elapsed=f"{elapsed:.2f}s",
122
+ url=url,
123
+ )
124
+
125
+ result_msgs.append(
126
+ f"[red]{filename} {resp.status_code}[/red]",
127
+ )
128
+
129
+ except requests.RequestException as e:
130
+ logger.error(
131
+ 'Download Error',
132
+ repo=full_name,
133
+ file=filename,
134
+ error=str(e),
135
+ )
136
+ stats.failed_files += 1
137
+ result_msgs.append(f"[red]{filename} Err[/red]")
138
+
139
+ if not result_msgs:
140
+ stats.status_msg = f"[dim]{full_name} skip[/dim]"
141
+ else:
142
+ stats.status_msg = f"{full_name}: {', '.join(result_msgs)}"
143
+
144
+ return stats
145
+
146
+
147
+ def load_targets(jsonl_path: str) -> list[dict]:
148
+ """Loads repository targets from a JSONL file."""
149
+ targets: list[dict] = []
150
+ path = Path(jsonl_path)
151
+ if not path.exists():
152
+ return targets
153
+
154
+ with path.open(encoding='utf-8') as f:
155
+ for line in f:
156
+ if line.strip():
157
+ try:
158
+ targets.append(json.loads(line))
159
+ except json.JSONDecodeError:
160
+ pass
161
+ return targets
162
+
163
+
164
+ def main(
165
+ input_file: str | None = typer.Option(
166
+ None, help='Input JSONL file path (default: {language}.jsonl)',
167
+ ),
168
+ output_dir: str = typer.Option(
169
+ 'data', help='Download destination directory',
170
+ ),
171
+ language: Language | None = typer.Option(
172
+ None, help='Target Language (default: all)',
173
+ ),
174
+ token: str = typer.Option(
175
+ None, envvar='GITHUB_TOKEN', help='GitHub Token',
176
+ ),
177
+ concurrency: int = typer.Option(32, help='Number of concurrent threads'),
178
+ limit: int | None = typer.Option(
179
+ None, help='Limit number of processed repos (for testing)',
180
+ ),
181
+ ):
182
+ """
183
+ Download SBOM files from repositories.
184
+ """
185
+ if language is None:
186
+ if input_file:
187
+ logger.error(
188
+ 'Cannot specify input_file when targeting ALL languages.',
189
+ )
190
+ raise typer.Exit(1)
191
+ logger.warning('No language specified. Downloading ALL languages...')
192
+ target_languages = list(Language)
193
+ else:
194
+ target_languages = [language]
195
+
196
+ downloader = SBOMDownloader(token, output_dir, pool_size=concurrency)
197
+
198
+ with Progress(
199
+ SpinnerColumn(),
200
+ TextColumn('[bold blue]{task.description}'),
201
+ BarColumn(),
202
+ TextColumn('[progress.percentage]{task.percentage:>3.0f}%'),
203
+ TextColumn('•'),
204
+ TextColumn('[green]{task.completed}/{task.total}'),
205
+ TextColumn('•'),
206
+ TextColumn('[dim]{task.fields[status]}', justify='left'),
207
+ console=console,
208
+ ) as progress:
209
+
210
+ overall_stats = {
211
+ 'repos': 0,
212
+ 'downloaded': 0,
213
+ 'missing': 0,
214
+ 'failed': 0,
215
+ 'cache_hits': 0,
216
+ }
217
+ start_time_all = time.time()
218
+
219
+ for lang in target_languages:
220
+ if input_file:
221
+ target_file = input_file
222
+ else:
223
+ target_file = f"{lang}.jsonl"
224
+
225
+ # Check if file exists, if not, skip efficiently
226
+ if not os.path.exists(target_file):
227
+ logger.debug(
228
+ f"Target file {target_file} not found. Skipping {lang}.",
229
+ )
230
+ continue
231
+
232
+ tasks = load_targets(target_file)
233
+ if not tasks:
234
+ logger.warning(f"Input file empty: {target_file}. Skipping.")
235
+ continue
236
+
237
+ if limit:
238
+ tasks = tasks[:limit]
239
+
240
+ logger.info(
241
+ 'Starting Processing',
242
+ language=str(lang),
243
+ target_file=target_file,
244
+ total_tasks=len(tasks),
245
+ )
246
+
247
+ main_task = progress.add_task(
248
+ f'Downloading {lang}...', total=len(tasks), status='Starting...',
249
+ )
250
+
251
+ with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
252
+ future_to_repo = {
253
+ executor.submit(downloader.download_repo, repo, lang): repo
254
+ for repo in tasks
255
+ }
256
+
257
+ for future in concurrent.futures.as_completed(future_to_repo):
258
+ try:
259
+ result = future.result()
260
+ overall_stats['repos'] += 1
261
+ overall_stats['downloaded'] += result.downloaded_files
262
+ overall_stats['missing'] += result.missing_files
263
+ overall_stats['failed'] += result.failed_files
264
+ overall_stats['cache_hits'] += result.cache_hits
265
+ progress.update(
266
+ main_task, advance=1,
267
+ status=result.status_msg,
268
+ )
269
+ except Exception as e:
270
+ logger.error(f"Error processing repo: {e}")
271
+ progress.update(
272
+ main_task, advance=1,
273
+ status='[red]Error[/red]',
274
+ )
275
+
276
+ # Print Summary Table
277
+ total_time = time.time() - start_time_all
278
+ table = Table(title='Download Summary')
279
+ table.add_column('Metric', style='cyan')
280
+ table.add_column('Value', style='magenta')
281
+
282
+ table.add_row('Total Repos Processed', str(overall_stats['repos']))
283
+ table.add_row('Files Downloaded', str(overall_stats['downloaded']))
284
+ table.add_row('Files Missing (404)', str(overall_stats['missing']))
285
+ table.add_row('Failed Downloads', str(overall_stats['failed']))
286
+ table.add_row('Cache Hits', str(overall_stats['cache_hits']))
287
+ table.add_row('Total Duration', f"{total_time:.2f}s")
288
+
289
+ console.print(table)
290
+
291
+
292
+ if __name__ == '__main__':
293
+ typer.run(main)