chatsbom 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,327 @@
1
+ import json
2
+ import time
3
+ from dataclasses import dataclass
4
+ from dataclasses import field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import clickhouse_connect
10
+ import typer
11
+ from clickhouse_connect.driver.client import Client
12
+ from rich.console import Console
13
+ from rich.progress import BarColumn
14
+ from rich.progress import MofNCompleteColumn
15
+ from rich.progress import Progress
16
+ from rich.progress import SpinnerColumn
17
+ from rich.progress import TaskProgressColumn
18
+ from rich.progress import TextColumn
19
+ from rich.progress import TimeElapsedColumn
20
+ from rich.table import Table
21
+
22
+ from chatsbom.models.language import Language
23
+
24
+ console = Console()
25
+
26
+ REPO_COLUMNS = [
27
+ 'id', 'owner', 'repo', 'full_name', 'url',
28
+ 'stars', 'description', 'created_at', 'language', 'topics',
29
+ ]
30
+ ARTIFACT_COLUMNS = [
31
+ 'repository_id', 'artifact_id', 'name',
32
+ 'version', 'type', 'purl', 'found_by', 'licenses',
33
+ ]
34
+ BATCH_SIZE = 1000
35
+
36
+
37
+ @dataclass
38
+ class ImportStats:
39
+ files_processed: int = 0
40
+ repos_processed: int = 0
41
+ artifacts_imported: int = 0
42
+ start_time: float = field(default_factory=time.time)
43
+
44
+
45
+ def get_client(host, port, user, password, database) -> Client:
46
+ return clickhouse_connect.get_client(
47
+ host=host, port=port, username=user, password=password, database=database,
48
+ )
49
+
50
+
51
+ def init_db(client: Client):
52
+ from sbom_insight.core.schema import ARTIFACTS_DDL
53
+ from sbom_insight.core.schema import REPOSITORIES_DDL
54
+ client.command(REPOSITORIES_DDL)
55
+ client.command(ARTIFACTS_DDL)
56
+
57
+
58
+ def parse_iso_time(time_str: str | None) -> datetime:
59
+ if not time_str:
60
+ return datetime(1970, 1, 1)
61
+ try:
62
+ # Handle "2014-06-03T23:37:33Z" format
63
+ return datetime.fromisoformat(time_str.replace('Z', '+00:00'))
64
+ except Exception:
65
+ return datetime(1970, 1, 1)
66
+
67
+
68
+ def parse_repo_line(line: str, language: str) -> tuple[list[Any] | None, dict[str, Any] | None]:
69
+ """Parse a single line from jsonl and return a tuple of (repo_row, metadata_dict)."""
70
+ try:
71
+ repo_meta = json.loads(line)
72
+ except json.JSONDecodeError:
73
+ return None, None
74
+
75
+ full_name = repo_meta.get('full_name', '')
76
+ if not full_name:
77
+ return None, None
78
+
79
+ parts = full_name.split('/')
80
+ if len(parts) == 2:
81
+ owner, repo = parts
82
+ else:
83
+ owner, repo = '', full_name
84
+
85
+ repo_id = int(repo_meta.get('id', 0))
86
+ created_at = parse_iso_time(repo_meta.get('created_at'))
87
+
88
+ repo_row = [
89
+ repo_id,
90
+ owner,
91
+ repo,
92
+ full_name,
93
+ repo_meta.get('url', ''),
94
+ int(repo_meta.get('stars', 0)),
95
+ repo_meta.get('description', '') or '',
96
+ created_at,
97
+ language,
98
+ repo_meta.get('topics', []) or [],
99
+ ]
100
+
101
+ # Pass essential metadata for artifact scanning
102
+ meta_context = {
103
+ 'id': repo_id,
104
+ 'owner': owner,
105
+ 'repo': repo,
106
+ 'language': language,
107
+ }
108
+
109
+ return repo_row, meta_context
110
+
111
+
112
+ def scan_artifacts(meta_context: dict[str, Any]) -> list[list[Any]]:
113
+ """Scan for SBOM files based on repository metadata and return a list of artifact rows."""
114
+ if not meta_context:
115
+ return []
116
+
117
+ language = meta_context['language']
118
+ owner = meta_context['owner']
119
+ repo = meta_context['repo']
120
+ repo_id = meta_context['id']
121
+
122
+ # Expected path: data/{language}/{owner}/{repo}/**/sbom.json
123
+ base_dir = Path('data') / language / owner / repo
124
+ if not base_dir.exists():
125
+ return []
126
+
127
+ artifacts_rows = []
128
+ # Recursively find sbom.json
129
+ sbom_files = list(base_dir.rglob('sbom.json'))
130
+
131
+ for sbom_file in sbom_files:
132
+ try:
133
+ with open(sbom_file) as sf:
134
+ sbom_data = json.load(sf)
135
+ artifacts = sbom_data.get('artifacts', [])
136
+ for art in artifacts:
137
+ artifact_row = [
138
+ repo_id,
139
+ art.get('id', ''),
140
+ art.get('name', ''),
141
+ art.get('version', ''),
142
+ art.get('type', ''),
143
+ art.get('purl', ''),
144
+ art.get('foundBy', ''),
145
+ [
146
+ lic.get('value', '') or lic.get(
147
+ 'spdxExpression', '',
148
+ )
149
+ for lic in art.get('licenses', [])
150
+ ],
151
+ ]
152
+ artifacts_rows.append(artifact_row)
153
+ except Exception:
154
+ # Optionally log error
155
+ pass
156
+
157
+ return artifacts_rows
158
+
159
+
160
+ def import_file(client: Client, file_path: str, progress: Progress, stats: ImportStats):
161
+ """Import a single jsonl file into ClickHouse."""
162
+ language = Path(file_path).stem
163
+
164
+ # Count lines first for progress bar
165
+ try:
166
+ with open(file_path) as f:
167
+ total_lines = sum(1 for _ in f)
168
+ except FileNotFoundError:
169
+ console.print(f"[red]File {file_path} not found.[/red]")
170
+ return
171
+
172
+ task_id = progress.add_task(
173
+ f"[cyan]Importing {language}[/cyan]", total=total_lines,
174
+ )
175
+ stats.files_processed += 1
176
+
177
+ with open(file_path) as f:
178
+ # Don't read all lines into memory if possible, but for JSONL iterating file object is fine
179
+ # However, to be safe with progress tracking we iterate file object
180
+ # Re-open or seek 0? Seek 0.
181
+ f.seek(0)
182
+
183
+ repo_batch = []
184
+ artifact_batch = []
185
+
186
+ for line in f:
187
+ repo_row, meta_context = parse_repo_line(line, language)
188
+ if not repo_row or not meta_context:
189
+ progress.advance(task_id)
190
+ continue
191
+
192
+ repo_batch.append(repo_row)
193
+ stats.repos_processed += 1
194
+
195
+ # Scan artifacts
196
+ artifacts = scan_artifacts(meta_context)
197
+ if artifacts:
198
+ artifact_batch.extend(artifacts)
199
+ stats.artifacts_imported += len(artifacts)
200
+
201
+ # Flush batches
202
+ if len(repo_batch) >= BATCH_SIZE:
203
+ client.insert(
204
+ 'repositories', repo_batch,
205
+ column_names=REPO_COLUMNS,
206
+ )
207
+ repo_batch = []
208
+
209
+ if len(artifact_batch) >= BATCH_SIZE:
210
+ client.insert(
211
+ 'artifacts', artifact_batch,
212
+ column_names=ARTIFACT_COLUMNS,
213
+ )
214
+ artifact_batch = []
215
+
216
+ progress.advance(task_id)
217
+
218
+ # Flush remaining
219
+ if repo_batch:
220
+ client.insert(
221
+ 'repositories', repo_batch,
222
+ column_names=REPO_COLUMNS,
223
+ )
224
+ if artifact_batch:
225
+ client.insert(
226
+ 'artifacts', artifact_batch,
227
+ column_names=ARTIFACT_COLUMNS,
228
+ )
229
+
230
+
231
+ def main(
232
+ host: str = typer.Option('localhost', help='ClickHouse host'),
233
+ port: int = typer.Option(8123, help='ClickHouse http port'),
234
+ user: str = typer.Option('admin', help='ClickHouse user'),
235
+ password: str = typer.Option('admin', help='ClickHouse password'),
236
+ database: str = typer.Option('sbom', help='ClickHouse database'),
237
+ clean: bool = typer.Option(False, help='Drop tables before importing'),
238
+ language: list[Language] | None = typer.Option(
239
+ None, help='Specific languages to import',
240
+ ),
241
+ input_file: Path | None = typer.Option(
242
+ None, help='Specific file to import (ignoring language argument)',
243
+ ),
244
+ ):
245
+ """Index SBOM data into the database."""
246
+
247
+ # Ensure database exists
248
+ try:
249
+ # Connect to default database first
250
+ tmp_client = clickhouse_connect.get_client(
251
+ host=host, port=port, username=user, password=password, database='default',
252
+ )
253
+ tmp_client.command(f"CREATE DATABASE IF NOT EXISTS {database}")
254
+ except Exception as e:
255
+ console.print(
256
+ f'[bold red]Error:[/] Failed to connect to ClickHouse at '
257
+ f'[cyan]{host}:{port}[/]\n\n'
258
+ f'Details: {e}\n\n'
259
+ 'Please ensure:\n'
260
+ ' 1. ClickHouse is running: [cyan]docker compose up -d[/]\n'
261
+ ' 2. Host and port are correct\n'
262
+ ' 3. User credentials are valid',
263
+ )
264
+ raise typer.Exit(1)
265
+
266
+ try:
267
+ client = get_client(host, port, user, password, database)
268
+ except Exception as e:
269
+ console.print(f"[red]Failed to connect to ClickHouse: {e}[/red]")
270
+ raise typer.Exit(code=1)
271
+
272
+ if clean:
273
+ console.print('[yellow]Dropping existing tables...[/yellow]')
274
+ client.command('DROP TABLE IF EXISTS repositories')
275
+ client.command('DROP TABLE IF EXISTS artifacts')
276
+
277
+ init_db(client)
278
+ console.print('[green]Database initialized.[/green]')
279
+
280
+ files_to_process = []
281
+ if input_file:
282
+ if input_file.exists():
283
+ files_to_process.append(input_file)
284
+ else:
285
+ console.print(
286
+ f"[red]Input file {input_file} does not exist.[/red]",
287
+ )
288
+ raise typer.Exit(code=1)
289
+ else:
290
+ langs_to_process = language if language else list(Language)
291
+ for lang in langs_to_process:
292
+ f = Path(f"{lang.value}.jsonl")
293
+ if f.exists():
294
+ files_to_process.append(f)
295
+ else:
296
+ if language:
297
+ console.print(
298
+ f"[yellow]File {f} for language {lang.value} not found.[/yellow]",
299
+ )
300
+
301
+ stats = ImportStats()
302
+
303
+ with Progress(
304
+ SpinnerColumn(),
305
+ TextColumn('[bold blue]{task.description}'),
306
+ BarColumn(),
307
+ TaskProgressColumn(),
308
+ MofNCompleteColumn(),
309
+ TimeElapsedColumn(),
310
+ console=console,
311
+ transient=False,
312
+ ) as progress:
313
+ for f in files_to_process:
314
+ import_file(client, str(f), progress, stats)
315
+
316
+ # Summary Table
317
+ elapsed_time = time.time() - stats.start_time
318
+ table = Table(title='Import Summary')
319
+ table.add_column('Metric', style='cyan')
320
+ table.add_column('Value', style='magenta')
321
+
322
+ table.add_row('Files Processed', str(stats.files_processed))
323
+ table.add_row('Repositories Processed', f"{stats.repos_processed:,}")
324
+ table.add_row('Artifacts Imported', f"{stats.artifacts_imported:,}")
325
+ table.add_row('Total Duration', f"{elapsed_time:.2f}s")
326
+
327
+ console.print(table)
@@ -0,0 +1,174 @@
1
+ from typing import Any
2
+
3
+ import typer
4
+ from rich.console import Console
5
+ from rich.prompt import Prompt
6
+ from rich.table import Table
7
+
8
+ from chatsbom.core.clickhouse import check_clickhouse_connection
9
+ from chatsbom.core.config import DatabaseConfig
10
+ from chatsbom.core.config import get_config
11
+ from chatsbom.core.repository import SBOMRepository
12
+
13
+ console = Console()
14
+
15
+
16
+ def main(
17
+ library: str = typer.Argument(
18
+ ...,
19
+ help='Library name to search for (e.g. requests)',
20
+ ),
21
+ host: str = typer.Option(None, help='ClickHouse host'),
22
+ port: int = typer.Option(None, help='ClickHouse http port'),
23
+ user: str = typer.Option(None, help='ClickHouse user (default: guest)'),
24
+ password: str = typer.Option(
25
+ None, help='ClickHouse password (default: guest)',
26
+ ),
27
+ database: str = typer.Option(None, help='ClickHouse database'),
28
+ limit: int = typer.Option(50, help='Max results to display'),
29
+ language: str = typer.Option(
30
+ None, help='Filter by programming language (e.g. python, go)',
31
+ ),
32
+ ):
33
+ """
34
+ Search for repositories that depend on a specific library.
35
+ Query is performed using the read-only 'guest' user.
36
+ """
37
+ # Load config and override with CLI arguments
38
+ config = get_config()
39
+ db_config = DatabaseConfig(
40
+ host=host or config.database.host,
41
+ port=port or config.database.port,
42
+ user=user or config.database.user,
43
+ password=password or config.database.password,
44
+ database=database or config.database.database,
45
+ )
46
+
47
+ check_clickhouse_connection(
48
+ host=db_config.host,
49
+ port=db_config.port,
50
+ user=db_config.user,
51
+ password=db_config.password,
52
+ database=db_config.database,
53
+ console=console,
54
+ require_database=True,
55
+ )
56
+
57
+ repo = SBOMRepository(db_config)
58
+
59
+ # Step 1: Find candidates using fuzzy search
60
+ console.print(f"[dim]Searching for libraries match '{library}'...[/dim]")
61
+
62
+ lang_filter = ''
63
+ params: dict[str, Any] = {'pattern': f"%{library}%"}
64
+
65
+ if language:
66
+ lang_filter = 'AND r.language = {language:String}'
67
+ params['language'] = language
68
+
69
+ candidate_query = f"""
70
+ SELECT a.name, count() as cnt
71
+ FROM {db_config.artifacts_table} a
72
+ JOIN {db_config.repositories_table} r ON a.repository_id = r.id
73
+ WHERE a.name ILIKE {{pattern:String}} {lang_filter}
74
+ GROUP BY a.name
75
+ ORDER BY cnt DESC
76
+ LIMIT 20
77
+ """
78
+ try:
79
+ candidates_res = repo.client.query(candidate_query, parameters=params)
80
+ candidates = candidates_res.result_rows
81
+ except Exception as e:
82
+ console.print(f"[red]Candidate search failed: {e}[/red]")
83
+ raise typer.Exit(code=1)
84
+
85
+ selected_library = library
86
+ if not candidates:
87
+ console.print(
88
+ f"[yellow]No exact or partial matches found for '{library}'{' in ' + language if language else ''}. Using input as-is.[/yellow]",
89
+ )
90
+ else:
91
+ ctable = Table(
92
+ title=f"Multiple libraries match '{library}'. Please select one:",
93
+ )
94
+ ctable.add_column('No.', style='cyan', justify='right')
95
+ ctable.add_column('Library Name', style='green')
96
+ ctable.add_column('Projects Using', style='magenta')
97
+
98
+ for idx, (name, cnt) in enumerate(candidates, 1):
99
+ ctable.add_row(str(idx), name, str(cnt))
100
+
101
+ console.print(ctable)
102
+
103
+ choices = [str(i) for i in range(1, len(candidates) + 1)]
104
+ choice = Prompt.ask('Select Library', choices=choices, default='1')
105
+
106
+ selected_library = candidates[int(choice) - 1][0]
107
+ console.print(f"[bold]Selected:[/bold] {selected_library}")
108
+
109
+ # Count total results first
110
+ count_query = f"""
111
+ SELECT count()
112
+ FROM {db_config.artifacts_table} a
113
+ JOIN {db_config.repositories_table} r ON a.repository_id = r.id
114
+ WHERE a.name = {{library:String}} {lang_filter}
115
+ """
116
+
117
+ params = {'library': selected_library}
118
+ if language:
119
+ params['language'] = language
120
+
121
+ try:
122
+ total_count = repo.client.query(
123
+ count_query, parameters=params,
124
+ ).result_rows[0][0]
125
+ except Exception as e:
126
+ console.print(f"[red]Count query failed: {e}[/red]")
127
+ raise typer.Exit(code=1)
128
+
129
+ if total_count == 0:
130
+ console.print(
131
+ f"[yellow]No repositories found depending on '{selected_library}'.[/yellow]",
132
+ )
133
+ return
134
+
135
+ query = f"""
136
+ SELECT
137
+ r.owner,
138
+ r.repo,
139
+ r.stars,
140
+ a.version,
141
+ r.url
142
+ FROM {db_config.artifacts_table} AS a
143
+ JOIN {db_config.repositories_table} AS r ON a.repository_id = r.id
144
+ WHERE a.name = {{library:String}} {lang_filter}
145
+ ORDER BY r.stars DESC
146
+ LIMIT {{limit:UInt32}}
147
+ """
148
+
149
+ try:
150
+ # Re-use params but update limit
151
+ params['limit'] = limit
152
+ result = repo.client.query(query, parameters=params)
153
+ except Exception as e:
154
+ console.print(f"[red]Query failed: {e}[/red]")
155
+ raise typer.Exit(code=1)
156
+
157
+ rows = result.result_rows
158
+ table = Table(
159
+ title=f"Dependents of '{selected_library}' (Top {limit} of {total_count})",
160
+ )
161
+ table.add_column('Owner', style='cyan')
162
+ table.add_column('Repo', style='green')
163
+ table.add_column('Stars', style='magenta', justify='right')
164
+ table.add_column('Version', style='yellow')
165
+ table.add_column('URL', style='blue')
166
+
167
+ for row in rows:
168
+ owner, repo_name, stars, version, url = row
169
+ table.add_row(owner, repo_name, str(stars), version, url)
170
+
171
+ console.print(table)
172
+ console.print(
173
+ f"[dim]Note: Shown top {len(rows)} results of {total_count} total, sorted by stars.[/dim]",
174
+ )