flybase-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flybase_cli/schema.py ADDED
@@ -0,0 +1,671 @@
1
+ from __future__ import annotations
2
+
3
+ import itertools
4
+ import re
5
+ import sqlite3
6
+ from pathlib import Path
7
+
8
+ from .config import SEARCH_ID_CANDIDATES
9
+ from .core import ensure_registry, list_registry_table_names, open_db, write_json
10
+ from .semantics import RELATIONSHIP_GROUPS, describe_column_name, infer_table_tags
11
+
12
+
13
+ def sample_column_values(
14
+ conn: sqlite3.Connection,
15
+ table_name: str,
16
+ column_name: str,
17
+ limit: int,
18
+ ) -> list[str]:
19
+ if limit <= 0:
20
+ return []
21
+ rows = conn.execute(
22
+ f'''
23
+ SELECT "{column_name}"
24
+ FROM "{table_name}"
25
+ WHERE "{column_name}" IS NOT NULL AND TRIM(CAST("{column_name}" AS TEXT)) != ''
26
+ LIMIT ?
27
+ ''',
28
+ (limit,),
29
+ ).fetchall()
30
+ return [str(row[0]) for row in rows]
31
+
32
+
33
+ def describe_table(
34
+ conn: sqlite3.Connection,
35
+ table_name: str,
36
+ sample_values: int = 3,
37
+ ) -> dict[str, object] | None:
38
+ row = conn.execute(
39
+ """
40
+ SELECT source_path, row_count
41
+ FROM fb_ingest_registry
42
+ WHERE table_name = ?
43
+ """,
44
+ (table_name,),
45
+ ).fetchone()
46
+ if row is None:
47
+ return None
48
+ source_path, row_count = row
49
+ columns_meta = conn.execute(f'PRAGMA table_info("{table_name}")').fetchall()
50
+ column_names = [column_name for _, column_name, *_ in columns_meta]
51
+ columns = [
52
+ {
53
+ "name": column_name,
54
+ **(
55
+ {"description": description}
56
+ if (description := describe_column_name(column_name))
57
+ else {}
58
+ ),
59
+ "sample_values": sample_column_values(
60
+ conn,
61
+ table_name,
62
+ column_name,
63
+ sample_values,
64
+ ),
65
+ }
66
+ for _, column_name, *_ in columns_meta
67
+ ]
68
+ return {
69
+ "table_name": table_name,
70
+ "source_path": source_path,
71
+ "row_count": row_count,
72
+ "column_count": len(column_names),
73
+ "semantic_tags": infer_table_tags(table_name, source_path, column_names),
74
+ "columns": columns,
75
+ }
76
+
77
+
78
+ def describe_tables(
79
+ db_path: Path,
80
+ table_names: list[str] | None = None,
81
+ sample_values: int = 3,
82
+ ) -> list[dict[str, object]]:
83
+ conn = open_db(db_path)
84
+ ensure_registry(conn)
85
+ try:
86
+ selected = table_names or list_registry_table_names(conn)
87
+ return [
88
+ description
89
+ for table_name in selected
90
+ for description in [describe_table(conn, table_name, sample_values=sample_values)]
91
+ if description is not None
92
+ ]
93
+ finally:
94
+ conn.close()
95
+
96
+
97
+ def table_column_names(table: dict[str, object]) -> list[str]:
98
+ return [str(column["name"]) for column in table["columns"]]
99
+
100
+
101
+ def lineage_reference_columns(columns: list[str]) -> list[str]:
102
+ ordered: list[tuple[int, str]] = []
103
+ for column in columns:
104
+ match = re.fullmatch(r"ancestor_ordinal_(\d+)", column)
105
+ if match:
106
+ ordered.append((int(match.group(1)), column))
107
+ lineage = [column for _, column in sorted(ordered)]
108
+ if "parent_ordinal" in columns:
109
+ lineage.append("parent_ordinal")
110
+ return lineage
111
+
112
+
113
+ def lineage_key_columns(columns: list[str]) -> list[str]:
114
+ lineage = lineage_reference_columns(columns)
115
+ if "ordinal" in columns:
116
+ lineage.append("ordinal")
117
+ return lineage
118
+
119
+
120
+ def infer_lineage_relationship(
121
+ child: dict[str, object],
122
+ tables_by_name: dict[str, dict[str, object]],
123
+ ) -> dict[str, object] | None:
124
+ child_name = str(child["table_name"])
125
+ child_columns = table_column_names(child)
126
+ if "parent_record_id" not in child_columns:
127
+ return None
128
+
129
+ candidates = [
130
+ table_name
131
+ for table_name in tables_by_name
132
+ if table_name != child_name and child_name.startswith(f"{table_name}_")
133
+ ]
134
+ if not candidates:
135
+ return None
136
+ parent_name = max(candidates, key=len)
137
+ parent = tables_by_name[parent_name]
138
+ parent_columns = table_column_names(parent)
139
+ child_refs = lineage_reference_columns(child_columns)
140
+
141
+ if "record_id" in parent_columns and not child_refs:
142
+ return {
143
+ "kind": "lineage",
144
+ "from_table": child_name,
145
+ "to_table": parent_name,
146
+ "column_pairs": [{"from": "parent_record_id", "to": "record_id"}],
147
+ "confidence": "high",
148
+ "description": "Nested child table joins to its parent record_id.",
149
+ }
150
+
151
+ if "parent_record_id" not in parent_columns:
152
+ return None
153
+ parent_keys = lineage_key_columns(parent_columns)
154
+ if len(child_refs) != len(parent_keys):
155
+ return None
156
+ return {
157
+ "kind": "lineage",
158
+ "from_table": child_name,
159
+ "to_table": parent_name,
160
+ "column_pairs": [
161
+ {"from": "parent_record_id", "to": "parent_record_id"},
162
+ *[
163
+ {"from": from_column, "to": to_column}
164
+ for from_column, to_column in zip(child_refs, parent_keys, strict=True)
165
+ ],
166
+ ],
167
+ "confidence": "high",
168
+ "description": "Nested child table joins to its direct parent via lineage columns.",
169
+ }
170
+
171
+
172
+ def infer_id_alias_relationships(tables: list[dict[str, object]]) -> list[dict[str, object]]:
173
+ relationships: list[dict[str, object]] = []
174
+ seen: set[tuple[str, str, str, str, str]] = set()
175
+
176
+ for group, config in RELATIONSHIP_GROUPS.items():
177
+ aliases = tuple(str(alias) for alias in config["aliases"])
178
+ matches: list[tuple[str, str]] = []
179
+ for table in tables:
180
+ table_name = str(table["table_name"])
181
+ columns = set(table_column_names(table))
182
+ alias = next((candidate for candidate in aliases if candidate in columns), None)
183
+ if alias is not None:
184
+ matches.append((table_name, alias))
185
+ for (left_table, left_column), (right_table, right_column) in itertools.combinations(matches, 2):
186
+ key = tuple(sorted((left_table, right_table))) + (group, left_column, right_column)
187
+ if key in seen:
188
+ continue
189
+ seen.add(key)
190
+ kind = str(config["kind"])
191
+ confidence = "high" if left_column == right_column else "medium"
192
+ if kind == "id-alias":
193
+ description = f"Shared {config['label']} identifiers inferred from column names."
194
+ else:
195
+ description = f"Shared {config['label']} identifiers inferred from column names."
196
+ relationships.append(
197
+ {
198
+ "kind": kind,
199
+ "entity": group,
200
+ "from_table": left_table,
201
+ "to_table": right_table,
202
+ "column_pairs": [{"from": left_column, "to": right_column}],
203
+ "confidence": confidence,
204
+ "description": description,
205
+ }
206
+ )
207
+ return relationships
208
+
209
+
210
+ def infer_schema_relationships(tables: list[dict[str, object]]) -> list[dict[str, object]]:
211
+ tables_by_name = {str(table["table_name"]): table for table in tables}
212
+ relationships = []
213
+ for table in tables:
214
+ relationship = infer_lineage_relationship(table, tables_by_name)
215
+ if relationship is not None:
216
+ relationships.append(relationship)
217
+ relationships.extend(infer_id_alias_relationships(tables))
218
+ return relationships
219
+
220
+
221
+ def quote_identifier(identifier: str) -> str:
222
+ return '"' + identifier.replace('"', '""') + '"'
223
+
224
+
225
+ def safe_label(identifier: str) -> str:
226
+ cleaned = re.sub(r"[^A-Za-z0-9_]+", "_", identifier).strip("_")
227
+ return cleaned or "value"
228
+
229
+
230
+ def select_alias_lines(alias: str, columns: list[str]) -> list[str]:
231
+ return [
232
+ f' {alias}.{quote_identifier(column)} AS {quote_identifier(alias + "_" + safe_label(column))}'
233
+ for column in columns
234
+ ]
235
+
236
+
237
+ def projection_priority(column_name: str) -> tuple[int, str]:
238
+ if column_name == "payload_json":
239
+ return (999, column_name)
240
+ if column_name == "record_id":
241
+ return (0, column_name)
242
+ if column_name in {"parent_record_id", "parent_ordinal", "ordinal"}:
243
+ return (1, column_name)
244
+ if column_name in SEARCH_ID_CANDIDATES:
245
+ return (2, column_name)
246
+ if column_name in {"symbol", "gene_symbol", "annotation_id", "feature_id"}:
247
+ return (3, column_name)
248
+ if column_name.endswith("_id") or column_name.endswith("Id"):
249
+ return (4, column_name)
250
+ return (10, column_name)
251
+
252
+
253
+ def pick_projection_columns(table: dict[str, object], limit: int = 3) -> list[str]:
254
+ columns = table_column_names(table)
255
+ ranked = sorted(columns, key=projection_priority)
256
+ selected = [column for column in ranked if column != "payload_json"][:limit]
257
+ return selected or columns[:limit]
258
+
259
+
260
+ def build_table_sample_query(table: dict[str, object], limit: int) -> dict[str, object]:
261
+ table_name = str(table["table_name"])
262
+ return {
263
+ "id": f"table-sample:{table_name}",
264
+ "name": f"{table_name}-sample",
265
+ "kind": "table-sample",
266
+ "tables": [table_name],
267
+ "description": f"Sample rows from {table_name}.",
268
+ "sql": f'SELECT *\nFROM {quote_identifier(table_name)}\nLIMIT {limit}',
269
+ "parameters": [],
270
+ "semantic_tags": list(table.get("semantic_tags", [])),
271
+ }
272
+
273
+
274
+ def build_relationship_query(
275
+ relationship: dict[str, object],
276
+ tables_by_name: dict[str, dict[str, object]],
277
+ limit: int,
278
+ ) -> dict[str, object] | None:
279
+ from_table_name = str(relationship["from_table"])
280
+ to_table_name = str(relationship["to_table"])
281
+ from_table = tables_by_name.get(from_table_name)
282
+ to_table = tables_by_name.get(to_table_name)
283
+ if from_table is None or to_table is None:
284
+ return None
285
+
286
+ from_projection = pick_projection_columns(from_table)
287
+ to_projection = pick_projection_columns(to_table)
288
+ select_lines = [
289
+ *select_alias_lines("src", from_projection),
290
+ *select_alias_lines("dst", to_projection),
291
+ ]
292
+ on_clause = " AND\n".join(
293
+ f' src.{quote_identifier(str(pair["from"]))} = dst.{quote_identifier(str(pair["to"]))}'
294
+ for pair in relationship["column_pairs"]
295
+ )
296
+ return {
297
+ "id": f"join:{relationship['kind']}:{from_table_name}:{to_table_name}",
298
+ "name": f"{from_table_name}-to-{to_table_name}",
299
+ "kind": "join",
300
+ "relationship_kind": relationship["kind"],
301
+ "tables": [from_table_name, to_table_name],
302
+ "description": str(relationship["description"]),
303
+ "sql": "\n".join(
304
+ [
305
+ "SELECT",
306
+ ",\n".join(select_lines),
307
+ f"FROM {quote_identifier(from_table_name)} AS src",
308
+ f"JOIN {quote_identifier(to_table_name)} AS dst",
309
+ "ON",
310
+ on_clause,
311
+ f"LIMIT {limit}",
312
+ ]
313
+ ),
314
+ "parameters": [],
315
+ "semantic_tags": sorted(
316
+ set(from_table.get("semantic_tags", [])) | set(to_table.get("semantic_tags", []))
317
+ ),
318
+ }
319
+
320
+
321
+ def pick_first_column(table: dict[str, object], candidates: tuple[str, ...]) -> str | None:
322
+ columns = set(table_column_names(table))
323
+ return next((candidate for candidate in candidates if candidate in columns), None)
324
+
325
+
326
+ def select_projection(table: dict[str, object], preferred: list[str]) -> list[str]:
327
+ columns = table_column_names(table)
328
+ selected = [column for column in preferred if column in columns]
329
+ if not selected:
330
+ selected = pick_projection_columns(table, limit=4)
331
+ return selected
332
+
333
+
334
+ def quoted_projection(columns: list[str]) -> str:
335
+ return ", ".join(quote_identifier(column) for column in columns)
336
+
337
+
338
+ def build_named_template(
339
+ *,
340
+ template_id: str,
341
+ name: str,
342
+ table: dict[str, object],
343
+ description: str,
344
+ sql: str,
345
+ parameters: list[dict[str, str]],
346
+ semantic_tags: list[str],
347
+ ) -> dict[str, object]:
348
+ return {
349
+ "id": template_id,
350
+ "name": name,
351
+ "kind": "named",
352
+ "tables": [str(table["table_name"])],
353
+ "description": description,
354
+ "sql": sql,
355
+ "parameters": parameters,
356
+ "semantic_tags": semantic_tags,
357
+ }
358
+
359
+
360
+ def build_gene_summary_templates(table: dict[str, object], limit: int) -> list[dict[str, object]]:
361
+ table_name = str(table["table_name"])
362
+ gene_column = pick_first_column(table, tuple(str(alias) for alias in RELATIONSHIP_GROUPS["fbgn"]["aliases"]))
363
+ summary_column = next(
364
+ (column for column in table_column_names(table) if "summary" in column.lower()),
365
+ None,
366
+ )
367
+ symbol_column = pick_first_column(table, ("gene_symbol", "symbol"))
368
+ if gene_column is None or summary_column is None:
369
+ return []
370
+ projection = select_projection(table, [gene_column, symbol_column or "", summary_column])
371
+ templates = [
372
+ build_named_template(
373
+ template_id=f"named:gene-summary-by-fbgn:{table_name}",
374
+ name="gene-summary-by-fbgn",
375
+ table=table,
376
+ description=f"Look up gene summaries in {table_name} by FlyBase gene identifier.",
377
+ sql="\n".join(
378
+ [
379
+ f"SELECT {quoted_projection(projection)}",
380
+ f"FROM {quote_identifier(table_name)}",
381
+ f"WHERE {quote_identifier(gene_column)} = :fbgn_id",
382
+ f"LIMIT {limit}",
383
+ ]
384
+ ),
385
+ parameters=[
386
+ {
387
+ "name": "fbgn_id",
388
+ "example": "FBgn0002121",
389
+ "description": "FlyBase gene identifier.",
390
+ }
391
+ ],
392
+ semantic_tags=sorted(set(table.get("semantic_tags", [])) | {"gene", "summary"}),
393
+ )
394
+ ]
395
+ if symbol_column is not None:
396
+ templates.append(
397
+ build_named_template(
398
+ template_id=f"named:gene-summary-by-symbol:{table_name}",
399
+ name="gene-summary-by-symbol",
400
+ table=table,
401
+ description=f"Look up gene summaries in {table_name} by gene symbol.",
402
+ sql="\n".join(
403
+ [
404
+ f"SELECT {quoted_projection(projection)}",
405
+ f"FROM {quote_identifier(table_name)}",
406
+ f"WHERE {quote_identifier(symbol_column)} = :gene_symbol",
407
+ f"LIMIT {limit}",
408
+ ]
409
+ ),
410
+ parameters=[
411
+ {
412
+ "name": "gene_symbol",
413
+ "example": "amx",
414
+ "description": "Gene symbol.",
415
+ }
416
+ ],
417
+ semantic_tags=sorted(set(table.get("semantic_tags", [])) | {"gene", "summary"}),
418
+ )
419
+ )
420
+ return templates
421
+
422
+
423
+ def build_link_templates(table: dict[str, object], limit: int) -> list[dict[str, object]]:
424
+ table_name = str(table["table_name"])
425
+ gene_column = pick_first_column(table, tuple(str(alias) for alias in RELATIONSHIP_GROUPS["fbgn"]["aliases"]))
426
+ transcript_column = pick_first_column(table, tuple(str(alias) for alias in RELATIONSHIP_GROUPS["fbtr"]["aliases"]))
427
+ protein_column = pick_first_column(table, tuple(str(alias) for alias in RELATIONSHIP_GROUPS["fbpp"]["aliases"]))
428
+ if transcript_column is None or protein_column is None:
429
+ return []
430
+ projection = select_projection(
431
+ table,
432
+ [gene_column or "", transcript_column, protein_column],
433
+ )
434
+ filters: list[tuple[str, str, str, str]] = []
435
+ if gene_column is not None:
436
+ filters.append(("transcript-protein-links", gene_column, "fbgn_id", "FBgn0002121"))
437
+ filters.append(("transcript-protein-links-by-transcript", transcript_column, "fbtr_id", "FBtr0080001"))
438
+ templates: list[dict[str, object]] = []
439
+ for name, column, parameter, example in filters:
440
+ templates.append(
441
+ build_named_template(
442
+ template_id=f"named:{name}:{table_name}",
443
+ name=name,
444
+ table=table,
445
+ description=f"Resolve linked transcript/protein rows in {table_name}.",
446
+ sql="\n".join(
447
+ [
448
+ f"SELECT {quoted_projection(projection)}",
449
+ f"FROM {quote_identifier(table_name)}",
450
+ f"WHERE {quote_identifier(column)} = :{parameter}",
451
+ f"LIMIT {limit}",
452
+ ]
453
+ ),
454
+ parameters=[
455
+ {
456
+ "name": parameter,
457
+ "example": example,
458
+ "description": f"Filter value for {column}.",
459
+ }
460
+ ],
461
+ semantic_tags=sorted(set(table.get("semantic_tags", [])) | {"transcript", "protein"}),
462
+ )
463
+ )
464
+ return templates
465
+
466
+
467
+ def build_publication_templates(table: dict[str, object], limit: int) -> list[dict[str, object]]:
468
+ table_name = str(table["table_name"])
469
+ gene_column = pick_first_column(table, tuple(str(alias) for alias in RELATIONSHIP_GROUPS["fbgn"]["aliases"]))
470
+ publication_column = pick_first_column(
471
+ table,
472
+ tuple(str(alias) for alias in RELATIONSHIP_GROUPS["publication"]["aliases"]),
473
+ )
474
+ if gene_column is None or publication_column is None:
475
+ return []
476
+ projection = select_projection(table, [gene_column, publication_column, "record_id"])
477
+ return [
478
+ build_named_template(
479
+ template_id=f"named:publications-for-gene:{table_name}",
480
+ name="publications-for-gene",
481
+ table=table,
482
+ description=f"Find publication-linked rows in {table_name} for a gene.",
483
+ sql="\n".join(
484
+ [
485
+ f"SELECT {quoted_projection(projection)}",
486
+ f"FROM {quote_identifier(table_name)}",
487
+ f"WHERE {quote_identifier(gene_column)} = :fbgn_id",
488
+ f"LIMIT {limit}",
489
+ ]
490
+ ),
491
+ parameters=[
492
+ {
493
+ "name": "fbgn_id",
494
+ "example": "FBgn0002121",
495
+ "description": "FlyBase gene identifier.",
496
+ }
497
+ ],
498
+ semantic_tags=sorted(set(table.get("semantic_tags", [])) | {"gene", "publication"}),
499
+ )
500
+ ]
501
+
502
+
503
+ def build_coordinate_templates(table: dict[str, object], limit: int) -> list[dict[str, object]]:
504
+ table_name = str(table["table_name"])
505
+ transcript_column = pick_first_column(table, ("transcript_id", "fbtr_id", "primary_fbtr", "flybase_fbtr"))
506
+ feature_column = pick_first_column(table, ("feature_id", "parent_id", "record_id"))
507
+ coordinate_columns = [
508
+ column
509
+ for column in ("seqid", "start", "end", "strand", "startPosition", "endPosition")
510
+ if column in table_column_names(table)
511
+ ]
512
+ if not coordinate_columns:
513
+ return []
514
+ if transcript_column is not None:
515
+ filter_column = transcript_column
516
+ parameter = ("transcript_id", "FBtr0080001")
517
+ name = "coordinates-for-transcript"
518
+ elif feature_column is not None:
519
+ filter_column = feature_column
520
+ parameter = ("feature_id", "FBgn0002121")
521
+ name = "coordinates-for-feature"
522
+ else:
523
+ return []
524
+ projection = select_projection(table, [filter_column, *coordinate_columns])
525
+ return [
526
+ build_named_template(
527
+ template_id=f"named:{name}:{table_name}",
528
+ name=name,
529
+ table=table,
530
+ description=f"Fetch coordinate rows from {table_name}.",
531
+ sql="\n".join(
532
+ [
533
+ f"SELECT {quoted_projection(projection)}",
534
+ f"FROM {quote_identifier(table_name)}",
535
+ f"WHERE {quote_identifier(filter_column)} = :{parameter[0]}",
536
+ f"LIMIT {limit}",
537
+ ]
538
+ ),
539
+ parameters=[
540
+ {
541
+ "name": parameter[0],
542
+ "example": parameter[1],
543
+ "description": f"Filter value for {filter_column}.",
544
+ }
545
+ ],
546
+ semantic_tags=sorted(set(table.get("semantic_tags", [])) | {"coordinates"}),
547
+ )
548
+ ]
549
+
550
+
551
+ def build_biological_query_templates(
552
+ tables: list[dict[str, object]],
553
+ limit: int,
554
+ ) -> list[dict[str, object]]:
555
+ templates: list[dict[str, object]] = []
556
+ for table in tables:
557
+ templates.extend(build_gene_summary_templates(table, limit))
558
+ templates.extend(build_link_templates(table, limit))
559
+ templates.extend(build_publication_templates(table, limit))
560
+ templates.extend(build_coordinate_templates(table, limit))
561
+ return templates
562
+
563
+
564
+ def dedupe_templates(templates: list[dict[str, object]]) -> list[dict[str, object]]:
565
+ deduped: dict[str, dict[str, object]] = {}
566
+ for template in templates:
567
+ deduped[str(template["id"])] = template
568
+ return list(deduped.values())
569
+
570
+
571
+ def build_query_templates(
572
+ tables: list[dict[str, object]],
573
+ relationships: list[dict[str, object]],
574
+ limit: int = 5,
575
+ ) -> list[dict[str, object]]:
576
+ templates = [build_table_sample_query(table, limit) for table in tables]
577
+ tables_by_name = {str(table["table_name"]): table for table in tables}
578
+ for relationship in relationships:
579
+ template = build_relationship_query(relationship, tables_by_name, limit)
580
+ if template is not None:
581
+ templates.append(template)
582
+ templates.extend(build_biological_query_templates(tables, limit))
583
+ return dedupe_templates(templates)
584
+
585
+
586
+ def collect_schema_details(
587
+ db_path: Path,
588
+ table_names: list[str] | None = None,
589
+ sample_values: int = 3,
590
+ query_limit: int = 5,
591
+ ) -> tuple[list[dict[str, object]], list[dict[str, object]], list[dict[str, object]]]:
592
+ tables = describe_tables(
593
+ db_path,
594
+ table_names=table_names,
595
+ sample_values=sample_values,
596
+ )
597
+ relationships = infer_schema_relationships(tables)
598
+ query_templates = build_query_templates(tables, relationships, limit=query_limit)
599
+ return tables, relationships, query_templates
600
+
601
+
602
+ def semantic_summary(tables: list[dict[str, object]]) -> dict[str, object]:
603
+ tag_counts: dict[str, int] = {}
604
+ for table in tables:
605
+ for tag in table.get("semantic_tags", []):
606
+ key = str(tag)
607
+ tag_counts[key] = tag_counts.get(key, 0) + 1
608
+ return {
609
+ "tags": sorted(tag_counts),
610
+ "tag_counts": dict(sorted(tag_counts.items())),
611
+ }
612
+
613
+
614
+ def build_schema_summary(
615
+ db_path: Path,
616
+ table_names: list[str] | None = None,
617
+ sample_values: int = 3,
618
+ query_limit: int = 5,
619
+ ) -> dict[str, object]:
620
+ tables, relationships, query_templates = collect_schema_details(
621
+ db_path,
622
+ table_names=table_names,
623
+ sample_values=sample_values,
624
+ query_limit=query_limit,
625
+ )
626
+ return {
627
+ "db_path": str(db_path),
628
+ "table_count": len(tables),
629
+ "tables": tables,
630
+ "semantic_summary": semantic_summary(tables),
631
+ "relationships": relationships,
632
+ "query_templates": query_templates,
633
+ }
634
+
635
+
636
+ def export_schema_summary(
637
+ db_path: Path,
638
+ output_path: Path,
639
+ table_names: list[str] | None = None,
640
+ sample_values: int = 3,
641
+ query_limit: int = 5,
642
+ ) -> dict[str, object]:
643
+ payload = build_schema_summary(
644
+ db_path,
645
+ table_names=table_names,
646
+ sample_values=sample_values,
647
+ query_limit=query_limit,
648
+ )
649
+ write_json(output_path, payload)
650
+ return payload
651
+
652
+
653
+ def build_query_plan(
654
+ db_path: Path,
655
+ table_names: list[str] | None = None,
656
+ sample_values: int = 3,
657
+ limit: int = 5,
658
+ ) -> dict[str, object]:
659
+ tables, relationships, query_templates = collect_schema_details(
660
+ db_path,
661
+ table_names=table_names,
662
+ sample_values=sample_values,
663
+ query_limit=limit,
664
+ )
665
+ return {
666
+ "db_path": str(db_path),
667
+ "table_count": len(tables),
668
+ "relationship_count": len(relationships),
669
+ "semantic_summary": semantic_summary(tables),
670
+ "queries": query_templates,
671
+ }