bdsc-cli 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bdsc_cli/core.py ADDED
@@ -0,0 +1,2661 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import difflib
5
+ import hashlib
6
+ import json
7
+ import os
8
+ import re
9
+ import sqlite3
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any, Iterator
14
+ from urllib import error, parse, request
15
+
16
+ USER_AGENT = "bdsc-cli/0.1 (+https://bdsc.indiana.edu/)"
17
+ DEFAULT_STATE_DIR = Path(
18
+ os.environ.get("BDSC_CLI_HOME", Path.home() / ".local" / "share" / "bdsc-cli")
19
+ )
20
+ DB_NAME = "bdsc.sqlite3"
21
+ MANIFEST_NAME = "manifest.json"
22
+
23
+ DATASETS = {
24
+ "bloomington": "https://bdsc.indiana.edu/pdf/bloomington.csv",
25
+ "stockcomps_map_comments": "https://bdsc.indiana.edu/pdf/stockcomps_map_comments.csv",
26
+ "stockgenes": "https://bdsc.indiana.edu/pdf/stockgenes.csv",
27
+ "stockgenes_compgenes": "https://bdsc.indiana.edu/pdf/stockgenes_compgenes.csv",
28
+ "stockgenes_compprops": "https://bdsc.indiana.edu/pdf/stockgenes_compprops.csv",
29
+ }
30
+
31
+
32
+ @dataclass
33
+ class SyncResult:
34
+ name: str
35
+ path: Path
36
+ status: str
37
+ bytes_downloaded: int
38
+ metadata: dict[str, Any]
39
+
40
+
41
+ @dataclass
42
+ class QueryCriterion:
43
+ kind: str
44
+ query: str
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class ReportSpec:
49
+ name: str
50
+ description: str
51
+ default_dataset: str
52
+ groups: tuple[tuple[QueryCriterion, ...], ...] = ()
53
+
54
+
55
+ LOOKUP_KINDS = (
56
+ "auto",
57
+ "stock",
58
+ "rrid",
59
+ "gene",
60
+ "fbid",
61
+ "component",
62
+ "property",
63
+ "property-exact",
64
+ "driver-family",
65
+ "relationship",
66
+ "search",
67
+ )
68
+ EXPORT_DATASETS = ("stocks", "components", "genes", "properties")
69
+ TERM_SCOPES = ("properties", "property-descriptions", "relationships")
70
+ REPORT_NAMES = ("olfactory", "drivers", "optogenetics")
71
+
72
+ REPORT_SPECS = {
73
+ "olfactory": ReportSpec(
74
+ name="olfactory",
75
+ description="olfactory receptor and odorant-binding gene families",
76
+ default_dataset="components",
77
+ ),
78
+ "drivers": ReportSpec(
79
+ name="drivers",
80
+ description="expression-driver and recombinase components",
81
+ default_dataset="components",
82
+ groups=(
83
+ (QueryCriterion(kind="driver-family", query="GAL4"),),
84
+ (QueryCriterion(kind="driver-family", query="lexA"),),
85
+ (QueryCriterion(kind="driver-family", query="QF"),),
86
+ (QueryCriterion(kind="driver-family", query="split"),),
87
+ (QueryCriterion(kind="driver-family", query="FLP"),),
88
+ ),
89
+ ),
90
+ "optogenetics": ReportSpec(
91
+ name="optogenetics",
92
+ description="common optogenetic effectors and optogenetic-tagged components",
93
+ default_dataset="components",
94
+ groups=(
95
+ (QueryCriterion(kind="gene", query="Chronos"),),
96
+ (QueryCriterion(kind="gene", query="CsChrimson"),),
97
+ (QueryCriterion(kind="gene", query="Chrimson"),),
98
+ (QueryCriterion(kind="gene", query="GtACR"),),
99
+ (QueryCriterion(kind="gene", query="ReaChR"),),
100
+ (QueryCriterion(kind="gene", query="ChR2"),),
101
+ (QueryCriterion(kind="gene", query="eNpHR"),),
102
+ (QueryCriterion(kind="property", query="optogen"),),
103
+ ),
104
+ ),
105
+ }
106
+
107
+ REPORT_DATASET_SYMBOLS = {
108
+ "stocks": "s",
109
+ "components": "cc",
110
+ "genes": "sg",
111
+ "properties": "cc",
112
+ }
113
+
114
+ DRIVER_FAMILY_ALIASES = {
115
+ "gal4": ("gal4", "gawb"),
116
+ "lexa": ("lexa",),
117
+ "qf": ("qf",),
118
+ "flp": ("flp", "flpo", "flp recombinase"),
119
+ "split": ("split zip hemi driver", "split intein hemi driver"),
120
+ }
121
+
122
+
123
+ def resolve_state_dir(value: str | Path | None) -> Path:
124
+ return Path(value).expanduser() if value else DEFAULT_STATE_DIR
125
+
126
+
127
+ def ensure_state_dir(state_dir: Path) -> None:
128
+ state_dir.mkdir(parents=True, exist_ok=True)
129
+ (state_dir / "raw").mkdir(parents=True, exist_ok=True)
130
+
131
+
132
+ def manifest_file(state_dir: Path) -> Path:
133
+ return state_dir / MANIFEST_NAME
134
+
135
+
136
+ def db_file(state_dir: Path) -> Path:
137
+ return state_dir / DB_NAME
138
+
139
+
140
+ def load_manifest(state_dir: Path) -> dict[str, Any]:
141
+ path = manifest_file(state_dir)
142
+ if not path.exists():
143
+ return {"datasets": {}}
144
+ return json.loads(path.read_text(encoding="utf-8"))
145
+
146
+
147
+ def save_manifest(state_dir: Path, manifest: dict[str, Any]) -> None:
148
+ manifest_file(state_dir).write_text(
149
+ json.dumps(manifest, indent=2, sort_keys=True) + "\n",
150
+ encoding="utf-8",
151
+ )
152
+
153
+
154
+ def raw_file(state_dir: Path, name: str) -> Path:
155
+ return state_dir / "raw" / f"{name}.csv"
156
+
157
+
158
+ def _now_iso() -> str:
159
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
160
+
161
+
162
+ def _hash_file(path: Path) -> str:
163
+ digest = hashlib.sha256()
164
+ with path.open("rb") as handle:
165
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
166
+ digest.update(chunk)
167
+ return digest.hexdigest()
168
+
169
+
170
+ def sync_datasets(state_dir: Path, force: bool = False) -> list[SyncResult]:
171
+ ensure_state_dir(state_dir)
172
+ manifest = load_manifest(state_dir)
173
+ results: list[SyncResult] = []
174
+
175
+ for name, url in DATASETS.items():
176
+ path = raw_file(state_dir, name)
177
+ entry = manifest.setdefault("datasets", {}).get(name, {})
178
+ headers = {"User-Agent": USER_AGENT}
179
+ if not force:
180
+ if entry.get("etag"):
181
+ headers["If-None-Match"] = entry["etag"]
182
+ if entry.get("last_modified"):
183
+ headers["If-Modified-Since"] = entry["last_modified"]
184
+
185
+ req = request.Request(url, headers=headers)
186
+ try:
187
+ with request.urlopen(req) as response:
188
+ temp_path = path.with_suffix(".csv.tmp")
189
+ size = 0
190
+ digest = hashlib.sha256()
191
+ with temp_path.open("wb") as handle:
192
+ for chunk in iter(lambda: response.read(1024 * 1024), b""):
193
+ size += len(chunk)
194
+ digest.update(chunk)
195
+ handle.write(chunk)
196
+ temp_path.replace(path)
197
+ metadata = {
198
+ "url": url,
199
+ "etag": response.headers.get("ETag"),
200
+ "last_modified": response.headers.get("Last-Modified"),
201
+ "content_length": response.headers.get("Content-Length"),
202
+ "sha256": digest.hexdigest(),
203
+ "fetched_at": _now_iso(),
204
+ }
205
+ manifest["datasets"][name] = metadata
206
+ results.append(
207
+ SyncResult(
208
+ name=name,
209
+ path=path,
210
+ status="downloaded",
211
+ bytes_downloaded=size,
212
+ metadata=metadata,
213
+ )
214
+ )
215
+ except error.HTTPError as exc:
216
+ if exc.code == 304 and path.exists():
217
+ metadata = {
218
+ **entry,
219
+ "checked_at": _now_iso(),
220
+ "sha256": entry.get("sha256") or _hash_file(path),
221
+ }
222
+ manifest["datasets"][name] = metadata
223
+ results.append(
224
+ SyncResult(
225
+ name=name,
226
+ path=path,
227
+ status="not-modified",
228
+ bytes_downloaded=0,
229
+ metadata=metadata,
230
+ )
231
+ )
232
+ continue
233
+ raise RuntimeError(f"failed to download {url}: {exc}") from exc
234
+
235
+ manifest["updated_at"] = _now_iso()
236
+ save_manifest(state_dir, manifest)
237
+ return results
238
+
239
+
240
+ def _iter_csv_rows(path: Path) -> list[dict[str, str]]:
241
+ rows: list[dict[str, str]] = []
242
+ for encoding_errors in ("strict", "replace"):
243
+ try:
244
+ with path.open(
245
+ "r",
246
+ encoding="utf-8-sig",
247
+ errors=encoding_errors,
248
+ newline="",
249
+ ) as handle:
250
+ reader = csv.DictReader(handle)
251
+ for raw_row in reader:
252
+ row = {
253
+ (key or "").strip(): (value or "").strip()
254
+ for key, value in raw_row.items()
255
+ }
256
+ if any(row.values()):
257
+ rows.append(row)
258
+ return rows
259
+ except UnicodeDecodeError:
260
+ rows.clear()
261
+ continue
262
+ raise UnicodeDecodeError("utf-8", b"", 0, 1, f"could not decode {path}")
263
+ return rows
264
+
265
+
266
+ def _to_int(value: str) -> int | None:
267
+ value = value.strip()
268
+ if not value:
269
+ return None
270
+ try:
271
+ return int(value)
272
+ except ValueError:
273
+ return None
274
+
275
+
276
+ def _require_files(state_dir: Path) -> None:
277
+ missing = [name for name in DATASETS if not raw_file(state_dir, name).exists()]
278
+ if missing:
279
+ missing_list = ", ".join(missing)
280
+ raise FileNotFoundError(
281
+ f"missing raw datasets: {missing_list}. run `bdsc sync` first"
282
+ )
283
+
284
+
285
+ def build_index(state_dir: Path) -> dict[str, int]:
286
+ ensure_state_dir(state_dir)
287
+ _require_files(state_dir)
288
+ manifest = load_manifest(state_dir)
289
+
290
+ bloomington_rows = _iter_csv_rows(raw_file(state_dir, "bloomington"))
291
+ component_rows = _iter_csv_rows(raw_file(state_dir, "stockcomps_map_comments"))
292
+ stockgene_rows = _iter_csv_rows(raw_file(state_dir, "stockgenes"))
293
+ compgene_rows = _iter_csv_rows(raw_file(state_dir, "stockgenes_compgenes"))
294
+ compprop_rows = _iter_csv_rows(raw_file(state_dir, "stockgenes_compprops"))
295
+
296
+ db_path = db_file(state_dir)
297
+ if db_path.exists():
298
+ db_path.unlink()
299
+
300
+ conn = sqlite3.connect(db_path)
301
+ try:
302
+ conn.execute("PRAGMA journal_mode=WAL")
303
+ conn.execute("PRAGMA synchronous=NORMAL")
304
+ conn.executescript(
305
+ """
306
+ CREATE TABLE stocks (
307
+ stknum INTEGER PRIMARY KEY,
308
+ genotype TEXT NOT NULL,
309
+ chromosomes TEXT,
310
+ aka TEXT,
311
+ date_added TEXT,
312
+ donor_info TEXT,
313
+ stock_comments TEXT
314
+ );
315
+
316
+ CREATE TABLE component_comments (
317
+ stknum INTEGER NOT NULL,
318
+ genotype TEXT,
319
+ component_symbol TEXT,
320
+ fbid TEXT,
321
+ mapstatement TEXT,
322
+ comment1 TEXT,
323
+ comment2 TEXT,
324
+ comment3 TEXT
325
+ );
326
+
327
+ CREATE TABLE stockgenes (
328
+ stknum INTEGER NOT NULL,
329
+ genotype TEXT,
330
+ component_symbol TEXT,
331
+ gene_symbol TEXT,
332
+ fbgn TEXT,
333
+ bdsc_symbol_id INTEGER,
334
+ bdsc_gene_id INTEGER
335
+ );
336
+
337
+ CREATE TABLE compgenes (
338
+ bdsc_symbol_id INTEGER,
339
+ bdsc_gene_id INTEGER,
340
+ compgeneprop_id INTEGER,
341
+ prop_syn TEXT
342
+ );
343
+
344
+ CREATE TABLE compprops (
345
+ bdsc_symbol_id INTEGER,
346
+ property_id INTEGER,
347
+ property_descrip TEXT,
348
+ prop_syn TEXT
349
+ );
350
+
351
+ CREATE TABLE search_documents (
352
+ stknum INTEGER PRIMARY KEY,
353
+ genotype TEXT,
354
+ aka TEXT,
355
+ donor_info TEXT,
356
+ stock_comments TEXT,
357
+ component_symbols TEXT,
358
+ fbids TEXT,
359
+ gene_symbols TEXT,
360
+ fbgns TEXT,
361
+ property_terms TEXT,
362
+ relationship_terms TEXT,
363
+ search_text TEXT
364
+ );
365
+
366
+ CREATE INDEX idx_component_comments_stknum ON component_comments(stknum);
367
+ CREATE INDEX idx_stockgenes_stknum ON stockgenes(stknum);
368
+ CREATE INDEX idx_stockgenes_gene_symbol ON stockgenes(gene_symbol);
369
+ CREATE INDEX idx_stockgenes_fbgn ON stockgenes(fbgn);
370
+ CREATE INDEX idx_compgenes_symbol_id ON compgenes(bdsc_symbol_id);
371
+ CREATE INDEX idx_compprops_symbol_id ON compprops(bdsc_symbol_id);
372
+ """
373
+ )
374
+
375
+ conn.executemany(
376
+ """
377
+ INSERT INTO stocks (
378
+ stknum, genotype, chromosomes, aka, date_added, donor_info, stock_comments
379
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
380
+ """,
381
+ [
382
+ (
383
+ _to_int(row["Stk #"]),
384
+ row["Genotype"],
385
+ row["Ch # all"],
386
+ row["A.K.A"],
387
+ row["Date added"],
388
+ row["Donor info"],
389
+ row["Stock comments"],
390
+ )
391
+ for row in bloomington_rows
392
+ if _to_int(row["Stk #"]) is not None
393
+ ],
394
+ )
395
+
396
+ conn.executemany(
397
+ """
398
+ INSERT INTO component_comments (
399
+ stknum, genotype, component_symbol, fbid, mapstatement, comment1, comment2, comment3
400
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
401
+ """,
402
+ [
403
+ (
404
+ _to_int(row["Stk #"]),
405
+ row["Genotype"],
406
+ row["component_symbol"],
407
+ row["fbid"],
408
+ row["mapstatement"],
409
+ row["comment1"],
410
+ row["comment2"],
411
+ row["comment3"],
412
+ )
413
+ for row in component_rows
414
+ if _to_int(row["Stk #"]) is not None
415
+ ],
416
+ )
417
+
418
+ conn.executemany(
419
+ """
420
+ INSERT INTO stockgenes (
421
+ stknum, genotype, component_symbol, gene_symbol, fbgn, bdsc_symbol_id, bdsc_gene_id
422
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
423
+ """,
424
+ [
425
+ (
426
+ _to_int(row["stknum"]),
427
+ row["genotype"],
428
+ row["component_symbol"],
429
+ row["gene_symbol"],
430
+ row["fbgn"],
431
+ _to_int(row["bdsc_symbol_id"]),
432
+ _to_int(row["bdsc_gene_id"]),
433
+ )
434
+ for row in stockgene_rows
435
+ if _to_int(row["stknum"]) is not None
436
+ ],
437
+ )
438
+
439
+ conn.executemany(
440
+ """
441
+ INSERT INTO compgenes (
442
+ bdsc_symbol_id, bdsc_gene_id, compgeneprop_id, prop_syn
443
+ ) VALUES (?, ?, ?, ?)
444
+ """,
445
+ [
446
+ (
447
+ _to_int(row["bdsc_symbol_id"]),
448
+ _to_int(row["bdsc_gene_id"]),
449
+ _to_int(row["compgeneprop_id"]),
450
+ row["prop_syn"],
451
+ )
452
+ for row in compgene_rows
453
+ ],
454
+ )
455
+
456
+ conn.executemany(
457
+ """
458
+ INSERT INTO compprops (
459
+ bdsc_symbol_id, property_id, property_descrip, prop_syn
460
+ ) VALUES (?, ?, ?, ?)
461
+ """,
462
+ [
463
+ (
464
+ _to_int(row["bdsc_symbol_id"]),
465
+ _to_int(row["property_id"]),
466
+ row["property_descrip"],
467
+ row["prop_syn"],
468
+ )
469
+ for row in compprop_rows
470
+ ],
471
+ )
472
+
473
+ conn.execute(
474
+ """
475
+ INSERT INTO search_documents (
476
+ stknum, genotype, aka, donor_info, stock_comments,
477
+ component_symbols, fbids, gene_symbols, fbgns,
478
+ property_terms, relationship_terms, search_text
479
+ )
480
+ SELECT
481
+ s.stknum,
482
+ s.genotype,
483
+ COALESCE(s.aka, ''),
484
+ COALESCE(s.donor_info, ''),
485
+ COALESCE(s.stock_comments, ''),
486
+ COALESCE((
487
+ SELECT group_concat(component_symbol, ' ')
488
+ FROM (
489
+ SELECT DISTINCT sg.component_symbol AS component_symbol
490
+ FROM stockgenes sg
491
+ WHERE sg.stknum = s.stknum AND sg.component_symbol != ''
492
+ ORDER BY sg.component_symbol
493
+ )
494
+ ), ''),
495
+ COALESCE((
496
+ SELECT group_concat(fbid, ' ')
497
+ FROM (
498
+ SELECT DISTINCT cc.fbid AS fbid
499
+ FROM component_comments cc
500
+ WHERE cc.stknum = s.stknum AND cc.fbid != ''
501
+ ORDER BY cc.fbid
502
+ )
503
+ ), ''),
504
+ COALESCE((
505
+ SELECT group_concat(gene_symbol, ' ')
506
+ FROM (
507
+ SELECT DISTINCT sg.gene_symbol AS gene_symbol
508
+ FROM stockgenes sg
509
+ WHERE sg.stknum = s.stknum AND sg.gene_symbol != ''
510
+ ORDER BY sg.gene_symbol
511
+ )
512
+ ), ''),
513
+ COALESCE((
514
+ SELECT group_concat(fbgn, ' ')
515
+ FROM (
516
+ SELECT DISTINCT sg.fbgn AS fbgn
517
+ FROM stockgenes sg
518
+ WHERE sg.stknum = s.stknum AND sg.fbgn != ''
519
+ ORDER BY sg.fbgn
520
+ )
521
+ ), ''),
522
+ COALESCE((
523
+ SELECT group_concat(prop_syn, ' ')
524
+ FROM (
525
+ SELECT DISTINCT cp.prop_syn AS prop_syn
526
+ FROM stockgenes sg
527
+ JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
528
+ WHERE sg.stknum = s.stknum AND cp.prop_syn != ''
529
+ ORDER BY cp.prop_syn
530
+ )
531
+ ), ''),
532
+ COALESCE((
533
+ SELECT group_concat(prop_syn, ' ')
534
+ FROM (
535
+ SELECT DISTINCT cg.prop_syn AS prop_syn
536
+ FROM stockgenes sg
537
+ JOIN compgenes cg
538
+ ON cg.bdsc_symbol_id = sg.bdsc_symbol_id
539
+ AND cg.bdsc_gene_id = sg.bdsc_gene_id
540
+ WHERE sg.stknum = s.stknum AND cg.prop_syn != ''
541
+ ORDER BY cg.prop_syn
542
+ )
543
+ ), ''),
544
+ trim(
545
+ s.stknum || ' ' ||
546
+ COALESCE(s.genotype, '') || ' ' ||
547
+ COALESCE(s.aka, '') || ' ' ||
548
+ COALESCE(s.donor_info, '') || ' ' ||
549
+ COALESCE(s.stock_comments, '') || ' ' ||
550
+ COALESCE((
551
+ SELECT group_concat(fbid, ' ')
552
+ FROM (
553
+ SELECT DISTINCT cc.fbid AS fbid
554
+ FROM component_comments cc
555
+ WHERE cc.stknum = s.stknum AND cc.fbid != ''
556
+ )
557
+ ), '') || ' ' ||
558
+ COALESCE((
559
+ SELECT group_concat(component_symbol, ' ')
560
+ FROM (
561
+ SELECT DISTINCT sg.component_symbol AS component_symbol
562
+ FROM stockgenes sg
563
+ WHERE sg.stknum = s.stknum AND sg.component_symbol != ''
564
+ )
565
+ ), '') || ' ' ||
566
+ COALESCE((
567
+ SELECT group_concat(gene_symbol, ' ')
568
+ FROM (
569
+ SELECT DISTINCT sg.gene_symbol AS gene_symbol
570
+ FROM stockgenes sg
571
+ WHERE sg.stknum = s.stknum AND sg.gene_symbol != ''
572
+ )
573
+ ), '') || ' ' ||
574
+ COALESCE((
575
+ SELECT group_concat(fbgn, ' ')
576
+ FROM (
577
+ SELECT DISTINCT sg.fbgn AS fbgn
578
+ FROM stockgenes sg
579
+ WHERE sg.stknum = s.stknum AND sg.fbgn != ''
580
+ )
581
+ ), '') || ' ' ||
582
+ COALESCE((
583
+ SELECT group_concat(prop_syn, ' ')
584
+ FROM (
585
+ SELECT DISTINCT cp.prop_syn AS prop_syn
586
+ FROM stockgenes sg
587
+ JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
588
+ WHERE sg.stknum = s.stknum AND cp.prop_syn != ''
589
+ )
590
+ ), '') || ' ' ||
591
+ COALESCE((
592
+ SELECT group_concat(property_descrip, ' ')
593
+ FROM (
594
+ SELECT DISTINCT cp.property_descrip AS property_descrip
595
+ FROM stockgenes sg
596
+ JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
597
+ WHERE sg.stknum = s.stknum AND cp.property_descrip != ''
598
+ )
599
+ ), '') || ' ' ||
600
+ COALESCE((
601
+ SELECT group_concat(prop_syn, ' ')
602
+ FROM (
603
+ SELECT DISTINCT cg.prop_syn AS prop_syn
604
+ FROM stockgenes sg
605
+ JOIN compgenes cg
606
+ ON cg.bdsc_symbol_id = sg.bdsc_symbol_id
607
+ AND cg.bdsc_gene_id = sg.bdsc_gene_id
608
+ WHERE sg.stknum = s.stknum AND cg.prop_syn != ''
609
+ )
610
+ ), '') || ' ' ||
611
+ COALESCE((
612
+ SELECT group_concat(comment_text, ' ')
613
+ FROM (
614
+ SELECT DISTINCT cc.comment1 AS comment_text
615
+ FROM component_comments cc
616
+ WHERE cc.stknum = s.stknum AND cc.comment1 != ''
617
+ UNION
618
+ SELECT DISTINCT cc.comment2 AS comment_text
619
+ FROM component_comments cc
620
+ WHERE cc.stknum = s.stknum AND cc.comment2 != ''
621
+ UNION
622
+ SELECT DISTINCT cc.comment3 AS comment_text
623
+ FROM component_comments cc
624
+ WHERE cc.stknum = s.stknum AND cc.comment3 != ''
625
+ UNION
626
+ SELECT DISTINCT cc.mapstatement AS comment_text
627
+ FROM component_comments cc
628
+ WHERE cc.stknum = s.stknum AND cc.mapstatement != ''
629
+ )
630
+ ), '')
631
+ )
632
+ FROM stocks s
633
+ """
634
+ )
635
+
636
+ fts_enabled = True
637
+ try:
638
+ conn.execute(
639
+ """
640
+ CREATE VIRTUAL TABLE stock_fts USING fts5(
641
+ stknum UNINDEXED,
642
+ genotype,
643
+ aka,
644
+ donor_info,
645
+ stock_comments,
646
+ component_symbols,
647
+ fbids,
648
+ gene_symbols,
649
+ fbgns,
650
+ property_terms,
651
+ relationship_terms,
652
+ tokenize='porter unicode61'
653
+ )
654
+ """
655
+ )
656
+ except sqlite3.OperationalError:
657
+ fts_enabled = False
658
+
659
+ if fts_enabled:
660
+ conn.execute(
661
+ """
662
+ INSERT INTO stock_fts (
663
+ stknum, genotype, aka, donor_info, stock_comments,
664
+ component_symbols, fbids, gene_symbols, fbgns,
665
+ property_terms, relationship_terms
666
+ )
667
+ SELECT
668
+ stknum, genotype, aka, donor_info, stock_comments,
669
+ component_symbols, fbids, gene_symbols, fbgns,
670
+ property_terms, relationship_terms
671
+ FROM search_documents
672
+ """
673
+ )
674
+
675
+ trigram_enabled = True
676
+ try:
677
+ conn.execute(
678
+ """
679
+ CREATE VIRTUAL TABLE stock_trigram USING fts5(
680
+ stknum UNINDEXED,
681
+ search_text,
682
+ tokenize='trigram'
683
+ )
684
+ """
685
+ )
686
+ except sqlite3.OperationalError:
687
+ trigram_enabled = False
688
+
689
+ if trigram_enabled:
690
+ conn.execute(
691
+ """
692
+ INSERT INTO stock_trigram (stknum, search_text)
693
+ SELECT stknum, search_text
694
+ FROM search_documents
695
+ """
696
+ )
697
+
698
+ conn.commit()
699
+ counts = {
700
+ "stocks": len(bloomington_rows),
701
+ "component_comments": len(component_rows),
702
+ "stockgenes": len(stockgene_rows),
703
+ "compgenes": len(compgene_rows),
704
+ "compprops": len(compprop_rows),
705
+ "fts_enabled": int(fts_enabled),
706
+ "trigram_enabled": int(trigram_enabled),
707
+ }
708
+ manifest["index"] = {
709
+ "db_path": str(db_path),
710
+ "built_at": _now_iso(),
711
+ "counts": counts,
712
+ }
713
+ save_manifest(state_dir, manifest)
714
+ return counts
715
+ finally:
716
+ conn.close()
717
+
718
+
719
+ def _connect(state_dir: Path) -> sqlite3.Connection:
720
+ path = db_file(state_dir)
721
+ if not path.exists():
722
+ raise FileNotFoundError(f"missing index: {path}. run `bdsc sync` or `bdsc build-index`")
723
+ conn = sqlite3.connect(path)
724
+ conn.row_factory = sqlite3.Row
725
+ return conn
726
+
727
+
728
+ def build_fts_query(text: str) -> str:
729
+ tokens = re.findall(r"[A-Za-z0-9]+", text.lower())
730
+ if not tokens:
731
+ escaped = text.replace('"', '""').strip()
732
+ return f'"{escaped}"'
733
+ return " ".join(f"{token}*" for token in tokens)
734
+
735
+
736
+ def _query_tokens(text: str) -> list[str]:
737
+ return re.findall(r"[A-Za-z0-9]+", text.lower())
738
+
739
+
740
+ def _is_free_text_query(text: str) -> bool:
741
+ return len(_query_tokens(text)) > 1
742
+
743
+
744
+ def _compact_text(text: str) -> str:
745
+ return "".join(_query_tokens(text))
746
+
747
+
748
+ def _trigrams(text: str) -> list[str]:
749
+ if len(text) < 3:
750
+ return []
751
+ return [text[index : index + 3] for index in range(len(text) - 2)]
752
+
753
+
754
+ def build_trigram_query(text: str) -> str | None:
755
+ tokens = _query_tokens(text)
756
+ grams: list[str] = []
757
+ seen: set[str] = set()
758
+ for token in tokens:
759
+ for gram in _trigrams(token):
760
+ if gram not in seen:
761
+ seen.add(gram)
762
+ grams.append(gram)
763
+ compact = _compact_text(text)
764
+ for gram in _trigrams(compact):
765
+ if gram not in seen:
766
+ seen.add(gram)
767
+ grams.append(gram)
768
+ if not grams:
769
+ return None
770
+ return " OR ".join(f'"{gram}"' for gram in grams)
771
+
772
+
773
+ def _trigram_overlap_ratio(query: str, text: str) -> float:
774
+ query_grams = set(_trigrams(_compact_text(query)))
775
+ text_grams = set(_trigrams(_compact_text(text)))
776
+ if not query_grams or not text_grams:
777
+ return 0.0
778
+ return len(query_grams & text_grams) / len(query_grams)
779
+
780
+
781
+ def _best_term_similarity(query: str, text: str) -> float:
782
+ query_compact = _compact_text(query)
783
+ if not query_compact:
784
+ return 0.0
785
+
786
+ best = 0.0
787
+ for term in _query_tokens(text):
788
+ if len(term) < 3:
789
+ continue
790
+ similarity = difflib.SequenceMatcher(None, query_compact, term).ratio()
791
+ similarity += _trigram_overlap_ratio(query, term)
792
+ if similarity > best:
793
+ best = similarity
794
+ return best
795
+
796
+
797
+ def _score_search_document(query: str, row: sqlite3.Row | dict[str, Any]) -> float:
798
+ query_value = query.strip().lower()
799
+ query_tokens = _query_tokens(query)
800
+ query_compact = _compact_text(query)
801
+ search_text = row["search_text"]
802
+ haystack = search_text.lower()
803
+ compact_haystack = _compact_text(search_text)
804
+ document_tokens = set(_query_tokens(search_text))
805
+
806
+ score = 0.0
807
+ if query_value and query_value in haystack:
808
+ score += 8.0
809
+ if query_compact and query_compact in compact_haystack:
810
+ score += 10.0
811
+
812
+ exact_matches = sum(1 for token in query_tokens if token in document_tokens)
813
+ prefix_matches = sum(
814
+ 1
815
+ for token in query_tokens
816
+ if token not in document_tokens and any(doc.startswith(token) for doc in document_tokens)
817
+ )
818
+ score += exact_matches * 3.0
819
+ score += prefix_matches * 1.5
820
+
821
+ overlap = _trigram_overlap_ratio(query, search_text)
822
+ score += overlap * 4.0
823
+
824
+ gene_symbols = row["gene_symbols"] or ""
825
+ component_symbols = row["component_symbols"] or ""
826
+ primary_fields = f"{gene_symbols} {component_symbols}".strip()
827
+ if primary_fields:
828
+ score += _trigram_overlap_ratio(query, primary_fields) * 8.0
829
+ score += _best_term_similarity(query, primary_fields) * 12.0
830
+
831
+ return score
832
+
833
+
834
+ def _search_result_payload(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
835
+ return {
836
+ "stknum": row["stknum"],
837
+ "genotype": row["genotype"],
838
+ "gene_symbols": row["gene_symbols"],
839
+ "fbgns": row["fbgns"],
840
+ "component_symbols": row["component_symbols"],
841
+ }
842
+
843
+
844
+ def _merge_ranked_matches(
845
+ matches: list[dict[str, Any]],
846
+ key_fn,
847
+ ) -> list[dict[str, Any]]:
848
+ merged: dict[Any, dict[str, Any]] = {}
849
+ for match in matches:
850
+ key = key_fn(match["row"])
851
+ existing = merged.get(key)
852
+ if existing is None or match["score"] > existing["score"]:
853
+ merged[key] = match
854
+ return sorted(
855
+ merged.values(),
856
+ key=lambda item: (-item["score"], item["row"]["stknum"]),
857
+ )
858
+
859
+
860
+ def _search_candidates_from_prefix_fts(
861
+ conn: sqlite3.Connection,
862
+ query: str,
863
+ limit: int,
864
+ ) -> list[dict[str, Any]]:
865
+ has_fts = bool(
866
+ conn.execute(
867
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_fts'"
868
+ ).fetchone()
869
+ )
870
+ if not has_fts:
871
+ rows = conn.execute(
872
+ """
873
+ SELECT
874
+ s.stknum,
875
+ s.genotype,
876
+ sd.gene_symbols,
877
+ sd.fbgns,
878
+ sd.component_symbols,
879
+ sd.search_text
880
+ FROM search_documents sd
881
+ JOIN stocks s ON s.stknum = sd.stknum
882
+ WHERE sd.search_text LIKE ?
883
+ ORDER BY s.stknum
884
+ LIMIT ?
885
+ """,
886
+ (f"%{query}%", limit),
887
+ ).fetchall()
888
+ return [{"row": row, "score": _score_search_document(query, row) + 20.0} for row in rows]
889
+
890
+ rows = conn.execute(
891
+ """
892
+ SELECT
893
+ s.stknum,
894
+ s.genotype,
895
+ sd.gene_symbols,
896
+ sd.fbgns,
897
+ sd.component_symbols,
898
+ sd.search_text,
899
+ bm25(stock_fts) AS rank
900
+ FROM stock_fts f
901
+ JOIN stocks s ON s.stknum = f.stknum
902
+ JOIN search_documents sd ON sd.stknum = s.stknum
903
+ WHERE stock_fts MATCH ?
904
+ ORDER BY bm25(stock_fts), s.stknum
905
+ LIMIT ?
906
+ """,
907
+ (build_fts_query(query), limit),
908
+ ).fetchall()
909
+ return [
910
+ {
911
+ "row": row,
912
+ "score": _score_search_document(query, row) + 40.0 + min(10.0, abs(row["rank"]) * 1000000.0),
913
+ }
914
+ for row in rows
915
+ ]
916
+
917
+
918
+ def _search_candidates_from_trigram_fts(
919
+ conn: sqlite3.Connection,
920
+ query: str,
921
+ limit: int,
922
+ ) -> list[dict[str, Any]]:
923
+ trigram_query = build_trigram_query(query)
924
+ if not trigram_query:
925
+ return []
926
+
927
+ has_trigram = bool(
928
+ conn.execute(
929
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_trigram'"
930
+ ).fetchone()
931
+ )
932
+ if not has_trigram:
933
+ return []
934
+
935
+ rows = conn.execute(
936
+ """
937
+ SELECT
938
+ s.stknum,
939
+ s.genotype,
940
+ sd.gene_symbols,
941
+ sd.fbgns,
942
+ sd.component_symbols,
943
+ sd.search_text,
944
+ bm25(stock_trigram) AS rank
945
+ FROM stock_trigram t
946
+ JOIN stocks s ON s.stknum = t.stknum
947
+ JOIN search_documents sd ON sd.stknum = s.stknum
948
+ WHERE stock_trigram MATCH ?
949
+ ORDER BY bm25(stock_trigram), s.stknum
950
+ LIMIT ?
951
+ """,
952
+ (trigram_query, limit),
953
+ ).fetchall()
954
+
955
+ matches: list[dict[str, Any]] = []
956
+ for row in rows:
957
+ score = _score_search_document(query, row) + min(6.0, abs(row["rank"]) * 1000000.0)
958
+ if score >= 4.5:
959
+ matches.append({"row": row, "score": score})
960
+ return matches
961
+
962
+
963
+ def _candidate_stock_ids_for_query(
964
+ conn: sqlite3.Connection,
965
+ query: str,
966
+ limit: int,
967
+ ) -> list[int]:
968
+ candidates: dict[int, float] = {}
969
+ for match in _search_candidates_from_prefix_fts(conn, query, max(limit * 2, 20)):
970
+ candidates[match["row"]["stknum"]] = max(
971
+ match["score"],
972
+ candidates.get(match["row"]["stknum"], float("-inf")),
973
+ )
974
+ for match in _search_candidates_from_trigram_fts(conn, query, max(limit * 6, 60)):
975
+ candidates[match["row"]["stknum"]] = max(
976
+ match["score"],
977
+ candidates.get(match["row"]["stknum"], float("-inf")),
978
+ )
979
+ ranked = sorted(candidates.items(), key=lambda item: (-item[1], item[0]))
980
+ return [stknum for stknum, _ in ranked[:limit]]
981
+
982
+
983
+ def _score_field_match(query: str, text: str) -> float:
984
+ if not text:
985
+ return 0.0
986
+ lowered_query = query.strip().lower()
987
+ compact_query = _compact_text(query)
988
+ lowered_text = text.lower()
989
+ compact_text = _compact_text(text)
990
+ text_tokens = set(_query_tokens(text))
991
+ query_tokens = _query_tokens(query)
992
+
993
+ score = 0.0
994
+ if lowered_query and lowered_query == lowered_text:
995
+ score += 12.0
996
+ elif lowered_query and lowered_query in lowered_text:
997
+ score += 8.0
998
+ if compact_query and compact_query == compact_text:
999
+ score += 14.0
1000
+ elif compact_query and compact_query in compact_text:
1001
+ score += 10.0
1002
+ score += _trigram_overlap_ratio(query, text) * 6.0
1003
+ score += _best_term_similarity(query, text) * 8.0
1004
+ score += sum(1 for token in query_tokens if token in text_tokens) * 1.5
1005
+ score += sum(
1006
+ 1
1007
+ for token in query_tokens
1008
+ if token not in text_tokens and any(text_token.startswith(token) for text_token in text_tokens)
1009
+ )
1010
+ return score
1011
+
1012
+
1013
+ def _rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
1014
+ return [dict(row) for row in rows]
1015
+
1016
+
1017
+ def _default_row_key(row: sqlite3.Row) -> tuple[Any, ...]:
1018
+ return tuple(row[key] for key in row.keys())
1019
+
1020
+
1021
+ def _component_result_key(row: sqlite3.Row | dict[str, Any]) -> tuple[Any, ...]:
1022
+ return (row["stknum"], row["component_symbol"], row["fbid"])
1023
+
1024
+
1025
+ def _gene_result_key(row: sqlite3.Row | dict[str, Any]) -> tuple[Any, ...]:
1026
+ return (row["stknum"], row["component_symbol"], row["gene_symbol"], row["fbgn"])
1027
+
1028
+
1029
+ def _rank_direct_rows(
1030
+ query: str,
1031
+ rows: list[sqlite3.Row],
1032
+ *,
1033
+ field_names: list[str],
1034
+ limit: int,
1035
+ min_score: float = 5.0,
1036
+ key_fn=None,
1037
+ ) -> list[dict[str, Any]]:
1038
+ scored: list[dict[str, Any]] = []
1039
+ for row in rows:
1040
+ row_score = max(_score_field_match(query, row[field_name] or "") for field_name in field_names)
1041
+ if row_score >= min_score:
1042
+ scored.append({"row": row, "score": row_score})
1043
+
1044
+ if key_fn is None:
1045
+ key_fn = _default_row_key
1046
+ ranked = _merge_ranked_matches(scored, key_fn)
1047
+ return [dict(item["row"]) for item in ranked[:limit]]
1048
+
1049
+
1050
+ def search_local(state_dir: Path, query: str, limit: int = 10) -> list[dict[str, Any]]:
1051
+ query = query.strip()
1052
+ if not query:
1053
+ return []
1054
+
1055
+ conn = _connect(state_dir)
1056
+ try:
1057
+ if query.isdigit():
1058
+ stock = get_stock(state_dir, int(query))
1059
+ return [stock] if stock else []
1060
+
1061
+ candidates: dict[int, dict[str, Any]] = {}
1062
+ for match in _search_candidates_from_prefix_fts(conn, query, max(limit * 3, 20)):
1063
+ stknum = match["row"]["stknum"]
1064
+ existing = candidates.get(stknum)
1065
+ if existing is None or match["score"] > existing["score"]:
1066
+ candidates[stknum] = match
1067
+
1068
+ if not candidates:
1069
+ for match in _search_candidates_from_trigram_fts(conn, query, max(limit * 12, 60)):
1070
+ stknum = match["row"]["stknum"]
1071
+ existing = candidates.get(stknum)
1072
+ if existing is None or match["score"] > existing["score"]:
1073
+ candidates[stknum] = match
1074
+
1075
+ ranked = sorted(
1076
+ candidates.values(),
1077
+ key=lambda item: (-item["score"], item["row"]["stknum"]),
1078
+ )
1079
+ return [_search_result_payload(item["row"]) for item in ranked[:limit]]
1080
+ finally:
1081
+ conn.close()
1082
+
1083
+
1084
+ def search_gene(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1085
+ query = query.strip()
1086
+ if not query:
1087
+ return []
1088
+
1089
+ conn = _connect(state_dir)
1090
+ try:
1091
+ if query.upper().startswith("FBGN"):
1092
+ rows = conn.execute(
1093
+ """
1094
+ SELECT DISTINCT
1095
+ sg.stknum,
1096
+ sg.genotype,
1097
+ sg.component_symbol,
1098
+ sg.gene_symbol,
1099
+ sg.fbgn
1100
+ FROM stockgenes sg
1101
+ WHERE UPPER(sg.fbgn) = UPPER(?)
1102
+ ORDER BY sg.stknum, sg.component_symbol, sg.gene_symbol
1103
+ LIMIT ?
1104
+ """,
1105
+ (query, limit),
1106
+ ).fetchall()
1107
+ else:
1108
+ rows = conn.execute(
1109
+ """
1110
+ SELECT DISTINCT
1111
+ sg.stknum,
1112
+ sg.genotype,
1113
+ sg.component_symbol,
1114
+ sg.gene_symbol,
1115
+ sg.fbgn
1116
+ FROM stockgenes sg
1117
+ WHERE LOWER(sg.gene_symbol) = LOWER(?)
1118
+ OR LOWER(sg.gene_symbol) LIKE LOWER(?)
1119
+ ORDER BY
1120
+ CASE WHEN LOWER(sg.gene_symbol) = LOWER(?) THEN 0 ELSE 1 END,
1121
+ sg.stknum,
1122
+ sg.component_symbol,
1123
+ sg.gene_symbol
1124
+ LIMIT ?
1125
+ """,
1126
+ (query, f"{query}%", query, limit),
1127
+ ).fetchall()
1128
+ if rows:
1129
+ return _rows_to_dicts(rows)
1130
+
1131
+ stock_ids = _candidate_stock_ids_for_query(conn, query, max(limit * 4, 40))
1132
+ if not stock_ids:
1133
+ return []
1134
+ placeholders = ", ".join("?" for _ in stock_ids)
1135
+ fuzzy_rows = conn.execute(
1136
+ f"""
1137
+ SELECT DISTINCT
1138
+ sg.stknum,
1139
+ sg.genotype,
1140
+ sg.component_symbol,
1141
+ sg.gene_symbol,
1142
+ sg.fbgn
1143
+ FROM stockgenes sg
1144
+ WHERE sg.stknum IN ({placeholders})
1145
+ """,
1146
+ stock_ids,
1147
+ ).fetchall()
1148
+ return _rank_direct_rows(
1149
+ query,
1150
+ fuzzy_rows,
1151
+ field_names=["gene_symbol", "fbgn"],
1152
+ limit=limit,
1153
+ key_fn=_gene_result_key,
1154
+ )
1155
+ finally:
1156
+ conn.close()
1157
+
1158
+
1159
+ def _component_metadata_subqueries(
1160
+ stock_num_expr: str,
1161
+ component_symbol_expr: str,
1162
+ symbol_id_expr: str,
1163
+ ) -> str:
1164
+ return f"""
1165
+ COALESCE((
1166
+ SELECT group_concat(gene_symbol, ' ')
1167
+ FROM (
1168
+ SELECT DISTINCT sg.gene_symbol AS gene_symbol
1169
+ FROM stockgenes sg
1170
+ WHERE sg.stknum = {stock_num_expr}
1171
+ AND sg.component_symbol = {component_symbol_expr}
1172
+ AND sg.gene_symbol != ''
1173
+ ORDER BY sg.gene_symbol
1174
+ )
1175
+ ), '') AS gene_symbols,
1176
+ COALESCE((
1177
+ SELECT group_concat(fbgn, ' ')
1178
+ FROM (
1179
+ SELECT DISTINCT sg.fbgn AS fbgn
1180
+ FROM stockgenes sg
1181
+ WHERE sg.stknum = {stock_num_expr}
1182
+ AND sg.component_symbol = {component_symbol_expr}
1183
+ AND sg.fbgn != ''
1184
+ ORDER BY sg.fbgn
1185
+ )
1186
+ ), '') AS fbgns,
1187
+ COALESCE((
1188
+ SELECT group_concat(prop_syn, ' | ')
1189
+ FROM (
1190
+ SELECT DISTINCT cp.prop_syn AS prop_syn
1191
+ FROM compprops cp
1192
+ WHERE cp.bdsc_symbol_id = {symbol_id_expr}
1193
+ AND cp.prop_syn != ''
1194
+ ORDER BY cp.prop_syn
1195
+ )
1196
+ ), '') AS property_syns,
1197
+ COALESCE((
1198
+ SELECT group_concat(property_descrip, ' | ')
1199
+ FROM (
1200
+ SELECT DISTINCT cp.property_descrip AS property_descrip
1201
+ FROM compprops cp
1202
+ WHERE cp.bdsc_symbol_id = {symbol_id_expr}
1203
+ AND cp.property_descrip != ''
1204
+ ORDER BY cp.property_descrip
1205
+ )
1206
+ ), '') AS property_descriptions,
1207
+ COALESCE((
1208
+ SELECT group_concat(prop_syn, ' | ')
1209
+ FROM (
1210
+ SELECT DISTINCT cg.prop_syn AS prop_syn
1211
+ FROM compgenes cg
1212
+ WHERE cg.bdsc_symbol_id = {symbol_id_expr}
1213
+ AND cg.prop_syn != ''
1214
+ ORDER BY cg.prop_syn
1215
+ )
1216
+ ), '') AS gene_relationships
1217
+ """
1218
+
1219
+
1220
+ def _search_component_table(
1221
+ state_dir: Path,
1222
+ *,
1223
+ conn: sqlite3.Connection | None = None,
1224
+ column: str,
1225
+ query: str,
1226
+ limit: int,
1227
+ ) -> list[dict[str, Any]]:
1228
+ query = query.strip()
1229
+ if not query:
1230
+ return []
1231
+
1232
+ if column not in {"fbid", "component_symbol"}:
1233
+ raise ValueError(f"unsupported component search column: {column}")
1234
+
1235
+ close_conn = conn is None
1236
+ conn = conn or _connect(state_dir)
1237
+ try:
1238
+ rows = conn.execute(
1239
+ f"""
1240
+ SELECT
1241
+ cc.stknum,
1242
+ cc.genotype,
1243
+ cc.component_symbol,
1244
+ cc.fbid,
1245
+ cc.mapstatement,
1246
+ {_component_metadata_subqueries(
1247
+ "cc.stknum",
1248
+ "cc.component_symbol",
1249
+ "(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol)",
1250
+ )}
1251
+ FROM component_comments cc
1252
+ WHERE LOWER(cc.{column}) = LOWER(?)
1253
+ OR LOWER(cc.{column}) LIKE LOWER(?)
1254
+ ORDER BY
1255
+ CASE WHEN LOWER(cc.{column}) = LOWER(?) THEN 0 ELSE 1 END,
1256
+ cc.stknum,
1257
+ cc.component_symbol
1258
+ LIMIT ?
1259
+ """,
1260
+ (query, f"{query}%", query, limit),
1261
+ ).fetchall()
1262
+ if rows:
1263
+ return _rows_to_dicts(rows)
1264
+
1265
+ stock_ids = _candidate_stock_ids_for_query(conn, query, max(limit * 4, 40))
1266
+ if not stock_ids:
1267
+ return []
1268
+ placeholders = ", ".join("?" for _ in stock_ids)
1269
+ fuzzy_rows = conn.execute(
1270
+ f"""
1271
+ SELECT
1272
+ cc.stknum,
1273
+ cc.genotype,
1274
+ cc.component_symbol,
1275
+ cc.fbid,
1276
+ cc.mapstatement,
1277
+ {_component_metadata_subqueries(
1278
+ "cc.stknum",
1279
+ "cc.component_symbol",
1280
+ "(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol)",
1281
+ )}
1282
+ FROM component_comments cc
1283
+ WHERE cc.stknum IN ({placeholders})
1284
+ """,
1285
+ stock_ids,
1286
+ ).fetchall()
1287
+ field_names = ["fbid", "component_symbol", "gene_symbols", "genotype", "property_syns"]
1288
+ if column == "component_symbol":
1289
+ field_names = ["component_symbol", "gene_symbols", "fbid", "property_syns", "genotype"]
1290
+ return _rank_direct_rows(
1291
+ query,
1292
+ fuzzy_rows,
1293
+ field_names=field_names,
1294
+ limit=limit,
1295
+ key_fn=_component_result_key,
1296
+ )
1297
+ finally:
1298
+ if close_conn:
1299
+ conn.close()
1300
+
1301
+
1302
+ def _fetch_component_domain_rows(
1303
+ conn: sqlite3.Connection,
1304
+ query: str,
1305
+ limit: int,
1306
+ *,
1307
+ cte_sql: str,
1308
+ cte_params: list[Any],
1309
+ ) -> list[sqlite3.Row]:
1310
+ rows = conn.execute(
1311
+ f"""
1312
+ {cte_sql}
1313
+ SELECT
1314
+ cc.stknum,
1315
+ cc.genotype,
1316
+ cc.component_symbol,
1317
+ cc.fbid,
1318
+ cc.mapstatement,
1319
+ {_component_metadata_subqueries("cc.stknum", "cc.component_symbol", "sg0.bdsc_symbol_id")}
1320
+ FROM component_comments cc
1321
+ JOIN stockgenes sg0
1322
+ ON sg0.stknum = cc.stknum
1323
+ AND sg0.component_symbol = cc.component_symbol
1324
+ JOIN matching_rows mr
1325
+ ON mr.bdsc_symbol_id = sg0.bdsc_symbol_id
1326
+ GROUP BY
1327
+ cc.stknum,
1328
+ cc.genotype,
1329
+ cc.component_symbol,
1330
+ cc.fbid,
1331
+ cc.mapstatement,
1332
+ sg0.bdsc_symbol_id
1333
+ ORDER BY cc.stknum, cc.component_symbol
1334
+ LIMIT ?
1335
+ """,
1336
+ (*cte_params, limit),
1337
+ ).fetchall()
1338
+ if rows:
1339
+ return rows
1340
+
1341
+ stock_ids = _candidate_stock_ids_for_query(conn, query, max(limit * 4, 40))
1342
+ if not stock_ids:
1343
+ return []
1344
+ placeholders = ", ".join("?" for _ in stock_ids)
1345
+ return conn.execute(
1346
+ f"""
1347
+ SELECT
1348
+ cc.stknum,
1349
+ cc.genotype,
1350
+ cc.component_symbol,
1351
+ cc.fbid,
1352
+ cc.mapstatement,
1353
+ {_component_metadata_subqueries("cc.stknum", "cc.component_symbol", "sg0.bdsc_symbol_id")}
1354
+ FROM component_comments cc
1355
+ JOIN stockgenes sg0
1356
+ ON sg0.stknum = cc.stknum
1357
+ AND sg0.component_symbol = cc.component_symbol
1358
+ WHERE cc.stknum IN ({placeholders})
1359
+ GROUP BY
1360
+ cc.stknum,
1361
+ cc.genotype,
1362
+ cc.component_symbol,
1363
+ cc.fbid,
1364
+ cc.mapstatement,
1365
+ sg0.bdsc_symbol_id
1366
+ """,
1367
+ stock_ids,
1368
+ ).fetchall()
1369
+
1370
+ def _search_component_domain(
1371
+ state_dir: Path,
1372
+ query: str,
1373
+ limit: int,
1374
+ *,
1375
+ cte_sql: str,
1376
+ cte_params: list[Any],
1377
+ field_names: list[str],
1378
+ ) -> list[dict[str, Any]]:
1379
+ query = query.strip()
1380
+ if not query:
1381
+ return []
1382
+
1383
+ conn = _connect(state_dir)
1384
+ try:
1385
+ rows = _fetch_component_domain_rows(
1386
+ conn,
1387
+ query,
1388
+ limit,
1389
+ cte_sql=cte_sql,
1390
+ cte_params=cte_params,
1391
+ )
1392
+ return _rank_direct_rows(
1393
+ query,
1394
+ rows,
1395
+ field_names=field_names,
1396
+ limit=limit,
1397
+ key_fn=_component_result_key,
1398
+ )
1399
+ finally:
1400
+ conn.close()
1401
+
1402
+
1403
+ def search_property(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1404
+ query = query.strip()
1405
+ return _search_component_domain(
1406
+ state_dir,
1407
+ query,
1408
+ limit,
1409
+ cte_sql="""
1410
+ WITH matching_rows AS (
1411
+ SELECT DISTINCT bdsc_symbol_id
1412
+ FROM compprops
1413
+ WHERE LOWER(prop_syn) = LOWER(?)
1414
+ OR LOWER(prop_syn) LIKE LOWER(?)
1415
+ OR LOWER(property_descrip) LIKE LOWER(?)
1416
+ )
1417
+ """,
1418
+ cte_params=[query, f"{query}%", f"%{query}%"],
1419
+ field_names=["property_syns", "property_descriptions", "component_symbol", "gene_symbols"],
1420
+ )
1421
+
1422
+
1423
+ def search_property_exact(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1424
+ query = query.strip()
1425
+ return _search_component_domain(
1426
+ state_dir,
1427
+ query,
1428
+ limit,
1429
+ cte_sql="""
1430
+ WITH matching_rows AS (
1431
+ SELECT DISTINCT bdsc_symbol_id
1432
+ FROM compprops
1433
+ WHERE LOWER(prop_syn) = LOWER(?)
1434
+ OR LOWER(property_descrip) = LOWER(?)
1435
+ )
1436
+ """,
1437
+ cte_params=[query, query],
1438
+ field_names=["property_syns", "property_descriptions", "component_symbol", "gene_symbols"],
1439
+ )
1440
+
1441
+
1442
+ def search_driver_family(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1443
+ query = query.strip()
1444
+ _, tokens = normalize_driver_family(query)
1445
+ clause, params = _driver_family_clause(
1446
+ tokens,
1447
+ "cc.component_symbol",
1448
+ "sg.gene_symbol",
1449
+ "cp.prop_syn",
1450
+ )
1451
+ return _search_component_domain(
1452
+ state_dir,
1453
+ query,
1454
+ limit,
1455
+ cte_sql=f"""
1456
+ WITH matching_rows AS (
1457
+ SELECT DISTINCT sg.bdsc_symbol_id
1458
+ FROM stockgenes sg
1459
+ JOIN component_comments cc
1460
+ ON cc.stknum = sg.stknum
1461
+ AND cc.component_symbol = sg.component_symbol
1462
+ LEFT JOIN compprops cp
1463
+ ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
1464
+ WHERE {clause}
1465
+ )
1466
+ """,
1467
+ cte_params=params,
1468
+ field_names=["component_symbol", "property_syns", "gene_symbols"],
1469
+ )
1470
+
1471
+
1472
+ def search_relationship(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1473
+ query = query.strip()
1474
+ return _search_component_domain(
1475
+ state_dir,
1476
+ query,
1477
+ limit,
1478
+ cte_sql="""
1479
+ WITH matching_rows AS (
1480
+ SELECT DISTINCT bdsc_symbol_id
1481
+ FROM compgenes
1482
+ WHERE LOWER(prop_syn) = LOWER(?)
1483
+ OR LOWER(prop_syn) LIKE LOWER(?)
1484
+ )
1485
+ """,
1486
+ cte_params=[query, f"{query}%"],
1487
+ field_names=["gene_relationships", "gene_symbols", "component_symbol", "property_syns"],
1488
+ )
1489
+
1490
+
1491
+ def resolve_rrid_to_stknum(query: str) -> int | None:
1492
+ match = re.fullmatch(r"(?:RRID:)?BDSC_(\d+)", query.strip(), flags=re.IGNORECASE)
1493
+ if match:
1494
+ return int(match.group(1))
1495
+ if query.strip().isdigit():
1496
+ return int(query.strip())
1497
+ return None
1498
+
1499
+
1500
+ def get_stock_by_rrid(state_dir: Path, query: str) -> dict[str, Any] | None:
1501
+ stknum = resolve_rrid_to_stknum(query)
1502
+ if stknum is None:
1503
+ return None
1504
+ return get_stock(state_dir, stknum)
1505
+
1506
+
1507
+ def search_fbid(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1508
+ return _search_component_table(state_dir, column="fbid", query=query, limit=limit)
1509
+
1510
+
1511
+ def search_component(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
1512
+ return _search_component_table(
1513
+ state_dir,
1514
+ column="component_symbol",
1515
+ query=query,
1516
+ limit=limit,
1517
+ )
1518
+
1519
+
1520
+ def detect_query_kind(query: str) -> str:
1521
+ value = query.strip()
1522
+ if not value:
1523
+ return "search"
1524
+ if value.isdigit():
1525
+ return "stock"
1526
+ if resolve_rrid_to_stknum(value) is not None and not value.isdigit():
1527
+ return "rrid"
1528
+ if re.fullmatch(r"FBgn\d+", value, flags=re.IGNORECASE):
1529
+ return "gene"
1530
+ if re.fullmatch(r"FB[a-z]{2}\d+", value, flags=re.IGNORECASE):
1531
+ return "fbid"
1532
+ if any(token in value for token in ("P{", "}", "[", "]", "attP", "CyO")):
1533
+ return "component"
1534
+ if _is_free_text_query(value):
1535
+ return "search"
1536
+ return "gene"
1537
+
1538
+
1539
+ def _prefix_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
1540
+ return f"(LOWER({expr}) = LOWER(?) OR LOWER({expr}) LIKE LOWER(?))", [query, f"{query}%"]
1541
+
1542
+
1543
+ def _contains_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
1544
+ return f"LOWER({expr}) LIKE LOWER(?)", [f"%{query}%"]
1545
+
1546
+
1547
+ def _search_text_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
1548
+ tokens = _query_tokens(query)
1549
+ if len(tokens) <= 1:
1550
+ return _contains_match_clause(expr, query)
1551
+ return (
1552
+ " AND ".join(f"LOWER({expr}) LIKE LOWER(?)" for _ in tokens),
1553
+ [f"%{token}%" for token in tokens],
1554
+ )
1555
+
1556
+
1557
+ def _exact_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
1558
+ return f"LOWER({expr}) = LOWER(?)", [query]
1559
+
1560
+
1561
+ def _property_match_clause(query: str, *, exact: bool) -> tuple[str, list[Any]]:
1562
+ synonym_clause, synonym_params = (
1563
+ _exact_match_clause("cp.prop_syn", query)
1564
+ if exact
1565
+ else _prefix_match_clause("cp.prop_syn", query)
1566
+ )
1567
+ description_clause, description_params = (
1568
+ _exact_match_clause("cp.property_descrip", query)
1569
+ if exact
1570
+ else _contains_match_clause("cp.property_descrip", query)
1571
+ )
1572
+ return (
1573
+ f"({synonym_clause} OR {description_clause})",
1574
+ synonym_params + description_params,
1575
+ )
1576
+
1577
+
1578
+ def _gene_match_clause(fbgn_expr: str, gene_expr: str, query: str) -> tuple[str, list[Any]]:
1579
+ if query.upper().startswith("FBGN"):
1580
+ return f"UPPER({fbgn_expr}) = UPPER(?)", [query]
1581
+ clause, params = _prefix_match_clause(gene_expr, query)
1582
+ return clause, params
1583
+
1584
+
1585
+ def normalize_driver_family(query: str) -> tuple[str, tuple[str, ...]]:
1586
+ normalized = query.strip().lower()
1587
+ for family, aliases in DRIVER_FAMILY_ALIASES.items():
1588
+ if normalized == family or normalized in aliases:
1589
+ return family, aliases
1590
+ return normalized, (normalized,)
1591
+
1592
+
1593
+ def _driver_family_clause(tokens: tuple[str, ...], *exprs: str) -> tuple[str, list[Any]]:
1594
+ predicates: list[str] = []
1595
+ params: list[Any] = []
1596
+ for expr in exprs:
1597
+ for token in tokens:
1598
+ if token == "lexa":
1599
+ lowered_expr = f"LOWER({expr})"
1600
+ predicates.append(
1601
+ f"(({lowered_expr} GLOB ? OR {lowered_expr} GLOB ?) "
1602
+ f"AND NOT ({lowered_expr} GLOB ? OR {lowered_expr} GLOB ?))"
1603
+ )
1604
+ params.extend(
1605
+ ("*[^a-z0-9]lexa*", "lexa*", "*[^a-z0-9]lexaop*", "lexaop*")
1606
+ )
1607
+ continue
1608
+ predicates.append(f"LOWER({expr}) LIKE LOWER(?)")
1609
+ params.append(f"%{token}%")
1610
+ return "(" + " OR ".join(predicates) + ")", params
1611
+
1612
+
1613
+ def _driver_family_criterion(dataset: str, query: str) -> tuple[str, list[Any], str]:
1614
+ _, tokens = normalize_driver_family(query)
1615
+
1616
+ if dataset == "stocks":
1617
+ clause, params = _driver_family_clause(
1618
+ tokens,
1619
+ "cc.component_symbol",
1620
+ "cc.genotype",
1621
+ "sg.gene_symbol",
1622
+ "cp.prop_syn",
1623
+ )
1624
+ return (
1625
+ "EXISTS ("
1626
+ "SELECT 1 FROM component_comments cc "
1627
+ "LEFT JOIN stockgenes sg ON sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
1628
+ "LEFT JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1629
+ f"WHERE cc.stknum = s.stknum AND {clause}"
1630
+ ")",
1631
+ params,
1632
+ "driver-family",
1633
+ )
1634
+
1635
+ if dataset == "components":
1636
+ component_clause, component_params = _driver_family_clause(
1637
+ tokens,
1638
+ "cc.component_symbol",
1639
+ )
1640
+ gene_clause, gene_params = _driver_family_clause(tokens, "sg.gene_symbol")
1641
+ property_clause, property_params = _driver_family_clause(tokens, "cp.prop_syn")
1642
+ return (
1643
+ "("
1644
+ f"{component_clause} OR EXISTS ("
1645
+ "SELECT 1 FROM stockgenes sg "
1646
+ "LEFT JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1647
+ "WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
1648
+ f"AND ({gene_clause} OR {property_clause})"
1649
+ ")"
1650
+ ")",
1651
+ component_params + gene_params + property_params,
1652
+ "driver-family",
1653
+ )
1654
+
1655
+ if dataset == "genes":
1656
+ component_clause, component_params = _driver_family_clause(
1657
+ tokens,
1658
+ "sg.component_symbol",
1659
+ "sg.gene_symbol",
1660
+ )
1661
+ property_clause, property_params = _driver_family_clause(tokens, "cp.prop_syn")
1662
+ return (
1663
+ "("
1664
+ f"{component_clause} OR EXISTS ("
1665
+ "SELECT 1 FROM compprops cp "
1666
+ "WHERE cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1667
+ f"AND {property_clause}"
1668
+ ")"
1669
+ ")",
1670
+ component_params + property_params,
1671
+ "driver-family",
1672
+ )
1673
+
1674
+ component_clause, component_params = _driver_family_clause(
1675
+ tokens,
1676
+ "cc.component_symbol",
1677
+ "cp.prop_syn",
1678
+ )
1679
+ gene_clause, gene_params = _driver_family_clause(tokens, "sg2.gene_symbol")
1680
+ return (
1681
+ "("
1682
+ f"{component_clause} OR EXISTS ("
1683
+ "SELECT 1 FROM stockgenes sg2 "
1684
+ "WHERE sg2.stknum = cc.stknum AND sg2.component_symbol = cc.component_symbol "
1685
+ f"AND {gene_clause}"
1686
+ ")"
1687
+ ")",
1688
+ component_params + gene_params,
1689
+ "driver-family",
1690
+ )
1691
+
1692
+
1693
+ def _single_criterion(
1694
+ dataset: str,
1695
+ query: str,
1696
+ kind: str,
1697
+ ) -> tuple[str, list[Any], str | None]:
1698
+ resolved_kind = detect_query_kind(query) if kind == "auto" else kind
1699
+ params: list[Any] = []
1700
+
1701
+ if resolved_kind == "stock":
1702
+ clause = {
1703
+ "stocks": "s.stknum = ?",
1704
+ "components": "cc.stknum = ?",
1705
+ "genes": "sg.stknum = ?",
1706
+ "properties": "cc.stknum = ?",
1707
+ }[dataset]
1708
+ params.append(int(query.strip()))
1709
+ return clause, params, resolved_kind
1710
+
1711
+ if resolved_kind == "rrid":
1712
+ stknum = resolve_rrid_to_stknum(query)
1713
+ if stknum is None:
1714
+ return "0", [], resolved_kind
1715
+ clause = {
1716
+ "stocks": "s.stknum = ?",
1717
+ "components": "cc.stknum = ?",
1718
+ "genes": "sg.stknum = ?",
1719
+ "properties": "cc.stknum = ?",
1720
+ }[dataset]
1721
+ params.append(stknum)
1722
+ return clause, params, resolved_kind
1723
+
1724
+ if resolved_kind == "gene":
1725
+ if dataset == "stocks":
1726
+ clause, params = _gene_match_clause("sg.fbgn", "sg.gene_symbol", query)
1727
+ return (
1728
+ f"EXISTS (SELECT 1 FROM stockgenes sg WHERE sg.stknum = s.stknum AND {clause})",
1729
+ params,
1730
+ resolved_kind,
1731
+ )
1732
+ if dataset == "components":
1733
+ clause, params = _gene_match_clause("sg.fbgn", "sg.gene_symbol", query)
1734
+ return (
1735
+ f"EXISTS (SELECT 1 FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol AND {clause})",
1736
+ params,
1737
+ resolved_kind,
1738
+ )
1739
+ if dataset == "genes":
1740
+ clause, params = _gene_match_clause("sg.fbgn", "sg.gene_symbol", query)
1741
+ return clause, params, resolved_kind
1742
+ clause, params = _gene_match_clause("sg2.fbgn", "sg2.gene_symbol", query)
1743
+ return (
1744
+ f"EXISTS (SELECT 1 FROM stockgenes sg2 WHERE sg2.stknum = cc.stknum AND sg2.component_symbol = cc.component_symbol AND {clause})",
1745
+ params,
1746
+ resolved_kind,
1747
+ )
1748
+
1749
+ if resolved_kind == "component":
1750
+ clause, params = _prefix_match_clause(
1751
+ {"stocks": "sg.component_symbol", "components": "cc.component_symbol", "genes": "sg.component_symbol", "properties": "cc.component_symbol"}[dataset],
1752
+ query,
1753
+ )
1754
+ if dataset == "stocks":
1755
+ return (
1756
+ f"EXISTS (SELECT 1 FROM stockgenes sg WHERE sg.stknum = s.stknum AND {clause})",
1757
+ params,
1758
+ resolved_kind,
1759
+ )
1760
+ return clause, params, resolved_kind
1761
+
1762
+ if resolved_kind == "fbid":
1763
+ clause, params = _prefix_match_clause(
1764
+ {"components": "cc.fbid", "properties": "cc.fbid"}[dataset]
1765
+ if dataset in {"components", "properties"}
1766
+ else "cc.fbid",
1767
+ query,
1768
+ )
1769
+ if dataset == "stocks":
1770
+ return (
1771
+ f"EXISTS (SELECT 1 FROM component_comments cc WHERE cc.stknum = s.stknum AND {clause})",
1772
+ params,
1773
+ resolved_kind,
1774
+ )
1775
+ if dataset == "genes":
1776
+ return (
1777
+ f"EXISTS (SELECT 1 FROM component_comments cc WHERE cc.stknum = sg.stknum AND cc.component_symbol = sg.component_symbol AND {clause})",
1778
+ params,
1779
+ resolved_kind,
1780
+ )
1781
+ return clause, params, resolved_kind
1782
+
1783
+ if resolved_kind == "property":
1784
+ clause, params = _property_match_clause(query, exact=False)
1785
+ if dataset == "stocks":
1786
+ return (
1787
+ "EXISTS ("
1788
+ "SELECT 1 FROM stockgenes sg "
1789
+ "JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1790
+ f"WHERE sg.stknum = s.stknum AND {clause}"
1791
+ ")",
1792
+ params,
1793
+ resolved_kind,
1794
+ )
1795
+ if dataset == "components":
1796
+ return (
1797
+ "EXISTS ("
1798
+ "SELECT 1 FROM stockgenes sg "
1799
+ "JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1800
+ "WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
1801
+ f"AND {clause}"
1802
+ ")",
1803
+ params,
1804
+ resolved_kind,
1805
+ )
1806
+ if dataset == "genes":
1807
+ return (
1808
+ f"EXISTS (SELECT 1 FROM compprops cp WHERE cp.bdsc_symbol_id = sg.bdsc_symbol_id AND {clause})",
1809
+ params,
1810
+ resolved_kind,
1811
+ )
1812
+ return clause, params, resolved_kind
1813
+
1814
+ if resolved_kind == "property-exact":
1815
+ clause, params = _property_match_clause(query, exact=True)
1816
+ if dataset == "stocks":
1817
+ return (
1818
+ "EXISTS ("
1819
+ "SELECT 1 FROM stockgenes sg "
1820
+ "JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1821
+ f"WHERE sg.stknum = s.stknum AND {clause}"
1822
+ ")",
1823
+ params,
1824
+ resolved_kind,
1825
+ )
1826
+ if dataset == "components":
1827
+ return (
1828
+ "EXISTS ("
1829
+ "SELECT 1 FROM stockgenes sg "
1830
+ "JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
1831
+ "WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
1832
+ f"AND {clause}"
1833
+ ")",
1834
+ params,
1835
+ resolved_kind,
1836
+ )
1837
+ if dataset == "genes":
1838
+ return (
1839
+ f"EXISTS (SELECT 1 FROM compprops cp WHERE cp.bdsc_symbol_id = sg.bdsc_symbol_id AND {clause})",
1840
+ params,
1841
+ resolved_kind,
1842
+ )
1843
+ return clause, params, resolved_kind
1844
+
1845
+ if resolved_kind == "driver-family":
1846
+ return _driver_family_criterion(dataset, query)
1847
+
1848
+ if resolved_kind == "relationship":
1849
+ if dataset == "stocks":
1850
+ clause, params = _prefix_match_clause("cg.prop_syn", query)
1851
+ return (
1852
+ "EXISTS ("
1853
+ "SELECT 1 FROM stockgenes sg "
1854
+ "JOIN compgenes cg ON cg.bdsc_symbol_id = sg.bdsc_symbol_id "
1855
+ f"WHERE sg.stknum = s.stknum AND {clause}"
1856
+ ")",
1857
+ params,
1858
+ resolved_kind,
1859
+ )
1860
+ if dataset == "components":
1861
+ clause, params = _prefix_match_clause("cg.prop_syn", query)
1862
+ return (
1863
+ "EXISTS ("
1864
+ "SELECT 1 FROM stockgenes sg "
1865
+ "JOIN compgenes cg ON cg.bdsc_symbol_id = sg.bdsc_symbol_id "
1866
+ "WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
1867
+ f"AND {clause}"
1868
+ ")",
1869
+ params,
1870
+ resolved_kind,
1871
+ )
1872
+ if dataset == "genes":
1873
+ clause, params = _prefix_match_clause("cg.prop_syn", query)
1874
+ return (
1875
+ f"EXISTS (SELECT 1 FROM compgenes cg WHERE cg.bdsc_symbol_id = sg.bdsc_symbol_id AND cg.bdsc_gene_id = sg.bdsc_gene_id AND {clause})",
1876
+ params,
1877
+ resolved_kind,
1878
+ )
1879
+ clause, params = _prefix_match_clause("cg.prop_syn", query)
1880
+ return (
1881
+ "EXISTS ("
1882
+ "SELECT 1 FROM stockgenes sg2 "
1883
+ "JOIN compgenes cg ON cg.bdsc_symbol_id = sg2.bdsc_symbol_id "
1884
+ "WHERE sg2.stknum = cc.stknum AND sg2.component_symbol = cc.component_symbol "
1885
+ f"AND {clause}"
1886
+ ")",
1887
+ params,
1888
+ resolved_kind,
1889
+ )
1890
+
1891
+ if resolved_kind == "search":
1892
+ if dataset == "stocks":
1893
+ clause, params = _search_text_match_clause("sd.search_text", query)
1894
+ return clause, params, resolved_kind
1895
+ if dataset == "components":
1896
+ clause, params = _search_text_match_clause("sd.search_text", query)
1897
+ return (
1898
+ "EXISTS (SELECT 1 FROM search_documents sd "
1899
+ f"WHERE sd.stknum = cc.stknum AND {clause})",
1900
+ params,
1901
+ resolved_kind,
1902
+ )
1903
+ if dataset == "genes":
1904
+ clause, params = _search_text_match_clause("sd.search_text", query)
1905
+ return (
1906
+ "EXISTS (SELECT 1 FROM search_documents sd "
1907
+ f"WHERE sd.stknum = sg.stknum AND {clause})",
1908
+ params,
1909
+ resolved_kind,
1910
+ )
1911
+ clause, params = _search_text_match_clause("sd.search_text", query)
1912
+ return (
1913
+ "EXISTS (SELECT 1 FROM search_documents sd "
1914
+ f"WHERE sd.stknum = cc.stknum AND {clause})",
1915
+ params,
1916
+ resolved_kind,
1917
+ )
1918
+
1919
+ raise ValueError(f"unsupported export filter kind: {kind}")
1920
+
1921
+
1922
+ def _normalize_criteria(
1923
+ criteria: list[QueryCriterion] | None,
1924
+ query: str | None,
1925
+ kind: str,
1926
+ ) -> list[QueryCriterion]:
1927
+ normalized = [
1928
+ QueryCriterion(kind=item.kind, query=item.query.strip())
1929
+ for item in (criteria or [])
1930
+ if item.query.strip()
1931
+ ]
1932
+ if query and query.strip():
1933
+ normalized.append(QueryCriterion(kind=kind, query=query.strip()))
1934
+ return normalized
1935
+
1936
+
1937
+ def _compose_where_clause(
1938
+ dataset: str,
1939
+ criteria: list[QueryCriterion] | None,
1940
+ *,
1941
+ query: str | None = None,
1942
+ kind: str = "auto",
1943
+ ) -> tuple[str, list[Any]]:
1944
+ normalized = _normalize_criteria(criteria, query, kind)
1945
+ if not normalized:
1946
+ return "", []
1947
+
1948
+ predicates: list[str] = []
1949
+ params: list[Any] = []
1950
+ for criterion in normalized:
1951
+ predicate, predicate_params, _ = _single_criterion(
1952
+ dataset,
1953
+ criterion.query,
1954
+ criterion.kind,
1955
+ )
1956
+ predicates.append(f"({predicate})")
1957
+ params.extend(predicate_params)
1958
+ return "WHERE " + " AND ".join(predicates), params
1959
+
1960
+
1961
+ def lookup_query(
1962
+ state_dir: Path,
1963
+ query: str,
1964
+ *,
1965
+ kind: str = "auto",
1966
+ limit: int = 20,
1967
+ ) -> dict[str, Any]:
1968
+ requested_kind = kind
1969
+ resolved_kind = detect_query_kind(query) if kind == "auto" else kind
1970
+
1971
+ if resolved_kind == "stock":
1972
+ result = get_stock(state_dir, int(query.strip()))
1973
+ results = [result] if result else []
1974
+ elif resolved_kind == "rrid":
1975
+ result = get_stock_by_rrid(state_dir, query)
1976
+ results = [result] if result else []
1977
+ elif resolved_kind == "gene":
1978
+ results = search_gene(state_dir, query, limit=limit)
1979
+ if kind == "auto" and not results:
1980
+ resolved_kind = "search"
1981
+ results = search_local(state_dir, query, limit=limit)
1982
+ elif resolved_kind == "fbid":
1983
+ results = search_fbid(state_dir, query, limit=limit)
1984
+ elif resolved_kind == "component":
1985
+ results = search_component(state_dir, query, limit=limit)
1986
+ elif resolved_kind == "property":
1987
+ results = search_property(state_dir, query, limit=limit)
1988
+ elif resolved_kind == "property-exact":
1989
+ results = search_property_exact(state_dir, query, limit=limit)
1990
+ elif resolved_kind == "driver-family":
1991
+ results = search_driver_family(state_dir, query, limit=limit)
1992
+ elif resolved_kind == "relationship":
1993
+ results = search_relationship(state_dir, query, limit=limit)
1994
+ elif resolved_kind == "search":
1995
+ results = search_local(state_dir, query, limit=limit)
1996
+ else:
1997
+ raise ValueError(f"unsupported lookup kind: {kind}")
1998
+
1999
+ return {
2000
+ "query": query,
2001
+ "requested_kind": requested_kind,
2002
+ "kind": resolved_kind,
2003
+ "result_count": len(results),
2004
+ "results": results,
2005
+ }
2006
+
2007
+
2008
+ def get_stock(state_dir: Path, stknum: int) -> dict[str, Any] | None:
2009
+ conn = _connect(state_dir)
2010
+ try:
2011
+ stock_row = conn.execute(
2012
+ """
2013
+ SELECT
2014
+ s.stknum,
2015
+ s.genotype,
2016
+ s.chromosomes,
2017
+ s.aka,
2018
+ s.date_added,
2019
+ s.donor_info,
2020
+ s.stock_comments,
2021
+ sd.component_symbols,
2022
+ sd.gene_symbols,
2023
+ sd.fbgns
2024
+ FROM stocks s
2025
+ LEFT JOIN search_documents sd ON sd.stknum = s.stknum
2026
+ WHERE s.stknum = ?
2027
+ """,
2028
+ (stknum,),
2029
+ ).fetchone()
2030
+ if stock_row is None:
2031
+ return None
2032
+
2033
+ component_rows = conn.execute(
2034
+ f"""
2035
+ SELECT
2036
+ component_symbol,
2037
+ fbid,
2038
+ mapstatement,
2039
+ comment1,
2040
+ comment2,
2041
+ comment3,
2042
+ {_component_metadata_subqueries(
2043
+ "component_comments.stknum",
2044
+ "component_comments.component_symbol",
2045
+ "(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = component_comments.stknum AND sg.component_symbol = component_comments.component_symbol)",
2046
+ )}
2047
+ FROM component_comments
2048
+ WHERE stknum = ?
2049
+ ORDER BY component_symbol
2050
+ """,
2051
+ (stknum,),
2052
+ ).fetchall()
2053
+
2054
+ gene_rows = conn.execute(
2055
+ """
2056
+ SELECT DISTINCT
2057
+ component_symbol,
2058
+ gene_symbol,
2059
+ fbgn
2060
+ FROM stockgenes
2061
+ WHERE stknum = ?
2062
+ ORDER BY component_symbol, gene_symbol, fbgn
2063
+ """,
2064
+ (stknum,),
2065
+ ).fetchall()
2066
+
2067
+ stock = dict(stock_row)
2068
+ stock["rrid"] = f"RRID:BDSC_{stknum}"
2069
+ stock["components"] = [dict(row) for row in component_rows]
2070
+ stock["genes"] = [dict(row) for row in gene_rows]
2071
+ return stock
2072
+ finally:
2073
+ conn.close()
2074
+
2075
+
2076
+ def live_search(query: str, limit: int = 10) -> list[dict[str, Any]]:
2077
+ simple_payload = parse.urlencode({"presearch": query, "type": "contains"}).encode("utf-8")
2078
+ req = request.Request(
2079
+ "https://bdsc.indiana.edu/Home/GetSearchResults",
2080
+ data=simple_payload,
2081
+ headers={
2082
+ "User-Agent": USER_AGENT,
2083
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
2084
+ "X-Requested-With": "XMLHttpRequest",
2085
+ },
2086
+ method="POST",
2087
+ )
2088
+ with request.urlopen(req) as response:
2089
+ data = json.loads(response.read().decode("utf-8"))
2090
+ rows = data.get("Data") or []
2091
+ if rows:
2092
+ return rows[:limit]
2093
+
2094
+ advanced_payload = parse.urlencode(
2095
+ {
2096
+ "selectedGenotypeMatches": "any genotype",
2097
+ "selectedGenotypeContains1": "contains",
2098
+ "genotype1": query,
2099
+ "selectedGenotypeContains2": "contains",
2100
+ "genotype2": "",
2101
+ "selectedGenotypeContains3": "contains",
2102
+ "genotype3": "",
2103
+ "selectedCommentContains": "contains",
2104
+ "stockComment": "",
2105
+ "selectedDonorContains": "contains",
2106
+ "donor": "",
2107
+ "selectedAffectedChromosomes": "any",
2108
+ }
2109
+ ).encode("utf-8")
2110
+ advanced_req = request.Request(
2111
+ "https://bdsc.indiana.edu/Home/GetAdvancedSearchResults",
2112
+ data=advanced_payload,
2113
+ headers={
2114
+ "User-Agent": USER_AGENT,
2115
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
2116
+ "X-Requested-With": "XMLHttpRequest",
2117
+ },
2118
+ method="POST",
2119
+ )
2120
+ with request.urlopen(advanced_req) as response:
2121
+ advanced_data = json.loads(response.read().decode("utf-8"))
2122
+ return (advanced_data.get("Data") or [])[:limit]
2123
+
2124
+
2125
+ def get_status(state_dir: Path) -> dict[str, Any]:
2126
+ state_dir = resolve_state_dir(state_dir)
2127
+ manifest = load_manifest(state_dir)
2128
+ datasets = manifest.get("datasets", {})
2129
+ db_path = db_file(state_dir)
2130
+ index_info = manifest.get("index")
2131
+ if index_info is None and db_path.exists():
2132
+ conn = sqlite3.connect(db_path)
2133
+ try:
2134
+ has_fts = bool(
2135
+ conn.execute(
2136
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_fts'"
2137
+ ).fetchone()
2138
+ )
2139
+ has_trigram = bool(
2140
+ conn.execute(
2141
+ "SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_trigram'"
2142
+ ).fetchone()
2143
+ )
2144
+ index_info = {
2145
+ "db_path": str(db_path),
2146
+ "built_at": None,
2147
+ "counts": {
2148
+ "stocks": conn.execute("SELECT COUNT(*) FROM stocks").fetchone()[0],
2149
+ "component_comments": conn.execute(
2150
+ "SELECT COUNT(*) FROM component_comments"
2151
+ ).fetchone()[0],
2152
+ "stockgenes": conn.execute("SELECT COUNT(*) FROM stockgenes").fetchone()[0],
2153
+ "compgenes": conn.execute("SELECT COUNT(*) FROM compgenes").fetchone()[0],
2154
+ "compprops": conn.execute("SELECT COUNT(*) FROM compprops").fetchone()[0],
2155
+ "fts_enabled": int(has_fts),
2156
+ "trigram_enabled": int(has_trigram),
2157
+ },
2158
+ }
2159
+ finally:
2160
+ conn.close()
2161
+ return {
2162
+ "state_dir": str(state_dir),
2163
+ "db_path": str(db_path),
2164
+ "db_exists": db_path.exists(),
2165
+ "dataset_count": len(datasets),
2166
+ "datasets": datasets,
2167
+ "index": index_info,
2168
+ "updated_at": manifest.get("updated_at"),
2169
+ }
2170
+
2171
+
2172
+ def _dataset_sort_clause(dataset: str) -> str:
2173
+ if dataset == "stocks":
2174
+ return "ORDER BY s.stknum"
2175
+ if dataset == "components":
2176
+ return "ORDER BY cc.stknum, cc.component_symbol"
2177
+ if dataset == "genes":
2178
+ return "ORDER BY sg.stknum, sg.component_symbol, sg.gene_symbol, sg.fbgn"
2179
+ if dataset == "properties":
2180
+ return "ORDER BY cc.stknum, cc.component_symbol, cp.prop_syn, cp.property_id"
2181
+ raise ValueError(f"unsupported export dataset: {dataset}")
2182
+
2183
+
2184
+ def _dataset_select_sql(dataset: str) -> str:
2185
+ if dataset == "stocks":
2186
+ return """
2187
+ SELECT
2188
+ s.stknum,
2189
+ 'RRID:BDSC_' || s.stknum AS rrid,
2190
+ s.genotype,
2191
+ s.chromosomes,
2192
+ s.aka,
2193
+ s.date_added,
2194
+ s.donor_info,
2195
+ s.stock_comments,
2196
+ COALESCE(sd.component_symbols, '') AS component_symbols,
2197
+ COALESCE(sd.gene_symbols, '') AS gene_symbols,
2198
+ COALESCE(sd.fbgns, '') AS fbgns
2199
+ FROM stocks s
2200
+ LEFT JOIN search_documents sd ON sd.stknum = s.stknum
2201
+ """
2202
+ if dataset == "components":
2203
+ return f"""
2204
+ SELECT
2205
+ cc.stknum,
2206
+ cc.genotype,
2207
+ cc.component_symbol,
2208
+ cc.fbid,
2209
+ cc.mapstatement,
2210
+ cc.comment1,
2211
+ cc.comment2,
2212
+ cc.comment3,
2213
+ {_component_metadata_subqueries(
2214
+ "cc.stknum",
2215
+ "cc.component_symbol",
2216
+ "(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol)",
2217
+ )}
2218
+ FROM component_comments cc
2219
+ """
2220
+ if dataset == "genes":
2221
+ return """
2222
+ SELECT DISTINCT
2223
+ sg.stknum,
2224
+ sg.genotype,
2225
+ sg.component_symbol,
2226
+ cc.fbid,
2227
+ sg.gene_symbol,
2228
+ sg.fbgn,
2229
+ sg.bdsc_symbol_id,
2230
+ sg.bdsc_gene_id,
2231
+ COALESCE((
2232
+ SELECT group_concat(prop_syn, ' | ')
2233
+ FROM (
2234
+ SELECT DISTINCT cg.prop_syn AS prop_syn
2235
+ FROM compgenes cg
2236
+ WHERE cg.bdsc_symbol_id = sg.bdsc_symbol_id
2237
+ AND cg.bdsc_gene_id = sg.bdsc_gene_id
2238
+ AND cg.prop_syn != ''
2239
+ ORDER BY cg.prop_syn
2240
+ )
2241
+ ), '') AS gene_relationships
2242
+ FROM stockgenes sg
2243
+ LEFT JOIN component_comments cc
2244
+ ON cc.stknum = sg.stknum
2245
+ AND cc.component_symbol = sg.component_symbol
2246
+ """
2247
+ if dataset == "properties":
2248
+ return """
2249
+ SELECT DISTINCT
2250
+ cc.stknum,
2251
+ cc.genotype,
2252
+ cc.component_symbol,
2253
+ cc.fbid,
2254
+ cp.property_id,
2255
+ cp.prop_syn,
2256
+ cp.property_descrip
2257
+ FROM component_comments cc
2258
+ JOIN stockgenes sg
2259
+ ON sg.stknum = cc.stknum
2260
+ AND sg.component_symbol = cc.component_symbol
2261
+ JOIN compprops cp
2262
+ ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
2263
+ """
2264
+ raise ValueError(f"unsupported export dataset: {dataset}")
2265
+
2266
+
2267
+ def iter_dataset_rows(
2268
+ state_dir: Path,
2269
+ dataset: str,
2270
+ *,
2271
+ where_clause: str = "",
2272
+ params: tuple[Any, ...] = (),
2273
+ limit: int | None = None,
2274
+ ) -> Iterator[dict[str, Any]]:
2275
+ if dataset not in EXPORT_DATASETS:
2276
+ raise ValueError(f"unsupported export dataset: {dataset}")
2277
+
2278
+ conn = _connect(state_dir)
2279
+ try:
2280
+ sql = _dataset_select_sql(dataset)
2281
+ if where_clause:
2282
+ sql += f"\n{where_clause}"
2283
+ sql += f"\n{_dataset_sort_clause(dataset)}"
2284
+
2285
+ if limit is not None:
2286
+ sql += "\nLIMIT ?"
2287
+ cursor = conn.execute(sql, (*params, limit))
2288
+ else:
2289
+ cursor = conn.execute(sql, params)
2290
+
2291
+ columns = [description[0] for description in cursor.description]
2292
+ try:
2293
+ while True:
2294
+ rows = cursor.fetchmany(1000)
2295
+ if not rows:
2296
+ break
2297
+ for row in rows:
2298
+ yield dict(zip(columns, row, strict=False))
2299
+ finally:
2300
+ cursor.close()
2301
+ finally:
2302
+ conn.close()
2303
+
2304
+
2305
+ def iter_export_rows(
2306
+ state_dir: Path,
2307
+ dataset: str,
2308
+ *,
2309
+ limit: int | None = None,
2310
+ criteria: list[QueryCriterion] | None = None,
2311
+ query: str | None = None,
2312
+ kind: str = "auto",
2313
+ ) -> Iterator[dict[str, Any]]:
2314
+ where_clause, params = _compose_where_clause(
2315
+ dataset,
2316
+ criteria,
2317
+ query=query,
2318
+ kind=kind,
2319
+ )
2320
+ yield from iter_dataset_rows(
2321
+ state_dir,
2322
+ dataset,
2323
+ where_clause=where_clause,
2324
+ params=tuple(params),
2325
+ limit=limit,
2326
+ )
2327
+
2328
+
2329
+ def _report_olfactory_where(dataset: str) -> str:
2330
+ component_clause = (
2331
+ "component_symbol GLOB '*Or[0-9]*' "
2332
+ "OR component_symbol GLOB '*Orco*' "
2333
+ "OR component_symbol GLOB '*Ir[0-9]*' "
2334
+ "OR component_symbol GLOB '*Obp[0-9]*'"
2335
+ )
2336
+ if dataset == "stocks":
2337
+ return (
2338
+ "WHERE EXISTS (SELECT 1 FROM component_comments cc "
2339
+ f"WHERE cc.stknum = s.stknum AND ({component_clause}))"
2340
+ )
2341
+ symbol = REPORT_DATASET_SYMBOLS.get(dataset)
2342
+ if symbol is None:
2343
+ raise ValueError(f"unsupported report dataset: {dataset}")
2344
+ return f"WHERE {component_clause.replace('component_symbol', f'{symbol}.component_symbol')}"
2345
+
2346
+
2347
+ def _report_row_key(dataset: str, row: dict[str, Any]) -> tuple[Any, ...]:
2348
+ if dataset == "stocks":
2349
+ return (row["stknum"],)
2350
+ if dataset == "components":
2351
+ return _component_result_key(row)
2352
+ if dataset == "genes":
2353
+ return _gene_result_key(row)
2354
+ if dataset == "properties":
2355
+ return (row["stknum"], row["component_symbol"], row["property_id"], row["prop_syn"])
2356
+ raise ValueError(f"unsupported report dataset: {dataset}")
2357
+
2358
+
2359
+ def _merge_report_rows(dataset: str, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
2360
+ deduped: dict[tuple[Any, ...], dict[str, Any]] = {}
2361
+ for row in rows:
2362
+ key = _report_row_key(dataset, row)
2363
+ deduped.setdefault(key, row)
2364
+ return list(deduped.values())
2365
+
2366
+
2367
+ def iter_report_rows(
2368
+ state_dir: Path,
2369
+ report_name: str,
2370
+ *,
2371
+ dataset: str | None = None,
2372
+ limit: int | None = None,
2373
+ ) -> Iterator[dict[str, Any]]:
2374
+ if report_name not in REPORT_NAMES:
2375
+ raise ValueError(f"unsupported report: {report_name}")
2376
+ spec = REPORT_SPECS[report_name]
2377
+ resolved_dataset = dataset or spec.default_dataset
2378
+
2379
+ if report_name == "olfactory":
2380
+ yield from iter_dataset_rows(
2381
+ state_dir,
2382
+ resolved_dataset,
2383
+ where_clause=_report_olfactory_where(resolved_dataset),
2384
+ limit=limit,
2385
+ )
2386
+ return
2387
+
2388
+ merged_rows: list[dict[str, Any]] = []
2389
+ for group in spec.groups:
2390
+ rows = list(
2391
+ iter_export_rows(
2392
+ state_dir,
2393
+ resolved_dataset,
2394
+ criteria=list(group),
2395
+ limit=limit,
2396
+ )
2397
+ )
2398
+ merged_rows.extend(rows)
2399
+ if limit is not None and len(_merge_report_rows(resolved_dataset, merged_rows)) >= limit:
2400
+ break
2401
+
2402
+ deduped = _merge_report_rows(resolved_dataset, merged_rows)
2403
+ if limit is not None:
2404
+ deduped = deduped[:limit]
2405
+ for row in deduped:
2406
+ yield row
2407
+
2408
+
2409
+ def list_terms(
2410
+ state_dir: Path,
2411
+ scope: str,
2412
+ *,
2413
+ query: str | None = None,
2414
+ limit: int = 50,
2415
+ ) -> list[dict[str, Any]]:
2416
+ if scope not in TERM_SCOPES:
2417
+ raise ValueError(f"unsupported term scope: {scope}")
2418
+
2419
+ query = (query or "").strip()
2420
+ conn = _connect(state_dir)
2421
+ try:
2422
+ if scope == "properties":
2423
+ sql = """
2424
+ SELECT
2425
+ prop_syn AS term,
2426
+ MIN(property_descrip) AS description,
2427
+ COUNT(*) AS count
2428
+ FROM compprops
2429
+ WHERE prop_syn != ''
2430
+ """
2431
+ params: list[Any] = []
2432
+ if query:
2433
+ sql += " AND LOWER(prop_syn) LIKE LOWER(?)"
2434
+ params.append(f"{query}%")
2435
+ sql += """
2436
+ GROUP BY prop_syn
2437
+ ORDER BY count DESC, term
2438
+ LIMIT ?
2439
+ """
2440
+ elif scope == "property-descriptions":
2441
+ sql = """
2442
+ SELECT
2443
+ property_descrip AS term,
2444
+ MIN(prop_syn) AS synonym,
2445
+ COUNT(*) AS count
2446
+ FROM compprops
2447
+ WHERE property_descrip != ''
2448
+ """
2449
+ params = []
2450
+ if query:
2451
+ sql += " AND LOWER(property_descrip) LIKE LOWER(?)"
2452
+ params.append(f"%{query}%")
2453
+ sql += """
2454
+ GROUP BY property_descrip
2455
+ ORDER BY count DESC, term
2456
+ LIMIT ?
2457
+ """
2458
+ else:
2459
+ sql = """
2460
+ SELECT
2461
+ prop_syn AS term,
2462
+ COUNT(*) AS count
2463
+ FROM compgenes
2464
+ WHERE prop_syn != ''
2465
+ """
2466
+ params = []
2467
+ if query:
2468
+ sql += " AND LOWER(prop_syn) LIKE LOWER(?)"
2469
+ params.append(f"{query}%")
2470
+ sql += """
2471
+ GROUP BY prop_syn
2472
+ ORDER BY count DESC, term
2473
+ LIMIT ?
2474
+ """
2475
+
2476
+ rows = conn.execute(sql, (*params, limit)).fetchall()
2477
+ return [dict(row) for row in rows]
2478
+ finally:
2479
+ conn.close()
2480
+
2481
+
2482
+ def format_sync_results(results: list[SyncResult]) -> str:
2483
+ lines = []
2484
+ for result in results:
2485
+ lines.append(
2486
+ f"{result.name}: {result.status} {result.bytes_downloaded}B {result.path}"
2487
+ )
2488
+ return "\n".join(lines)
2489
+
2490
+
2491
+ def format_search_results(results: list[dict[str, Any]]) -> str:
2492
+ if not results:
2493
+ return "no results"
2494
+ lines = []
2495
+ for row in results:
2496
+ stknum = row.get("stknum", row.get("Stknum"))
2497
+ genotype = row.get("genotype", row.get("Genotype"))
2498
+ bits = [str(stknum), genotype]
2499
+ genes = row.get("gene_symbols") or row.get("fbgns") or row.get("SearchText") or ""
2500
+ if genes:
2501
+ bits.append(f"genes={genes}")
2502
+ lines.append(" | ".join(bits))
2503
+ return "\n".join(lines)
2504
+
2505
+
2506
+ def format_gene_results(results: list[dict[str, Any]]) -> str:
2507
+ if not results:
2508
+ return "no results"
2509
+ lines = []
2510
+ for row in results:
2511
+ lines.append(
2512
+ " | ".join(
2513
+ [
2514
+ str(row["stknum"]),
2515
+ row["gene_symbol"],
2516
+ row["fbgn"],
2517
+ row["component_symbol"],
2518
+ row["genotype"],
2519
+ ]
2520
+ )
2521
+ )
2522
+ return "\n".join(lines)
2523
+
2524
+
2525
+ def format_component_results(results: list[dict[str, Any]]) -> str:
2526
+ if not results:
2527
+ return "no results"
2528
+ lines = []
2529
+ for row in results:
2530
+ bits = [
2531
+ str(row["stknum"]),
2532
+ row["component_symbol"],
2533
+ row["fbid"],
2534
+ ]
2535
+ genes = row.get("gene_symbols") or row.get("fbgns") or ""
2536
+ if genes:
2537
+ bits.append(f"genes={genes}")
2538
+ properties = row.get("property_syns") or ""
2539
+ if properties:
2540
+ bits.append(f"props={properties}")
2541
+ relationships = row.get("gene_relationships") or ""
2542
+ if relationships:
2543
+ bits.append(f"rels={relationships}")
2544
+ lines.append(" | ".join(bits + [row["genotype"]]))
2545
+ return "\n".join(lines)
2546
+
2547
+
2548
+ def format_property_results(results: list[dict[str, Any]]) -> str:
2549
+ if not results:
2550
+ return "no results"
2551
+ lines = []
2552
+ for row in results:
2553
+ bits = [
2554
+ str(row["stknum"]),
2555
+ row["component_symbol"],
2556
+ row["fbid"],
2557
+ row["prop_syn"],
2558
+ ]
2559
+ if row.get("property_descrip"):
2560
+ bits.append(row["property_descrip"])
2561
+ lines.append(" | ".join(bits + [row["genotype"]]))
2562
+ return "\n".join(lines)
2563
+
2564
+
2565
+ def format_dataset_results(dataset: str, results: list[dict[str, Any]]) -> str:
2566
+ if dataset == "stocks":
2567
+ return format_search_results(results)
2568
+ if dataset == "components":
2569
+ return format_component_results(results)
2570
+ if dataset == "genes":
2571
+ return format_gene_results(results)
2572
+ if dataset == "properties":
2573
+ return format_property_results(results)
2574
+ raise ValueError(f"unsupported dataset formatter: {dataset}")
2575
+
2576
+
2577
+ def format_term_results(results: list[dict[str, Any]]) -> str:
2578
+ if not results:
2579
+ return "no results"
2580
+ lines = []
2581
+ for row in results:
2582
+ bits = [row["term"], f"count={row['count']}"]
2583
+ if row.get("description"):
2584
+ bits.append(row["description"])
2585
+ if row.get("synonym"):
2586
+ bits.append(f"synonym={row['synonym']}")
2587
+ lines.append(" | ".join(bits))
2588
+ return "\n".join(lines)
2589
+
2590
+
2591
+ def format_lookup_result(result: dict[str, Any]) -> str:
2592
+ lines = [f"query: {result['query']}", f"kind: {result['kind']}"]
2593
+ rows = result["results"]
2594
+ kind = result["kind"]
2595
+ if kind in {"stock", "rrid"}:
2596
+ body = format_stock(rows[0] if rows else None)
2597
+ elif kind == "gene":
2598
+ body = format_gene_results(rows)
2599
+ elif kind in {
2600
+ "component",
2601
+ "fbid",
2602
+ "property",
2603
+ "property-exact",
2604
+ "driver-family",
2605
+ "relationship",
2606
+ }:
2607
+ body = format_component_results(rows)
2608
+ else:
2609
+ body = format_search_results(rows)
2610
+ lines.append(body)
2611
+ return "\n".join(lines)
2612
+
2613
+
2614
+ def format_stock(stock: dict[str, Any] | None) -> str:
2615
+ if stock is None:
2616
+ return "not found"
2617
+
2618
+ lines = [
2619
+ f"stknum: {stock['stknum']}",
2620
+ f"rrid: {stock['rrid']}",
2621
+ f"genotype: {stock['genotype']}",
2622
+ f"chromosomes: {stock['chromosomes'] or '-'}",
2623
+ f"aka: {stock['aka'] or '-'}",
2624
+ f"date_added: {stock['date_added'] or '-'}",
2625
+ f"donor_info: {stock['donor_info'] or '-'}",
2626
+ f"stock_comments: {stock['stock_comments'] or '-'}",
2627
+ f"component_symbols: {stock['component_symbols'] or '-'}",
2628
+ f"gene_symbols: {stock['gene_symbols'] or '-'}",
2629
+ f"fbgns: {stock['fbgns'] or '-'}",
2630
+ ]
2631
+
2632
+ if stock["components"]:
2633
+ lines.append("components:")
2634
+ for row in stock["components"][:20]:
2635
+ detail = "; ".join(
2636
+ part
2637
+ for part in [
2638
+ row["fbid"],
2639
+ row["mapstatement"],
2640
+ row["comment1"],
2641
+ row["comment2"],
2642
+ row["comment3"],
2643
+ ]
2644
+ if part
2645
+ )
2646
+ if detail:
2647
+ lines.append(f" - {row['component_symbol']}: {detail}")
2648
+ else:
2649
+ lines.append(f" - {row['component_symbol']}")
2650
+ if row.get("property_syns"):
2651
+ lines.append(f" properties: {row['property_syns']}")
2652
+ if row.get("gene_relationships"):
2653
+ lines.append(f" gene_relationships: {row['gene_relationships']}")
2654
+
2655
+ if stock["genes"]:
2656
+ lines.append("genes:")
2657
+ for row in stock["genes"][:40]:
2658
+ bits = [row["component_symbol"], row["gene_symbol"], row["fbgn"]]
2659
+ lines.append(f" - {' | '.join(bit for bit in bits if bit)}")
2660
+
2661
+ return "\n".join(lines)