bdsc-cli 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bdsc_cli/__init__.py +3 -0
- bdsc_cli/cli.py +612 -0
- bdsc_cli/core.py +2661 -0
- bdsc_cli-0.2.1.dist-info/METADATA +362 -0
- bdsc_cli-0.2.1.dist-info/RECORD +9 -0
- bdsc_cli-0.2.1.dist-info/WHEEL +5 -0
- bdsc_cli-0.2.1.dist-info/entry_points.txt +2 -0
- bdsc_cli-0.2.1.dist-info/licenses/LICENSE +21 -0
- bdsc_cli-0.2.1.dist-info/top_level.txt +1 -0
bdsc_cli/core.py
ADDED
|
@@ -0,0 +1,2661 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import difflib
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import sqlite3
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Iterator
|
|
14
|
+
from urllib import error, parse, request
|
|
15
|
+
|
|
16
|
+
USER_AGENT = "bdsc-cli/0.1 (+https://bdsc.indiana.edu/)"
|
|
17
|
+
DEFAULT_STATE_DIR = Path(
|
|
18
|
+
os.environ.get("BDSC_CLI_HOME", Path.home() / ".local" / "share" / "bdsc-cli")
|
|
19
|
+
)
|
|
20
|
+
DB_NAME = "bdsc.sqlite3"
|
|
21
|
+
MANIFEST_NAME = "manifest.json"
|
|
22
|
+
|
|
23
|
+
DATASETS = {
|
|
24
|
+
"bloomington": "https://bdsc.indiana.edu/pdf/bloomington.csv",
|
|
25
|
+
"stockcomps_map_comments": "https://bdsc.indiana.edu/pdf/stockcomps_map_comments.csv",
|
|
26
|
+
"stockgenes": "https://bdsc.indiana.edu/pdf/stockgenes.csv",
|
|
27
|
+
"stockgenes_compgenes": "https://bdsc.indiana.edu/pdf/stockgenes_compgenes.csv",
|
|
28
|
+
"stockgenes_compprops": "https://bdsc.indiana.edu/pdf/stockgenes_compprops.csv",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SyncResult:
|
|
34
|
+
name: str
|
|
35
|
+
path: Path
|
|
36
|
+
status: str
|
|
37
|
+
bytes_downloaded: int
|
|
38
|
+
metadata: dict[str, Any]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class QueryCriterion:
|
|
43
|
+
kind: str
|
|
44
|
+
query: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class ReportSpec:
|
|
49
|
+
name: str
|
|
50
|
+
description: str
|
|
51
|
+
default_dataset: str
|
|
52
|
+
groups: tuple[tuple[QueryCriterion, ...], ...] = ()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
LOOKUP_KINDS = (
|
|
56
|
+
"auto",
|
|
57
|
+
"stock",
|
|
58
|
+
"rrid",
|
|
59
|
+
"gene",
|
|
60
|
+
"fbid",
|
|
61
|
+
"component",
|
|
62
|
+
"property",
|
|
63
|
+
"property-exact",
|
|
64
|
+
"driver-family",
|
|
65
|
+
"relationship",
|
|
66
|
+
"search",
|
|
67
|
+
)
|
|
68
|
+
EXPORT_DATASETS = ("stocks", "components", "genes", "properties")
|
|
69
|
+
TERM_SCOPES = ("properties", "property-descriptions", "relationships")
|
|
70
|
+
REPORT_NAMES = ("olfactory", "drivers", "optogenetics")
|
|
71
|
+
|
|
72
|
+
REPORT_SPECS = {
|
|
73
|
+
"olfactory": ReportSpec(
|
|
74
|
+
name="olfactory",
|
|
75
|
+
description="olfactory receptor and odorant-binding gene families",
|
|
76
|
+
default_dataset="components",
|
|
77
|
+
),
|
|
78
|
+
"drivers": ReportSpec(
|
|
79
|
+
name="drivers",
|
|
80
|
+
description="expression-driver and recombinase components",
|
|
81
|
+
default_dataset="components",
|
|
82
|
+
groups=(
|
|
83
|
+
(QueryCriterion(kind="driver-family", query="GAL4"),),
|
|
84
|
+
(QueryCriterion(kind="driver-family", query="lexA"),),
|
|
85
|
+
(QueryCriterion(kind="driver-family", query="QF"),),
|
|
86
|
+
(QueryCriterion(kind="driver-family", query="split"),),
|
|
87
|
+
(QueryCriterion(kind="driver-family", query="FLP"),),
|
|
88
|
+
),
|
|
89
|
+
),
|
|
90
|
+
"optogenetics": ReportSpec(
|
|
91
|
+
name="optogenetics",
|
|
92
|
+
description="common optogenetic effectors and optogenetic-tagged components",
|
|
93
|
+
default_dataset="components",
|
|
94
|
+
groups=(
|
|
95
|
+
(QueryCriterion(kind="gene", query="Chronos"),),
|
|
96
|
+
(QueryCriterion(kind="gene", query="CsChrimson"),),
|
|
97
|
+
(QueryCriterion(kind="gene", query="Chrimson"),),
|
|
98
|
+
(QueryCriterion(kind="gene", query="GtACR"),),
|
|
99
|
+
(QueryCriterion(kind="gene", query="ReaChR"),),
|
|
100
|
+
(QueryCriterion(kind="gene", query="ChR2"),),
|
|
101
|
+
(QueryCriterion(kind="gene", query="eNpHR"),),
|
|
102
|
+
(QueryCriterion(kind="property", query="optogen"),),
|
|
103
|
+
),
|
|
104
|
+
),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
REPORT_DATASET_SYMBOLS = {
|
|
108
|
+
"stocks": "s",
|
|
109
|
+
"components": "cc",
|
|
110
|
+
"genes": "sg",
|
|
111
|
+
"properties": "cc",
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
DRIVER_FAMILY_ALIASES = {
|
|
115
|
+
"gal4": ("gal4", "gawb"),
|
|
116
|
+
"lexa": ("lexa",),
|
|
117
|
+
"qf": ("qf",),
|
|
118
|
+
"flp": ("flp", "flpo", "flp recombinase"),
|
|
119
|
+
"split": ("split zip hemi driver", "split intein hemi driver"),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def resolve_state_dir(value: str | Path | None) -> Path:
|
|
124
|
+
return Path(value).expanduser() if value else DEFAULT_STATE_DIR
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def ensure_state_dir(state_dir: Path) -> None:
|
|
128
|
+
state_dir.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
(state_dir / "raw").mkdir(parents=True, exist_ok=True)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def manifest_file(state_dir: Path) -> Path:
|
|
133
|
+
return state_dir / MANIFEST_NAME
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def db_file(state_dir: Path) -> Path:
|
|
137
|
+
return state_dir / DB_NAME
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def load_manifest(state_dir: Path) -> dict[str, Any]:
|
|
141
|
+
path = manifest_file(state_dir)
|
|
142
|
+
if not path.exists():
|
|
143
|
+
return {"datasets": {}}
|
|
144
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def save_manifest(state_dir: Path, manifest: dict[str, Any]) -> None:
|
|
148
|
+
manifest_file(state_dir).write_text(
|
|
149
|
+
json.dumps(manifest, indent=2, sort_keys=True) + "\n",
|
|
150
|
+
encoding="utf-8",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def raw_file(state_dir: Path, name: str) -> Path:
|
|
155
|
+
return state_dir / "raw" / f"{name}.csv"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _now_iso() -> str:
|
|
159
|
+
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _hash_file(path: Path) -> str:
|
|
163
|
+
digest = hashlib.sha256()
|
|
164
|
+
with path.open("rb") as handle:
|
|
165
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
166
|
+
digest.update(chunk)
|
|
167
|
+
return digest.hexdigest()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def sync_datasets(state_dir: Path, force: bool = False) -> list[SyncResult]:
|
|
171
|
+
ensure_state_dir(state_dir)
|
|
172
|
+
manifest = load_manifest(state_dir)
|
|
173
|
+
results: list[SyncResult] = []
|
|
174
|
+
|
|
175
|
+
for name, url in DATASETS.items():
|
|
176
|
+
path = raw_file(state_dir, name)
|
|
177
|
+
entry = manifest.setdefault("datasets", {}).get(name, {})
|
|
178
|
+
headers = {"User-Agent": USER_AGENT}
|
|
179
|
+
if not force:
|
|
180
|
+
if entry.get("etag"):
|
|
181
|
+
headers["If-None-Match"] = entry["etag"]
|
|
182
|
+
if entry.get("last_modified"):
|
|
183
|
+
headers["If-Modified-Since"] = entry["last_modified"]
|
|
184
|
+
|
|
185
|
+
req = request.Request(url, headers=headers)
|
|
186
|
+
try:
|
|
187
|
+
with request.urlopen(req) as response:
|
|
188
|
+
temp_path = path.with_suffix(".csv.tmp")
|
|
189
|
+
size = 0
|
|
190
|
+
digest = hashlib.sha256()
|
|
191
|
+
with temp_path.open("wb") as handle:
|
|
192
|
+
for chunk in iter(lambda: response.read(1024 * 1024), b""):
|
|
193
|
+
size += len(chunk)
|
|
194
|
+
digest.update(chunk)
|
|
195
|
+
handle.write(chunk)
|
|
196
|
+
temp_path.replace(path)
|
|
197
|
+
metadata = {
|
|
198
|
+
"url": url,
|
|
199
|
+
"etag": response.headers.get("ETag"),
|
|
200
|
+
"last_modified": response.headers.get("Last-Modified"),
|
|
201
|
+
"content_length": response.headers.get("Content-Length"),
|
|
202
|
+
"sha256": digest.hexdigest(),
|
|
203
|
+
"fetched_at": _now_iso(),
|
|
204
|
+
}
|
|
205
|
+
manifest["datasets"][name] = metadata
|
|
206
|
+
results.append(
|
|
207
|
+
SyncResult(
|
|
208
|
+
name=name,
|
|
209
|
+
path=path,
|
|
210
|
+
status="downloaded",
|
|
211
|
+
bytes_downloaded=size,
|
|
212
|
+
metadata=metadata,
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
except error.HTTPError as exc:
|
|
216
|
+
if exc.code == 304 and path.exists():
|
|
217
|
+
metadata = {
|
|
218
|
+
**entry,
|
|
219
|
+
"checked_at": _now_iso(),
|
|
220
|
+
"sha256": entry.get("sha256") or _hash_file(path),
|
|
221
|
+
}
|
|
222
|
+
manifest["datasets"][name] = metadata
|
|
223
|
+
results.append(
|
|
224
|
+
SyncResult(
|
|
225
|
+
name=name,
|
|
226
|
+
path=path,
|
|
227
|
+
status="not-modified",
|
|
228
|
+
bytes_downloaded=0,
|
|
229
|
+
metadata=metadata,
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
continue
|
|
233
|
+
raise RuntimeError(f"failed to download {url}: {exc}") from exc
|
|
234
|
+
|
|
235
|
+
manifest["updated_at"] = _now_iso()
|
|
236
|
+
save_manifest(state_dir, manifest)
|
|
237
|
+
return results
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _iter_csv_rows(path: Path) -> list[dict[str, str]]:
|
|
241
|
+
rows: list[dict[str, str]] = []
|
|
242
|
+
for encoding_errors in ("strict", "replace"):
|
|
243
|
+
try:
|
|
244
|
+
with path.open(
|
|
245
|
+
"r",
|
|
246
|
+
encoding="utf-8-sig",
|
|
247
|
+
errors=encoding_errors,
|
|
248
|
+
newline="",
|
|
249
|
+
) as handle:
|
|
250
|
+
reader = csv.DictReader(handle)
|
|
251
|
+
for raw_row in reader:
|
|
252
|
+
row = {
|
|
253
|
+
(key or "").strip(): (value or "").strip()
|
|
254
|
+
for key, value in raw_row.items()
|
|
255
|
+
}
|
|
256
|
+
if any(row.values()):
|
|
257
|
+
rows.append(row)
|
|
258
|
+
return rows
|
|
259
|
+
except UnicodeDecodeError:
|
|
260
|
+
rows.clear()
|
|
261
|
+
continue
|
|
262
|
+
raise UnicodeDecodeError("utf-8", b"", 0, 1, f"could not decode {path}")
|
|
263
|
+
return rows
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _to_int(value: str) -> int | None:
|
|
267
|
+
value = value.strip()
|
|
268
|
+
if not value:
|
|
269
|
+
return None
|
|
270
|
+
try:
|
|
271
|
+
return int(value)
|
|
272
|
+
except ValueError:
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _require_files(state_dir: Path) -> None:
|
|
277
|
+
missing = [name for name in DATASETS if not raw_file(state_dir, name).exists()]
|
|
278
|
+
if missing:
|
|
279
|
+
missing_list = ", ".join(missing)
|
|
280
|
+
raise FileNotFoundError(
|
|
281
|
+
f"missing raw datasets: {missing_list}. run `bdsc sync` first"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def build_index(state_dir: Path) -> dict[str, int]:
|
|
286
|
+
ensure_state_dir(state_dir)
|
|
287
|
+
_require_files(state_dir)
|
|
288
|
+
manifest = load_manifest(state_dir)
|
|
289
|
+
|
|
290
|
+
bloomington_rows = _iter_csv_rows(raw_file(state_dir, "bloomington"))
|
|
291
|
+
component_rows = _iter_csv_rows(raw_file(state_dir, "stockcomps_map_comments"))
|
|
292
|
+
stockgene_rows = _iter_csv_rows(raw_file(state_dir, "stockgenes"))
|
|
293
|
+
compgene_rows = _iter_csv_rows(raw_file(state_dir, "stockgenes_compgenes"))
|
|
294
|
+
compprop_rows = _iter_csv_rows(raw_file(state_dir, "stockgenes_compprops"))
|
|
295
|
+
|
|
296
|
+
db_path = db_file(state_dir)
|
|
297
|
+
if db_path.exists():
|
|
298
|
+
db_path.unlink()
|
|
299
|
+
|
|
300
|
+
conn = sqlite3.connect(db_path)
|
|
301
|
+
try:
|
|
302
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
303
|
+
conn.execute("PRAGMA synchronous=NORMAL")
|
|
304
|
+
conn.executescript(
|
|
305
|
+
"""
|
|
306
|
+
CREATE TABLE stocks (
|
|
307
|
+
stknum INTEGER PRIMARY KEY,
|
|
308
|
+
genotype TEXT NOT NULL,
|
|
309
|
+
chromosomes TEXT,
|
|
310
|
+
aka TEXT,
|
|
311
|
+
date_added TEXT,
|
|
312
|
+
donor_info TEXT,
|
|
313
|
+
stock_comments TEXT
|
|
314
|
+
);
|
|
315
|
+
|
|
316
|
+
CREATE TABLE component_comments (
|
|
317
|
+
stknum INTEGER NOT NULL,
|
|
318
|
+
genotype TEXT,
|
|
319
|
+
component_symbol TEXT,
|
|
320
|
+
fbid TEXT,
|
|
321
|
+
mapstatement TEXT,
|
|
322
|
+
comment1 TEXT,
|
|
323
|
+
comment2 TEXT,
|
|
324
|
+
comment3 TEXT
|
|
325
|
+
);
|
|
326
|
+
|
|
327
|
+
CREATE TABLE stockgenes (
|
|
328
|
+
stknum INTEGER NOT NULL,
|
|
329
|
+
genotype TEXT,
|
|
330
|
+
component_symbol TEXT,
|
|
331
|
+
gene_symbol TEXT,
|
|
332
|
+
fbgn TEXT,
|
|
333
|
+
bdsc_symbol_id INTEGER,
|
|
334
|
+
bdsc_gene_id INTEGER
|
|
335
|
+
);
|
|
336
|
+
|
|
337
|
+
CREATE TABLE compgenes (
|
|
338
|
+
bdsc_symbol_id INTEGER,
|
|
339
|
+
bdsc_gene_id INTEGER,
|
|
340
|
+
compgeneprop_id INTEGER,
|
|
341
|
+
prop_syn TEXT
|
|
342
|
+
);
|
|
343
|
+
|
|
344
|
+
CREATE TABLE compprops (
|
|
345
|
+
bdsc_symbol_id INTEGER,
|
|
346
|
+
property_id INTEGER,
|
|
347
|
+
property_descrip TEXT,
|
|
348
|
+
prop_syn TEXT
|
|
349
|
+
);
|
|
350
|
+
|
|
351
|
+
CREATE TABLE search_documents (
|
|
352
|
+
stknum INTEGER PRIMARY KEY,
|
|
353
|
+
genotype TEXT,
|
|
354
|
+
aka TEXT,
|
|
355
|
+
donor_info TEXT,
|
|
356
|
+
stock_comments TEXT,
|
|
357
|
+
component_symbols TEXT,
|
|
358
|
+
fbids TEXT,
|
|
359
|
+
gene_symbols TEXT,
|
|
360
|
+
fbgns TEXT,
|
|
361
|
+
property_terms TEXT,
|
|
362
|
+
relationship_terms TEXT,
|
|
363
|
+
search_text TEXT
|
|
364
|
+
);
|
|
365
|
+
|
|
366
|
+
CREATE INDEX idx_component_comments_stknum ON component_comments(stknum);
|
|
367
|
+
CREATE INDEX idx_stockgenes_stknum ON stockgenes(stknum);
|
|
368
|
+
CREATE INDEX idx_stockgenes_gene_symbol ON stockgenes(gene_symbol);
|
|
369
|
+
CREATE INDEX idx_stockgenes_fbgn ON stockgenes(fbgn);
|
|
370
|
+
CREATE INDEX idx_compgenes_symbol_id ON compgenes(bdsc_symbol_id);
|
|
371
|
+
CREATE INDEX idx_compprops_symbol_id ON compprops(bdsc_symbol_id);
|
|
372
|
+
"""
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
conn.executemany(
|
|
376
|
+
"""
|
|
377
|
+
INSERT INTO stocks (
|
|
378
|
+
stknum, genotype, chromosomes, aka, date_added, donor_info, stock_comments
|
|
379
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
380
|
+
""",
|
|
381
|
+
[
|
|
382
|
+
(
|
|
383
|
+
_to_int(row["Stk #"]),
|
|
384
|
+
row["Genotype"],
|
|
385
|
+
row["Ch # all"],
|
|
386
|
+
row["A.K.A"],
|
|
387
|
+
row["Date added"],
|
|
388
|
+
row["Donor info"],
|
|
389
|
+
row["Stock comments"],
|
|
390
|
+
)
|
|
391
|
+
for row in bloomington_rows
|
|
392
|
+
if _to_int(row["Stk #"]) is not None
|
|
393
|
+
],
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
conn.executemany(
|
|
397
|
+
"""
|
|
398
|
+
INSERT INTO component_comments (
|
|
399
|
+
stknum, genotype, component_symbol, fbid, mapstatement, comment1, comment2, comment3
|
|
400
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
401
|
+
""",
|
|
402
|
+
[
|
|
403
|
+
(
|
|
404
|
+
_to_int(row["Stk #"]),
|
|
405
|
+
row["Genotype"],
|
|
406
|
+
row["component_symbol"],
|
|
407
|
+
row["fbid"],
|
|
408
|
+
row["mapstatement"],
|
|
409
|
+
row["comment1"],
|
|
410
|
+
row["comment2"],
|
|
411
|
+
row["comment3"],
|
|
412
|
+
)
|
|
413
|
+
for row in component_rows
|
|
414
|
+
if _to_int(row["Stk #"]) is not None
|
|
415
|
+
],
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
conn.executemany(
|
|
419
|
+
"""
|
|
420
|
+
INSERT INTO stockgenes (
|
|
421
|
+
stknum, genotype, component_symbol, gene_symbol, fbgn, bdsc_symbol_id, bdsc_gene_id
|
|
422
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
423
|
+
""",
|
|
424
|
+
[
|
|
425
|
+
(
|
|
426
|
+
_to_int(row["stknum"]),
|
|
427
|
+
row["genotype"],
|
|
428
|
+
row["component_symbol"],
|
|
429
|
+
row["gene_symbol"],
|
|
430
|
+
row["fbgn"],
|
|
431
|
+
_to_int(row["bdsc_symbol_id"]),
|
|
432
|
+
_to_int(row["bdsc_gene_id"]),
|
|
433
|
+
)
|
|
434
|
+
for row in stockgene_rows
|
|
435
|
+
if _to_int(row["stknum"]) is not None
|
|
436
|
+
],
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
conn.executemany(
|
|
440
|
+
"""
|
|
441
|
+
INSERT INTO compgenes (
|
|
442
|
+
bdsc_symbol_id, bdsc_gene_id, compgeneprop_id, prop_syn
|
|
443
|
+
) VALUES (?, ?, ?, ?)
|
|
444
|
+
""",
|
|
445
|
+
[
|
|
446
|
+
(
|
|
447
|
+
_to_int(row["bdsc_symbol_id"]),
|
|
448
|
+
_to_int(row["bdsc_gene_id"]),
|
|
449
|
+
_to_int(row["compgeneprop_id"]),
|
|
450
|
+
row["prop_syn"],
|
|
451
|
+
)
|
|
452
|
+
for row in compgene_rows
|
|
453
|
+
],
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
conn.executemany(
|
|
457
|
+
"""
|
|
458
|
+
INSERT INTO compprops (
|
|
459
|
+
bdsc_symbol_id, property_id, property_descrip, prop_syn
|
|
460
|
+
) VALUES (?, ?, ?, ?)
|
|
461
|
+
""",
|
|
462
|
+
[
|
|
463
|
+
(
|
|
464
|
+
_to_int(row["bdsc_symbol_id"]),
|
|
465
|
+
_to_int(row["property_id"]),
|
|
466
|
+
row["property_descrip"],
|
|
467
|
+
row["prop_syn"],
|
|
468
|
+
)
|
|
469
|
+
for row in compprop_rows
|
|
470
|
+
],
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
conn.execute(
|
|
474
|
+
"""
|
|
475
|
+
INSERT INTO search_documents (
|
|
476
|
+
stknum, genotype, aka, donor_info, stock_comments,
|
|
477
|
+
component_symbols, fbids, gene_symbols, fbgns,
|
|
478
|
+
property_terms, relationship_terms, search_text
|
|
479
|
+
)
|
|
480
|
+
SELECT
|
|
481
|
+
s.stknum,
|
|
482
|
+
s.genotype,
|
|
483
|
+
COALESCE(s.aka, ''),
|
|
484
|
+
COALESCE(s.donor_info, ''),
|
|
485
|
+
COALESCE(s.stock_comments, ''),
|
|
486
|
+
COALESCE((
|
|
487
|
+
SELECT group_concat(component_symbol, ' ')
|
|
488
|
+
FROM (
|
|
489
|
+
SELECT DISTINCT sg.component_symbol AS component_symbol
|
|
490
|
+
FROM stockgenes sg
|
|
491
|
+
WHERE sg.stknum = s.stknum AND sg.component_symbol != ''
|
|
492
|
+
ORDER BY sg.component_symbol
|
|
493
|
+
)
|
|
494
|
+
), ''),
|
|
495
|
+
COALESCE((
|
|
496
|
+
SELECT group_concat(fbid, ' ')
|
|
497
|
+
FROM (
|
|
498
|
+
SELECT DISTINCT cc.fbid AS fbid
|
|
499
|
+
FROM component_comments cc
|
|
500
|
+
WHERE cc.stknum = s.stknum AND cc.fbid != ''
|
|
501
|
+
ORDER BY cc.fbid
|
|
502
|
+
)
|
|
503
|
+
), ''),
|
|
504
|
+
COALESCE((
|
|
505
|
+
SELECT group_concat(gene_symbol, ' ')
|
|
506
|
+
FROM (
|
|
507
|
+
SELECT DISTINCT sg.gene_symbol AS gene_symbol
|
|
508
|
+
FROM stockgenes sg
|
|
509
|
+
WHERE sg.stknum = s.stknum AND sg.gene_symbol != ''
|
|
510
|
+
ORDER BY sg.gene_symbol
|
|
511
|
+
)
|
|
512
|
+
), ''),
|
|
513
|
+
COALESCE((
|
|
514
|
+
SELECT group_concat(fbgn, ' ')
|
|
515
|
+
FROM (
|
|
516
|
+
SELECT DISTINCT sg.fbgn AS fbgn
|
|
517
|
+
FROM stockgenes sg
|
|
518
|
+
WHERE sg.stknum = s.stknum AND sg.fbgn != ''
|
|
519
|
+
ORDER BY sg.fbgn
|
|
520
|
+
)
|
|
521
|
+
), ''),
|
|
522
|
+
COALESCE((
|
|
523
|
+
SELECT group_concat(prop_syn, ' ')
|
|
524
|
+
FROM (
|
|
525
|
+
SELECT DISTINCT cp.prop_syn AS prop_syn
|
|
526
|
+
FROM stockgenes sg
|
|
527
|
+
JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
528
|
+
WHERE sg.stknum = s.stknum AND cp.prop_syn != ''
|
|
529
|
+
ORDER BY cp.prop_syn
|
|
530
|
+
)
|
|
531
|
+
), ''),
|
|
532
|
+
COALESCE((
|
|
533
|
+
SELECT group_concat(prop_syn, ' ')
|
|
534
|
+
FROM (
|
|
535
|
+
SELECT DISTINCT cg.prop_syn AS prop_syn
|
|
536
|
+
FROM stockgenes sg
|
|
537
|
+
JOIN compgenes cg
|
|
538
|
+
ON cg.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
539
|
+
AND cg.bdsc_gene_id = sg.bdsc_gene_id
|
|
540
|
+
WHERE sg.stknum = s.stknum AND cg.prop_syn != ''
|
|
541
|
+
ORDER BY cg.prop_syn
|
|
542
|
+
)
|
|
543
|
+
), ''),
|
|
544
|
+
trim(
|
|
545
|
+
s.stknum || ' ' ||
|
|
546
|
+
COALESCE(s.genotype, '') || ' ' ||
|
|
547
|
+
COALESCE(s.aka, '') || ' ' ||
|
|
548
|
+
COALESCE(s.donor_info, '') || ' ' ||
|
|
549
|
+
COALESCE(s.stock_comments, '') || ' ' ||
|
|
550
|
+
COALESCE((
|
|
551
|
+
SELECT group_concat(fbid, ' ')
|
|
552
|
+
FROM (
|
|
553
|
+
SELECT DISTINCT cc.fbid AS fbid
|
|
554
|
+
FROM component_comments cc
|
|
555
|
+
WHERE cc.stknum = s.stknum AND cc.fbid != ''
|
|
556
|
+
)
|
|
557
|
+
), '') || ' ' ||
|
|
558
|
+
COALESCE((
|
|
559
|
+
SELECT group_concat(component_symbol, ' ')
|
|
560
|
+
FROM (
|
|
561
|
+
SELECT DISTINCT sg.component_symbol AS component_symbol
|
|
562
|
+
FROM stockgenes sg
|
|
563
|
+
WHERE sg.stknum = s.stknum AND sg.component_symbol != ''
|
|
564
|
+
)
|
|
565
|
+
), '') || ' ' ||
|
|
566
|
+
COALESCE((
|
|
567
|
+
SELECT group_concat(gene_symbol, ' ')
|
|
568
|
+
FROM (
|
|
569
|
+
SELECT DISTINCT sg.gene_symbol AS gene_symbol
|
|
570
|
+
FROM stockgenes sg
|
|
571
|
+
WHERE sg.stknum = s.stknum AND sg.gene_symbol != ''
|
|
572
|
+
)
|
|
573
|
+
), '') || ' ' ||
|
|
574
|
+
COALESCE((
|
|
575
|
+
SELECT group_concat(fbgn, ' ')
|
|
576
|
+
FROM (
|
|
577
|
+
SELECT DISTINCT sg.fbgn AS fbgn
|
|
578
|
+
FROM stockgenes sg
|
|
579
|
+
WHERE sg.stknum = s.stknum AND sg.fbgn != ''
|
|
580
|
+
)
|
|
581
|
+
), '') || ' ' ||
|
|
582
|
+
COALESCE((
|
|
583
|
+
SELECT group_concat(prop_syn, ' ')
|
|
584
|
+
FROM (
|
|
585
|
+
SELECT DISTINCT cp.prop_syn AS prop_syn
|
|
586
|
+
FROM stockgenes sg
|
|
587
|
+
JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
588
|
+
WHERE sg.stknum = s.stknum AND cp.prop_syn != ''
|
|
589
|
+
)
|
|
590
|
+
), '') || ' ' ||
|
|
591
|
+
COALESCE((
|
|
592
|
+
SELECT group_concat(property_descrip, ' ')
|
|
593
|
+
FROM (
|
|
594
|
+
SELECT DISTINCT cp.property_descrip AS property_descrip
|
|
595
|
+
FROM stockgenes sg
|
|
596
|
+
JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
597
|
+
WHERE sg.stknum = s.stknum AND cp.property_descrip != ''
|
|
598
|
+
)
|
|
599
|
+
), '') || ' ' ||
|
|
600
|
+
COALESCE((
|
|
601
|
+
SELECT group_concat(prop_syn, ' ')
|
|
602
|
+
FROM (
|
|
603
|
+
SELECT DISTINCT cg.prop_syn AS prop_syn
|
|
604
|
+
FROM stockgenes sg
|
|
605
|
+
JOIN compgenes cg
|
|
606
|
+
ON cg.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
607
|
+
AND cg.bdsc_gene_id = sg.bdsc_gene_id
|
|
608
|
+
WHERE sg.stknum = s.stknum AND cg.prop_syn != ''
|
|
609
|
+
)
|
|
610
|
+
), '') || ' ' ||
|
|
611
|
+
COALESCE((
|
|
612
|
+
SELECT group_concat(comment_text, ' ')
|
|
613
|
+
FROM (
|
|
614
|
+
SELECT DISTINCT cc.comment1 AS comment_text
|
|
615
|
+
FROM component_comments cc
|
|
616
|
+
WHERE cc.stknum = s.stknum AND cc.comment1 != ''
|
|
617
|
+
UNION
|
|
618
|
+
SELECT DISTINCT cc.comment2 AS comment_text
|
|
619
|
+
FROM component_comments cc
|
|
620
|
+
WHERE cc.stknum = s.stknum AND cc.comment2 != ''
|
|
621
|
+
UNION
|
|
622
|
+
SELECT DISTINCT cc.comment3 AS comment_text
|
|
623
|
+
FROM component_comments cc
|
|
624
|
+
WHERE cc.stknum = s.stknum AND cc.comment3 != ''
|
|
625
|
+
UNION
|
|
626
|
+
SELECT DISTINCT cc.mapstatement AS comment_text
|
|
627
|
+
FROM component_comments cc
|
|
628
|
+
WHERE cc.stknum = s.stknum AND cc.mapstatement != ''
|
|
629
|
+
)
|
|
630
|
+
), '')
|
|
631
|
+
)
|
|
632
|
+
FROM stocks s
|
|
633
|
+
"""
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
fts_enabled = True
|
|
637
|
+
try:
|
|
638
|
+
conn.execute(
|
|
639
|
+
"""
|
|
640
|
+
CREATE VIRTUAL TABLE stock_fts USING fts5(
|
|
641
|
+
stknum UNINDEXED,
|
|
642
|
+
genotype,
|
|
643
|
+
aka,
|
|
644
|
+
donor_info,
|
|
645
|
+
stock_comments,
|
|
646
|
+
component_symbols,
|
|
647
|
+
fbids,
|
|
648
|
+
gene_symbols,
|
|
649
|
+
fbgns,
|
|
650
|
+
property_terms,
|
|
651
|
+
relationship_terms,
|
|
652
|
+
tokenize='porter unicode61'
|
|
653
|
+
)
|
|
654
|
+
"""
|
|
655
|
+
)
|
|
656
|
+
except sqlite3.OperationalError:
|
|
657
|
+
fts_enabled = False
|
|
658
|
+
|
|
659
|
+
if fts_enabled:
|
|
660
|
+
conn.execute(
|
|
661
|
+
"""
|
|
662
|
+
INSERT INTO stock_fts (
|
|
663
|
+
stknum, genotype, aka, donor_info, stock_comments,
|
|
664
|
+
component_symbols, fbids, gene_symbols, fbgns,
|
|
665
|
+
property_terms, relationship_terms
|
|
666
|
+
)
|
|
667
|
+
SELECT
|
|
668
|
+
stknum, genotype, aka, donor_info, stock_comments,
|
|
669
|
+
component_symbols, fbids, gene_symbols, fbgns,
|
|
670
|
+
property_terms, relationship_terms
|
|
671
|
+
FROM search_documents
|
|
672
|
+
"""
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
trigram_enabled = True
|
|
676
|
+
try:
|
|
677
|
+
conn.execute(
|
|
678
|
+
"""
|
|
679
|
+
CREATE VIRTUAL TABLE stock_trigram USING fts5(
|
|
680
|
+
stknum UNINDEXED,
|
|
681
|
+
search_text,
|
|
682
|
+
tokenize='trigram'
|
|
683
|
+
)
|
|
684
|
+
"""
|
|
685
|
+
)
|
|
686
|
+
except sqlite3.OperationalError:
|
|
687
|
+
trigram_enabled = False
|
|
688
|
+
|
|
689
|
+
if trigram_enabled:
|
|
690
|
+
conn.execute(
|
|
691
|
+
"""
|
|
692
|
+
INSERT INTO stock_trigram (stknum, search_text)
|
|
693
|
+
SELECT stknum, search_text
|
|
694
|
+
FROM search_documents
|
|
695
|
+
"""
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
conn.commit()
|
|
699
|
+
counts = {
|
|
700
|
+
"stocks": len(bloomington_rows),
|
|
701
|
+
"component_comments": len(component_rows),
|
|
702
|
+
"stockgenes": len(stockgene_rows),
|
|
703
|
+
"compgenes": len(compgene_rows),
|
|
704
|
+
"compprops": len(compprop_rows),
|
|
705
|
+
"fts_enabled": int(fts_enabled),
|
|
706
|
+
"trigram_enabled": int(trigram_enabled),
|
|
707
|
+
}
|
|
708
|
+
manifest["index"] = {
|
|
709
|
+
"db_path": str(db_path),
|
|
710
|
+
"built_at": _now_iso(),
|
|
711
|
+
"counts": counts,
|
|
712
|
+
}
|
|
713
|
+
save_manifest(state_dir, manifest)
|
|
714
|
+
return counts
|
|
715
|
+
finally:
|
|
716
|
+
conn.close()
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def _connect(state_dir: Path) -> sqlite3.Connection:
|
|
720
|
+
path = db_file(state_dir)
|
|
721
|
+
if not path.exists():
|
|
722
|
+
raise FileNotFoundError(f"missing index: {path}. run `bdsc sync` or `bdsc build-index`")
|
|
723
|
+
conn = sqlite3.connect(path)
|
|
724
|
+
conn.row_factory = sqlite3.Row
|
|
725
|
+
return conn
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def build_fts_query(text: str) -> str:
|
|
729
|
+
tokens = re.findall(r"[A-Za-z0-9]+", text.lower())
|
|
730
|
+
if not tokens:
|
|
731
|
+
escaped = text.replace('"', '""').strip()
|
|
732
|
+
return f'"{escaped}"'
|
|
733
|
+
return " ".join(f"{token}*" for token in tokens)
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
def _query_tokens(text: str) -> list[str]:
|
|
737
|
+
return re.findall(r"[A-Za-z0-9]+", text.lower())
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def _is_free_text_query(text: str) -> bool:
|
|
741
|
+
return len(_query_tokens(text)) > 1
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def _compact_text(text: str) -> str:
|
|
745
|
+
return "".join(_query_tokens(text))
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def _trigrams(text: str) -> list[str]:
|
|
749
|
+
if len(text) < 3:
|
|
750
|
+
return []
|
|
751
|
+
return [text[index : index + 3] for index in range(len(text) - 2)]
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def build_trigram_query(text: str) -> str | None:
|
|
755
|
+
tokens = _query_tokens(text)
|
|
756
|
+
grams: list[str] = []
|
|
757
|
+
seen: set[str] = set()
|
|
758
|
+
for token in tokens:
|
|
759
|
+
for gram in _trigrams(token):
|
|
760
|
+
if gram not in seen:
|
|
761
|
+
seen.add(gram)
|
|
762
|
+
grams.append(gram)
|
|
763
|
+
compact = _compact_text(text)
|
|
764
|
+
for gram in _trigrams(compact):
|
|
765
|
+
if gram not in seen:
|
|
766
|
+
seen.add(gram)
|
|
767
|
+
grams.append(gram)
|
|
768
|
+
if not grams:
|
|
769
|
+
return None
|
|
770
|
+
return " OR ".join(f'"{gram}"' for gram in grams)
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def _trigram_overlap_ratio(query: str, text: str) -> float:
|
|
774
|
+
query_grams = set(_trigrams(_compact_text(query)))
|
|
775
|
+
text_grams = set(_trigrams(_compact_text(text)))
|
|
776
|
+
if not query_grams or not text_grams:
|
|
777
|
+
return 0.0
|
|
778
|
+
return len(query_grams & text_grams) / len(query_grams)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _best_term_similarity(query: str, text: str) -> float:
|
|
782
|
+
query_compact = _compact_text(query)
|
|
783
|
+
if not query_compact:
|
|
784
|
+
return 0.0
|
|
785
|
+
|
|
786
|
+
best = 0.0
|
|
787
|
+
for term in _query_tokens(text):
|
|
788
|
+
if len(term) < 3:
|
|
789
|
+
continue
|
|
790
|
+
similarity = difflib.SequenceMatcher(None, query_compact, term).ratio()
|
|
791
|
+
similarity += _trigram_overlap_ratio(query, term)
|
|
792
|
+
if similarity > best:
|
|
793
|
+
best = similarity
|
|
794
|
+
return best
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
def _score_search_document(query: str, row: sqlite3.Row | dict[str, Any]) -> float:
|
|
798
|
+
query_value = query.strip().lower()
|
|
799
|
+
query_tokens = _query_tokens(query)
|
|
800
|
+
query_compact = _compact_text(query)
|
|
801
|
+
search_text = row["search_text"]
|
|
802
|
+
haystack = search_text.lower()
|
|
803
|
+
compact_haystack = _compact_text(search_text)
|
|
804
|
+
document_tokens = set(_query_tokens(search_text))
|
|
805
|
+
|
|
806
|
+
score = 0.0
|
|
807
|
+
if query_value and query_value in haystack:
|
|
808
|
+
score += 8.0
|
|
809
|
+
if query_compact and query_compact in compact_haystack:
|
|
810
|
+
score += 10.0
|
|
811
|
+
|
|
812
|
+
exact_matches = sum(1 for token in query_tokens if token in document_tokens)
|
|
813
|
+
prefix_matches = sum(
|
|
814
|
+
1
|
|
815
|
+
for token in query_tokens
|
|
816
|
+
if token not in document_tokens and any(doc.startswith(token) for doc in document_tokens)
|
|
817
|
+
)
|
|
818
|
+
score += exact_matches * 3.0
|
|
819
|
+
score += prefix_matches * 1.5
|
|
820
|
+
|
|
821
|
+
overlap = _trigram_overlap_ratio(query, search_text)
|
|
822
|
+
score += overlap * 4.0
|
|
823
|
+
|
|
824
|
+
gene_symbols = row["gene_symbols"] or ""
|
|
825
|
+
component_symbols = row["component_symbols"] or ""
|
|
826
|
+
primary_fields = f"{gene_symbols} {component_symbols}".strip()
|
|
827
|
+
if primary_fields:
|
|
828
|
+
score += _trigram_overlap_ratio(query, primary_fields) * 8.0
|
|
829
|
+
score += _best_term_similarity(query, primary_fields) * 12.0
|
|
830
|
+
|
|
831
|
+
return score
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def _search_result_payload(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
|
|
835
|
+
return {
|
|
836
|
+
"stknum": row["stknum"],
|
|
837
|
+
"genotype": row["genotype"],
|
|
838
|
+
"gene_symbols": row["gene_symbols"],
|
|
839
|
+
"fbgns": row["fbgns"],
|
|
840
|
+
"component_symbols": row["component_symbols"],
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def _merge_ranked_matches(
|
|
845
|
+
matches: list[dict[str, Any]],
|
|
846
|
+
key_fn,
|
|
847
|
+
) -> list[dict[str, Any]]:
|
|
848
|
+
merged: dict[Any, dict[str, Any]] = {}
|
|
849
|
+
for match in matches:
|
|
850
|
+
key = key_fn(match["row"])
|
|
851
|
+
existing = merged.get(key)
|
|
852
|
+
if existing is None or match["score"] > existing["score"]:
|
|
853
|
+
merged[key] = match
|
|
854
|
+
return sorted(
|
|
855
|
+
merged.values(),
|
|
856
|
+
key=lambda item: (-item["score"], item["row"]["stknum"]),
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def _search_candidates_from_prefix_fts(
|
|
861
|
+
conn: sqlite3.Connection,
|
|
862
|
+
query: str,
|
|
863
|
+
limit: int,
|
|
864
|
+
) -> list[dict[str, Any]]:
|
|
865
|
+
has_fts = bool(
|
|
866
|
+
conn.execute(
|
|
867
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_fts'"
|
|
868
|
+
).fetchone()
|
|
869
|
+
)
|
|
870
|
+
if not has_fts:
|
|
871
|
+
rows = conn.execute(
|
|
872
|
+
"""
|
|
873
|
+
SELECT
|
|
874
|
+
s.stknum,
|
|
875
|
+
s.genotype,
|
|
876
|
+
sd.gene_symbols,
|
|
877
|
+
sd.fbgns,
|
|
878
|
+
sd.component_symbols,
|
|
879
|
+
sd.search_text
|
|
880
|
+
FROM search_documents sd
|
|
881
|
+
JOIN stocks s ON s.stknum = sd.stknum
|
|
882
|
+
WHERE sd.search_text LIKE ?
|
|
883
|
+
ORDER BY s.stknum
|
|
884
|
+
LIMIT ?
|
|
885
|
+
""",
|
|
886
|
+
(f"%{query}%", limit),
|
|
887
|
+
).fetchall()
|
|
888
|
+
return [{"row": row, "score": _score_search_document(query, row) + 20.0} for row in rows]
|
|
889
|
+
|
|
890
|
+
rows = conn.execute(
|
|
891
|
+
"""
|
|
892
|
+
SELECT
|
|
893
|
+
s.stknum,
|
|
894
|
+
s.genotype,
|
|
895
|
+
sd.gene_symbols,
|
|
896
|
+
sd.fbgns,
|
|
897
|
+
sd.component_symbols,
|
|
898
|
+
sd.search_text,
|
|
899
|
+
bm25(stock_fts) AS rank
|
|
900
|
+
FROM stock_fts f
|
|
901
|
+
JOIN stocks s ON s.stknum = f.stknum
|
|
902
|
+
JOIN search_documents sd ON sd.stknum = s.stknum
|
|
903
|
+
WHERE stock_fts MATCH ?
|
|
904
|
+
ORDER BY bm25(stock_fts), s.stknum
|
|
905
|
+
LIMIT ?
|
|
906
|
+
""",
|
|
907
|
+
(build_fts_query(query), limit),
|
|
908
|
+
).fetchall()
|
|
909
|
+
return [
|
|
910
|
+
{
|
|
911
|
+
"row": row,
|
|
912
|
+
"score": _score_search_document(query, row) + 40.0 + min(10.0, abs(row["rank"]) * 1000000.0),
|
|
913
|
+
}
|
|
914
|
+
for row in rows
|
|
915
|
+
]
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def _search_candidates_from_trigram_fts(
|
|
919
|
+
conn: sqlite3.Connection,
|
|
920
|
+
query: str,
|
|
921
|
+
limit: int,
|
|
922
|
+
) -> list[dict[str, Any]]:
|
|
923
|
+
trigram_query = build_trigram_query(query)
|
|
924
|
+
if not trigram_query:
|
|
925
|
+
return []
|
|
926
|
+
|
|
927
|
+
has_trigram = bool(
|
|
928
|
+
conn.execute(
|
|
929
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_trigram'"
|
|
930
|
+
).fetchone()
|
|
931
|
+
)
|
|
932
|
+
if not has_trigram:
|
|
933
|
+
return []
|
|
934
|
+
|
|
935
|
+
rows = conn.execute(
|
|
936
|
+
"""
|
|
937
|
+
SELECT
|
|
938
|
+
s.stknum,
|
|
939
|
+
s.genotype,
|
|
940
|
+
sd.gene_symbols,
|
|
941
|
+
sd.fbgns,
|
|
942
|
+
sd.component_symbols,
|
|
943
|
+
sd.search_text,
|
|
944
|
+
bm25(stock_trigram) AS rank
|
|
945
|
+
FROM stock_trigram t
|
|
946
|
+
JOIN stocks s ON s.stknum = t.stknum
|
|
947
|
+
JOIN search_documents sd ON sd.stknum = s.stknum
|
|
948
|
+
WHERE stock_trigram MATCH ?
|
|
949
|
+
ORDER BY bm25(stock_trigram), s.stknum
|
|
950
|
+
LIMIT ?
|
|
951
|
+
""",
|
|
952
|
+
(trigram_query, limit),
|
|
953
|
+
).fetchall()
|
|
954
|
+
|
|
955
|
+
matches: list[dict[str, Any]] = []
|
|
956
|
+
for row in rows:
|
|
957
|
+
score = _score_search_document(query, row) + min(6.0, abs(row["rank"]) * 1000000.0)
|
|
958
|
+
if score >= 4.5:
|
|
959
|
+
matches.append({"row": row, "score": score})
|
|
960
|
+
return matches
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def _candidate_stock_ids_for_query(
|
|
964
|
+
conn: sqlite3.Connection,
|
|
965
|
+
query: str,
|
|
966
|
+
limit: int,
|
|
967
|
+
) -> list[int]:
|
|
968
|
+
candidates: dict[int, float] = {}
|
|
969
|
+
for match in _search_candidates_from_prefix_fts(conn, query, max(limit * 2, 20)):
|
|
970
|
+
candidates[match["row"]["stknum"]] = max(
|
|
971
|
+
match["score"],
|
|
972
|
+
candidates.get(match["row"]["stknum"], float("-inf")),
|
|
973
|
+
)
|
|
974
|
+
for match in _search_candidates_from_trigram_fts(conn, query, max(limit * 6, 60)):
|
|
975
|
+
candidates[match["row"]["stknum"]] = max(
|
|
976
|
+
match["score"],
|
|
977
|
+
candidates.get(match["row"]["stknum"], float("-inf")),
|
|
978
|
+
)
|
|
979
|
+
ranked = sorted(candidates.items(), key=lambda item: (-item[1], item[0]))
|
|
980
|
+
return [stknum for stknum, _ in ranked[:limit]]
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def _score_field_match(query: str, text: str) -> float:
|
|
984
|
+
if not text:
|
|
985
|
+
return 0.0
|
|
986
|
+
lowered_query = query.strip().lower()
|
|
987
|
+
compact_query = _compact_text(query)
|
|
988
|
+
lowered_text = text.lower()
|
|
989
|
+
compact_text = _compact_text(text)
|
|
990
|
+
text_tokens = set(_query_tokens(text))
|
|
991
|
+
query_tokens = _query_tokens(query)
|
|
992
|
+
|
|
993
|
+
score = 0.0
|
|
994
|
+
if lowered_query and lowered_query == lowered_text:
|
|
995
|
+
score += 12.0
|
|
996
|
+
elif lowered_query and lowered_query in lowered_text:
|
|
997
|
+
score += 8.0
|
|
998
|
+
if compact_query and compact_query == compact_text:
|
|
999
|
+
score += 14.0
|
|
1000
|
+
elif compact_query and compact_query in compact_text:
|
|
1001
|
+
score += 10.0
|
|
1002
|
+
score += _trigram_overlap_ratio(query, text) * 6.0
|
|
1003
|
+
score += _best_term_similarity(query, text) * 8.0
|
|
1004
|
+
score += sum(1 for token in query_tokens if token in text_tokens) * 1.5
|
|
1005
|
+
score += sum(
|
|
1006
|
+
1
|
|
1007
|
+
for token in query_tokens
|
|
1008
|
+
if token not in text_tokens and any(text_token.startswith(token) for text_token in text_tokens)
|
|
1009
|
+
)
|
|
1010
|
+
return score
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def _rows_to_dicts(rows: list[sqlite3.Row]) -> list[dict[str, Any]]:
|
|
1014
|
+
return [dict(row) for row in rows]
|
|
1015
|
+
|
|
1016
|
+
|
|
1017
|
+
def _default_row_key(row: sqlite3.Row) -> tuple[Any, ...]:
|
|
1018
|
+
return tuple(row[key] for key in row.keys())
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def _component_result_key(row: sqlite3.Row | dict[str, Any]) -> tuple[Any, ...]:
|
|
1022
|
+
return (row["stknum"], row["component_symbol"], row["fbid"])
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def _gene_result_key(row: sqlite3.Row | dict[str, Any]) -> tuple[Any, ...]:
|
|
1026
|
+
return (row["stknum"], row["component_symbol"], row["gene_symbol"], row["fbgn"])
|
|
1027
|
+
|
|
1028
|
+
|
|
1029
|
+
def _rank_direct_rows(
|
|
1030
|
+
query: str,
|
|
1031
|
+
rows: list[sqlite3.Row],
|
|
1032
|
+
*,
|
|
1033
|
+
field_names: list[str],
|
|
1034
|
+
limit: int,
|
|
1035
|
+
min_score: float = 5.0,
|
|
1036
|
+
key_fn=None,
|
|
1037
|
+
) -> list[dict[str, Any]]:
|
|
1038
|
+
scored: list[dict[str, Any]] = []
|
|
1039
|
+
for row in rows:
|
|
1040
|
+
row_score = max(_score_field_match(query, row[field_name] or "") for field_name in field_names)
|
|
1041
|
+
if row_score >= min_score:
|
|
1042
|
+
scored.append({"row": row, "score": row_score})
|
|
1043
|
+
|
|
1044
|
+
if key_fn is None:
|
|
1045
|
+
key_fn = _default_row_key
|
|
1046
|
+
ranked = _merge_ranked_matches(scored, key_fn)
|
|
1047
|
+
return [dict(item["row"]) for item in ranked[:limit]]
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def search_local(state_dir: Path, query: str, limit: int = 10) -> list[dict[str, Any]]:
|
|
1051
|
+
query = query.strip()
|
|
1052
|
+
if not query:
|
|
1053
|
+
return []
|
|
1054
|
+
|
|
1055
|
+
conn = _connect(state_dir)
|
|
1056
|
+
try:
|
|
1057
|
+
if query.isdigit():
|
|
1058
|
+
stock = get_stock(state_dir, int(query))
|
|
1059
|
+
return [stock] if stock else []
|
|
1060
|
+
|
|
1061
|
+
candidates: dict[int, dict[str, Any]] = {}
|
|
1062
|
+
for match in _search_candidates_from_prefix_fts(conn, query, max(limit * 3, 20)):
|
|
1063
|
+
stknum = match["row"]["stknum"]
|
|
1064
|
+
existing = candidates.get(stknum)
|
|
1065
|
+
if existing is None or match["score"] > existing["score"]:
|
|
1066
|
+
candidates[stknum] = match
|
|
1067
|
+
|
|
1068
|
+
if not candidates:
|
|
1069
|
+
for match in _search_candidates_from_trigram_fts(conn, query, max(limit * 12, 60)):
|
|
1070
|
+
stknum = match["row"]["stknum"]
|
|
1071
|
+
existing = candidates.get(stknum)
|
|
1072
|
+
if existing is None or match["score"] > existing["score"]:
|
|
1073
|
+
candidates[stknum] = match
|
|
1074
|
+
|
|
1075
|
+
ranked = sorted(
|
|
1076
|
+
candidates.values(),
|
|
1077
|
+
key=lambda item: (-item["score"], item["row"]["stknum"]),
|
|
1078
|
+
)
|
|
1079
|
+
return [_search_result_payload(item["row"]) for item in ranked[:limit]]
|
|
1080
|
+
finally:
|
|
1081
|
+
conn.close()
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
def search_gene(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1085
|
+
query = query.strip()
|
|
1086
|
+
if not query:
|
|
1087
|
+
return []
|
|
1088
|
+
|
|
1089
|
+
conn = _connect(state_dir)
|
|
1090
|
+
try:
|
|
1091
|
+
if query.upper().startswith("FBGN"):
|
|
1092
|
+
rows = conn.execute(
|
|
1093
|
+
"""
|
|
1094
|
+
SELECT DISTINCT
|
|
1095
|
+
sg.stknum,
|
|
1096
|
+
sg.genotype,
|
|
1097
|
+
sg.component_symbol,
|
|
1098
|
+
sg.gene_symbol,
|
|
1099
|
+
sg.fbgn
|
|
1100
|
+
FROM stockgenes sg
|
|
1101
|
+
WHERE UPPER(sg.fbgn) = UPPER(?)
|
|
1102
|
+
ORDER BY sg.stknum, sg.component_symbol, sg.gene_symbol
|
|
1103
|
+
LIMIT ?
|
|
1104
|
+
""",
|
|
1105
|
+
(query, limit),
|
|
1106
|
+
).fetchall()
|
|
1107
|
+
else:
|
|
1108
|
+
rows = conn.execute(
|
|
1109
|
+
"""
|
|
1110
|
+
SELECT DISTINCT
|
|
1111
|
+
sg.stknum,
|
|
1112
|
+
sg.genotype,
|
|
1113
|
+
sg.component_symbol,
|
|
1114
|
+
sg.gene_symbol,
|
|
1115
|
+
sg.fbgn
|
|
1116
|
+
FROM stockgenes sg
|
|
1117
|
+
WHERE LOWER(sg.gene_symbol) = LOWER(?)
|
|
1118
|
+
OR LOWER(sg.gene_symbol) LIKE LOWER(?)
|
|
1119
|
+
ORDER BY
|
|
1120
|
+
CASE WHEN LOWER(sg.gene_symbol) = LOWER(?) THEN 0 ELSE 1 END,
|
|
1121
|
+
sg.stknum,
|
|
1122
|
+
sg.component_symbol,
|
|
1123
|
+
sg.gene_symbol
|
|
1124
|
+
LIMIT ?
|
|
1125
|
+
""",
|
|
1126
|
+
(query, f"{query}%", query, limit),
|
|
1127
|
+
).fetchall()
|
|
1128
|
+
if rows:
|
|
1129
|
+
return _rows_to_dicts(rows)
|
|
1130
|
+
|
|
1131
|
+
stock_ids = _candidate_stock_ids_for_query(conn, query, max(limit * 4, 40))
|
|
1132
|
+
if not stock_ids:
|
|
1133
|
+
return []
|
|
1134
|
+
placeholders = ", ".join("?" for _ in stock_ids)
|
|
1135
|
+
fuzzy_rows = conn.execute(
|
|
1136
|
+
f"""
|
|
1137
|
+
SELECT DISTINCT
|
|
1138
|
+
sg.stknum,
|
|
1139
|
+
sg.genotype,
|
|
1140
|
+
sg.component_symbol,
|
|
1141
|
+
sg.gene_symbol,
|
|
1142
|
+
sg.fbgn
|
|
1143
|
+
FROM stockgenes sg
|
|
1144
|
+
WHERE sg.stknum IN ({placeholders})
|
|
1145
|
+
""",
|
|
1146
|
+
stock_ids,
|
|
1147
|
+
).fetchall()
|
|
1148
|
+
return _rank_direct_rows(
|
|
1149
|
+
query,
|
|
1150
|
+
fuzzy_rows,
|
|
1151
|
+
field_names=["gene_symbol", "fbgn"],
|
|
1152
|
+
limit=limit,
|
|
1153
|
+
key_fn=_gene_result_key,
|
|
1154
|
+
)
|
|
1155
|
+
finally:
|
|
1156
|
+
conn.close()
|
|
1157
|
+
|
|
1158
|
+
|
|
1159
|
+
def _component_metadata_subqueries(
|
|
1160
|
+
stock_num_expr: str,
|
|
1161
|
+
component_symbol_expr: str,
|
|
1162
|
+
symbol_id_expr: str,
|
|
1163
|
+
) -> str:
|
|
1164
|
+
return f"""
|
|
1165
|
+
COALESCE((
|
|
1166
|
+
SELECT group_concat(gene_symbol, ' ')
|
|
1167
|
+
FROM (
|
|
1168
|
+
SELECT DISTINCT sg.gene_symbol AS gene_symbol
|
|
1169
|
+
FROM stockgenes sg
|
|
1170
|
+
WHERE sg.stknum = {stock_num_expr}
|
|
1171
|
+
AND sg.component_symbol = {component_symbol_expr}
|
|
1172
|
+
AND sg.gene_symbol != ''
|
|
1173
|
+
ORDER BY sg.gene_symbol
|
|
1174
|
+
)
|
|
1175
|
+
), '') AS gene_symbols,
|
|
1176
|
+
COALESCE((
|
|
1177
|
+
SELECT group_concat(fbgn, ' ')
|
|
1178
|
+
FROM (
|
|
1179
|
+
SELECT DISTINCT sg.fbgn AS fbgn
|
|
1180
|
+
FROM stockgenes sg
|
|
1181
|
+
WHERE sg.stknum = {stock_num_expr}
|
|
1182
|
+
AND sg.component_symbol = {component_symbol_expr}
|
|
1183
|
+
AND sg.fbgn != ''
|
|
1184
|
+
ORDER BY sg.fbgn
|
|
1185
|
+
)
|
|
1186
|
+
), '') AS fbgns,
|
|
1187
|
+
COALESCE((
|
|
1188
|
+
SELECT group_concat(prop_syn, ' | ')
|
|
1189
|
+
FROM (
|
|
1190
|
+
SELECT DISTINCT cp.prop_syn AS prop_syn
|
|
1191
|
+
FROM compprops cp
|
|
1192
|
+
WHERE cp.bdsc_symbol_id = {symbol_id_expr}
|
|
1193
|
+
AND cp.prop_syn != ''
|
|
1194
|
+
ORDER BY cp.prop_syn
|
|
1195
|
+
)
|
|
1196
|
+
), '') AS property_syns,
|
|
1197
|
+
COALESCE((
|
|
1198
|
+
SELECT group_concat(property_descrip, ' | ')
|
|
1199
|
+
FROM (
|
|
1200
|
+
SELECT DISTINCT cp.property_descrip AS property_descrip
|
|
1201
|
+
FROM compprops cp
|
|
1202
|
+
WHERE cp.bdsc_symbol_id = {symbol_id_expr}
|
|
1203
|
+
AND cp.property_descrip != ''
|
|
1204
|
+
ORDER BY cp.property_descrip
|
|
1205
|
+
)
|
|
1206
|
+
), '') AS property_descriptions,
|
|
1207
|
+
COALESCE((
|
|
1208
|
+
SELECT group_concat(prop_syn, ' | ')
|
|
1209
|
+
FROM (
|
|
1210
|
+
SELECT DISTINCT cg.prop_syn AS prop_syn
|
|
1211
|
+
FROM compgenes cg
|
|
1212
|
+
WHERE cg.bdsc_symbol_id = {symbol_id_expr}
|
|
1213
|
+
AND cg.prop_syn != ''
|
|
1214
|
+
ORDER BY cg.prop_syn
|
|
1215
|
+
)
|
|
1216
|
+
), '') AS gene_relationships
|
|
1217
|
+
"""
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
def _search_component_table(
|
|
1221
|
+
state_dir: Path,
|
|
1222
|
+
*,
|
|
1223
|
+
conn: sqlite3.Connection | None = None,
|
|
1224
|
+
column: str,
|
|
1225
|
+
query: str,
|
|
1226
|
+
limit: int,
|
|
1227
|
+
) -> list[dict[str, Any]]:
|
|
1228
|
+
query = query.strip()
|
|
1229
|
+
if not query:
|
|
1230
|
+
return []
|
|
1231
|
+
|
|
1232
|
+
if column not in {"fbid", "component_symbol"}:
|
|
1233
|
+
raise ValueError(f"unsupported component search column: {column}")
|
|
1234
|
+
|
|
1235
|
+
close_conn = conn is None
|
|
1236
|
+
conn = conn or _connect(state_dir)
|
|
1237
|
+
try:
|
|
1238
|
+
rows = conn.execute(
|
|
1239
|
+
f"""
|
|
1240
|
+
SELECT
|
|
1241
|
+
cc.stknum,
|
|
1242
|
+
cc.genotype,
|
|
1243
|
+
cc.component_symbol,
|
|
1244
|
+
cc.fbid,
|
|
1245
|
+
cc.mapstatement,
|
|
1246
|
+
{_component_metadata_subqueries(
|
|
1247
|
+
"cc.stknum",
|
|
1248
|
+
"cc.component_symbol",
|
|
1249
|
+
"(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol)",
|
|
1250
|
+
)}
|
|
1251
|
+
FROM component_comments cc
|
|
1252
|
+
WHERE LOWER(cc.{column}) = LOWER(?)
|
|
1253
|
+
OR LOWER(cc.{column}) LIKE LOWER(?)
|
|
1254
|
+
ORDER BY
|
|
1255
|
+
CASE WHEN LOWER(cc.{column}) = LOWER(?) THEN 0 ELSE 1 END,
|
|
1256
|
+
cc.stknum,
|
|
1257
|
+
cc.component_symbol
|
|
1258
|
+
LIMIT ?
|
|
1259
|
+
""",
|
|
1260
|
+
(query, f"{query}%", query, limit),
|
|
1261
|
+
).fetchall()
|
|
1262
|
+
if rows:
|
|
1263
|
+
return _rows_to_dicts(rows)
|
|
1264
|
+
|
|
1265
|
+
stock_ids = _candidate_stock_ids_for_query(conn, query, max(limit * 4, 40))
|
|
1266
|
+
if not stock_ids:
|
|
1267
|
+
return []
|
|
1268
|
+
placeholders = ", ".join("?" for _ in stock_ids)
|
|
1269
|
+
fuzzy_rows = conn.execute(
|
|
1270
|
+
f"""
|
|
1271
|
+
SELECT
|
|
1272
|
+
cc.stknum,
|
|
1273
|
+
cc.genotype,
|
|
1274
|
+
cc.component_symbol,
|
|
1275
|
+
cc.fbid,
|
|
1276
|
+
cc.mapstatement,
|
|
1277
|
+
{_component_metadata_subqueries(
|
|
1278
|
+
"cc.stknum",
|
|
1279
|
+
"cc.component_symbol",
|
|
1280
|
+
"(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol)",
|
|
1281
|
+
)}
|
|
1282
|
+
FROM component_comments cc
|
|
1283
|
+
WHERE cc.stknum IN ({placeholders})
|
|
1284
|
+
""",
|
|
1285
|
+
stock_ids,
|
|
1286
|
+
).fetchall()
|
|
1287
|
+
field_names = ["fbid", "component_symbol", "gene_symbols", "genotype", "property_syns"]
|
|
1288
|
+
if column == "component_symbol":
|
|
1289
|
+
field_names = ["component_symbol", "gene_symbols", "fbid", "property_syns", "genotype"]
|
|
1290
|
+
return _rank_direct_rows(
|
|
1291
|
+
query,
|
|
1292
|
+
fuzzy_rows,
|
|
1293
|
+
field_names=field_names,
|
|
1294
|
+
limit=limit,
|
|
1295
|
+
key_fn=_component_result_key,
|
|
1296
|
+
)
|
|
1297
|
+
finally:
|
|
1298
|
+
if close_conn:
|
|
1299
|
+
conn.close()
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
def _fetch_component_domain_rows(
|
|
1303
|
+
conn: sqlite3.Connection,
|
|
1304
|
+
query: str,
|
|
1305
|
+
limit: int,
|
|
1306
|
+
*,
|
|
1307
|
+
cte_sql: str,
|
|
1308
|
+
cte_params: list[Any],
|
|
1309
|
+
) -> list[sqlite3.Row]:
|
|
1310
|
+
rows = conn.execute(
|
|
1311
|
+
f"""
|
|
1312
|
+
{cte_sql}
|
|
1313
|
+
SELECT
|
|
1314
|
+
cc.stknum,
|
|
1315
|
+
cc.genotype,
|
|
1316
|
+
cc.component_symbol,
|
|
1317
|
+
cc.fbid,
|
|
1318
|
+
cc.mapstatement,
|
|
1319
|
+
{_component_metadata_subqueries("cc.stknum", "cc.component_symbol", "sg0.bdsc_symbol_id")}
|
|
1320
|
+
FROM component_comments cc
|
|
1321
|
+
JOIN stockgenes sg0
|
|
1322
|
+
ON sg0.stknum = cc.stknum
|
|
1323
|
+
AND sg0.component_symbol = cc.component_symbol
|
|
1324
|
+
JOIN matching_rows mr
|
|
1325
|
+
ON mr.bdsc_symbol_id = sg0.bdsc_symbol_id
|
|
1326
|
+
GROUP BY
|
|
1327
|
+
cc.stknum,
|
|
1328
|
+
cc.genotype,
|
|
1329
|
+
cc.component_symbol,
|
|
1330
|
+
cc.fbid,
|
|
1331
|
+
cc.mapstatement,
|
|
1332
|
+
sg0.bdsc_symbol_id
|
|
1333
|
+
ORDER BY cc.stknum, cc.component_symbol
|
|
1334
|
+
LIMIT ?
|
|
1335
|
+
""",
|
|
1336
|
+
(*cte_params, limit),
|
|
1337
|
+
).fetchall()
|
|
1338
|
+
if rows:
|
|
1339
|
+
return rows
|
|
1340
|
+
|
|
1341
|
+
stock_ids = _candidate_stock_ids_for_query(conn, query, max(limit * 4, 40))
|
|
1342
|
+
if not stock_ids:
|
|
1343
|
+
return []
|
|
1344
|
+
placeholders = ", ".join("?" for _ in stock_ids)
|
|
1345
|
+
return conn.execute(
|
|
1346
|
+
f"""
|
|
1347
|
+
SELECT
|
|
1348
|
+
cc.stknum,
|
|
1349
|
+
cc.genotype,
|
|
1350
|
+
cc.component_symbol,
|
|
1351
|
+
cc.fbid,
|
|
1352
|
+
cc.mapstatement,
|
|
1353
|
+
{_component_metadata_subqueries("cc.stknum", "cc.component_symbol", "sg0.bdsc_symbol_id")}
|
|
1354
|
+
FROM component_comments cc
|
|
1355
|
+
JOIN stockgenes sg0
|
|
1356
|
+
ON sg0.stknum = cc.stknum
|
|
1357
|
+
AND sg0.component_symbol = cc.component_symbol
|
|
1358
|
+
WHERE cc.stknum IN ({placeholders})
|
|
1359
|
+
GROUP BY
|
|
1360
|
+
cc.stknum,
|
|
1361
|
+
cc.genotype,
|
|
1362
|
+
cc.component_symbol,
|
|
1363
|
+
cc.fbid,
|
|
1364
|
+
cc.mapstatement,
|
|
1365
|
+
sg0.bdsc_symbol_id
|
|
1366
|
+
""",
|
|
1367
|
+
stock_ids,
|
|
1368
|
+
).fetchall()
|
|
1369
|
+
|
|
1370
|
+
def _search_component_domain(
|
|
1371
|
+
state_dir: Path,
|
|
1372
|
+
query: str,
|
|
1373
|
+
limit: int,
|
|
1374
|
+
*,
|
|
1375
|
+
cte_sql: str,
|
|
1376
|
+
cte_params: list[Any],
|
|
1377
|
+
field_names: list[str],
|
|
1378
|
+
) -> list[dict[str, Any]]:
|
|
1379
|
+
query = query.strip()
|
|
1380
|
+
if not query:
|
|
1381
|
+
return []
|
|
1382
|
+
|
|
1383
|
+
conn = _connect(state_dir)
|
|
1384
|
+
try:
|
|
1385
|
+
rows = _fetch_component_domain_rows(
|
|
1386
|
+
conn,
|
|
1387
|
+
query,
|
|
1388
|
+
limit,
|
|
1389
|
+
cte_sql=cte_sql,
|
|
1390
|
+
cte_params=cte_params,
|
|
1391
|
+
)
|
|
1392
|
+
return _rank_direct_rows(
|
|
1393
|
+
query,
|
|
1394
|
+
rows,
|
|
1395
|
+
field_names=field_names,
|
|
1396
|
+
limit=limit,
|
|
1397
|
+
key_fn=_component_result_key,
|
|
1398
|
+
)
|
|
1399
|
+
finally:
|
|
1400
|
+
conn.close()
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
def search_property(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1404
|
+
query = query.strip()
|
|
1405
|
+
return _search_component_domain(
|
|
1406
|
+
state_dir,
|
|
1407
|
+
query,
|
|
1408
|
+
limit,
|
|
1409
|
+
cte_sql="""
|
|
1410
|
+
WITH matching_rows AS (
|
|
1411
|
+
SELECT DISTINCT bdsc_symbol_id
|
|
1412
|
+
FROM compprops
|
|
1413
|
+
WHERE LOWER(prop_syn) = LOWER(?)
|
|
1414
|
+
OR LOWER(prop_syn) LIKE LOWER(?)
|
|
1415
|
+
OR LOWER(property_descrip) LIKE LOWER(?)
|
|
1416
|
+
)
|
|
1417
|
+
""",
|
|
1418
|
+
cte_params=[query, f"{query}%", f"%{query}%"],
|
|
1419
|
+
field_names=["property_syns", "property_descriptions", "component_symbol", "gene_symbols"],
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
|
|
1423
|
+
def search_property_exact(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1424
|
+
query = query.strip()
|
|
1425
|
+
return _search_component_domain(
|
|
1426
|
+
state_dir,
|
|
1427
|
+
query,
|
|
1428
|
+
limit,
|
|
1429
|
+
cte_sql="""
|
|
1430
|
+
WITH matching_rows AS (
|
|
1431
|
+
SELECT DISTINCT bdsc_symbol_id
|
|
1432
|
+
FROM compprops
|
|
1433
|
+
WHERE LOWER(prop_syn) = LOWER(?)
|
|
1434
|
+
OR LOWER(property_descrip) = LOWER(?)
|
|
1435
|
+
)
|
|
1436
|
+
""",
|
|
1437
|
+
cte_params=[query, query],
|
|
1438
|
+
field_names=["property_syns", "property_descriptions", "component_symbol", "gene_symbols"],
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
|
|
1442
|
+
def search_driver_family(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1443
|
+
query = query.strip()
|
|
1444
|
+
_, tokens = normalize_driver_family(query)
|
|
1445
|
+
clause, params = _driver_family_clause(
|
|
1446
|
+
tokens,
|
|
1447
|
+
"cc.component_symbol",
|
|
1448
|
+
"sg.gene_symbol",
|
|
1449
|
+
"cp.prop_syn",
|
|
1450
|
+
)
|
|
1451
|
+
return _search_component_domain(
|
|
1452
|
+
state_dir,
|
|
1453
|
+
query,
|
|
1454
|
+
limit,
|
|
1455
|
+
cte_sql=f"""
|
|
1456
|
+
WITH matching_rows AS (
|
|
1457
|
+
SELECT DISTINCT sg.bdsc_symbol_id
|
|
1458
|
+
FROM stockgenes sg
|
|
1459
|
+
JOIN component_comments cc
|
|
1460
|
+
ON cc.stknum = sg.stknum
|
|
1461
|
+
AND cc.component_symbol = sg.component_symbol
|
|
1462
|
+
LEFT JOIN compprops cp
|
|
1463
|
+
ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
1464
|
+
WHERE {clause}
|
|
1465
|
+
)
|
|
1466
|
+
""",
|
|
1467
|
+
cte_params=params,
|
|
1468
|
+
field_names=["component_symbol", "property_syns", "gene_symbols"],
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
|
|
1472
|
+
def search_relationship(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1473
|
+
query = query.strip()
|
|
1474
|
+
return _search_component_domain(
|
|
1475
|
+
state_dir,
|
|
1476
|
+
query,
|
|
1477
|
+
limit,
|
|
1478
|
+
cte_sql="""
|
|
1479
|
+
WITH matching_rows AS (
|
|
1480
|
+
SELECT DISTINCT bdsc_symbol_id
|
|
1481
|
+
FROM compgenes
|
|
1482
|
+
WHERE LOWER(prop_syn) = LOWER(?)
|
|
1483
|
+
OR LOWER(prop_syn) LIKE LOWER(?)
|
|
1484
|
+
)
|
|
1485
|
+
""",
|
|
1486
|
+
cte_params=[query, f"{query}%"],
|
|
1487
|
+
field_names=["gene_relationships", "gene_symbols", "component_symbol", "property_syns"],
|
|
1488
|
+
)
|
|
1489
|
+
|
|
1490
|
+
|
|
1491
|
+
def resolve_rrid_to_stknum(query: str) -> int | None:
|
|
1492
|
+
match = re.fullmatch(r"(?:RRID:)?BDSC_(\d+)", query.strip(), flags=re.IGNORECASE)
|
|
1493
|
+
if match:
|
|
1494
|
+
return int(match.group(1))
|
|
1495
|
+
if query.strip().isdigit():
|
|
1496
|
+
return int(query.strip())
|
|
1497
|
+
return None
|
|
1498
|
+
|
|
1499
|
+
|
|
1500
|
+
def get_stock_by_rrid(state_dir: Path, query: str) -> dict[str, Any] | None:
|
|
1501
|
+
stknum = resolve_rrid_to_stknum(query)
|
|
1502
|
+
if stknum is None:
|
|
1503
|
+
return None
|
|
1504
|
+
return get_stock(state_dir, stknum)
|
|
1505
|
+
|
|
1506
|
+
|
|
1507
|
+
def search_fbid(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1508
|
+
return _search_component_table(state_dir, column="fbid", query=query, limit=limit)
|
|
1509
|
+
|
|
1510
|
+
|
|
1511
|
+
def search_component(state_dir: Path, query: str, limit: int = 20) -> list[dict[str, Any]]:
|
|
1512
|
+
return _search_component_table(
|
|
1513
|
+
state_dir,
|
|
1514
|
+
column="component_symbol",
|
|
1515
|
+
query=query,
|
|
1516
|
+
limit=limit,
|
|
1517
|
+
)
|
|
1518
|
+
|
|
1519
|
+
|
|
1520
|
+
def detect_query_kind(query: str) -> str:
|
|
1521
|
+
value = query.strip()
|
|
1522
|
+
if not value:
|
|
1523
|
+
return "search"
|
|
1524
|
+
if value.isdigit():
|
|
1525
|
+
return "stock"
|
|
1526
|
+
if resolve_rrid_to_stknum(value) is not None and not value.isdigit():
|
|
1527
|
+
return "rrid"
|
|
1528
|
+
if re.fullmatch(r"FBgn\d+", value, flags=re.IGNORECASE):
|
|
1529
|
+
return "gene"
|
|
1530
|
+
if re.fullmatch(r"FB[a-z]{2}\d+", value, flags=re.IGNORECASE):
|
|
1531
|
+
return "fbid"
|
|
1532
|
+
if any(token in value for token in ("P{", "}", "[", "]", "attP", "CyO")):
|
|
1533
|
+
return "component"
|
|
1534
|
+
if _is_free_text_query(value):
|
|
1535
|
+
return "search"
|
|
1536
|
+
return "gene"
|
|
1537
|
+
|
|
1538
|
+
|
|
1539
|
+
def _prefix_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
|
|
1540
|
+
return f"(LOWER({expr}) = LOWER(?) OR LOWER({expr}) LIKE LOWER(?))", [query, f"{query}%"]
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
def _contains_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
|
|
1544
|
+
return f"LOWER({expr}) LIKE LOWER(?)", [f"%{query}%"]
|
|
1545
|
+
|
|
1546
|
+
|
|
1547
|
+
def _search_text_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
|
|
1548
|
+
tokens = _query_tokens(query)
|
|
1549
|
+
if len(tokens) <= 1:
|
|
1550
|
+
return _contains_match_clause(expr, query)
|
|
1551
|
+
return (
|
|
1552
|
+
" AND ".join(f"LOWER({expr}) LIKE LOWER(?)" for _ in tokens),
|
|
1553
|
+
[f"%{token}%" for token in tokens],
|
|
1554
|
+
)
|
|
1555
|
+
|
|
1556
|
+
|
|
1557
|
+
def _exact_match_clause(expr: str, query: str) -> tuple[str, list[Any]]:
|
|
1558
|
+
return f"LOWER({expr}) = LOWER(?)", [query]
|
|
1559
|
+
|
|
1560
|
+
|
|
1561
|
+
def _property_match_clause(query: str, *, exact: bool) -> tuple[str, list[Any]]:
|
|
1562
|
+
synonym_clause, synonym_params = (
|
|
1563
|
+
_exact_match_clause("cp.prop_syn", query)
|
|
1564
|
+
if exact
|
|
1565
|
+
else _prefix_match_clause("cp.prop_syn", query)
|
|
1566
|
+
)
|
|
1567
|
+
description_clause, description_params = (
|
|
1568
|
+
_exact_match_clause("cp.property_descrip", query)
|
|
1569
|
+
if exact
|
|
1570
|
+
else _contains_match_clause("cp.property_descrip", query)
|
|
1571
|
+
)
|
|
1572
|
+
return (
|
|
1573
|
+
f"({synonym_clause} OR {description_clause})",
|
|
1574
|
+
synonym_params + description_params,
|
|
1575
|
+
)
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
def _gene_match_clause(fbgn_expr: str, gene_expr: str, query: str) -> tuple[str, list[Any]]:
|
|
1579
|
+
if query.upper().startswith("FBGN"):
|
|
1580
|
+
return f"UPPER({fbgn_expr}) = UPPER(?)", [query]
|
|
1581
|
+
clause, params = _prefix_match_clause(gene_expr, query)
|
|
1582
|
+
return clause, params
|
|
1583
|
+
|
|
1584
|
+
|
|
1585
|
+
def normalize_driver_family(query: str) -> tuple[str, tuple[str, ...]]:
|
|
1586
|
+
normalized = query.strip().lower()
|
|
1587
|
+
for family, aliases in DRIVER_FAMILY_ALIASES.items():
|
|
1588
|
+
if normalized == family or normalized in aliases:
|
|
1589
|
+
return family, aliases
|
|
1590
|
+
return normalized, (normalized,)
|
|
1591
|
+
|
|
1592
|
+
|
|
1593
|
+
def _driver_family_clause(tokens: tuple[str, ...], *exprs: str) -> tuple[str, list[Any]]:
|
|
1594
|
+
predicates: list[str] = []
|
|
1595
|
+
params: list[Any] = []
|
|
1596
|
+
for expr in exprs:
|
|
1597
|
+
for token in tokens:
|
|
1598
|
+
if token == "lexa":
|
|
1599
|
+
lowered_expr = f"LOWER({expr})"
|
|
1600
|
+
predicates.append(
|
|
1601
|
+
f"(({lowered_expr} GLOB ? OR {lowered_expr} GLOB ?) "
|
|
1602
|
+
f"AND NOT ({lowered_expr} GLOB ? OR {lowered_expr} GLOB ?))"
|
|
1603
|
+
)
|
|
1604
|
+
params.extend(
|
|
1605
|
+
("*[^a-z0-9]lexa*", "lexa*", "*[^a-z0-9]lexaop*", "lexaop*")
|
|
1606
|
+
)
|
|
1607
|
+
continue
|
|
1608
|
+
predicates.append(f"LOWER({expr}) LIKE LOWER(?)")
|
|
1609
|
+
params.append(f"%{token}%")
|
|
1610
|
+
return "(" + " OR ".join(predicates) + ")", params
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
def _driver_family_criterion(dataset: str, query: str) -> tuple[str, list[Any], str]:
|
|
1614
|
+
_, tokens = normalize_driver_family(query)
|
|
1615
|
+
|
|
1616
|
+
if dataset == "stocks":
|
|
1617
|
+
clause, params = _driver_family_clause(
|
|
1618
|
+
tokens,
|
|
1619
|
+
"cc.component_symbol",
|
|
1620
|
+
"cc.genotype",
|
|
1621
|
+
"sg.gene_symbol",
|
|
1622
|
+
"cp.prop_syn",
|
|
1623
|
+
)
|
|
1624
|
+
return (
|
|
1625
|
+
"EXISTS ("
|
|
1626
|
+
"SELECT 1 FROM component_comments cc "
|
|
1627
|
+
"LEFT JOIN stockgenes sg ON sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
|
|
1628
|
+
"LEFT JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1629
|
+
f"WHERE cc.stknum = s.stknum AND {clause}"
|
|
1630
|
+
")",
|
|
1631
|
+
params,
|
|
1632
|
+
"driver-family",
|
|
1633
|
+
)
|
|
1634
|
+
|
|
1635
|
+
if dataset == "components":
|
|
1636
|
+
component_clause, component_params = _driver_family_clause(
|
|
1637
|
+
tokens,
|
|
1638
|
+
"cc.component_symbol",
|
|
1639
|
+
)
|
|
1640
|
+
gene_clause, gene_params = _driver_family_clause(tokens, "sg.gene_symbol")
|
|
1641
|
+
property_clause, property_params = _driver_family_clause(tokens, "cp.prop_syn")
|
|
1642
|
+
return (
|
|
1643
|
+
"("
|
|
1644
|
+
f"{component_clause} OR EXISTS ("
|
|
1645
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1646
|
+
"LEFT JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1647
|
+
"WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
|
|
1648
|
+
f"AND ({gene_clause} OR {property_clause})"
|
|
1649
|
+
")"
|
|
1650
|
+
")",
|
|
1651
|
+
component_params + gene_params + property_params,
|
|
1652
|
+
"driver-family",
|
|
1653
|
+
)
|
|
1654
|
+
|
|
1655
|
+
if dataset == "genes":
|
|
1656
|
+
component_clause, component_params = _driver_family_clause(
|
|
1657
|
+
tokens,
|
|
1658
|
+
"sg.component_symbol",
|
|
1659
|
+
"sg.gene_symbol",
|
|
1660
|
+
)
|
|
1661
|
+
property_clause, property_params = _driver_family_clause(tokens, "cp.prop_syn")
|
|
1662
|
+
return (
|
|
1663
|
+
"("
|
|
1664
|
+
f"{component_clause} OR EXISTS ("
|
|
1665
|
+
"SELECT 1 FROM compprops cp "
|
|
1666
|
+
"WHERE cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1667
|
+
f"AND {property_clause}"
|
|
1668
|
+
")"
|
|
1669
|
+
")",
|
|
1670
|
+
component_params + property_params,
|
|
1671
|
+
"driver-family",
|
|
1672
|
+
)
|
|
1673
|
+
|
|
1674
|
+
component_clause, component_params = _driver_family_clause(
|
|
1675
|
+
tokens,
|
|
1676
|
+
"cc.component_symbol",
|
|
1677
|
+
"cp.prop_syn",
|
|
1678
|
+
)
|
|
1679
|
+
gene_clause, gene_params = _driver_family_clause(tokens, "sg2.gene_symbol")
|
|
1680
|
+
return (
|
|
1681
|
+
"("
|
|
1682
|
+
f"{component_clause} OR EXISTS ("
|
|
1683
|
+
"SELECT 1 FROM stockgenes sg2 "
|
|
1684
|
+
"WHERE sg2.stknum = cc.stknum AND sg2.component_symbol = cc.component_symbol "
|
|
1685
|
+
f"AND {gene_clause}"
|
|
1686
|
+
")"
|
|
1687
|
+
")",
|
|
1688
|
+
component_params + gene_params,
|
|
1689
|
+
"driver-family",
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
|
|
1693
|
+
def _single_criterion(
|
|
1694
|
+
dataset: str,
|
|
1695
|
+
query: str,
|
|
1696
|
+
kind: str,
|
|
1697
|
+
) -> tuple[str, list[Any], str | None]:
|
|
1698
|
+
resolved_kind = detect_query_kind(query) if kind == "auto" else kind
|
|
1699
|
+
params: list[Any] = []
|
|
1700
|
+
|
|
1701
|
+
if resolved_kind == "stock":
|
|
1702
|
+
clause = {
|
|
1703
|
+
"stocks": "s.stknum = ?",
|
|
1704
|
+
"components": "cc.stknum = ?",
|
|
1705
|
+
"genes": "sg.stknum = ?",
|
|
1706
|
+
"properties": "cc.stknum = ?",
|
|
1707
|
+
}[dataset]
|
|
1708
|
+
params.append(int(query.strip()))
|
|
1709
|
+
return clause, params, resolved_kind
|
|
1710
|
+
|
|
1711
|
+
if resolved_kind == "rrid":
|
|
1712
|
+
stknum = resolve_rrid_to_stknum(query)
|
|
1713
|
+
if stknum is None:
|
|
1714
|
+
return "0", [], resolved_kind
|
|
1715
|
+
clause = {
|
|
1716
|
+
"stocks": "s.stknum = ?",
|
|
1717
|
+
"components": "cc.stknum = ?",
|
|
1718
|
+
"genes": "sg.stknum = ?",
|
|
1719
|
+
"properties": "cc.stknum = ?",
|
|
1720
|
+
}[dataset]
|
|
1721
|
+
params.append(stknum)
|
|
1722
|
+
return clause, params, resolved_kind
|
|
1723
|
+
|
|
1724
|
+
if resolved_kind == "gene":
|
|
1725
|
+
if dataset == "stocks":
|
|
1726
|
+
clause, params = _gene_match_clause("sg.fbgn", "sg.gene_symbol", query)
|
|
1727
|
+
return (
|
|
1728
|
+
f"EXISTS (SELECT 1 FROM stockgenes sg WHERE sg.stknum = s.stknum AND {clause})",
|
|
1729
|
+
params,
|
|
1730
|
+
resolved_kind,
|
|
1731
|
+
)
|
|
1732
|
+
if dataset == "components":
|
|
1733
|
+
clause, params = _gene_match_clause("sg.fbgn", "sg.gene_symbol", query)
|
|
1734
|
+
return (
|
|
1735
|
+
f"EXISTS (SELECT 1 FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol AND {clause})",
|
|
1736
|
+
params,
|
|
1737
|
+
resolved_kind,
|
|
1738
|
+
)
|
|
1739
|
+
if dataset == "genes":
|
|
1740
|
+
clause, params = _gene_match_clause("sg.fbgn", "sg.gene_symbol", query)
|
|
1741
|
+
return clause, params, resolved_kind
|
|
1742
|
+
clause, params = _gene_match_clause("sg2.fbgn", "sg2.gene_symbol", query)
|
|
1743
|
+
return (
|
|
1744
|
+
f"EXISTS (SELECT 1 FROM stockgenes sg2 WHERE sg2.stknum = cc.stknum AND sg2.component_symbol = cc.component_symbol AND {clause})",
|
|
1745
|
+
params,
|
|
1746
|
+
resolved_kind,
|
|
1747
|
+
)
|
|
1748
|
+
|
|
1749
|
+
if resolved_kind == "component":
|
|
1750
|
+
clause, params = _prefix_match_clause(
|
|
1751
|
+
{"stocks": "sg.component_symbol", "components": "cc.component_symbol", "genes": "sg.component_symbol", "properties": "cc.component_symbol"}[dataset],
|
|
1752
|
+
query,
|
|
1753
|
+
)
|
|
1754
|
+
if dataset == "stocks":
|
|
1755
|
+
return (
|
|
1756
|
+
f"EXISTS (SELECT 1 FROM stockgenes sg WHERE sg.stknum = s.stknum AND {clause})",
|
|
1757
|
+
params,
|
|
1758
|
+
resolved_kind,
|
|
1759
|
+
)
|
|
1760
|
+
return clause, params, resolved_kind
|
|
1761
|
+
|
|
1762
|
+
if resolved_kind == "fbid":
|
|
1763
|
+
clause, params = _prefix_match_clause(
|
|
1764
|
+
{"components": "cc.fbid", "properties": "cc.fbid"}[dataset]
|
|
1765
|
+
if dataset in {"components", "properties"}
|
|
1766
|
+
else "cc.fbid",
|
|
1767
|
+
query,
|
|
1768
|
+
)
|
|
1769
|
+
if dataset == "stocks":
|
|
1770
|
+
return (
|
|
1771
|
+
f"EXISTS (SELECT 1 FROM component_comments cc WHERE cc.stknum = s.stknum AND {clause})",
|
|
1772
|
+
params,
|
|
1773
|
+
resolved_kind,
|
|
1774
|
+
)
|
|
1775
|
+
if dataset == "genes":
|
|
1776
|
+
return (
|
|
1777
|
+
f"EXISTS (SELECT 1 FROM component_comments cc WHERE cc.stknum = sg.stknum AND cc.component_symbol = sg.component_symbol AND {clause})",
|
|
1778
|
+
params,
|
|
1779
|
+
resolved_kind,
|
|
1780
|
+
)
|
|
1781
|
+
return clause, params, resolved_kind
|
|
1782
|
+
|
|
1783
|
+
if resolved_kind == "property":
|
|
1784
|
+
clause, params = _property_match_clause(query, exact=False)
|
|
1785
|
+
if dataset == "stocks":
|
|
1786
|
+
return (
|
|
1787
|
+
"EXISTS ("
|
|
1788
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1789
|
+
"JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1790
|
+
f"WHERE sg.stknum = s.stknum AND {clause}"
|
|
1791
|
+
")",
|
|
1792
|
+
params,
|
|
1793
|
+
resolved_kind,
|
|
1794
|
+
)
|
|
1795
|
+
if dataset == "components":
|
|
1796
|
+
return (
|
|
1797
|
+
"EXISTS ("
|
|
1798
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1799
|
+
"JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1800
|
+
"WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
|
|
1801
|
+
f"AND {clause}"
|
|
1802
|
+
")",
|
|
1803
|
+
params,
|
|
1804
|
+
resolved_kind,
|
|
1805
|
+
)
|
|
1806
|
+
if dataset == "genes":
|
|
1807
|
+
return (
|
|
1808
|
+
f"EXISTS (SELECT 1 FROM compprops cp WHERE cp.bdsc_symbol_id = sg.bdsc_symbol_id AND {clause})",
|
|
1809
|
+
params,
|
|
1810
|
+
resolved_kind,
|
|
1811
|
+
)
|
|
1812
|
+
return clause, params, resolved_kind
|
|
1813
|
+
|
|
1814
|
+
if resolved_kind == "property-exact":
|
|
1815
|
+
clause, params = _property_match_clause(query, exact=True)
|
|
1816
|
+
if dataset == "stocks":
|
|
1817
|
+
return (
|
|
1818
|
+
"EXISTS ("
|
|
1819
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1820
|
+
"JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1821
|
+
f"WHERE sg.stknum = s.stknum AND {clause}"
|
|
1822
|
+
")",
|
|
1823
|
+
params,
|
|
1824
|
+
resolved_kind,
|
|
1825
|
+
)
|
|
1826
|
+
if dataset == "components":
|
|
1827
|
+
return (
|
|
1828
|
+
"EXISTS ("
|
|
1829
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1830
|
+
"JOIN compprops cp ON cp.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1831
|
+
"WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
|
|
1832
|
+
f"AND {clause}"
|
|
1833
|
+
")",
|
|
1834
|
+
params,
|
|
1835
|
+
resolved_kind,
|
|
1836
|
+
)
|
|
1837
|
+
if dataset == "genes":
|
|
1838
|
+
return (
|
|
1839
|
+
f"EXISTS (SELECT 1 FROM compprops cp WHERE cp.bdsc_symbol_id = sg.bdsc_symbol_id AND {clause})",
|
|
1840
|
+
params,
|
|
1841
|
+
resolved_kind,
|
|
1842
|
+
)
|
|
1843
|
+
return clause, params, resolved_kind
|
|
1844
|
+
|
|
1845
|
+
if resolved_kind == "driver-family":
|
|
1846
|
+
return _driver_family_criterion(dataset, query)
|
|
1847
|
+
|
|
1848
|
+
if resolved_kind == "relationship":
|
|
1849
|
+
if dataset == "stocks":
|
|
1850
|
+
clause, params = _prefix_match_clause("cg.prop_syn", query)
|
|
1851
|
+
return (
|
|
1852
|
+
"EXISTS ("
|
|
1853
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1854
|
+
"JOIN compgenes cg ON cg.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1855
|
+
f"WHERE sg.stknum = s.stknum AND {clause}"
|
|
1856
|
+
")",
|
|
1857
|
+
params,
|
|
1858
|
+
resolved_kind,
|
|
1859
|
+
)
|
|
1860
|
+
if dataset == "components":
|
|
1861
|
+
clause, params = _prefix_match_clause("cg.prop_syn", query)
|
|
1862
|
+
return (
|
|
1863
|
+
"EXISTS ("
|
|
1864
|
+
"SELECT 1 FROM stockgenes sg "
|
|
1865
|
+
"JOIN compgenes cg ON cg.bdsc_symbol_id = sg.bdsc_symbol_id "
|
|
1866
|
+
"WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol "
|
|
1867
|
+
f"AND {clause}"
|
|
1868
|
+
")",
|
|
1869
|
+
params,
|
|
1870
|
+
resolved_kind,
|
|
1871
|
+
)
|
|
1872
|
+
if dataset == "genes":
|
|
1873
|
+
clause, params = _prefix_match_clause("cg.prop_syn", query)
|
|
1874
|
+
return (
|
|
1875
|
+
f"EXISTS (SELECT 1 FROM compgenes cg WHERE cg.bdsc_symbol_id = sg.bdsc_symbol_id AND cg.bdsc_gene_id = sg.bdsc_gene_id AND {clause})",
|
|
1876
|
+
params,
|
|
1877
|
+
resolved_kind,
|
|
1878
|
+
)
|
|
1879
|
+
clause, params = _prefix_match_clause("cg.prop_syn", query)
|
|
1880
|
+
return (
|
|
1881
|
+
"EXISTS ("
|
|
1882
|
+
"SELECT 1 FROM stockgenes sg2 "
|
|
1883
|
+
"JOIN compgenes cg ON cg.bdsc_symbol_id = sg2.bdsc_symbol_id "
|
|
1884
|
+
"WHERE sg2.stknum = cc.stknum AND sg2.component_symbol = cc.component_symbol "
|
|
1885
|
+
f"AND {clause}"
|
|
1886
|
+
")",
|
|
1887
|
+
params,
|
|
1888
|
+
resolved_kind,
|
|
1889
|
+
)
|
|
1890
|
+
|
|
1891
|
+
if resolved_kind == "search":
|
|
1892
|
+
if dataset == "stocks":
|
|
1893
|
+
clause, params = _search_text_match_clause("sd.search_text", query)
|
|
1894
|
+
return clause, params, resolved_kind
|
|
1895
|
+
if dataset == "components":
|
|
1896
|
+
clause, params = _search_text_match_clause("sd.search_text", query)
|
|
1897
|
+
return (
|
|
1898
|
+
"EXISTS (SELECT 1 FROM search_documents sd "
|
|
1899
|
+
f"WHERE sd.stknum = cc.stknum AND {clause})",
|
|
1900
|
+
params,
|
|
1901
|
+
resolved_kind,
|
|
1902
|
+
)
|
|
1903
|
+
if dataset == "genes":
|
|
1904
|
+
clause, params = _search_text_match_clause("sd.search_text", query)
|
|
1905
|
+
return (
|
|
1906
|
+
"EXISTS (SELECT 1 FROM search_documents sd "
|
|
1907
|
+
f"WHERE sd.stknum = sg.stknum AND {clause})",
|
|
1908
|
+
params,
|
|
1909
|
+
resolved_kind,
|
|
1910
|
+
)
|
|
1911
|
+
clause, params = _search_text_match_clause("sd.search_text", query)
|
|
1912
|
+
return (
|
|
1913
|
+
"EXISTS (SELECT 1 FROM search_documents sd "
|
|
1914
|
+
f"WHERE sd.stknum = cc.stknum AND {clause})",
|
|
1915
|
+
params,
|
|
1916
|
+
resolved_kind,
|
|
1917
|
+
)
|
|
1918
|
+
|
|
1919
|
+
raise ValueError(f"unsupported export filter kind: {kind}")
|
|
1920
|
+
|
|
1921
|
+
|
|
1922
|
+
def _normalize_criteria(
|
|
1923
|
+
criteria: list[QueryCriterion] | None,
|
|
1924
|
+
query: str | None,
|
|
1925
|
+
kind: str,
|
|
1926
|
+
) -> list[QueryCriterion]:
|
|
1927
|
+
normalized = [
|
|
1928
|
+
QueryCriterion(kind=item.kind, query=item.query.strip())
|
|
1929
|
+
for item in (criteria or [])
|
|
1930
|
+
if item.query.strip()
|
|
1931
|
+
]
|
|
1932
|
+
if query and query.strip():
|
|
1933
|
+
normalized.append(QueryCriterion(kind=kind, query=query.strip()))
|
|
1934
|
+
return normalized
|
|
1935
|
+
|
|
1936
|
+
|
|
1937
|
+
def _compose_where_clause(
|
|
1938
|
+
dataset: str,
|
|
1939
|
+
criteria: list[QueryCriterion] | None,
|
|
1940
|
+
*,
|
|
1941
|
+
query: str | None = None,
|
|
1942
|
+
kind: str = "auto",
|
|
1943
|
+
) -> tuple[str, list[Any]]:
|
|
1944
|
+
normalized = _normalize_criteria(criteria, query, kind)
|
|
1945
|
+
if not normalized:
|
|
1946
|
+
return "", []
|
|
1947
|
+
|
|
1948
|
+
predicates: list[str] = []
|
|
1949
|
+
params: list[Any] = []
|
|
1950
|
+
for criterion in normalized:
|
|
1951
|
+
predicate, predicate_params, _ = _single_criterion(
|
|
1952
|
+
dataset,
|
|
1953
|
+
criterion.query,
|
|
1954
|
+
criterion.kind,
|
|
1955
|
+
)
|
|
1956
|
+
predicates.append(f"({predicate})")
|
|
1957
|
+
params.extend(predicate_params)
|
|
1958
|
+
return "WHERE " + " AND ".join(predicates), params
|
|
1959
|
+
|
|
1960
|
+
|
|
1961
|
+
def lookup_query(
|
|
1962
|
+
state_dir: Path,
|
|
1963
|
+
query: str,
|
|
1964
|
+
*,
|
|
1965
|
+
kind: str = "auto",
|
|
1966
|
+
limit: int = 20,
|
|
1967
|
+
) -> dict[str, Any]:
|
|
1968
|
+
requested_kind = kind
|
|
1969
|
+
resolved_kind = detect_query_kind(query) if kind == "auto" else kind
|
|
1970
|
+
|
|
1971
|
+
if resolved_kind == "stock":
|
|
1972
|
+
result = get_stock(state_dir, int(query.strip()))
|
|
1973
|
+
results = [result] if result else []
|
|
1974
|
+
elif resolved_kind == "rrid":
|
|
1975
|
+
result = get_stock_by_rrid(state_dir, query)
|
|
1976
|
+
results = [result] if result else []
|
|
1977
|
+
elif resolved_kind == "gene":
|
|
1978
|
+
results = search_gene(state_dir, query, limit=limit)
|
|
1979
|
+
if kind == "auto" and not results:
|
|
1980
|
+
resolved_kind = "search"
|
|
1981
|
+
results = search_local(state_dir, query, limit=limit)
|
|
1982
|
+
elif resolved_kind == "fbid":
|
|
1983
|
+
results = search_fbid(state_dir, query, limit=limit)
|
|
1984
|
+
elif resolved_kind == "component":
|
|
1985
|
+
results = search_component(state_dir, query, limit=limit)
|
|
1986
|
+
elif resolved_kind == "property":
|
|
1987
|
+
results = search_property(state_dir, query, limit=limit)
|
|
1988
|
+
elif resolved_kind == "property-exact":
|
|
1989
|
+
results = search_property_exact(state_dir, query, limit=limit)
|
|
1990
|
+
elif resolved_kind == "driver-family":
|
|
1991
|
+
results = search_driver_family(state_dir, query, limit=limit)
|
|
1992
|
+
elif resolved_kind == "relationship":
|
|
1993
|
+
results = search_relationship(state_dir, query, limit=limit)
|
|
1994
|
+
elif resolved_kind == "search":
|
|
1995
|
+
results = search_local(state_dir, query, limit=limit)
|
|
1996
|
+
else:
|
|
1997
|
+
raise ValueError(f"unsupported lookup kind: {kind}")
|
|
1998
|
+
|
|
1999
|
+
return {
|
|
2000
|
+
"query": query,
|
|
2001
|
+
"requested_kind": requested_kind,
|
|
2002
|
+
"kind": resolved_kind,
|
|
2003
|
+
"result_count": len(results),
|
|
2004
|
+
"results": results,
|
|
2005
|
+
}
|
|
2006
|
+
|
|
2007
|
+
|
|
2008
|
+
def get_stock(state_dir: Path, stknum: int) -> dict[str, Any] | None:
|
|
2009
|
+
conn = _connect(state_dir)
|
|
2010
|
+
try:
|
|
2011
|
+
stock_row = conn.execute(
|
|
2012
|
+
"""
|
|
2013
|
+
SELECT
|
|
2014
|
+
s.stknum,
|
|
2015
|
+
s.genotype,
|
|
2016
|
+
s.chromosomes,
|
|
2017
|
+
s.aka,
|
|
2018
|
+
s.date_added,
|
|
2019
|
+
s.donor_info,
|
|
2020
|
+
s.stock_comments,
|
|
2021
|
+
sd.component_symbols,
|
|
2022
|
+
sd.gene_symbols,
|
|
2023
|
+
sd.fbgns
|
|
2024
|
+
FROM stocks s
|
|
2025
|
+
LEFT JOIN search_documents sd ON sd.stknum = s.stknum
|
|
2026
|
+
WHERE s.stknum = ?
|
|
2027
|
+
""",
|
|
2028
|
+
(stknum,),
|
|
2029
|
+
).fetchone()
|
|
2030
|
+
if stock_row is None:
|
|
2031
|
+
return None
|
|
2032
|
+
|
|
2033
|
+
component_rows = conn.execute(
|
|
2034
|
+
f"""
|
|
2035
|
+
SELECT
|
|
2036
|
+
component_symbol,
|
|
2037
|
+
fbid,
|
|
2038
|
+
mapstatement,
|
|
2039
|
+
comment1,
|
|
2040
|
+
comment2,
|
|
2041
|
+
comment3,
|
|
2042
|
+
{_component_metadata_subqueries(
|
|
2043
|
+
"component_comments.stknum",
|
|
2044
|
+
"component_comments.component_symbol",
|
|
2045
|
+
"(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = component_comments.stknum AND sg.component_symbol = component_comments.component_symbol)",
|
|
2046
|
+
)}
|
|
2047
|
+
FROM component_comments
|
|
2048
|
+
WHERE stknum = ?
|
|
2049
|
+
ORDER BY component_symbol
|
|
2050
|
+
""",
|
|
2051
|
+
(stknum,),
|
|
2052
|
+
).fetchall()
|
|
2053
|
+
|
|
2054
|
+
gene_rows = conn.execute(
|
|
2055
|
+
"""
|
|
2056
|
+
SELECT DISTINCT
|
|
2057
|
+
component_symbol,
|
|
2058
|
+
gene_symbol,
|
|
2059
|
+
fbgn
|
|
2060
|
+
FROM stockgenes
|
|
2061
|
+
WHERE stknum = ?
|
|
2062
|
+
ORDER BY component_symbol, gene_symbol, fbgn
|
|
2063
|
+
""",
|
|
2064
|
+
(stknum,),
|
|
2065
|
+
).fetchall()
|
|
2066
|
+
|
|
2067
|
+
stock = dict(stock_row)
|
|
2068
|
+
stock["rrid"] = f"RRID:BDSC_{stknum}"
|
|
2069
|
+
stock["components"] = [dict(row) for row in component_rows]
|
|
2070
|
+
stock["genes"] = [dict(row) for row in gene_rows]
|
|
2071
|
+
return stock
|
|
2072
|
+
finally:
|
|
2073
|
+
conn.close()
|
|
2074
|
+
|
|
2075
|
+
|
|
2076
|
+
def live_search(query: str, limit: int = 10) -> list[dict[str, Any]]:
|
|
2077
|
+
simple_payload = parse.urlencode({"presearch": query, "type": "contains"}).encode("utf-8")
|
|
2078
|
+
req = request.Request(
|
|
2079
|
+
"https://bdsc.indiana.edu/Home/GetSearchResults",
|
|
2080
|
+
data=simple_payload,
|
|
2081
|
+
headers={
|
|
2082
|
+
"User-Agent": USER_AGENT,
|
|
2083
|
+
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
2084
|
+
"X-Requested-With": "XMLHttpRequest",
|
|
2085
|
+
},
|
|
2086
|
+
method="POST",
|
|
2087
|
+
)
|
|
2088
|
+
with request.urlopen(req) as response:
|
|
2089
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
2090
|
+
rows = data.get("Data") or []
|
|
2091
|
+
if rows:
|
|
2092
|
+
return rows[:limit]
|
|
2093
|
+
|
|
2094
|
+
advanced_payload = parse.urlencode(
|
|
2095
|
+
{
|
|
2096
|
+
"selectedGenotypeMatches": "any genotype",
|
|
2097
|
+
"selectedGenotypeContains1": "contains",
|
|
2098
|
+
"genotype1": query,
|
|
2099
|
+
"selectedGenotypeContains2": "contains",
|
|
2100
|
+
"genotype2": "",
|
|
2101
|
+
"selectedGenotypeContains3": "contains",
|
|
2102
|
+
"genotype3": "",
|
|
2103
|
+
"selectedCommentContains": "contains",
|
|
2104
|
+
"stockComment": "",
|
|
2105
|
+
"selectedDonorContains": "contains",
|
|
2106
|
+
"donor": "",
|
|
2107
|
+
"selectedAffectedChromosomes": "any",
|
|
2108
|
+
}
|
|
2109
|
+
).encode("utf-8")
|
|
2110
|
+
advanced_req = request.Request(
|
|
2111
|
+
"https://bdsc.indiana.edu/Home/GetAdvancedSearchResults",
|
|
2112
|
+
data=advanced_payload,
|
|
2113
|
+
headers={
|
|
2114
|
+
"User-Agent": USER_AGENT,
|
|
2115
|
+
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
2116
|
+
"X-Requested-With": "XMLHttpRequest",
|
|
2117
|
+
},
|
|
2118
|
+
method="POST",
|
|
2119
|
+
)
|
|
2120
|
+
with request.urlopen(advanced_req) as response:
|
|
2121
|
+
advanced_data = json.loads(response.read().decode("utf-8"))
|
|
2122
|
+
return (advanced_data.get("Data") or [])[:limit]
|
|
2123
|
+
|
|
2124
|
+
|
|
2125
|
+
def get_status(state_dir: Path) -> dict[str, Any]:
|
|
2126
|
+
state_dir = resolve_state_dir(state_dir)
|
|
2127
|
+
manifest = load_manifest(state_dir)
|
|
2128
|
+
datasets = manifest.get("datasets", {})
|
|
2129
|
+
db_path = db_file(state_dir)
|
|
2130
|
+
index_info = manifest.get("index")
|
|
2131
|
+
if index_info is None and db_path.exists():
|
|
2132
|
+
conn = sqlite3.connect(db_path)
|
|
2133
|
+
try:
|
|
2134
|
+
has_fts = bool(
|
|
2135
|
+
conn.execute(
|
|
2136
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_fts'"
|
|
2137
|
+
).fetchone()
|
|
2138
|
+
)
|
|
2139
|
+
has_trigram = bool(
|
|
2140
|
+
conn.execute(
|
|
2141
|
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='stock_trigram'"
|
|
2142
|
+
).fetchone()
|
|
2143
|
+
)
|
|
2144
|
+
index_info = {
|
|
2145
|
+
"db_path": str(db_path),
|
|
2146
|
+
"built_at": None,
|
|
2147
|
+
"counts": {
|
|
2148
|
+
"stocks": conn.execute("SELECT COUNT(*) FROM stocks").fetchone()[0],
|
|
2149
|
+
"component_comments": conn.execute(
|
|
2150
|
+
"SELECT COUNT(*) FROM component_comments"
|
|
2151
|
+
).fetchone()[0],
|
|
2152
|
+
"stockgenes": conn.execute("SELECT COUNT(*) FROM stockgenes").fetchone()[0],
|
|
2153
|
+
"compgenes": conn.execute("SELECT COUNT(*) FROM compgenes").fetchone()[0],
|
|
2154
|
+
"compprops": conn.execute("SELECT COUNT(*) FROM compprops").fetchone()[0],
|
|
2155
|
+
"fts_enabled": int(has_fts),
|
|
2156
|
+
"trigram_enabled": int(has_trigram),
|
|
2157
|
+
},
|
|
2158
|
+
}
|
|
2159
|
+
finally:
|
|
2160
|
+
conn.close()
|
|
2161
|
+
return {
|
|
2162
|
+
"state_dir": str(state_dir),
|
|
2163
|
+
"db_path": str(db_path),
|
|
2164
|
+
"db_exists": db_path.exists(),
|
|
2165
|
+
"dataset_count": len(datasets),
|
|
2166
|
+
"datasets": datasets,
|
|
2167
|
+
"index": index_info,
|
|
2168
|
+
"updated_at": manifest.get("updated_at"),
|
|
2169
|
+
}
|
|
2170
|
+
|
|
2171
|
+
|
|
2172
|
+
def _dataset_sort_clause(dataset: str) -> str:
|
|
2173
|
+
if dataset == "stocks":
|
|
2174
|
+
return "ORDER BY s.stknum"
|
|
2175
|
+
if dataset == "components":
|
|
2176
|
+
return "ORDER BY cc.stknum, cc.component_symbol"
|
|
2177
|
+
if dataset == "genes":
|
|
2178
|
+
return "ORDER BY sg.stknum, sg.component_symbol, sg.gene_symbol, sg.fbgn"
|
|
2179
|
+
if dataset == "properties":
|
|
2180
|
+
return "ORDER BY cc.stknum, cc.component_symbol, cp.prop_syn, cp.property_id"
|
|
2181
|
+
raise ValueError(f"unsupported export dataset: {dataset}")
|
|
2182
|
+
|
|
2183
|
+
|
|
2184
|
+
def _dataset_select_sql(dataset: str) -> str:
|
|
2185
|
+
if dataset == "stocks":
|
|
2186
|
+
return """
|
|
2187
|
+
SELECT
|
|
2188
|
+
s.stknum,
|
|
2189
|
+
'RRID:BDSC_' || s.stknum AS rrid,
|
|
2190
|
+
s.genotype,
|
|
2191
|
+
s.chromosomes,
|
|
2192
|
+
s.aka,
|
|
2193
|
+
s.date_added,
|
|
2194
|
+
s.donor_info,
|
|
2195
|
+
s.stock_comments,
|
|
2196
|
+
COALESCE(sd.component_symbols, '') AS component_symbols,
|
|
2197
|
+
COALESCE(sd.gene_symbols, '') AS gene_symbols,
|
|
2198
|
+
COALESCE(sd.fbgns, '') AS fbgns
|
|
2199
|
+
FROM stocks s
|
|
2200
|
+
LEFT JOIN search_documents sd ON sd.stknum = s.stknum
|
|
2201
|
+
"""
|
|
2202
|
+
if dataset == "components":
|
|
2203
|
+
return f"""
|
|
2204
|
+
SELECT
|
|
2205
|
+
cc.stknum,
|
|
2206
|
+
cc.genotype,
|
|
2207
|
+
cc.component_symbol,
|
|
2208
|
+
cc.fbid,
|
|
2209
|
+
cc.mapstatement,
|
|
2210
|
+
cc.comment1,
|
|
2211
|
+
cc.comment2,
|
|
2212
|
+
cc.comment3,
|
|
2213
|
+
{_component_metadata_subqueries(
|
|
2214
|
+
"cc.stknum",
|
|
2215
|
+
"cc.component_symbol",
|
|
2216
|
+
"(SELECT MIN(sg.bdsc_symbol_id) FROM stockgenes sg WHERE sg.stknum = cc.stknum AND sg.component_symbol = cc.component_symbol)",
|
|
2217
|
+
)}
|
|
2218
|
+
FROM component_comments cc
|
|
2219
|
+
"""
|
|
2220
|
+
if dataset == "genes":
|
|
2221
|
+
return """
|
|
2222
|
+
SELECT DISTINCT
|
|
2223
|
+
sg.stknum,
|
|
2224
|
+
sg.genotype,
|
|
2225
|
+
sg.component_symbol,
|
|
2226
|
+
cc.fbid,
|
|
2227
|
+
sg.gene_symbol,
|
|
2228
|
+
sg.fbgn,
|
|
2229
|
+
sg.bdsc_symbol_id,
|
|
2230
|
+
sg.bdsc_gene_id,
|
|
2231
|
+
COALESCE((
|
|
2232
|
+
SELECT group_concat(prop_syn, ' | ')
|
|
2233
|
+
FROM (
|
|
2234
|
+
SELECT DISTINCT cg.prop_syn AS prop_syn
|
|
2235
|
+
FROM compgenes cg
|
|
2236
|
+
WHERE cg.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
2237
|
+
AND cg.bdsc_gene_id = sg.bdsc_gene_id
|
|
2238
|
+
AND cg.prop_syn != ''
|
|
2239
|
+
ORDER BY cg.prop_syn
|
|
2240
|
+
)
|
|
2241
|
+
), '') AS gene_relationships
|
|
2242
|
+
FROM stockgenes sg
|
|
2243
|
+
LEFT JOIN component_comments cc
|
|
2244
|
+
ON cc.stknum = sg.stknum
|
|
2245
|
+
AND cc.component_symbol = sg.component_symbol
|
|
2246
|
+
"""
|
|
2247
|
+
if dataset == "properties":
|
|
2248
|
+
return """
|
|
2249
|
+
SELECT DISTINCT
|
|
2250
|
+
cc.stknum,
|
|
2251
|
+
cc.genotype,
|
|
2252
|
+
cc.component_symbol,
|
|
2253
|
+
cc.fbid,
|
|
2254
|
+
cp.property_id,
|
|
2255
|
+
cp.prop_syn,
|
|
2256
|
+
cp.property_descrip
|
|
2257
|
+
FROM component_comments cc
|
|
2258
|
+
JOIN stockgenes sg
|
|
2259
|
+
ON sg.stknum = cc.stknum
|
|
2260
|
+
AND sg.component_symbol = cc.component_symbol
|
|
2261
|
+
JOIN compprops cp
|
|
2262
|
+
ON cp.bdsc_symbol_id = sg.bdsc_symbol_id
|
|
2263
|
+
"""
|
|
2264
|
+
raise ValueError(f"unsupported export dataset: {dataset}")
|
|
2265
|
+
|
|
2266
|
+
|
|
2267
|
+
def iter_dataset_rows(
|
|
2268
|
+
state_dir: Path,
|
|
2269
|
+
dataset: str,
|
|
2270
|
+
*,
|
|
2271
|
+
where_clause: str = "",
|
|
2272
|
+
params: tuple[Any, ...] = (),
|
|
2273
|
+
limit: int | None = None,
|
|
2274
|
+
) -> Iterator[dict[str, Any]]:
|
|
2275
|
+
if dataset not in EXPORT_DATASETS:
|
|
2276
|
+
raise ValueError(f"unsupported export dataset: {dataset}")
|
|
2277
|
+
|
|
2278
|
+
conn = _connect(state_dir)
|
|
2279
|
+
try:
|
|
2280
|
+
sql = _dataset_select_sql(dataset)
|
|
2281
|
+
if where_clause:
|
|
2282
|
+
sql += f"\n{where_clause}"
|
|
2283
|
+
sql += f"\n{_dataset_sort_clause(dataset)}"
|
|
2284
|
+
|
|
2285
|
+
if limit is not None:
|
|
2286
|
+
sql += "\nLIMIT ?"
|
|
2287
|
+
cursor = conn.execute(sql, (*params, limit))
|
|
2288
|
+
else:
|
|
2289
|
+
cursor = conn.execute(sql, params)
|
|
2290
|
+
|
|
2291
|
+
columns = [description[0] for description in cursor.description]
|
|
2292
|
+
try:
|
|
2293
|
+
while True:
|
|
2294
|
+
rows = cursor.fetchmany(1000)
|
|
2295
|
+
if not rows:
|
|
2296
|
+
break
|
|
2297
|
+
for row in rows:
|
|
2298
|
+
yield dict(zip(columns, row, strict=False))
|
|
2299
|
+
finally:
|
|
2300
|
+
cursor.close()
|
|
2301
|
+
finally:
|
|
2302
|
+
conn.close()
|
|
2303
|
+
|
|
2304
|
+
|
|
2305
|
+
def iter_export_rows(
|
|
2306
|
+
state_dir: Path,
|
|
2307
|
+
dataset: str,
|
|
2308
|
+
*,
|
|
2309
|
+
limit: int | None = None,
|
|
2310
|
+
criteria: list[QueryCriterion] | None = None,
|
|
2311
|
+
query: str | None = None,
|
|
2312
|
+
kind: str = "auto",
|
|
2313
|
+
) -> Iterator[dict[str, Any]]:
|
|
2314
|
+
where_clause, params = _compose_where_clause(
|
|
2315
|
+
dataset,
|
|
2316
|
+
criteria,
|
|
2317
|
+
query=query,
|
|
2318
|
+
kind=kind,
|
|
2319
|
+
)
|
|
2320
|
+
yield from iter_dataset_rows(
|
|
2321
|
+
state_dir,
|
|
2322
|
+
dataset,
|
|
2323
|
+
where_clause=where_clause,
|
|
2324
|
+
params=tuple(params),
|
|
2325
|
+
limit=limit,
|
|
2326
|
+
)
|
|
2327
|
+
|
|
2328
|
+
|
|
2329
|
+
def _report_olfactory_where(dataset: str) -> str:
|
|
2330
|
+
component_clause = (
|
|
2331
|
+
"component_symbol GLOB '*Or[0-9]*' "
|
|
2332
|
+
"OR component_symbol GLOB '*Orco*' "
|
|
2333
|
+
"OR component_symbol GLOB '*Ir[0-9]*' "
|
|
2334
|
+
"OR component_symbol GLOB '*Obp[0-9]*'"
|
|
2335
|
+
)
|
|
2336
|
+
if dataset == "stocks":
|
|
2337
|
+
return (
|
|
2338
|
+
"WHERE EXISTS (SELECT 1 FROM component_comments cc "
|
|
2339
|
+
f"WHERE cc.stknum = s.stknum AND ({component_clause}))"
|
|
2340
|
+
)
|
|
2341
|
+
symbol = REPORT_DATASET_SYMBOLS.get(dataset)
|
|
2342
|
+
if symbol is None:
|
|
2343
|
+
raise ValueError(f"unsupported report dataset: {dataset}")
|
|
2344
|
+
return f"WHERE {component_clause.replace('component_symbol', f'{symbol}.component_symbol')}"
|
|
2345
|
+
|
|
2346
|
+
|
|
2347
|
+
def _report_row_key(dataset: str, row: dict[str, Any]) -> tuple[Any, ...]:
|
|
2348
|
+
if dataset == "stocks":
|
|
2349
|
+
return (row["stknum"],)
|
|
2350
|
+
if dataset == "components":
|
|
2351
|
+
return _component_result_key(row)
|
|
2352
|
+
if dataset == "genes":
|
|
2353
|
+
return _gene_result_key(row)
|
|
2354
|
+
if dataset == "properties":
|
|
2355
|
+
return (row["stknum"], row["component_symbol"], row["property_id"], row["prop_syn"])
|
|
2356
|
+
raise ValueError(f"unsupported report dataset: {dataset}")
|
|
2357
|
+
|
|
2358
|
+
|
|
2359
|
+
def _merge_report_rows(dataset: str, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
2360
|
+
deduped: dict[tuple[Any, ...], dict[str, Any]] = {}
|
|
2361
|
+
for row in rows:
|
|
2362
|
+
key = _report_row_key(dataset, row)
|
|
2363
|
+
deduped.setdefault(key, row)
|
|
2364
|
+
return list(deduped.values())
|
|
2365
|
+
|
|
2366
|
+
|
|
2367
|
+
def iter_report_rows(
|
|
2368
|
+
state_dir: Path,
|
|
2369
|
+
report_name: str,
|
|
2370
|
+
*,
|
|
2371
|
+
dataset: str | None = None,
|
|
2372
|
+
limit: int | None = None,
|
|
2373
|
+
) -> Iterator[dict[str, Any]]:
|
|
2374
|
+
if report_name not in REPORT_NAMES:
|
|
2375
|
+
raise ValueError(f"unsupported report: {report_name}")
|
|
2376
|
+
spec = REPORT_SPECS[report_name]
|
|
2377
|
+
resolved_dataset = dataset or spec.default_dataset
|
|
2378
|
+
|
|
2379
|
+
if report_name == "olfactory":
|
|
2380
|
+
yield from iter_dataset_rows(
|
|
2381
|
+
state_dir,
|
|
2382
|
+
resolved_dataset,
|
|
2383
|
+
where_clause=_report_olfactory_where(resolved_dataset),
|
|
2384
|
+
limit=limit,
|
|
2385
|
+
)
|
|
2386
|
+
return
|
|
2387
|
+
|
|
2388
|
+
merged_rows: list[dict[str, Any]] = []
|
|
2389
|
+
for group in spec.groups:
|
|
2390
|
+
rows = list(
|
|
2391
|
+
iter_export_rows(
|
|
2392
|
+
state_dir,
|
|
2393
|
+
resolved_dataset,
|
|
2394
|
+
criteria=list(group),
|
|
2395
|
+
limit=limit,
|
|
2396
|
+
)
|
|
2397
|
+
)
|
|
2398
|
+
merged_rows.extend(rows)
|
|
2399
|
+
if limit is not None and len(_merge_report_rows(resolved_dataset, merged_rows)) >= limit:
|
|
2400
|
+
break
|
|
2401
|
+
|
|
2402
|
+
deduped = _merge_report_rows(resolved_dataset, merged_rows)
|
|
2403
|
+
if limit is not None:
|
|
2404
|
+
deduped = deduped[:limit]
|
|
2405
|
+
for row in deduped:
|
|
2406
|
+
yield row
|
|
2407
|
+
|
|
2408
|
+
|
|
2409
|
+
def list_terms(
|
|
2410
|
+
state_dir: Path,
|
|
2411
|
+
scope: str,
|
|
2412
|
+
*,
|
|
2413
|
+
query: str | None = None,
|
|
2414
|
+
limit: int = 50,
|
|
2415
|
+
) -> list[dict[str, Any]]:
|
|
2416
|
+
if scope not in TERM_SCOPES:
|
|
2417
|
+
raise ValueError(f"unsupported term scope: {scope}")
|
|
2418
|
+
|
|
2419
|
+
query = (query or "").strip()
|
|
2420
|
+
conn = _connect(state_dir)
|
|
2421
|
+
try:
|
|
2422
|
+
if scope == "properties":
|
|
2423
|
+
sql = """
|
|
2424
|
+
SELECT
|
|
2425
|
+
prop_syn AS term,
|
|
2426
|
+
MIN(property_descrip) AS description,
|
|
2427
|
+
COUNT(*) AS count
|
|
2428
|
+
FROM compprops
|
|
2429
|
+
WHERE prop_syn != ''
|
|
2430
|
+
"""
|
|
2431
|
+
params: list[Any] = []
|
|
2432
|
+
if query:
|
|
2433
|
+
sql += " AND LOWER(prop_syn) LIKE LOWER(?)"
|
|
2434
|
+
params.append(f"{query}%")
|
|
2435
|
+
sql += """
|
|
2436
|
+
GROUP BY prop_syn
|
|
2437
|
+
ORDER BY count DESC, term
|
|
2438
|
+
LIMIT ?
|
|
2439
|
+
"""
|
|
2440
|
+
elif scope == "property-descriptions":
|
|
2441
|
+
sql = """
|
|
2442
|
+
SELECT
|
|
2443
|
+
property_descrip AS term,
|
|
2444
|
+
MIN(prop_syn) AS synonym,
|
|
2445
|
+
COUNT(*) AS count
|
|
2446
|
+
FROM compprops
|
|
2447
|
+
WHERE property_descrip != ''
|
|
2448
|
+
"""
|
|
2449
|
+
params = []
|
|
2450
|
+
if query:
|
|
2451
|
+
sql += " AND LOWER(property_descrip) LIKE LOWER(?)"
|
|
2452
|
+
params.append(f"%{query}%")
|
|
2453
|
+
sql += """
|
|
2454
|
+
GROUP BY property_descrip
|
|
2455
|
+
ORDER BY count DESC, term
|
|
2456
|
+
LIMIT ?
|
|
2457
|
+
"""
|
|
2458
|
+
else:
|
|
2459
|
+
sql = """
|
|
2460
|
+
SELECT
|
|
2461
|
+
prop_syn AS term,
|
|
2462
|
+
COUNT(*) AS count
|
|
2463
|
+
FROM compgenes
|
|
2464
|
+
WHERE prop_syn != ''
|
|
2465
|
+
"""
|
|
2466
|
+
params = []
|
|
2467
|
+
if query:
|
|
2468
|
+
sql += " AND LOWER(prop_syn) LIKE LOWER(?)"
|
|
2469
|
+
params.append(f"{query}%")
|
|
2470
|
+
sql += """
|
|
2471
|
+
GROUP BY prop_syn
|
|
2472
|
+
ORDER BY count DESC, term
|
|
2473
|
+
LIMIT ?
|
|
2474
|
+
"""
|
|
2475
|
+
|
|
2476
|
+
rows = conn.execute(sql, (*params, limit)).fetchall()
|
|
2477
|
+
return [dict(row) for row in rows]
|
|
2478
|
+
finally:
|
|
2479
|
+
conn.close()
|
|
2480
|
+
|
|
2481
|
+
|
|
2482
|
+
def format_sync_results(results: list[SyncResult]) -> str:
|
|
2483
|
+
lines = []
|
|
2484
|
+
for result in results:
|
|
2485
|
+
lines.append(
|
|
2486
|
+
f"{result.name}: {result.status} {result.bytes_downloaded}B {result.path}"
|
|
2487
|
+
)
|
|
2488
|
+
return "\n".join(lines)
|
|
2489
|
+
|
|
2490
|
+
|
|
2491
|
+
def format_search_results(results: list[dict[str, Any]]) -> str:
|
|
2492
|
+
if not results:
|
|
2493
|
+
return "no results"
|
|
2494
|
+
lines = []
|
|
2495
|
+
for row in results:
|
|
2496
|
+
stknum = row.get("stknum", row.get("Stknum"))
|
|
2497
|
+
genotype = row.get("genotype", row.get("Genotype"))
|
|
2498
|
+
bits = [str(stknum), genotype]
|
|
2499
|
+
genes = row.get("gene_symbols") or row.get("fbgns") or row.get("SearchText") or ""
|
|
2500
|
+
if genes:
|
|
2501
|
+
bits.append(f"genes={genes}")
|
|
2502
|
+
lines.append(" | ".join(bits))
|
|
2503
|
+
return "\n".join(lines)
|
|
2504
|
+
|
|
2505
|
+
|
|
2506
|
+
def format_gene_results(results: list[dict[str, Any]]) -> str:
|
|
2507
|
+
if not results:
|
|
2508
|
+
return "no results"
|
|
2509
|
+
lines = []
|
|
2510
|
+
for row in results:
|
|
2511
|
+
lines.append(
|
|
2512
|
+
" | ".join(
|
|
2513
|
+
[
|
|
2514
|
+
str(row["stknum"]),
|
|
2515
|
+
row["gene_symbol"],
|
|
2516
|
+
row["fbgn"],
|
|
2517
|
+
row["component_symbol"],
|
|
2518
|
+
row["genotype"],
|
|
2519
|
+
]
|
|
2520
|
+
)
|
|
2521
|
+
)
|
|
2522
|
+
return "\n".join(lines)
|
|
2523
|
+
|
|
2524
|
+
|
|
2525
|
+
def format_component_results(results: list[dict[str, Any]]) -> str:
|
|
2526
|
+
if not results:
|
|
2527
|
+
return "no results"
|
|
2528
|
+
lines = []
|
|
2529
|
+
for row in results:
|
|
2530
|
+
bits = [
|
|
2531
|
+
str(row["stknum"]),
|
|
2532
|
+
row["component_symbol"],
|
|
2533
|
+
row["fbid"],
|
|
2534
|
+
]
|
|
2535
|
+
genes = row.get("gene_symbols") or row.get("fbgns") or ""
|
|
2536
|
+
if genes:
|
|
2537
|
+
bits.append(f"genes={genes}")
|
|
2538
|
+
properties = row.get("property_syns") or ""
|
|
2539
|
+
if properties:
|
|
2540
|
+
bits.append(f"props={properties}")
|
|
2541
|
+
relationships = row.get("gene_relationships") or ""
|
|
2542
|
+
if relationships:
|
|
2543
|
+
bits.append(f"rels={relationships}")
|
|
2544
|
+
lines.append(" | ".join(bits + [row["genotype"]]))
|
|
2545
|
+
return "\n".join(lines)
|
|
2546
|
+
|
|
2547
|
+
|
|
2548
|
+
def format_property_results(results: list[dict[str, Any]]) -> str:
|
|
2549
|
+
if not results:
|
|
2550
|
+
return "no results"
|
|
2551
|
+
lines = []
|
|
2552
|
+
for row in results:
|
|
2553
|
+
bits = [
|
|
2554
|
+
str(row["stknum"]),
|
|
2555
|
+
row["component_symbol"],
|
|
2556
|
+
row["fbid"],
|
|
2557
|
+
row["prop_syn"],
|
|
2558
|
+
]
|
|
2559
|
+
if row.get("property_descrip"):
|
|
2560
|
+
bits.append(row["property_descrip"])
|
|
2561
|
+
lines.append(" | ".join(bits + [row["genotype"]]))
|
|
2562
|
+
return "\n".join(lines)
|
|
2563
|
+
|
|
2564
|
+
|
|
2565
|
+
def format_dataset_results(dataset: str, results: list[dict[str, Any]]) -> str:
|
|
2566
|
+
if dataset == "stocks":
|
|
2567
|
+
return format_search_results(results)
|
|
2568
|
+
if dataset == "components":
|
|
2569
|
+
return format_component_results(results)
|
|
2570
|
+
if dataset == "genes":
|
|
2571
|
+
return format_gene_results(results)
|
|
2572
|
+
if dataset == "properties":
|
|
2573
|
+
return format_property_results(results)
|
|
2574
|
+
raise ValueError(f"unsupported dataset formatter: {dataset}")
|
|
2575
|
+
|
|
2576
|
+
|
|
2577
|
+
def format_term_results(results: list[dict[str, Any]]) -> str:
|
|
2578
|
+
if not results:
|
|
2579
|
+
return "no results"
|
|
2580
|
+
lines = []
|
|
2581
|
+
for row in results:
|
|
2582
|
+
bits = [row["term"], f"count={row['count']}"]
|
|
2583
|
+
if row.get("description"):
|
|
2584
|
+
bits.append(row["description"])
|
|
2585
|
+
if row.get("synonym"):
|
|
2586
|
+
bits.append(f"synonym={row['synonym']}")
|
|
2587
|
+
lines.append(" | ".join(bits))
|
|
2588
|
+
return "\n".join(lines)
|
|
2589
|
+
|
|
2590
|
+
|
|
2591
|
+
def format_lookup_result(result: dict[str, Any]) -> str:
|
|
2592
|
+
lines = [f"query: {result['query']}", f"kind: {result['kind']}"]
|
|
2593
|
+
rows = result["results"]
|
|
2594
|
+
kind = result["kind"]
|
|
2595
|
+
if kind in {"stock", "rrid"}:
|
|
2596
|
+
body = format_stock(rows[0] if rows else None)
|
|
2597
|
+
elif kind == "gene":
|
|
2598
|
+
body = format_gene_results(rows)
|
|
2599
|
+
elif kind in {
|
|
2600
|
+
"component",
|
|
2601
|
+
"fbid",
|
|
2602
|
+
"property",
|
|
2603
|
+
"property-exact",
|
|
2604
|
+
"driver-family",
|
|
2605
|
+
"relationship",
|
|
2606
|
+
}:
|
|
2607
|
+
body = format_component_results(rows)
|
|
2608
|
+
else:
|
|
2609
|
+
body = format_search_results(rows)
|
|
2610
|
+
lines.append(body)
|
|
2611
|
+
return "\n".join(lines)
|
|
2612
|
+
|
|
2613
|
+
|
|
2614
|
+
def format_stock(stock: dict[str, Any] | None) -> str:
|
|
2615
|
+
if stock is None:
|
|
2616
|
+
return "not found"
|
|
2617
|
+
|
|
2618
|
+
lines = [
|
|
2619
|
+
f"stknum: {stock['stknum']}",
|
|
2620
|
+
f"rrid: {stock['rrid']}",
|
|
2621
|
+
f"genotype: {stock['genotype']}",
|
|
2622
|
+
f"chromosomes: {stock['chromosomes'] or '-'}",
|
|
2623
|
+
f"aka: {stock['aka'] or '-'}",
|
|
2624
|
+
f"date_added: {stock['date_added'] or '-'}",
|
|
2625
|
+
f"donor_info: {stock['donor_info'] or '-'}",
|
|
2626
|
+
f"stock_comments: {stock['stock_comments'] or '-'}",
|
|
2627
|
+
f"component_symbols: {stock['component_symbols'] or '-'}",
|
|
2628
|
+
f"gene_symbols: {stock['gene_symbols'] or '-'}",
|
|
2629
|
+
f"fbgns: {stock['fbgns'] or '-'}",
|
|
2630
|
+
]
|
|
2631
|
+
|
|
2632
|
+
if stock["components"]:
|
|
2633
|
+
lines.append("components:")
|
|
2634
|
+
for row in stock["components"][:20]:
|
|
2635
|
+
detail = "; ".join(
|
|
2636
|
+
part
|
|
2637
|
+
for part in [
|
|
2638
|
+
row["fbid"],
|
|
2639
|
+
row["mapstatement"],
|
|
2640
|
+
row["comment1"],
|
|
2641
|
+
row["comment2"],
|
|
2642
|
+
row["comment3"],
|
|
2643
|
+
]
|
|
2644
|
+
if part
|
|
2645
|
+
)
|
|
2646
|
+
if detail:
|
|
2647
|
+
lines.append(f" - {row['component_symbol']}: {detail}")
|
|
2648
|
+
else:
|
|
2649
|
+
lines.append(f" - {row['component_symbol']}")
|
|
2650
|
+
if row.get("property_syns"):
|
|
2651
|
+
lines.append(f" properties: {row['property_syns']}")
|
|
2652
|
+
if row.get("gene_relationships"):
|
|
2653
|
+
lines.append(f" gene_relationships: {row['gene_relationships']}")
|
|
2654
|
+
|
|
2655
|
+
if stock["genes"]:
|
|
2656
|
+
lines.append("genes:")
|
|
2657
|
+
for row in stock["genes"][:40]:
|
|
2658
|
+
bits = [row["component_symbol"], row["gene_symbol"], row["fbgn"]]
|
|
2659
|
+
lines.append(f" - {' | '.join(bit for bit in bits if bit)}")
|
|
2660
|
+
|
|
2661
|
+
return "\n".join(lines)
|