flybase-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flybase_cli/__init__.py +4 -0
- flybase_cli/__main__.py +5 -0
- flybase_cli/cli.py +667 -0
- flybase_cli/config.py +266 -0
- flybase_cli/core.py +700 -0
- flybase_cli/loaders.py +539 -0
- flybase_cli/postgres.py +106 -0
- flybase_cli/querying.py +162 -0
- flybase_cli/schema.py +671 -0
- flybase_cli/semantics.py +114 -0
- flybase_cli/syncing.py +254 -0
- flybase_cli/version.py +1 -0
- flybase_cli-0.1.2.dist-info/METADATA +244 -0
- flybase_cli-0.1.2.dist-info/RECORD +18 -0
- flybase_cli-0.1.2.dist-info/WHEEL +5 -0
- flybase_cli-0.1.2.dist-info/entry_points.txt +2 -0
- flybase_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
- flybase_cli-0.1.2.dist-info/top_level.txt +1 -0
flybase_cli/core.py
ADDED
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sqlite3
|
|
5
|
+
import subprocess
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import urllib.request
|
|
8
|
+
from html.parser import HTMLParser
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .config import (
|
|
12
|
+
BASE_RELEASES,
|
|
13
|
+
BATCH_SIZE,
|
|
14
|
+
GENOME_ASSET_PATTERNS,
|
|
15
|
+
GENOME_SECTIONS,
|
|
16
|
+
SEARCH_ID_CANDIDATES,
|
|
17
|
+
SyncPreset,
|
|
18
|
+
)
|
|
19
|
+
from .loaders import ingest_source, is_ingestable
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def site_root_url() -> str:
|
|
23
|
+
parts = urllib.parse.urlsplit(BASE_RELEASES)
|
|
24
|
+
return f"{parts.scheme}://{parts.netloc}/"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DirectoryIndexParser(HTMLParser):
|
|
28
|
+
def __init__(self) -> None:
|
|
29
|
+
super().__init__()
|
|
30
|
+
self.entries: list[dict[str, str]] = []
|
|
31
|
+
self.current_href: str | None = None
|
|
32
|
+
self.current_text: list[str] = []
|
|
33
|
+
|
|
34
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
35
|
+
if tag != "a":
|
|
36
|
+
return
|
|
37
|
+
href = dict(attrs).get("href")
|
|
38
|
+
if href:
|
|
39
|
+
self.current_href = href
|
|
40
|
+
self.current_text = []
|
|
41
|
+
|
|
42
|
+
def handle_data(self, data: str) -> None:
|
|
43
|
+
if self.current_href is not None:
|
|
44
|
+
self.current_text.append(data)
|
|
45
|
+
|
|
46
|
+
def handle_endtag(self, tag: str) -> None:
|
|
47
|
+
if tag != "a" or self.current_href is None:
|
|
48
|
+
return
|
|
49
|
+
self.entries.append(
|
|
50
|
+
{
|
|
51
|
+
"href": self.current_href,
|
|
52
|
+
"text": "".join(self.current_text).strip(),
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
self.current_href = None
|
|
56
|
+
self.current_text = []
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def fetch_via_curl(url: str) -> bytes:
|
|
60
|
+
result = subprocess.run(
|
|
61
|
+
["curl", "-fsSL", url],
|
|
62
|
+
check=True,
|
|
63
|
+
capture_output=True,
|
|
64
|
+
)
|
|
65
|
+
return result.stdout
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def request_bytes(url: str) -> bytes:
|
|
69
|
+
request = urllib.request.Request(url, headers={"User-Agent": "curl/8.7.1"})
|
|
70
|
+
with urllib.request.urlopen(request) as response:
|
|
71
|
+
payload = response.read()
|
|
72
|
+
if getattr(response, "status", 200) == 202 or not payload:
|
|
73
|
+
raise RuntimeError(f"empty or challenged response for {url}")
|
|
74
|
+
return payload
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def fetch_text(url: str) -> str:
|
|
78
|
+
return fetch_bytes(url).decode("utf-8", errors="replace")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def fetch_bytes(url: str) -> bytes:
|
|
82
|
+
try:
|
|
83
|
+
return request_bytes(url)
|
|
84
|
+
except Exception:
|
|
85
|
+
return fetch_via_curl(url)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def release_base_url(release: str) -> str:
|
|
89
|
+
normalized = release.strip("/")
|
|
90
|
+
return urllib.parse.urljoin(BASE_RELEASES, f"{normalized}/")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def normalize_crawl_url(url: str) -> str:
|
|
94
|
+
clean = url.split("?", 1)[0]
|
|
95
|
+
if clean.endswith("index.html"):
|
|
96
|
+
clean = clean[: -len("index.html")]
|
|
97
|
+
if not clean.endswith("/"):
|
|
98
|
+
clean = f"{clean}/"
|
|
99
|
+
return clean
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def normalize_path(path: str) -> str:
|
|
103
|
+
return path.lstrip("/")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def normalize_bucket_href(href: str, release: str) -> str:
|
|
107
|
+
clean = href.split("?", 1)[0]
|
|
108
|
+
release_prefix = f"/releases/{release.strip('/')}/"
|
|
109
|
+
if clean.startswith(release_prefix):
|
|
110
|
+
clean = clean[len(release_prefix) :]
|
|
111
|
+
return normalize_path(clean)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def resolve_manifest_entry(current: str, href: str, release: str) -> str:
|
|
115
|
+
normalized = normalize_bucket_href(href, release)
|
|
116
|
+
if current and normalized and not href.startswith("/") and not normalized.startswith(current):
|
|
117
|
+
normalized = normalize_path(f"{current}{normalized}")
|
|
118
|
+
return normalized
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def scrape_links(url: str) -> list[dict[str, str]]:
|
|
122
|
+
parser = DirectoryIndexParser()
|
|
123
|
+
parser.feed(fetch_text(url))
|
|
124
|
+
return parser.entries
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def scrape_index(url: str) -> list[str]:
|
|
128
|
+
results: list[str] = []
|
|
129
|
+
for entry in scrape_links(url):
|
|
130
|
+
href = entry["href"]
|
|
131
|
+
if href.startswith(("?", "#")):
|
|
132
|
+
continue
|
|
133
|
+
clean = href.split("?", 1)[0]
|
|
134
|
+
if clean in ("", "../", "./", "index.html"):
|
|
135
|
+
continue
|
|
136
|
+
results.append(clean)
|
|
137
|
+
return results
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def build_manifest(prefix: str, release: str = "current") -> list[dict[str, str]]:
|
|
141
|
+
normalized_prefix = normalize_path(prefix)
|
|
142
|
+
if normalized_prefix and not normalized_prefix.endswith("/"):
|
|
143
|
+
normalized_prefix = f"{normalized_prefix}/"
|
|
144
|
+
|
|
145
|
+
base_url = release_base_url(release)
|
|
146
|
+
todo = [normalized_prefix]
|
|
147
|
+
seen: set[str] = set()
|
|
148
|
+
files: list[dict[str, str]] = []
|
|
149
|
+
|
|
150
|
+
while todo:
|
|
151
|
+
current = todo.pop()
|
|
152
|
+
if current in seen:
|
|
153
|
+
continue
|
|
154
|
+
seen.add(current)
|
|
155
|
+
page_url = urllib.parse.urljoin(base_url, current)
|
|
156
|
+
for entry in scrape_index(page_url):
|
|
157
|
+
normalized_entry = resolve_manifest_entry(current, entry, release)
|
|
158
|
+
if normalized_entry.endswith("index.html"):
|
|
159
|
+
normalized_entry = normalized_entry[: -len("index.html")]
|
|
160
|
+
if current and not normalized_entry.startswith(current):
|
|
161
|
+
continue
|
|
162
|
+
if current and normalized_entry == current.rstrip("/"):
|
|
163
|
+
continue
|
|
164
|
+
if normalized_entry.endswith("/"):
|
|
165
|
+
todo.append(normalized_entry)
|
|
166
|
+
continue
|
|
167
|
+
files.append(
|
|
168
|
+
{
|
|
169
|
+
"path": normalized_entry,
|
|
170
|
+
"url": urllib.parse.urljoin(base_url, normalized_entry),
|
|
171
|
+
}
|
|
172
|
+
)
|
|
173
|
+
return sorted(files, key=lambda item: item["path"])
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def path_from_root_url(root_url: str, url: str) -> str:
|
|
177
|
+
root = urllib.parse.urlsplit(normalize_crawl_url(root_url))
|
|
178
|
+
target = urllib.parse.urlsplit(url.split("?", 1)[0])
|
|
179
|
+
if (root.scheme, root.netloc) != (target.scheme, target.netloc):
|
|
180
|
+
return ""
|
|
181
|
+
if not target.path.startswith(root.path):
|
|
182
|
+
return ""
|
|
183
|
+
relative = target.path[len(root.path) :].lstrip("/")
|
|
184
|
+
if url.endswith("/") and relative and not relative.endswith("/"):
|
|
185
|
+
relative = f"{relative}/"
|
|
186
|
+
return relative
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def build_manifest_from_url(root_url: str) -> list[dict[str, str]]:
|
|
190
|
+
base_url = normalize_crawl_url(root_url)
|
|
191
|
+
todo = [base_url]
|
|
192
|
+
seen: set[str] = set()
|
|
193
|
+
files: list[dict[str, str]] = []
|
|
194
|
+
|
|
195
|
+
while todo:
|
|
196
|
+
current_url = todo.pop()
|
|
197
|
+
if current_url in seen:
|
|
198
|
+
continue
|
|
199
|
+
seen.add(current_url)
|
|
200
|
+
for href in scrape_index(current_url):
|
|
201
|
+
absolute = urllib.parse.urljoin(current_url, href)
|
|
202
|
+
if absolute.endswith(("index.html", "/")):
|
|
203
|
+
normalized = normalize_crawl_url(absolute)
|
|
204
|
+
else:
|
|
205
|
+
normalized = absolute.split("?", 1)[0]
|
|
206
|
+
relative = path_from_root_url(base_url, normalized)
|
|
207
|
+
if not relative:
|
|
208
|
+
continue
|
|
209
|
+
if normalized == base_url or relative in {"", "."}:
|
|
210
|
+
continue
|
|
211
|
+
if normalized.endswith("/"):
|
|
212
|
+
todo.append(normalized)
|
|
213
|
+
continue
|
|
214
|
+
files.append({"path": relative, "url": normalized})
|
|
215
|
+
|
|
216
|
+
return sorted(files, key=lambda item: item["path"])
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def extract_genomes(links: list[dict[str, str]]) -> list[dict[str, str]]:
|
|
220
|
+
genomes: list[dict[str, str]] = []
|
|
221
|
+
for link in links:
|
|
222
|
+
href = link["href"].split("?", 1)[0]
|
|
223
|
+
if not href.startswith("/genomes/"):
|
|
224
|
+
continue
|
|
225
|
+
parts = [part for part in href.split("/") if part]
|
|
226
|
+
if len(parts) < 3:
|
|
227
|
+
continue
|
|
228
|
+
species = parts[1]
|
|
229
|
+
genome_build = parts[2]
|
|
230
|
+
label = link["text"] or genome_build
|
|
231
|
+
genomes.append(
|
|
232
|
+
{
|
|
233
|
+
"label": label,
|
|
234
|
+
"species": species,
|
|
235
|
+
"genome_build": genome_build,
|
|
236
|
+
"url": urllib.parse.urljoin(site_root_url(), href.lstrip("/")),
|
|
237
|
+
}
|
|
238
|
+
)
|
|
239
|
+
return genomes
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def list_genomes(release: str = "current") -> list[dict[str, str]]:
|
|
243
|
+
links = scrape_links(release_base_url(release))
|
|
244
|
+
genomes = extract_genomes(links)
|
|
245
|
+
return sorted(genomes, key=lambda item: (item["species"], item["genome_build"]))
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def find_genome(
|
|
249
|
+
*,
|
|
250
|
+
release: str,
|
|
251
|
+
genome: str | None = None,
|
|
252
|
+
species: str | None = None,
|
|
253
|
+
) -> dict[str, str]:
|
|
254
|
+
genomes = list_genomes(release)
|
|
255
|
+
normalized_genome = genome.lower() if genome else None
|
|
256
|
+
normalized_species = species.lower() if species else None
|
|
257
|
+
|
|
258
|
+
matches: list[dict[str, str]] = []
|
|
259
|
+
for item in genomes:
|
|
260
|
+
label = item["label"].lower()
|
|
261
|
+
build = item["genome_build"].lower()
|
|
262
|
+
species_name = item["species"].lower()
|
|
263
|
+
genome_ok = normalized_genome is None or normalized_genome in {label, build}
|
|
264
|
+
species_ok = normalized_species is None or normalized_species in {species_name, label}
|
|
265
|
+
if genome_ok and species_ok:
|
|
266
|
+
matches.append(item)
|
|
267
|
+
|
|
268
|
+
if not matches:
|
|
269
|
+
raise ValueError("no genome matched the requested release/species/build")
|
|
270
|
+
if len(matches) > 1:
|
|
271
|
+
joined = ", ".join(match["label"] for match in matches[:10])
|
|
272
|
+
raise ValueError(f"multiple genomes matched: {joined}")
|
|
273
|
+
return matches[0]
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def genome_section_url(genome_url: str, section: str) -> str:
|
|
277
|
+
if section not in GENOME_SECTIONS:
|
|
278
|
+
raise ValueError(f"unsupported genome section: {section}")
|
|
279
|
+
return urllib.parse.urljoin(normalize_crawl_url(genome_url), f"{section}/")
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def genome_asset_pattern(asset: str | None) -> list[str]:
|
|
283
|
+
if not asset:
|
|
284
|
+
return []
|
|
285
|
+
pattern = GENOME_ASSET_PATTERNS.get(asset.lower())
|
|
286
|
+
if pattern is None:
|
|
287
|
+
raise ValueError(f"unknown genome asset preset: {asset}")
|
|
288
|
+
return [pattern]
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def write_json(path: Path, payload: object) -> None:
|
|
292
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
293
|
+
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def load_manifest(path: Path) -> list[dict[str, str]]:
|
|
297
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def compile_patterns(patterns: list[str] | tuple[str, ...]) -> list:
|
|
301
|
+
import re
|
|
302
|
+
|
|
303
|
+
return [re.compile(pattern) for pattern in patterns]
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def filter_manifest(
|
|
307
|
+
manifest: list[dict[str, str]],
|
|
308
|
+
include: list[str] | tuple[str, ...],
|
|
309
|
+
exclude: list[str] | tuple[str, ...],
|
|
310
|
+
) -> list[dict[str, str]]:
|
|
311
|
+
include_patterns = compile_patterns(include)
|
|
312
|
+
exclude_patterns = compile_patterns(exclude)
|
|
313
|
+
filtered: list[dict[str, str]] = []
|
|
314
|
+
|
|
315
|
+
for item in manifest:
|
|
316
|
+
path = item["path"]
|
|
317
|
+
if include_patterns and not any(pattern.search(path) for pattern in include_patterns):
|
|
318
|
+
continue
|
|
319
|
+
if any(pattern.search(path) for pattern in exclude_patterns):
|
|
320
|
+
continue
|
|
321
|
+
filtered.append(item)
|
|
322
|
+
|
|
323
|
+
return filtered
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def download_file(url: str, dest: Path) -> None:
|
|
327
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
+
try:
|
|
329
|
+
payload = request_bytes(url)
|
|
330
|
+
dest.write_bytes(payload)
|
|
331
|
+
except Exception:
|
|
332
|
+
subprocess.run(["curl", "-fsSL", "-o", str(dest), url], check=True)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def download_manifest_entries(
|
|
336
|
+
manifest: list[dict[str, str]],
|
|
337
|
+
root: Path,
|
|
338
|
+
force: bool = False,
|
|
339
|
+
) -> list[tuple[Path, bool]]:
|
|
340
|
+
local_paths: list[tuple[Path, bool]] = []
|
|
341
|
+
for item in manifest:
|
|
342
|
+
dest = root / item["path"]
|
|
343
|
+
should_download = force or not dest.exists()
|
|
344
|
+
local_paths.append((dest, should_download))
|
|
345
|
+
if dest.exists() and not force:
|
|
346
|
+
continue
|
|
347
|
+
download_file(item["url"], dest)
|
|
348
|
+
return local_paths
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def sync_manifest(
|
|
352
|
+
manifest: list[dict[str, str]],
|
|
353
|
+
*,
|
|
354
|
+
root: Path,
|
|
355
|
+
db_path: Path,
|
|
356
|
+
manifest_path: Path | None = None,
|
|
357
|
+
force: bool = False,
|
|
358
|
+
no_header: bool = False,
|
|
359
|
+
) -> dict[str, object]:
|
|
360
|
+
if manifest_path is not None:
|
|
361
|
+
write_json(manifest_path, manifest)
|
|
362
|
+
local_paths = download_manifest_entries(manifest, root, force=force)
|
|
363
|
+
ingested = ingest_files(
|
|
364
|
+
db_path,
|
|
365
|
+
[path for path, _ in local_paths if is_ingestable(path)],
|
|
366
|
+
no_header=no_header,
|
|
367
|
+
)
|
|
368
|
+
return {
|
|
369
|
+
"manifest_path": str(manifest_path) if manifest_path is not None else None,
|
|
370
|
+
"file_count": len(manifest),
|
|
371
|
+
"ingested_tables": ingested,
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def table_name_from_path(path: str) -> str:
|
|
376
|
+
import re
|
|
377
|
+
|
|
378
|
+
name = Path(path).name
|
|
379
|
+
suffixes = (
|
|
380
|
+
".tsv.gz",
|
|
381
|
+
".csv.gz",
|
|
382
|
+
".fasta.gz",
|
|
383
|
+
".fa.gz",
|
|
384
|
+
".fna.gz",
|
|
385
|
+
".faa.gz",
|
|
386
|
+
".gff.gz",
|
|
387
|
+
".gff3.gz",
|
|
388
|
+
".gtf.gz",
|
|
389
|
+
".json.gz",
|
|
390
|
+
".tsv",
|
|
391
|
+
".csv",
|
|
392
|
+
".fasta",
|
|
393
|
+
".fa",
|
|
394
|
+
".fna",
|
|
395
|
+
".faa",
|
|
396
|
+
".gff",
|
|
397
|
+
".gff3",
|
|
398
|
+
".gtf",
|
|
399
|
+
".json",
|
|
400
|
+
)
|
|
401
|
+
for suffix in suffixes:
|
|
402
|
+
if name.endswith(suffix):
|
|
403
|
+
name = name[: -len(suffix)]
|
|
404
|
+
break
|
|
405
|
+
safe = re.sub(r"[^A-Za-z0-9_]+", "_", name).strip("_").lower()
|
|
406
|
+
return f"fb_{safe or 'table'}"
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def open_db(path: Path) -> sqlite3.Connection:
|
|
410
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
411
|
+
conn = sqlite3.connect(path)
|
|
412
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
413
|
+
return conn
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def ensure_registry(conn: sqlite3.Connection) -> None:
|
|
417
|
+
existing = conn.execute(
|
|
418
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='fb_ingest_registry'"
|
|
419
|
+
).fetchone()
|
|
420
|
+
if not existing:
|
|
421
|
+
conn.execute(
|
|
422
|
+
"""
|
|
423
|
+
CREATE TABLE fb_ingest_registry (
|
|
424
|
+
source_path TEXT NOT NULL,
|
|
425
|
+
table_name TEXT NOT NULL,
|
|
426
|
+
row_count INTEGER NOT NULL,
|
|
427
|
+
PRIMARY KEY (source_path, table_name)
|
|
428
|
+
)
|
|
429
|
+
"""
|
|
430
|
+
)
|
|
431
|
+
return
|
|
432
|
+
|
|
433
|
+
columns = conn.execute("PRAGMA table_info(fb_ingest_registry)").fetchall()
|
|
434
|
+
source_path_column = next((column for column in columns if column[1] == "source_path"), None)
|
|
435
|
+
table_name_column = next((column for column in columns if column[1] == "table_name"), None)
|
|
436
|
+
if source_path_column and source_path_column[5] == 1 and table_name_column and table_name_column[5] == 0:
|
|
437
|
+
conn.execute("ALTER TABLE fb_ingest_registry RENAME TO fb_ingest_registry_old")
|
|
438
|
+
conn.execute(
|
|
439
|
+
"""
|
|
440
|
+
CREATE TABLE fb_ingest_registry (
|
|
441
|
+
source_path TEXT NOT NULL,
|
|
442
|
+
table_name TEXT NOT NULL,
|
|
443
|
+
row_count INTEGER NOT NULL,
|
|
444
|
+
PRIMARY KEY (source_path, table_name)
|
|
445
|
+
)
|
|
446
|
+
"""
|
|
447
|
+
)
|
|
448
|
+
conn.execute(
|
|
449
|
+
"""
|
|
450
|
+
INSERT INTO fb_ingest_registry (source_path, table_name, row_count)
|
|
451
|
+
SELECT source_path, table_name, row_count
|
|
452
|
+
FROM fb_ingest_registry_old
|
|
453
|
+
"""
|
|
454
|
+
)
|
|
455
|
+
conn.execute("DROP TABLE fb_ingest_registry_old")
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def upsert_registry(
|
|
459
|
+
conn: sqlite3.Connection,
|
|
460
|
+
source: Path,
|
|
461
|
+
table_name: str,
|
|
462
|
+
row_count: int,
|
|
463
|
+
) -> None:
|
|
464
|
+
conn.execute(
|
|
465
|
+
"""
|
|
466
|
+
INSERT INTO fb_ingest_registry (source_path, table_name, row_count)
|
|
467
|
+
VALUES (?, ?, ?)
|
|
468
|
+
ON CONFLICT(source_path, table_name)
|
|
469
|
+
DO UPDATE SET row_count = excluded.row_count
|
|
470
|
+
""",
|
|
471
|
+
(str(source), table_name, row_count),
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def ingest_files(
|
|
476
|
+
db_path: Path,
|
|
477
|
+
sources: list[Path],
|
|
478
|
+
no_header: bool = False,
|
|
479
|
+
) -> list[dict[str, str | int]]:
|
|
480
|
+
conn = open_db(db_path)
|
|
481
|
+
ensure_registry(conn)
|
|
482
|
+
ingested: list[dict[str, str | int]] = []
|
|
483
|
+
try:
|
|
484
|
+
for source in sources:
|
|
485
|
+
table_name = table_name_from_path(source.name)
|
|
486
|
+
for emitted_table_name, row_count in ingest_source(conn, source, table_name, no_header=no_header):
|
|
487
|
+
upsert_registry(conn, source, emitted_table_name, row_count)
|
|
488
|
+
ingested.append(
|
|
489
|
+
{
|
|
490
|
+
"source_path": str(source),
|
|
491
|
+
"table_name": emitted_table_name,
|
|
492
|
+
"row_count": row_count,
|
|
493
|
+
}
|
|
494
|
+
)
|
|
495
|
+
conn.commit()
|
|
496
|
+
finally:
|
|
497
|
+
conn.close()
|
|
498
|
+
return ingested
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def list_tables(db_path: Path, include_columns: bool = False) -> list[dict[str, object]]:
|
|
502
|
+
conn = open_db(db_path)
|
|
503
|
+
ensure_registry(conn)
|
|
504
|
+
try:
|
|
505
|
+
rows = conn.execute(
|
|
506
|
+
"""
|
|
507
|
+
SELECT table_name, row_count, source_path
|
|
508
|
+
FROM fb_ingest_registry
|
|
509
|
+
ORDER BY table_name
|
|
510
|
+
"""
|
|
511
|
+
).fetchall()
|
|
512
|
+
payload: list[dict[str, object]] = []
|
|
513
|
+
for table_name, row_count, source_path in rows:
|
|
514
|
+
item: dict[str, object] = {
|
|
515
|
+
"table_name": table_name,
|
|
516
|
+
"row_count": row_count,
|
|
517
|
+
"source_path": source_path,
|
|
518
|
+
}
|
|
519
|
+
if include_columns:
|
|
520
|
+
item["columns"] = [
|
|
521
|
+
row[1]
|
|
522
|
+
for row in conn.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
523
|
+
]
|
|
524
|
+
payload.append(item)
|
|
525
|
+
return payload
|
|
526
|
+
finally:
|
|
527
|
+
conn.close()
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def run_query(db_path: Path, query: str) -> tuple[list[str], list[tuple[object, ...]]] | None:
|
|
531
|
+
conn = open_db(db_path)
|
|
532
|
+
try:
|
|
533
|
+
cursor = conn.execute(query)
|
|
534
|
+
if cursor.description is None:
|
|
535
|
+
conn.commit()
|
|
536
|
+
return None
|
|
537
|
+
columns = [description[0] for description in cursor.description]
|
|
538
|
+
return columns, cursor.fetchall()
|
|
539
|
+
finally:
|
|
540
|
+
conn.close()
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def pick_record_id(columns: list[str], row: tuple[object, ...], rowid: int) -> str:
|
|
544
|
+
row_map = dict(zip(columns, row, strict=True))
|
|
545
|
+
for candidate in SEARCH_ID_CANDIDATES:
|
|
546
|
+
value = row_map.get(candidate)
|
|
547
|
+
if value:
|
|
548
|
+
return str(value)
|
|
549
|
+
return str(rowid)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def ensure_search_table(conn: sqlite3.Connection) -> None:
|
|
553
|
+
conn.execute("DROP TABLE IF EXISTS fb_search_fts")
|
|
554
|
+
conn.execute(
|
|
555
|
+
"""
|
|
556
|
+
CREATE VIRTUAL TABLE fb_search_fts
|
|
557
|
+
USING fts5(table_name, record_id, text)
|
|
558
|
+
"""
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def list_registry_table_names(conn: sqlite3.Connection) -> list[str]:
|
|
563
|
+
ensure_registry(conn)
|
|
564
|
+
return [
|
|
565
|
+
row[0]
|
|
566
|
+
for row in conn.execute(
|
|
567
|
+
"SELECT table_name FROM fb_ingest_registry ORDER BY table_name"
|
|
568
|
+
).fetchall()
|
|
569
|
+
]
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def rebuild_search_index(
|
|
573
|
+
db_path: Path,
|
|
574
|
+
table_names: list[str] | None = None,
|
|
575
|
+
) -> list[dict[str, object]]:
|
|
576
|
+
conn = open_db(db_path)
|
|
577
|
+
try:
|
|
578
|
+
ensure_search_table(conn)
|
|
579
|
+
selected_tables = table_names or list_registry_table_names(conn)
|
|
580
|
+
indexed: list[dict[str, object]] = []
|
|
581
|
+
|
|
582
|
+
for table_name in selected_tables:
|
|
583
|
+
columns = [
|
|
584
|
+
row[1]
|
|
585
|
+
for row in conn.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
586
|
+
]
|
|
587
|
+
if not columns:
|
|
588
|
+
continue
|
|
589
|
+
|
|
590
|
+
batch: list[tuple[str, str, str]] = []
|
|
591
|
+
row_count = 0
|
|
592
|
+
quoted = ", ".join(f'"{column}"' for column in columns)
|
|
593
|
+
sql = f'SELECT rowid, {quoted} FROM "{table_name}"'
|
|
594
|
+
for result in conn.execute(sql):
|
|
595
|
+
rowid = int(result[0])
|
|
596
|
+
row = tuple("" if value is None else str(value) for value in result[1:])
|
|
597
|
+
text_parts = [
|
|
598
|
+
f"{column}: {value}"
|
|
599
|
+
for column, value in zip(columns, row, strict=True)
|
|
600
|
+
if value
|
|
601
|
+
]
|
|
602
|
+
if not text_parts:
|
|
603
|
+
continue
|
|
604
|
+
batch.append(
|
|
605
|
+
(
|
|
606
|
+
table_name,
|
|
607
|
+
pick_record_id(columns, row, rowid),
|
|
608
|
+
"\n".join(text_parts),
|
|
609
|
+
)
|
|
610
|
+
)
|
|
611
|
+
if len(batch) >= BATCH_SIZE:
|
|
612
|
+
conn.executemany(
|
|
613
|
+
"INSERT INTO fb_search_fts (table_name, record_id, text) VALUES (?, ?, ?)",
|
|
614
|
+
batch,
|
|
615
|
+
)
|
|
616
|
+
row_count += len(batch)
|
|
617
|
+
batch.clear()
|
|
618
|
+
|
|
619
|
+
if batch:
|
|
620
|
+
conn.executemany(
|
|
621
|
+
"INSERT INTO fb_search_fts (table_name, record_id, text) VALUES (?, ?, ?)",
|
|
622
|
+
batch,
|
|
623
|
+
)
|
|
624
|
+
row_count += len(batch)
|
|
625
|
+
|
|
626
|
+
indexed.append({"table_name": table_name, "row_count": row_count})
|
|
627
|
+
|
|
628
|
+
conn.commit()
|
|
629
|
+
return indexed
|
|
630
|
+
finally:
|
|
631
|
+
conn.close()
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def search_index(
|
|
635
|
+
db_path: Path,
|
|
636
|
+
query: str,
|
|
637
|
+
limit: int = 20,
|
|
638
|
+
table_name: str | None = None,
|
|
639
|
+
) -> list[dict[str, object]]:
|
|
640
|
+
conn = open_db(db_path)
|
|
641
|
+
try:
|
|
642
|
+
sql = """
|
|
643
|
+
SELECT
|
|
644
|
+
table_name,
|
|
645
|
+
record_id,
|
|
646
|
+
bm25(fb_search_fts) AS score,
|
|
647
|
+
snippet(fb_search_fts, 2, '[', ']', '...', 12) AS snippet
|
|
648
|
+
FROM fb_search_fts
|
|
649
|
+
WHERE fb_search_fts MATCH ?
|
|
650
|
+
"""
|
|
651
|
+
params: list[object] = [query]
|
|
652
|
+
if table_name:
|
|
653
|
+
sql += " AND table_name = ?"
|
|
654
|
+
params.append(table_name)
|
|
655
|
+
sql += " LIMIT ?"
|
|
656
|
+
params.append(limit)
|
|
657
|
+
return [
|
|
658
|
+
{
|
|
659
|
+
"table_name": row[0],
|
|
660
|
+
"record_id": row[1],
|
|
661
|
+
"score": row[2],
|
|
662
|
+
"snippet": row[3],
|
|
663
|
+
}
|
|
664
|
+
for row in conn.execute(sql, params).fetchall()
|
|
665
|
+
]
|
|
666
|
+
finally:
|
|
667
|
+
conn.close()
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def sync_preset(
|
|
671
|
+
preset: SyncPreset,
|
|
672
|
+
root: Path,
|
|
673
|
+
db_path: Path,
|
|
674
|
+
manifest_path: Path,
|
|
675
|
+
release: str = "current",
|
|
676
|
+
force: bool = False,
|
|
677
|
+
) -> dict[str, object]:
|
|
678
|
+
manifest_map: dict[str, dict[str, str]] = {}
|
|
679
|
+
for selection in preset.selections:
|
|
680
|
+
filtered = filter_manifest(
|
|
681
|
+
build_manifest(selection.prefix, release=release),
|
|
682
|
+
selection.includes,
|
|
683
|
+
selection.excludes,
|
|
684
|
+
)
|
|
685
|
+
for item in filtered:
|
|
686
|
+
manifest_map[item["path"]] = item
|
|
687
|
+
manifest = sorted(manifest_map.values(), key=lambda item: item["path"])
|
|
688
|
+
summary = sync_manifest(
|
|
689
|
+
manifest,
|
|
690
|
+
root=root,
|
|
691
|
+
db_path=db_path,
|
|
692
|
+
manifest_path=manifest_path,
|
|
693
|
+
force=force,
|
|
694
|
+
)
|
|
695
|
+
return {
|
|
696
|
+
"preset": preset.name,
|
|
697
|
+
"description": preset.description,
|
|
698
|
+
"release": release,
|
|
699
|
+
**summary,
|
|
700
|
+
}
|