flybase-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flybase_cli/__init__.py +4 -0
- flybase_cli/__main__.py +5 -0
- flybase_cli/cli.py +667 -0
- flybase_cli/config.py +266 -0
- flybase_cli/core.py +700 -0
- flybase_cli/loaders.py +539 -0
- flybase_cli/postgres.py +106 -0
- flybase_cli/querying.py +162 -0
- flybase_cli/schema.py +671 -0
- flybase_cli/semantics.py +114 -0
- flybase_cli/syncing.py +254 -0
- flybase_cli/version.py +1 -0
- flybase_cli-0.1.2.dist-info/METADATA +244 -0
- flybase_cli-0.1.2.dist-info/RECORD +18 -0
- flybase_cli-0.1.2.dist-info/WHEEL +5 -0
- flybase_cli-0.1.2.dist-info/entry_points.txt +2 -0
- flybase_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
- flybase_cli-0.1.2.dist-info/top_level.txt +1 -0
flybase_cli/loaders.py
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import gzip
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import sqlite3
|
|
8
|
+
import tarfile
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from io import TextIOWrapper
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Iterator
|
|
13
|
+
|
|
14
|
+
from .config import (
|
|
15
|
+
BATCH_SIZE,
|
|
16
|
+
DELIMITED_SUFFIXES,
|
|
17
|
+
FASTA_SUFFIXES,
|
|
18
|
+
GFF_SUFFIXES,
|
|
19
|
+
GTF_SUFFIXES,
|
|
20
|
+
JSON_ID_CANDIDATES,
|
|
21
|
+
JSON_MAX_INFERRED_COLUMNS,
|
|
22
|
+
JSON_SUFFIXES,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@contextmanager
|
|
27
|
+
def open_maybe_gzip(path: Path):
|
|
28
|
+
if path.suffix != ".gz":
|
|
29
|
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
|
30
|
+
yield handle
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
archive = tarfile.open(path, mode="r:gz")
|
|
35
|
+
except tarfile.ReadError:
|
|
36
|
+
with gzip.open(path, "rt", encoding="utf-8", newline="") as handle:
|
|
37
|
+
yield handle
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
member = next((item for item in archive if item.isfile()), None)
|
|
42
|
+
if member is None:
|
|
43
|
+
raise ValueError(f"no regular file found in archive: {path}")
|
|
44
|
+
extracted = archive.extractfile(member)
|
|
45
|
+
if extracted is None:
|
|
46
|
+
raise ValueError(f"unable to extract archive member: {path}")
|
|
47
|
+
wrapper = TextIOWrapper(extracted, encoding="utf-8", newline="")
|
|
48
|
+
try:
|
|
49
|
+
yield wrapper
|
|
50
|
+
finally:
|
|
51
|
+
wrapper.close()
|
|
52
|
+
finally:
|
|
53
|
+
archive.close()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def sanitize_columns(columns: list[str]) -> list[str]:
|
|
57
|
+
seen: dict[str, int] = {}
|
|
58
|
+
output: list[str] = []
|
|
59
|
+
for index, column in enumerate(columns, start=1):
|
|
60
|
+
base = re.sub(r"[^A-Za-z0-9_]+", "_", column.strip()).strip("_").lower()
|
|
61
|
+
if not base:
|
|
62
|
+
base = f"col_{index}"
|
|
63
|
+
seen[base] = seen.get(base, 0) + 1
|
|
64
|
+
output.append(base if seen[base] == 1 else f"{base}_{seen[base]}")
|
|
65
|
+
return output
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_table(conn: sqlite3.Connection, table_name: str, columns: list[str]) -> str:
|
|
69
|
+
conn.execute(f'DROP TABLE IF EXISTS "{table_name}"')
|
|
70
|
+
create_sql = ", ".join(f'"{column}" TEXT' for column in columns)
|
|
71
|
+
conn.execute(f'CREATE TABLE "{table_name}" ({create_sql})')
|
|
72
|
+
quoted_columns = ", ".join(f'"{column}"' for column in columns)
|
|
73
|
+
placeholders = ", ".join("?" for _ in columns)
|
|
74
|
+
return f'INSERT INTO "{table_name}" ({quoted_columns}) VALUES ({placeholders})'
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def flush_batch(
|
|
78
|
+
conn: sqlite3.Connection,
|
|
79
|
+
insert_sql: str,
|
|
80
|
+
batch: list[list[str]] | list[tuple[str, ...]],
|
|
81
|
+
row_count: int,
|
|
82
|
+
) -> int:
|
|
83
|
+
if batch:
|
|
84
|
+
conn.executemany(insert_sql, batch)
|
|
85
|
+
row_count += len(batch)
|
|
86
|
+
return row_count
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def sample_delimiter(path: Path) -> str:
|
|
90
|
+
return "," if ".csv" in path.name.lower() else "\t"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def iter_delimited_rows(source: Path) -> Iterator[tuple[list[str], str]]:
|
|
94
|
+
delimiter = sample_delimiter(source)
|
|
95
|
+
with open_maybe_gzip(source) as handle:
|
|
96
|
+
reader = csv.reader(handle, delimiter=delimiter)
|
|
97
|
+
for row in reader:
|
|
98
|
+
yield row, delimiter
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def read_header_and_rows(
|
|
102
|
+
source: Path,
|
|
103
|
+
no_header: bool,
|
|
104
|
+
) -> tuple[list[str], list[str] | None, Iterator[tuple[list[str], str]], str]:
|
|
105
|
+
row_iter = iter_delimited_rows(source)
|
|
106
|
+
delimiter = "\t"
|
|
107
|
+
|
|
108
|
+
for row, delimiter in row_iter:
|
|
109
|
+
if not row:
|
|
110
|
+
continue
|
|
111
|
+
if row[0].startswith("##") and len(row) == 1:
|
|
112
|
+
continue
|
|
113
|
+
if no_header:
|
|
114
|
+
header = [f"col_{index}" for index in range(1, len(row) + 1)]
|
|
115
|
+
return header, row, row_iter, delimiter
|
|
116
|
+
row[0] = row[0].lstrip("#")
|
|
117
|
+
return row, None, row_iter, delimiter
|
|
118
|
+
|
|
119
|
+
raise ValueError(f"empty file: {source}")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def normalize_row(row: list[str], width: int, delimiter: str) -> list[str]:
|
|
123
|
+
if len(row) < width:
|
|
124
|
+
return row + [""] * (width - len(row))
|
|
125
|
+
if len(row) > width:
|
|
126
|
+
return row[: width - 1] + [delimiter.join(row[width - 1 :])]
|
|
127
|
+
return row
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def ingest_delimited(
|
|
131
|
+
conn: sqlite3.Connection,
|
|
132
|
+
source: Path,
|
|
133
|
+
table_name: str,
|
|
134
|
+
no_header: bool = False,
|
|
135
|
+
) -> int:
|
|
136
|
+
raw_header, first_data_row, row_iter, delimiter = read_header_and_rows(source, no_header)
|
|
137
|
+
columns = sanitize_columns(raw_header)
|
|
138
|
+
insert_sql = create_table(conn, table_name, columns)
|
|
139
|
+
batch: list[list[str]] = []
|
|
140
|
+
row_count = 0
|
|
141
|
+
|
|
142
|
+
if first_data_row is not None:
|
|
143
|
+
batch.append(normalize_row(first_data_row, len(columns), delimiter))
|
|
144
|
+
|
|
145
|
+
for row, _ in row_iter:
|
|
146
|
+
if not row:
|
|
147
|
+
continue
|
|
148
|
+
batch.append(normalize_row(row, len(columns), delimiter))
|
|
149
|
+
if len(batch) >= BATCH_SIZE:
|
|
150
|
+
row_count = flush_batch(conn, insert_sql, batch, row_count)
|
|
151
|
+
batch.clear()
|
|
152
|
+
|
|
153
|
+
return flush_batch(conn, insert_sql, batch, row_count)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def split_fasta_header(header: str) -> tuple[str, str]:
|
|
157
|
+
text = header[1:].strip()
|
|
158
|
+
if not text:
|
|
159
|
+
return "", ""
|
|
160
|
+
parts = text.split(None, 1)
|
|
161
|
+
record_id = parts[0]
|
|
162
|
+
description = parts[1] if len(parts) > 1 else ""
|
|
163
|
+
return record_id, description
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def ingest_fasta(conn: sqlite3.Connection, source: Path, table_name: str) -> int:
|
|
167
|
+
columns = ["record_id", "header", "description", "sequence", "sequence_length"]
|
|
168
|
+
insert_sql = create_table(conn, table_name, columns)
|
|
169
|
+
batch: list[tuple[str, ...]] = []
|
|
170
|
+
row_count = 0
|
|
171
|
+
current_header = ""
|
|
172
|
+
current_id = ""
|
|
173
|
+
current_description = ""
|
|
174
|
+
sequence_parts: list[str] = []
|
|
175
|
+
|
|
176
|
+
def flush_current() -> None:
|
|
177
|
+
nonlocal row_count, batch, sequence_parts
|
|
178
|
+
if not current_header:
|
|
179
|
+
return
|
|
180
|
+
sequence = "".join(sequence_parts)
|
|
181
|
+
batch.append(
|
|
182
|
+
(
|
|
183
|
+
current_id,
|
|
184
|
+
current_header[1:].strip(),
|
|
185
|
+
current_description,
|
|
186
|
+
sequence,
|
|
187
|
+
str(len(sequence)),
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
sequence_parts = []
|
|
191
|
+
|
|
192
|
+
with open_maybe_gzip(source) as handle:
|
|
193
|
+
for raw_line in handle:
|
|
194
|
+
line = raw_line.strip()
|
|
195
|
+
if not line:
|
|
196
|
+
continue
|
|
197
|
+
if line.startswith(">"):
|
|
198
|
+
flush_current()
|
|
199
|
+
current_header = line
|
|
200
|
+
current_id, current_description = split_fasta_header(line)
|
|
201
|
+
if len(batch) >= BATCH_SIZE:
|
|
202
|
+
row_count = flush_batch(conn, insert_sql, batch, row_count)
|
|
203
|
+
batch.clear()
|
|
204
|
+
continue
|
|
205
|
+
sequence_parts.append(line)
|
|
206
|
+
|
|
207
|
+
flush_current()
|
|
208
|
+
return flush_batch(conn, insert_sql, batch, row_count)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def parse_feature_attributes(raw_attributes: str) -> dict[str, str]:
|
|
212
|
+
attributes: dict[str, str] = {}
|
|
213
|
+
for part in raw_attributes.split(";"):
|
|
214
|
+
chunk = part.strip()
|
|
215
|
+
if not chunk:
|
|
216
|
+
continue
|
|
217
|
+
if "=" in chunk:
|
|
218
|
+
key, value = chunk.split("=", 1)
|
|
219
|
+
elif " " in chunk:
|
|
220
|
+
key, value = chunk.split(" ", 1)
|
|
221
|
+
value = value.strip().strip('"')
|
|
222
|
+
else:
|
|
223
|
+
key, value = chunk, ""
|
|
224
|
+
attributes[key.strip()] = value.strip().strip('"')
|
|
225
|
+
return attributes
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def pick_attribute(attributes: dict[str, str], *keys: str) -> str:
|
|
229
|
+
for key in keys:
|
|
230
|
+
value = attributes.get(key)
|
|
231
|
+
if value:
|
|
232
|
+
return value
|
|
233
|
+
return ""
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def ingest_feature_file(
|
|
237
|
+
conn: sqlite3.Connection,
|
|
238
|
+
source: Path,
|
|
239
|
+
table_name: str,
|
|
240
|
+
) -> int:
|
|
241
|
+
columns = [
|
|
242
|
+
"seqid",
|
|
243
|
+
"source",
|
|
244
|
+
"feature_type",
|
|
245
|
+
"start",
|
|
246
|
+
"end",
|
|
247
|
+
"score",
|
|
248
|
+
"strand",
|
|
249
|
+
"phase",
|
|
250
|
+
"feature_id",
|
|
251
|
+
"parent_id",
|
|
252
|
+
"feature_name",
|
|
253
|
+
"gene_id",
|
|
254
|
+
"transcript_id",
|
|
255
|
+
"attributes_json",
|
|
256
|
+
"attributes_raw",
|
|
257
|
+
]
|
|
258
|
+
insert_sql = create_table(conn, table_name, columns)
|
|
259
|
+
batch: list[tuple[str, ...]] = []
|
|
260
|
+
row_count = 0
|
|
261
|
+
|
|
262
|
+
with open_maybe_gzip(source) as handle:
|
|
263
|
+
for raw_line in handle:
|
|
264
|
+
line = raw_line.rstrip("\n")
|
|
265
|
+
if not line or line.startswith("#"):
|
|
266
|
+
continue
|
|
267
|
+
fields = line.split("\t")
|
|
268
|
+
if len(fields) != 9:
|
|
269
|
+
continue
|
|
270
|
+
attributes = parse_feature_attributes(fields[8])
|
|
271
|
+
batch.append(
|
|
272
|
+
(
|
|
273
|
+
fields[0],
|
|
274
|
+
fields[1],
|
|
275
|
+
fields[2],
|
|
276
|
+
fields[3],
|
|
277
|
+
fields[4],
|
|
278
|
+
fields[5],
|
|
279
|
+
fields[6],
|
|
280
|
+
fields[7],
|
|
281
|
+
pick_attribute(attributes, "ID", "id"),
|
|
282
|
+
pick_attribute(attributes, "Parent", "parent"),
|
|
283
|
+
pick_attribute(attributes, "Name", "gene_name", "transcript_name", "name"),
|
|
284
|
+
pick_attribute(attributes, "gene_id", "geneID", "gene"),
|
|
285
|
+
pick_attribute(attributes, "transcript_id", "transcriptID", "transcript"),
|
|
286
|
+
json.dumps(attributes, sort_keys=True),
|
|
287
|
+
fields[8],
|
|
288
|
+
)
|
|
289
|
+
)
|
|
290
|
+
if len(batch) >= BATCH_SIZE:
|
|
291
|
+
row_count = flush_batch(conn, insert_sql, batch, row_count)
|
|
292
|
+
batch.clear()
|
|
293
|
+
|
|
294
|
+
return flush_batch(conn, insert_sql, batch, row_count)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def iter_json_rows(payload: object) -> Iterator[tuple[str, str]]:
|
|
298
|
+
if isinstance(payload, list):
|
|
299
|
+
for index, item in enumerate(payload, start=1):
|
|
300
|
+
yield pick_json_record_id(item, index), json.dumps(item, sort_keys=True)
|
|
301
|
+
return
|
|
302
|
+
|
|
303
|
+
if isinstance(payload, dict):
|
|
304
|
+
list_keys = [key for key, value in payload.items() if isinstance(value, list)]
|
|
305
|
+
if len(list_keys) == 1:
|
|
306
|
+
for index, item in enumerate(payload[list_keys[0]], start=1):
|
|
307
|
+
yield pick_json_record_id(item, index), json.dumps(item, sort_keys=True)
|
|
308
|
+
return
|
|
309
|
+
yield "1", json.dumps(payload, sort_keys=True)
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
yield "1", json.dumps(payload)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def pick_json_record_id(item: object, fallback_index: int) -> str:
|
|
316
|
+
if isinstance(item, dict):
|
|
317
|
+
for candidate in JSON_ID_CANDIDATES:
|
|
318
|
+
value = item.get(candidate)
|
|
319
|
+
if value:
|
|
320
|
+
return str(value)
|
|
321
|
+
return str(fallback_index)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def json_scalar_to_text(value: object) -> str | None:
|
|
325
|
+
if value is None:
|
|
326
|
+
return ""
|
|
327
|
+
if isinstance(value, (str, int, float, bool)):
|
|
328
|
+
return str(value)
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def flatten_json_record(record: dict[str, object], prefix: str = "") -> dict[str, str]:
|
|
333
|
+
flattened: dict[str, str] = {}
|
|
334
|
+
for key, value in record.items():
|
|
335
|
+
safe_key = re.sub(r"[^A-Za-z0-9_]+", "_", key).strip("_")
|
|
336
|
+
if not safe_key:
|
|
337
|
+
continue
|
|
338
|
+
full_key = f"{prefix}{safe_key}" if not prefix else f"{prefix}_{safe_key}"
|
|
339
|
+
scalar = json_scalar_to_text(value)
|
|
340
|
+
if scalar is not None:
|
|
341
|
+
flattened[full_key] = scalar
|
|
342
|
+
continue
|
|
343
|
+
if isinstance(value, dict):
|
|
344
|
+
for nested_key, nested_value in flatten_json_record(value, full_key).items():
|
|
345
|
+
flattened[nested_key] = nested_value
|
|
346
|
+
return flattened
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def extract_json_records(payload: object) -> list[dict[str, object]]:
|
|
350
|
+
if isinstance(payload, list):
|
|
351
|
+
return [item for item in payload if isinstance(item, dict)]
|
|
352
|
+
if isinstance(payload, dict):
|
|
353
|
+
list_keys = [key for key, value in payload.items() if isinstance(value, list)]
|
|
354
|
+
if len(list_keys) == 1:
|
|
355
|
+
return [item for item in payload[list_keys[0]] if isinstance(item, dict)]
|
|
356
|
+
return [payload]
|
|
357
|
+
return []
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def infer_json_columns(records: list[dict[str, object]]) -> list[str]:
|
|
361
|
+
frequencies: dict[str, int] = {}
|
|
362
|
+
for record in records[:200]:
|
|
363
|
+
for key in flatten_json_record(record):
|
|
364
|
+
frequencies[key] = frequencies.get(key, 0) + 1
|
|
365
|
+
ordered = sorted(frequencies.items(), key=lambda item: (-item[1], item[0]))
|
|
366
|
+
return [key for key, _ in ordered[:JSON_MAX_INFERRED_COLUMNS]]
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def sanitize_json_child_name(name: str) -> str:
|
|
370
|
+
return re.sub(r"[^A-Za-z0-9_]+", "_", name).strip("_").lower()
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def json_parent_ordinal_columns(depth: int) -> list[str]:
|
|
374
|
+
if depth <= 0:
|
|
375
|
+
return []
|
|
376
|
+
if depth == 1:
|
|
377
|
+
return ["parent_ordinal"]
|
|
378
|
+
return [*(f"ancestor_ordinal_{index}" for index in range(1, depth)), "parent_ordinal"]
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def json_link_columns(parent_ordinals: list[str]) -> list[str]:
|
|
382
|
+
return ["parent_record_id", *json_parent_ordinal_columns(len(parent_ordinals)), "ordinal"]
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def discover_json_list_fields(records: list[dict[str, object]]) -> dict[str, dict[str, object]]:
|
|
386
|
+
discovered: dict[str, dict[str, object]] = {}
|
|
387
|
+
for record in records[:200]:
|
|
388
|
+
for key, value in record.items():
|
|
389
|
+
if not isinstance(value, list) or not value:
|
|
390
|
+
continue
|
|
391
|
+
field = discovered.setdefault(key, {"kind": None, "dict_rows": []})
|
|
392
|
+
first = value[0]
|
|
393
|
+
if all(isinstance(item, dict) for item in value):
|
|
394
|
+
if field["kind"] in (None, "dict"):
|
|
395
|
+
field["kind"] = "dict"
|
|
396
|
+
field["dict_rows"].extend(item for item in value if isinstance(item, dict))
|
|
397
|
+
continue
|
|
398
|
+
if all(json_scalar_to_text(item) is not None for item in value):
|
|
399
|
+
if field["kind"] is None:
|
|
400
|
+
field["kind"] = "scalar"
|
|
401
|
+
continue
|
|
402
|
+
field["kind"] = "mixed"
|
|
403
|
+
|
|
404
|
+
filtered: dict[str, dict[str, object]] = {}
|
|
405
|
+
for key, value in discovered.items():
|
|
406
|
+
kind = value["kind"]
|
|
407
|
+
if kind == "scalar":
|
|
408
|
+
filtered[key] = {"kind": "scalar"}
|
|
409
|
+
elif kind == "dict":
|
|
410
|
+
columns = infer_json_columns(value["dict_rows"])
|
|
411
|
+
filtered[key] = {"kind": "dict", "columns": columns}
|
|
412
|
+
return filtered
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def ingest_json_child_tables(
|
|
416
|
+
conn: sqlite3.Connection,
|
|
417
|
+
parent_table_name: str,
|
|
418
|
+
parent_rows: list[tuple[str, list[str], dict[str, object]]],
|
|
419
|
+
) -> list[tuple[str, int]]:
|
|
420
|
+
if not parent_rows:
|
|
421
|
+
return []
|
|
422
|
+
|
|
423
|
+
records = [record for _, _, record in parent_rows]
|
|
424
|
+
list_fields = discover_json_list_fields(records)
|
|
425
|
+
ingested: list[tuple[str, int]] = []
|
|
426
|
+
|
|
427
|
+
for field_name, field_info in list_fields.items():
|
|
428
|
+
child_table_name = f"{parent_table_name}_{sanitize_json_child_name(field_name)}"
|
|
429
|
+
kind = field_info["kind"]
|
|
430
|
+
link_columns = json_link_columns(parent_rows[0][1])
|
|
431
|
+
if kind == "scalar":
|
|
432
|
+
insert_sql = create_table(conn, child_table_name, [*link_columns, "value"])
|
|
433
|
+
batch: list[tuple[str, ...]] = []
|
|
434
|
+
for record_id, parent_ordinals, record in parent_rows:
|
|
435
|
+
values = record.get(field_name)
|
|
436
|
+
if not isinstance(values, list):
|
|
437
|
+
continue
|
|
438
|
+
for ordinal, item in enumerate(values, start=1):
|
|
439
|
+
scalar = json_scalar_to_text(item)
|
|
440
|
+
if scalar is None:
|
|
441
|
+
continue
|
|
442
|
+
batch.append(tuple([record_id, *parent_ordinals, str(ordinal), scalar]))
|
|
443
|
+
conn.executemany(insert_sql, batch)
|
|
444
|
+
ingested.append((child_table_name, len(batch)))
|
|
445
|
+
continue
|
|
446
|
+
|
|
447
|
+
if kind == "dict":
|
|
448
|
+
columns = [*link_columns, *field_info["columns"], "payload_json"]
|
|
449
|
+
insert_sql = create_table(conn, child_table_name, columns)
|
|
450
|
+
batch: list[tuple[str, ...]] = []
|
|
451
|
+
child_rows: list[tuple[str, list[str], dict[str, object]]] = []
|
|
452
|
+
for record_id, parent_ordinals, record in parent_rows:
|
|
453
|
+
values = record.get(field_name)
|
|
454
|
+
if not isinstance(values, list):
|
|
455
|
+
continue
|
|
456
|
+
for ordinal, item in enumerate(values, start=1):
|
|
457
|
+
if not isinstance(item, dict):
|
|
458
|
+
continue
|
|
459
|
+
flattened = flatten_json_record(item)
|
|
460
|
+
row = [record_id, *parent_ordinals, str(ordinal)]
|
|
461
|
+
row.extend(flattened.get(column, "") for column in field_info["columns"])
|
|
462
|
+
row.append(json.dumps(item, sort_keys=True))
|
|
463
|
+
batch.append(tuple(row))
|
|
464
|
+
child_rows.append((record_id, [*parent_ordinals, str(ordinal)], item))
|
|
465
|
+
conn.executemany(insert_sql, batch)
|
|
466
|
+
ingested.append((child_table_name, len(batch)))
|
|
467
|
+
ingested.extend(ingest_json_child_tables(conn, child_table_name, child_rows))
|
|
468
|
+
|
|
469
|
+
return ingested
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def ingest_json(conn: sqlite3.Connection, source: Path, table_name: str) -> list[tuple[str, int]]:
|
|
473
|
+
with open_maybe_gzip(source) as handle:
|
|
474
|
+
payload = json.load(handle)
|
|
475
|
+
records = extract_json_records(payload)
|
|
476
|
+
if not records:
|
|
477
|
+
columns = ["record_id", "payload_json"]
|
|
478
|
+
insert_sql = create_table(conn, table_name, columns)
|
|
479
|
+
batch = list(iter_json_rows(payload))
|
|
480
|
+
conn.executemany(insert_sql, batch)
|
|
481
|
+
return [(table_name, len(batch))]
|
|
482
|
+
|
|
483
|
+
inferred_columns = infer_json_columns(records)
|
|
484
|
+
columns = ["record_id", *inferred_columns, "payload_json"]
|
|
485
|
+
insert_sql = create_table(conn, table_name, columns)
|
|
486
|
+
batch: list[tuple[str, ...]] = []
|
|
487
|
+
for index, record in enumerate(records, start=1):
|
|
488
|
+
flattened = flatten_json_record(record)
|
|
489
|
+
row = [pick_json_record_id(record, index)]
|
|
490
|
+
row.extend(flattened.get(column, "") for column in inferred_columns)
|
|
491
|
+
row.append(json.dumps(record, sort_keys=True))
|
|
492
|
+
batch.append(tuple(row))
|
|
493
|
+
conn.executemany(insert_sql, batch)
|
|
494
|
+
ingested = [(table_name, len(batch))]
|
|
495
|
+
ingested.extend(
|
|
496
|
+
ingest_json_child_tables(
|
|
497
|
+
conn,
|
|
498
|
+
table_name,
|
|
499
|
+
[(pick_json_record_id(record, index), [], record) for index, record in enumerate(records, start=1)],
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
return ingested
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def detect_ingest_format(source: Path) -> str | None:
|
|
506
|
+
name = source.name.lower()
|
|
507
|
+
if any(name.endswith(suffix) for suffix in DELIMITED_SUFFIXES):
|
|
508
|
+
return "delimited"
|
|
509
|
+
if any(name.endswith(suffix) for suffix in FASTA_SUFFIXES):
|
|
510
|
+
return "fasta"
|
|
511
|
+
if any(name.endswith(suffix) for suffix in GFF_SUFFIXES):
|
|
512
|
+
return "gff"
|
|
513
|
+
if any(name.endswith(suffix) for suffix in GTF_SUFFIXES):
|
|
514
|
+
return "gtf"
|
|
515
|
+
if any(name.endswith(suffix) for suffix in JSON_SUFFIXES):
|
|
516
|
+
return "json"
|
|
517
|
+
return None
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def ingest_source(
|
|
521
|
+
conn: sqlite3.Connection,
|
|
522
|
+
source: Path,
|
|
523
|
+
table_name: str,
|
|
524
|
+
no_header: bool = False,
|
|
525
|
+
) -> list[tuple[str, int]]:
|
|
526
|
+
detected = detect_ingest_format(source)
|
|
527
|
+
if detected == "delimited":
|
|
528
|
+
return [(table_name, ingest_delimited(conn, source, table_name, no_header=no_header))]
|
|
529
|
+
if detected == "fasta":
|
|
530
|
+
return [(table_name, ingest_fasta(conn, source, table_name))]
|
|
531
|
+
if detected in {"gff", "gtf"}:
|
|
532
|
+
return [(table_name, ingest_feature_file(conn, source, table_name))]
|
|
533
|
+
if detected == "json":
|
|
534
|
+
return ingest_json(conn, source, table_name)
|
|
535
|
+
raise ValueError(f"unsupported ingest format: {source}")
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def is_ingestable(path: Path) -> bool:
|
|
539
|
+
return detect_ingest_format(path) is not None
|
flybase_cli/postgres.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .config import DEFAULT_POSTGRES_DIR
|
|
8
|
+
from .core import download_file, release_base_url
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def dump_url_for_release(release: str) -> str:
|
|
12
|
+
return f"{release_base_url(release)}psql/{release}.sql.gz"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def default_dump_path(root: Path, release: str) -> Path:
|
|
16
|
+
return root / f"{release}.sql.gz"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def default_script_path(root: Path, release: str) -> Path:
|
|
20
|
+
return root / f"load-{release}.sh"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def default_db_name(release: str) -> str:
|
|
24
|
+
return f"flybase_{release.lower()}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def available_postgres_tools() -> dict[str, str | None]:
|
|
28
|
+
return {
|
|
29
|
+
"createdb": shutil.which("createdb"),
|
|
30
|
+
"dropdb": shutil.which("dropdb"),
|
|
31
|
+
"psql": shutil.which("psql"),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def render_pg_load_script(
|
|
36
|
+
*,
|
|
37
|
+
dump_path: Path,
|
|
38
|
+
db_name: str,
|
|
39
|
+
drop_existing: bool,
|
|
40
|
+
) -> str:
|
|
41
|
+
lines = ["#!/usr/bin/env bash", "set -euo pipefail", ""]
|
|
42
|
+
if drop_existing:
|
|
43
|
+
lines.append(f"dropdb --if-exists {db_name}")
|
|
44
|
+
lines.append(f"createdb {db_name}")
|
|
45
|
+
lines.append(f"gzip -dc {dump_path} | psql {db_name}")
|
|
46
|
+
lines.append("")
|
|
47
|
+
return "\n".join(lines)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def write_pg_load_script(
|
|
51
|
+
*,
|
|
52
|
+
release: str,
|
|
53
|
+
dump_path: Path,
|
|
54
|
+
db_name: str,
|
|
55
|
+
script_path: Path,
|
|
56
|
+
drop_existing: bool,
|
|
57
|
+
) -> Path:
|
|
58
|
+
script_path.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
script = render_pg_load_script(
|
|
60
|
+
dump_path=dump_path,
|
|
61
|
+
db_name=db_name,
|
|
62
|
+
drop_existing=drop_existing,
|
|
63
|
+
)
|
|
64
|
+
script_path.write_text(script, encoding="utf-8")
|
|
65
|
+
script_path.chmod(0o755)
|
|
66
|
+
return script_path
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def ensure_dump_file(
|
|
70
|
+
*,
|
|
71
|
+
release: str,
|
|
72
|
+
dump_path: Path,
|
|
73
|
+
force: bool = False,
|
|
74
|
+
) -> Path:
|
|
75
|
+
if dump_path.exists() and not force:
|
|
76
|
+
return dump_path
|
|
77
|
+
dump_path.parent.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
download_file(dump_url_for_release(release), dump_path)
|
|
79
|
+
return dump_path
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def execute_pg_load_script(script_path: Path) -> None:
|
|
83
|
+
subprocess.run([str(script_path)], check=True)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def build_pg_load_plan(
|
|
87
|
+
*,
|
|
88
|
+
release: str,
|
|
89
|
+
root: Path = DEFAULT_POSTGRES_DIR,
|
|
90
|
+
db_name: str | None = None,
|
|
91
|
+
dump_path: Path | None = None,
|
|
92
|
+
script_path: Path | None = None,
|
|
93
|
+
drop_existing: bool = False,
|
|
94
|
+
) -> dict[str, object]:
|
|
95
|
+
db = db_name or default_db_name(release)
|
|
96
|
+
dump = dump_path or default_dump_path(root, release)
|
|
97
|
+
script = script_path or default_script_path(root, release)
|
|
98
|
+
return {
|
|
99
|
+
"release": release,
|
|
100
|
+
"db_name": db,
|
|
101
|
+
"dump_url": dump_url_for_release(release),
|
|
102
|
+
"dump_path": str(dump),
|
|
103
|
+
"script_path": str(script),
|
|
104
|
+
"drop_existing": drop_existing,
|
|
105
|
+
"tools": available_postgres_tools(),
|
|
106
|
+
}
|