flybase-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flybase_cli/__init__.py +4 -0
- flybase_cli/__main__.py +5 -0
- flybase_cli/cli.py +667 -0
- flybase_cli/config.py +266 -0
- flybase_cli/core.py +700 -0
- flybase_cli/loaders.py +539 -0
- flybase_cli/postgres.py +106 -0
- flybase_cli/querying.py +162 -0
- flybase_cli/schema.py +671 -0
- flybase_cli/semantics.py +114 -0
- flybase_cli/syncing.py +254 -0
- flybase_cli/version.py +1 -0
- flybase_cli-0.1.2.dist-info/METADATA +244 -0
- flybase_cli-0.1.2.dist-info/RECORD +18 -0
- flybase_cli-0.1.2.dist-info/WHEEL +5 -0
- flybase_cli-0.1.2.dist-info/entry_points.txt +2 -0
- flybase_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
- flybase_cli-0.1.2.dist-info/top_level.txt +1 -0
flybase_cli/semantics.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
COLUMN_DESCRIPTION_HINTS: dict[str, str] = {
|
|
5
|
+
"record_id": "Primary row identifier for this table.",
|
|
6
|
+
"parent_record_id": "Parent record identifier from a nested JSON source row.",
|
|
7
|
+
"parent_ordinal": "Ordinal position of the direct parent row within a repeated list.",
|
|
8
|
+
"ordinal": "Ordinal position within a repeated list.",
|
|
9
|
+
"value": "Scalar value from a repeated JSON list.",
|
|
10
|
+
"payload_json": "Full raw JSON payload for the source record.",
|
|
11
|
+
"fbgn_id": "FlyBase gene identifier.",
|
|
12
|
+
"primary_fbgn": "FlyBase gene identifier alias.",
|
|
13
|
+
"flybase_fbgn": "FlyBase gene identifier alias.",
|
|
14
|
+
"fbtr_id": "FlyBase transcript identifier.",
|
|
15
|
+
"primary_fbtr": "FlyBase transcript identifier alias.",
|
|
16
|
+
"flybase_fbtr": "FlyBase transcript identifier alias.",
|
|
17
|
+
"fbpp_id": "FlyBase protein identifier.",
|
|
18
|
+
"primary_fbpp": "FlyBase protein identifier alias.",
|
|
19
|
+
"flybase_fbpp": "FlyBase protein identifier alias.",
|
|
20
|
+
"annotation_id": "Genome annotation identifier.",
|
|
21
|
+
"feature_id": "Feature identifier from GFF or GTF annotations.",
|
|
22
|
+
"feature_name": "Feature symbol or display name.",
|
|
23
|
+
"gene_symbol": "Gene symbol.",
|
|
24
|
+
"symbol": "Feature symbol.",
|
|
25
|
+
"seqid": "Sequence or chromosome identifier.",
|
|
26
|
+
"start": "Genomic start coordinate.",
|
|
27
|
+
"end": "Genomic end coordinate.",
|
|
28
|
+
"strand": "Genomic strand.",
|
|
29
|
+
"summary": "Free-text summary field.",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
RELATIONSHIP_GROUPS: dict[str, dict[str, object]] = {
|
|
33
|
+
"fbgn": {
|
|
34
|
+
"aliases": ("fbgn_id", "primary_fbgn", "flybase_fbgn"),
|
|
35
|
+
"kind": "id-alias",
|
|
36
|
+
"label": "FlyBase fbgn",
|
|
37
|
+
},
|
|
38
|
+
"fbtr": {
|
|
39
|
+
"aliases": ("fbtr_id", "primary_fbtr", "flybase_fbtr", "transcript_id"),
|
|
40
|
+
"kind": "id-alias",
|
|
41
|
+
"label": "FlyBase fbtr/transcript",
|
|
42
|
+
},
|
|
43
|
+
"fbpp": {
|
|
44
|
+
"aliases": ("fbpp_id", "primary_fbpp", "flybase_fbpp"),
|
|
45
|
+
"kind": "id-alias",
|
|
46
|
+
"label": "FlyBase fbpp",
|
|
47
|
+
},
|
|
48
|
+
"publication": {
|
|
49
|
+
"aliases": ("fbrf", "fbrf_id", "reference_id", "publication_id", "pmid"),
|
|
50
|
+
"kind": "shared-id",
|
|
51
|
+
"label": "publication",
|
|
52
|
+
},
|
|
53
|
+
"annotation": {
|
|
54
|
+
"aliases": ("annotation_id", "feature_id"),
|
|
55
|
+
"kind": "shared-id",
|
|
56
|
+
"label": "annotation",
|
|
57
|
+
},
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def describe_column_name(column_name: str) -> str:
|
|
62
|
+
hinted = COLUMN_DESCRIPTION_HINTS.get(column_name)
|
|
63
|
+
if hinted is not None:
|
|
64
|
+
return hinted
|
|
65
|
+
lowered = column_name.lower()
|
|
66
|
+
if lowered.endswith("_json"):
|
|
67
|
+
return "Raw JSON payload for this nested structure."
|
|
68
|
+
if lowered.endswith("_id") or lowered.endswith("id"):
|
|
69
|
+
return "Identifier column."
|
|
70
|
+
if "summary" in lowered or "description" in lowered:
|
|
71
|
+
return "Free-text description or summary field."
|
|
72
|
+
if lowered.endswith("symbol") or lowered == "symbol":
|
|
73
|
+
return "Symbol or short label."
|
|
74
|
+
if lowered in {"startposition", "endposition"}:
|
|
75
|
+
return "Genomic coordinate from a nested JSON location record."
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def infer_table_tags(table_name: str, source_path: str, columns: list[str]) -> list[str]:
|
|
80
|
+
name_blob = " ".join([table_name, source_path]).lower()
|
|
81
|
+
lowered_columns = {column.lower() for column in columns}
|
|
82
|
+
tags: set[str] = set()
|
|
83
|
+
|
|
84
|
+
if any(alias in lowered_columns for alias in RELATIONSHIP_GROUPS["fbgn"]["aliases"]) or {
|
|
85
|
+
"gene_symbol",
|
|
86
|
+
"symbol",
|
|
87
|
+
} & lowered_columns:
|
|
88
|
+
tags.add("gene")
|
|
89
|
+
if any(alias in lowered_columns for alias in RELATIONSHIP_GROUPS["fbtr"]["aliases"]):
|
|
90
|
+
tags.add("transcript")
|
|
91
|
+
if any(alias in lowered_columns for alias in RELATIONSHIP_GROUPS["fbpp"]["aliases"]):
|
|
92
|
+
tags.add("protein")
|
|
93
|
+
if any(alias in lowered_columns for alias in RELATIONSHIP_GROUPS["publication"]["aliases"]):
|
|
94
|
+
tags.add("publication")
|
|
95
|
+
if any(alias in lowered_columns for alias in RELATIONSHIP_GROUPS["annotation"]["aliases"]):
|
|
96
|
+
tags.add("annotation")
|
|
97
|
+
if {"seqid", "start", "end", "startposition", "endposition"} & lowered_columns:
|
|
98
|
+
tags.add("coordinates")
|
|
99
|
+
if "summary" in name_blob or "summary" in lowered_columns:
|
|
100
|
+
tags.add("summary")
|
|
101
|
+
if "expression" in name_blob or "rpkm" in name_blob or "scrna" in name_blob:
|
|
102
|
+
tags.add("expression")
|
|
103
|
+
if "ortholog" in name_blob or "paralog" in name_blob:
|
|
104
|
+
tags.add("orthology")
|
|
105
|
+
if "interaction" in name_blob:
|
|
106
|
+
tags.add("interaction")
|
|
107
|
+
if "publication" in name_blob or "reference" in name_blob:
|
|
108
|
+
tags.add("reference")
|
|
109
|
+
if "payload_json" in lowered_columns:
|
|
110
|
+
tags.add("json")
|
|
111
|
+
if "parent_record_id" in lowered_columns:
|
|
112
|
+
tags.add("nested")
|
|
113
|
+
|
|
114
|
+
return sorted(tags)
|
flybase_cli/syncing.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .config import SyncPreset
|
|
7
|
+
from .core import build_manifest, filter_manifest, sync_manifest, write_json
|
|
8
|
+
from .loaders import is_ingestable
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
RELEASE_TOKEN_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
|
|
12
|
+
(re.compile(r"FB\d{4}_\d{2}", flags=re.IGNORECASE), "FBRELEASE"),
|
|
13
|
+
(re.compile(r"fb_\d{4}_\d{2}", flags=re.IGNORECASE), "fb_release"),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def stable_manifest_key(path: str) -> str:
|
|
18
|
+
stable = path
|
|
19
|
+
for pattern, replacement in RELEASE_TOKEN_PATTERNS:
|
|
20
|
+
stable = pattern.sub(replacement, stable)
|
|
21
|
+
return stable.lower()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def prefer_manifest_entry(entries: list[dict[str, str]]) -> dict[str, str]:
|
|
25
|
+
return sorted(entries, key=lambda item: item["path"])[0]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def group_manifest_by_stable_key(
|
|
29
|
+
manifest: list[dict[str, str]],
|
|
30
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
31
|
+
grouped: dict[str, list[dict[str, str]]] = {}
|
|
32
|
+
for item in manifest:
|
|
33
|
+
grouped.setdefault(stable_manifest_key(item["path"]), []).append(item)
|
|
34
|
+
return grouped
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def merge_manifests(manifests: list[list[dict[str, str]]]) -> list[dict[str, str]]:
|
|
38
|
+
merged: dict[str, dict[str, str]] = {}
|
|
39
|
+
for manifest in manifests:
|
|
40
|
+
for item in manifest:
|
|
41
|
+
merged[item["path"]] = item
|
|
42
|
+
return sorted(merged.values(), key=lambda item: item["path"])
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def filter_ingestable_manifest(manifest: list[dict[str, str]]) -> list[dict[str, str]]:
|
|
46
|
+
return [item for item in manifest if is_ingestable(Path(item["path"]))]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def preset_manifest(preset: SyncPreset, release: str) -> list[dict[str, str]]:
|
|
50
|
+
manifests: list[list[dict[str, str]]] = []
|
|
51
|
+
for selection in preset.selections:
|
|
52
|
+
manifests.append(
|
|
53
|
+
filter_manifest(
|
|
54
|
+
build_manifest(selection.prefix, release=release),
|
|
55
|
+
selection.includes,
|
|
56
|
+
selection.excludes,
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
return merge_manifests(manifests)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def full_manifest(
|
|
63
|
+
*,
|
|
64
|
+
release: str,
|
|
65
|
+
prefix: str = "precomputed_files/",
|
|
66
|
+
include: list[str] | tuple[str, ...] = (),
|
|
67
|
+
exclude: list[str] | tuple[str, ...] = (),
|
|
68
|
+
ingestable_only: bool = True,
|
|
69
|
+
) -> list[dict[str, str]]:
|
|
70
|
+
manifest = filter_manifest(
|
|
71
|
+
build_manifest(prefix, release=release),
|
|
72
|
+
include,
|
|
73
|
+
exclude,
|
|
74
|
+
)
|
|
75
|
+
if ingestable_only:
|
|
76
|
+
manifest = filter_ingestable_manifest(manifest)
|
|
77
|
+
return manifest
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def diff_manifests(
|
|
81
|
+
previous_manifest: list[dict[str, str]],
|
|
82
|
+
current_manifest: list[dict[str, str]],
|
|
83
|
+
) -> dict[str, object]:
|
|
84
|
+
previous_by_key = group_manifest_by_stable_key(previous_manifest)
|
|
85
|
+
current_by_key = group_manifest_by_stable_key(current_manifest)
|
|
86
|
+
keys = sorted(set(previous_by_key) | set(current_by_key))
|
|
87
|
+
|
|
88
|
+
added: list[dict[str, object]] = []
|
|
89
|
+
removed: list[dict[str, object]] = []
|
|
90
|
+
updated: list[dict[str, object]] = []
|
|
91
|
+
unchanged: list[dict[str, object]] = []
|
|
92
|
+
|
|
93
|
+
for key in keys:
|
|
94
|
+
previous_entries = previous_by_key.get(key, [])
|
|
95
|
+
current_entries = current_by_key.get(key, [])
|
|
96
|
+
if not previous_entries:
|
|
97
|
+
added.extend({"stable_key": key, "to": item} for item in current_entries)
|
|
98
|
+
continue
|
|
99
|
+
if not current_entries:
|
|
100
|
+
removed.extend({"stable_key": key, "from": item} for item in previous_entries)
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
previous_entry = prefer_manifest_entry(previous_entries)
|
|
104
|
+
current_entry = prefer_manifest_entry(current_entries)
|
|
105
|
+
if previous_entry["path"] == current_entry["path"] and previous_entry["url"] == current_entry["url"]:
|
|
106
|
+
unchanged.append({"stable_key": key, "item": current_entry})
|
|
107
|
+
continue
|
|
108
|
+
updated.append(
|
|
109
|
+
{
|
|
110
|
+
"stable_key": key,
|
|
111
|
+
"from": previous_entry,
|
|
112
|
+
"to": current_entry,
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
"added": added,
|
|
118
|
+
"removed": removed,
|
|
119
|
+
"updated": updated,
|
|
120
|
+
"unchanged": unchanged,
|
|
121
|
+
"added_count": len(added),
|
|
122
|
+
"removed_count": len(removed),
|
|
123
|
+
"updated_count": len(updated),
|
|
124
|
+
"unchanged_count": len(unchanged),
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def build_release_diff(
|
|
129
|
+
*,
|
|
130
|
+
prefix: str,
|
|
131
|
+
from_release: str,
|
|
132
|
+
to_release: str,
|
|
133
|
+
include: list[str] | tuple[str, ...] = (),
|
|
134
|
+
exclude: list[str] | tuple[str, ...] = (),
|
|
135
|
+
) -> dict[str, object]:
|
|
136
|
+
previous_manifest = filter_manifest(
|
|
137
|
+
build_manifest(prefix, release=from_release),
|
|
138
|
+
include,
|
|
139
|
+
exclude,
|
|
140
|
+
)
|
|
141
|
+
current_manifest = filter_manifest(
|
|
142
|
+
build_manifest(prefix, release=to_release),
|
|
143
|
+
include,
|
|
144
|
+
exclude,
|
|
145
|
+
)
|
|
146
|
+
return {
|
|
147
|
+
"prefix": prefix,
|
|
148
|
+
"from_release": from_release,
|
|
149
|
+
"to_release": to_release,
|
|
150
|
+
"previous_manifest_count": len(previous_manifest),
|
|
151
|
+
"current_manifest_count": len(current_manifest),
|
|
152
|
+
**diff_manifests(previous_manifest, current_manifest),
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def build_preset_release_diff(
|
|
157
|
+
*,
|
|
158
|
+
preset: SyncPreset,
|
|
159
|
+
from_release: str,
|
|
160
|
+
to_release: str,
|
|
161
|
+
) -> dict[str, object]:
|
|
162
|
+
previous_manifest = preset_manifest(preset, release=from_release)
|
|
163
|
+
current_manifest = preset_manifest(preset, release=to_release)
|
|
164
|
+
return {
|
|
165
|
+
"preset": preset.name,
|
|
166
|
+
"description": preset.description,
|
|
167
|
+
"prefixes": list(preset.prefixes),
|
|
168
|
+
"from_release": from_release,
|
|
169
|
+
"to_release": to_release,
|
|
170
|
+
"previous_manifest_count": len(previous_manifest),
|
|
171
|
+
"current_manifest_count": len(current_manifest),
|
|
172
|
+
**diff_manifests(previous_manifest, current_manifest),
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def incremental_manifest(diff: dict[str, object]) -> list[dict[str, str]]:
|
|
177
|
+
selected = [item["to"] for item in diff["added"]]
|
|
178
|
+
selected.extend(item["to"] for item in diff["updated"])
|
|
179
|
+
return sorted(selected, key=lambda item: item["path"])
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def sync_incremental_preset(
|
|
183
|
+
*,
|
|
184
|
+
preset: SyncPreset,
|
|
185
|
+
root: Path,
|
|
186
|
+
db_path: Path,
|
|
187
|
+
manifest_path: Path,
|
|
188
|
+
diff_path: Path | None,
|
|
189
|
+
from_release: str,
|
|
190
|
+
to_release: str,
|
|
191
|
+
force: bool = False,
|
|
192
|
+
no_header: bool = False,
|
|
193
|
+
) -> dict[str, object]:
|
|
194
|
+
previous_manifest = preset_manifest(preset, release=from_release)
|
|
195
|
+
current_manifest = preset_manifest(preset, release=to_release)
|
|
196
|
+
diff = diff_manifests(previous_manifest, current_manifest)
|
|
197
|
+
selected_manifest = incremental_manifest(diff)
|
|
198
|
+
if diff_path is not None:
|
|
199
|
+
write_json(diff_path, diff)
|
|
200
|
+
summary = sync_manifest(
|
|
201
|
+
selected_manifest,
|
|
202
|
+
root=root,
|
|
203
|
+
db_path=db_path,
|
|
204
|
+
manifest_path=manifest_path,
|
|
205
|
+
force=force,
|
|
206
|
+
no_header=no_header,
|
|
207
|
+
)
|
|
208
|
+
return {
|
|
209
|
+
"preset": preset.name,
|
|
210
|
+
"description": preset.description,
|
|
211
|
+
"from_release": from_release,
|
|
212
|
+
"to_release": to_release,
|
|
213
|
+
"incremental_file_count": len(selected_manifest),
|
|
214
|
+
"diff_path": str(diff_path) if diff_path is not None else None,
|
|
215
|
+
**diff,
|
|
216
|
+
**summary,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def sync_full_release(
|
|
221
|
+
*,
|
|
222
|
+
root: Path,
|
|
223
|
+
db_path: Path,
|
|
224
|
+
manifest_path: Path,
|
|
225
|
+
release: str,
|
|
226
|
+
prefix: str = "precomputed_files/",
|
|
227
|
+
include: list[str] | tuple[str, ...] = (),
|
|
228
|
+
exclude: list[str] | tuple[str, ...] = (),
|
|
229
|
+
ingestable_only: bool = True,
|
|
230
|
+
force: bool = False,
|
|
231
|
+
no_header: bool = False,
|
|
232
|
+
) -> dict[str, object]:
|
|
233
|
+
manifest = full_manifest(
|
|
234
|
+
release=release,
|
|
235
|
+
prefix=prefix,
|
|
236
|
+
include=include,
|
|
237
|
+
exclude=exclude,
|
|
238
|
+
ingestable_only=ingestable_only,
|
|
239
|
+
)
|
|
240
|
+
summary = sync_manifest(
|
|
241
|
+
manifest,
|
|
242
|
+
root=root,
|
|
243
|
+
db_path=db_path,
|
|
244
|
+
manifest_path=manifest_path,
|
|
245
|
+
force=force,
|
|
246
|
+
no_header=no_header,
|
|
247
|
+
)
|
|
248
|
+
return {
|
|
249
|
+
"mode": "full-sync",
|
|
250
|
+
"release": release,
|
|
251
|
+
"prefix": prefix,
|
|
252
|
+
"ingestable_only": ingestable_only,
|
|
253
|
+
**summary,
|
|
254
|
+
}
|
flybase_cli/version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.2"
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flybase-cli
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: FlyBase sync/query helper for agents.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Dynamic: license-file
|
|
10
|
+
|
|
11
|
+
# FlyBase local sync/query
|
|
12
|
+
|
|
13
|
+
Use FlyBase bulk files for agent workloads. Live API: helper only.
|
|
14
|
+
|
|
15
|
+
## Why
|
|
16
|
+
|
|
17
|
+
- `https://api.flybase.org/api/v1.0/` exists.
|
|
18
|
+
- some endpoints return useful JSON now, eg `domain/FBgn0001250`, `sequence/id/FBgn0001250`.
|
|
19
|
+
- some plausible endpoints return empty body today.
|
|
20
|
+
- bulk bucket + release files: better for repeatable agent queries.
|
|
21
|
+
|
|
22
|
+
## Current surfaces checked
|
|
23
|
+
|
|
24
|
+
- release bucket: `https://s3ftp.flybase.org/releases/current/`
|
|
25
|
+
- precomputed files: `https://s3ftp.flybase.org/releases/current/precomputed_files/`
|
|
26
|
+
- Postgres dump: `https://s3ftp.flybase.org/releases/current/psql/FB2026_01.sql.gz`
|
|
27
|
+
- API root: `https://api.flybase.org/api/v1.0/`
|
|
28
|
+
- batch download: `https://flybase.org/batchdownload`
|
|
29
|
+
|
|
30
|
+
## Layout
|
|
31
|
+
|
|
32
|
+
- `src/flybase_cli/`: package code
|
|
33
|
+
- `tests/`: stdlib `unittest`
|
|
34
|
+
- `flybase_cli.py`: thin repo-root shim
|
|
35
|
+
- `pyproject.toml`: package metadata / console entrypoint
|
|
36
|
+
|
|
37
|
+
## CLI
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
python3 flybase_cli.py presets
|
|
41
|
+
|
|
42
|
+
python3 flybase_cli.py sync gene-core
|
|
43
|
+
|
|
44
|
+
python3 flybase_cli.py sync gene-core --release FB2026_01
|
|
45
|
+
|
|
46
|
+
python3 flybase_cli.py sync gene-knowledge --release FB2026_01
|
|
47
|
+
|
|
48
|
+
python3 flybase_cli.py full-sync --release FB2026_01
|
|
49
|
+
|
|
50
|
+
python3 flybase_cli.py full-sync \
|
|
51
|
+
--release FB2026_01 \
|
|
52
|
+
--include 'best_gene_summary|entity_publication'
|
|
53
|
+
|
|
54
|
+
python3 flybase_cli.py sync-incremental \
|
|
55
|
+
gene-knowledge \
|
|
56
|
+
--from-release FB2025_06 \
|
|
57
|
+
--release FB2026_01
|
|
58
|
+
|
|
59
|
+
python3 flybase_cli.py release-diff \
|
|
60
|
+
--preset gene-knowledge \
|
|
61
|
+
--from-release FB2025_06 \
|
|
62
|
+
--to-release FB2026_01
|
|
63
|
+
|
|
64
|
+
python3 flybase_cli.py genomes --release FB2026_01
|
|
65
|
+
|
|
66
|
+
python3 flybase_cli.py sync-genome \
|
|
67
|
+
--release FB2026_01 \
|
|
68
|
+
--genome dmel_r6.67 \
|
|
69
|
+
--section fasta \
|
|
70
|
+
--asset mirna
|
|
71
|
+
|
|
72
|
+
python3 flybase_cli.py genome-presets
|
|
73
|
+
|
|
74
|
+
python3 flybase_cli.py sync-genome \
|
|
75
|
+
--release FB2026_01 \
|
|
76
|
+
--genome dmel_r6.67 \
|
|
77
|
+
--preset mirna-fasta
|
|
78
|
+
|
|
79
|
+
PYTHONPATH=src python3 -m flybase_cli sync gene-expression
|
|
80
|
+
|
|
81
|
+
python3 flybase_cli.py manifest \
|
|
82
|
+
--url https://s3ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r6.67_FB2026_01/fasta/ \
|
|
83
|
+
--include 'miRNA'
|
|
84
|
+
|
|
85
|
+
python3 flybase_cli.py sync-url \
|
|
86
|
+
--url https://s3ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r6.67_FB2026_01/fasta/ \
|
|
87
|
+
--include 'miRNA'
|
|
88
|
+
|
|
89
|
+
python3 flybase_cli.py ingest \
|
|
90
|
+
data/flybase/precomputed_files/genes/best_gene_summary_fb_2026_01.tsv.gz \
|
|
91
|
+
data/flybase/precomputed_files/genes/fbgn_fbtr_fbpp_fb_2026_01.tsv.gz \
|
|
92
|
+
data/flybase/precomputed_files/genes/fbgn_annotation_ID_fb_2026_01.tsv.gz
|
|
93
|
+
|
|
94
|
+
python3 flybase_cli.py tables --columns
|
|
95
|
+
|
|
96
|
+
python3 flybase_cli.py describe --sample-values 2
|
|
97
|
+
python3 flybase_cli.py schema-export --sample-values 1
|
|
98
|
+
python3 flybase_cli.py query-plan --sample-values 1 --limit 5
|
|
99
|
+
python3 flybase_cli.py query-run --template-name gene-summary-by-fbgn --param fbgn_id=FBgn0002121
|
|
100
|
+
|
|
101
|
+
python3 flybase_cli.py fts-build
|
|
102
|
+
|
|
103
|
+
python3 flybase_cli.py search 'memory formation'
|
|
104
|
+
|
|
105
|
+
python3 flybase_cli.py pg-load --release FB2026_01
|
|
106
|
+
|
|
107
|
+
python3 flybase_cli.py sql \
|
|
108
|
+
"select * from fb_best_gene_summary_fb_2026_01 limit 5"
|
|
109
|
+
|
|
110
|
+
python3 flybase_cli.py sql \
|
|
111
|
+
"select s.fbgn_id, s.gene_symbol, a.annotation_id, p.flybase_fbtr, p.flybase_fbpp \
|
|
112
|
+
from fb_best_gene_summary_fb_2026_01 s \
|
|
113
|
+
join fb_fbgn_annotation_id_fb_2026_01 a on a.primary_fbgn = s.fbgn_id \
|
|
114
|
+
left join fb_fbgn_fbtr_fbpp_fb_2026_01 p on p.flybase_fbgn = s.fbgn_id \
|
|
115
|
+
limit 5"
|
|
116
|
+
|
|
117
|
+
python3 flybase_cli.py api domain/FBgn0001250
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Sync presets
|
|
121
|
+
|
|
122
|
+
- `gene-core`: summaries + FBgn/FBtr/FBpp + annotation IDs + SO annotations
|
|
123
|
+
- `gene-expression`: curated/high-throughput/scRNA expression slices
|
|
124
|
+
- `references`: publication/link tables
|
|
125
|
+
- `gene-knowledge`: core gene facts + representative publications + orthology tables
|
|
126
|
+
- `orthology`: ortholog, paralog, and disease-association tables
|
|
127
|
+
- `interactions`: gene- and allele-level interaction tables
|
|
128
|
+
|
|
129
|
+
## Full sync
|
|
130
|
+
|
|
131
|
+
- `full-sync` crawls an entire release prefix, default `precomputed_files/`
|
|
132
|
+
- default behavior: download only files the current loaders can ingest into SQLite
|
|
133
|
+
- use `--all-files` if you want non-ingestable release artifacts too
|
|
134
|
+
- use `--include` / `--exclude` to stage a narrower smoke or partial warehouse
|
|
135
|
+
- default manifest path: `data/flybase/manifests/<release>/full-sync.json`
|
|
136
|
+
|
|
137
|
+
## Discovery
|
|
138
|
+
|
|
139
|
+
- `genomes --release FB2026_01` lists genome builds linked from that FlyBase release
|
|
140
|
+
- `sync-url` turns a crawlable FlyBase directory URL into a one-step local sync
|
|
141
|
+
- `sync-genome` resolves a release/build pair into the right genome-section URL automatically
|
|
142
|
+
- `genome-presets` lists reusable genome asset sync recipes
|
|
143
|
+
|
|
144
|
+
## Genome sync
|
|
145
|
+
|
|
146
|
+
- sections: `fasta`, `gff`, `gtf`, `dna`, `chado-xml`
|
|
147
|
+
- asset shortcuts include `mirna`, `transcript`, `translation`, `gene`, `chromosome`, `cds`, `ncrna`, `gff`, `gtf`
|
|
148
|
+
- presets include `mirna-fasta`, `transcript-fasta`, `translation-fasta`, `gene-fasta`, `chromosome-fasta`, `ncrna-fasta`, `gff-all`, `gtf-all`
|
|
149
|
+
- use `--include`/`--exclude` for narrower file selection on top of the asset preset
|
|
150
|
+
|
|
151
|
+
## Ingest formats
|
|
152
|
+
|
|
153
|
+
- delimited: `tsv`, `csv`, gzipped variants
|
|
154
|
+
- sequence: `fasta`, `fa`, `fna`, `faa`, gzipped variants
|
|
155
|
+
- annotation: `gff`, `gff3`, `gtf`, gzipped variants
|
|
156
|
+
- JSON: `json`, `json.gz`
|
|
157
|
+
|
|
158
|
+
## JSON ingest
|
|
159
|
+
|
|
160
|
+
- top-level scalar JSON fields become queryable SQLite columns
|
|
161
|
+
- one nested dict level is flattened, eg `gene.symbol` -> `gene_symbol`
|
|
162
|
+
- repeated top-level lists become child tables, eg `symbolSynonyms` -> `<table>_symbolsynonyms`
|
|
163
|
+
- repeated lists nested inside child dict rows become descendant tables, eg `genomeLocations[].exons[]` -> `<table>_genomelocations_exons`
|
|
164
|
+
- full source record remains in `payload_json`
|
|
165
|
+
|
|
166
|
+
Example:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
python3 flybase_cli.py sql \
|
|
170
|
+
"select record_id, symbol, gene_geneId from fb_ncrna_genes_fb_2026_01 limit 5"
|
|
171
|
+
|
|
172
|
+
python3 flybase_cli.py sql \
|
|
173
|
+
"select parent_record_id, ordinal, value \
|
|
174
|
+
from fb_ncrna_genes_fb_2026_01_symbolsynonyms \
|
|
175
|
+
limit 5"
|
|
176
|
+
|
|
177
|
+
python3 flybase_cli.py sql \
|
|
178
|
+
"select parent_record_id, parent_ordinal, ordinal, startPosition, endPosition \
|
|
179
|
+
from fb_ncrna_genes_fb_2026_01_genomelocations_exons \
|
|
180
|
+
limit 5"
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Search
|
|
184
|
+
|
|
185
|
+
- `fts-build` creates a local SQLite FTS5 index from ingested tables
|
|
186
|
+
- `search` queries that index without calling the live FlyBase API
|
|
187
|
+
- record ids prefer stable FlyBase-like columns such as `fbgn_id`, `primary_fbgn`, `flybase_fbtr`
|
|
188
|
+
|
|
189
|
+
## Metadata
|
|
190
|
+
|
|
191
|
+
- `describe` summarizes ingested tables with row counts, source paths, semantic tags, columns, and representative non-empty values
|
|
192
|
+
- `schema-export` writes the same metadata to a deterministic JSON artifact beside the SQLite DB, eg `FB2026_01.schema.json`
|
|
193
|
+
- `schema-export` also includes inferred `relationships` for nested child tables and common FlyBase ID joins
|
|
194
|
+
- `schema-export` also emits `semantic_summary` for table/entity tag coverage
|
|
195
|
+
- `schema-export` also emits ready-to-run `query_templates`
|
|
196
|
+
- `query-plan` prints starter SQL without the larger schema payload
|
|
197
|
+
- `query-plan` now includes named biological templates such as `gene-summary-by-fbgn`, `transcript-protein-links`, `publications-for-gene`, and coordinate lookups when matching tables exist
|
|
198
|
+
- `query-run` selects one template and executes it with parameter values
|
|
199
|
+
- useful first step before writing ad hoc SQL or building agent query plans
|
|
200
|
+
|
|
201
|
+
Example:
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
python3 flybase_cli.py schema-export \
|
|
205
|
+
--db data/flybase/FB2026_01.sqlite \
|
|
206
|
+
--sample-values 1
|
|
207
|
+
|
|
208
|
+
python3 flybase_cli.py query-plan \
|
|
209
|
+
--db data/flybase/FB2026_01.sqlite \
|
|
210
|
+
--sample-values 1 \
|
|
211
|
+
--limit 5
|
|
212
|
+
|
|
213
|
+
python3 flybase_cli.py query-run \
|
|
214
|
+
--db data/flybase/FB2026_01.sqlite \
|
|
215
|
+
--template-name gene-summary-by-fbgn \
|
|
216
|
+
--param fbgn_id=FBgn0002121
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Notes
|
|
220
|
+
|
|
221
|
+
- nested JSON child tables keep lineage columns like `parent_record_id`, `parent_ordinal`, `ordinal`.
|
|
222
|
+
- many FlyBase files start with `##` metadata lines; loader skips those.
|
|
223
|
+
- `sync` writes a preset manifest under `data/flybase/manifests/<release>/`.
|
|
224
|
+
- `full-sync` is the broadest offline path for release bulk data without going through the full Postgres dump.
|
|
225
|
+
- `sync --release FB2026_01` defaults to `data/flybase/FB2026_01.sqlite` to avoid cross-release mixing.
|
|
226
|
+
- `sync-incremental` uses stable manifest keys so release-renamed files still land in `updated` instead of noisy add/remove pairs.
|
|
227
|
+
- `release-diff` compares releases either by raw prefix or by curated multi-prefix preset.
|
|
228
|
+
- `manifest --url` lets you crawl non-`releases/` FlyBase directories such as genome FASTA/GFF trees.
|
|
229
|
+
- `sync-url` is the shortest path for genome assets once you know the directory URL.
|
|
230
|
+
- `sync-genome` is the shortest path when you know the FlyBase release + genome build label.
|
|
231
|
+
- `sync-genome --preset ...` is the preferred path for common genome asset pulls.
|
|
232
|
+
- some FlyBase `.gff.gz` assets are tar-wrapped gzip archives; loader handles that transparently.
|
|
233
|
+
- `sql` and `query-run` shape results as record-oriented JSON with summary metadata for agent chaining.
|
|
234
|
+
- `pg-load` stages the full Postgres import script for `releases/<release>/psql/<release>.sql.gz`.
|
|
235
|
+
- `pg-load --execute` runs the staged script when `createdb` and `psql` are installed locally.
|
|
236
|
+
- SQLite keeps setup minimal; switch to DuckDB/Postgres if you want bigger joins/faster scans.
|
|
237
|
+
- if you only need a few IDs, FlyBase Batch Download may be simpler than syncing files.
|
|
238
|
+
- use `--no-header` for files whose first non-comment row is data, not column names.
|
|
239
|
+
|
|
240
|
+
## Tests
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
python3 -m unittest discover -s tests
|
|
244
|
+
```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
flybase_cli/__init__.py,sha256=lEw53vLD-DrNuj5MFPyP9jpSSwFtyNOEhQ0hM5TPJjc,90
|
|
2
|
+
flybase_cli/__main__.py,sha256=PSQ4rpL0dG6f-qH4N7H-gD9igQkdHzH4yVZDcW8lfZo,80
|
|
3
|
+
flybase_cli/cli.py,sha256=05xJ2O9WQD3fNAlZDzdNxAeAH1ZXdbPJexbo3ArvJHE,25338
|
|
4
|
+
flybase_cli/config.py,sha256=9arWRL_7Hw3mWjKjn_nR7j7j9EGnJKzytyyy0wAtFRI,7620
|
|
5
|
+
flybase_cli/core.py,sha256=ID2p3ZHfEXh2twuEiOFOT7QBuZzJv4ln-w9oTL2DNdo,21681
|
|
6
|
+
flybase_cli/loaders.py,sha256=8ll0lJ3dHMRhRX3oCjE8_XRSP1-wcxzdf_4ckhElVvc,18833
|
|
7
|
+
flybase_cli/postgres.py,sha256=5TVLQj3kZwdCboG_oq1tbKdSAKhhSm4726YJLiDpxlI,2731
|
|
8
|
+
flybase_cli/querying.py,sha256=x9R-R6XitCYkVJpPHtwhdrOYXLP2uTUbkS3iKUcTRK4,4816
|
|
9
|
+
flybase_cli/schema.py,sha256=1-40-ep1cEKoDWw7-9nA_-euIWRR_9c4E9hHcNl9Yf8,24055
|
|
10
|
+
flybase_cli/semantics.py,sha256=SpixB9nPNYBSWRgm6WMcXBkrYSbKxFo-rX4hw2NRBiQ,4649
|
|
11
|
+
flybase_cli/syncing.py,sha256=Qa7lGtKd-O6jrnNS3OPcbHHYHRr8CvHm7G4P32lVZbQ,7783
|
|
12
|
+
flybase_cli/version.py,sha256=YvuYzWnKtqBb-IqG8HAu-nhIYAsgj9Vmc_b9o7vO-js,22
|
|
13
|
+
flybase_cli-0.1.2.dist-info/licenses/LICENSE,sha256=pOm_AKDa_IHzuMyjTQlSC46C0WZiYTDhQXkmlkSk6cI,1080
|
|
14
|
+
flybase_cli-0.1.2.dist-info/METADATA,sha256=6TnyXJI8WMpCezR1ngYrE0L3AhaOJ1ezlHhTvO6Lso4,9577
|
|
15
|
+
flybase_cli-0.1.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
16
|
+
flybase_cli-0.1.2.dist-info/entry_points.txt,sha256=UL7hpwro7lPQv5gu3MNehvdDcLfy2zoPqBM81rQK6Tg,53
|
|
17
|
+
flybase_cli-0.1.2.dist-info/top_level.txt,sha256=QbB-Gk-A6obyDd9fkFs2Njv6zB9k7OWTBLtKW-xsP7Q,12
|
|
18
|
+
flybase_cli-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Gustavo Madeira Santana
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
flybase_cli
|