flybase-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flybase_cli/__init__.py +4 -0
- flybase_cli/__main__.py +5 -0
- flybase_cli/cli.py +667 -0
- flybase_cli/config.py +266 -0
- flybase_cli/core.py +700 -0
- flybase_cli/loaders.py +539 -0
- flybase_cli/postgres.py +106 -0
- flybase_cli/querying.py +162 -0
- flybase_cli/schema.py +671 -0
- flybase_cli/semantics.py +114 -0
- flybase_cli/syncing.py +254 -0
- flybase_cli/version.py +1 -0
- flybase_cli-0.1.2.dist-info/METADATA +244 -0
- flybase_cli-0.1.2.dist-info/RECORD +18 -0
- flybase_cli-0.1.2.dist-info/WHEEL +5 -0
- flybase_cli-0.1.2.dist-info/entry_points.txt +2 -0
- flybase_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
- flybase_cli-0.1.2.dist-info/top_level.txt +1 -0
flybase_cli/config.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
BASE_RELEASES = "https://s3ftp.flybase.org/releases/"
|
|
8
|
+
BASE_API = "https://api.flybase.org/api/v1.0/"
|
|
9
|
+
DEFAULT_RELEASE = "current"
|
|
10
|
+
DEFAULT_ROOT = Path("data/flybase")
|
|
11
|
+
DEFAULT_DB = DEFAULT_ROOT / "flybase.sqlite"
|
|
12
|
+
DEFAULT_MANIFEST = DEFAULT_ROOT / "manifest.json"
|
|
13
|
+
DEFAULT_POSTGRES_DIR = DEFAULT_ROOT / "postgres"
|
|
14
|
+
BATCH_SIZE = 1000
|
|
15
|
+
DELIMITED_SUFFIXES = (".tsv", ".csv", ".tsv.gz", ".csv.gz")
|
|
16
|
+
FASTA_SUFFIXES = (
|
|
17
|
+
".fasta",
|
|
18
|
+
".fa",
|
|
19
|
+
".fna",
|
|
20
|
+
".faa",
|
|
21
|
+
".fasta.gz",
|
|
22
|
+
".fa.gz",
|
|
23
|
+
".fna.gz",
|
|
24
|
+
".faa.gz",
|
|
25
|
+
)
|
|
26
|
+
GFF_SUFFIXES = (".gff", ".gff3", ".gff.gz", ".gff3.gz")
|
|
27
|
+
GTF_SUFFIXES = (".gtf", ".gtf.gz")
|
|
28
|
+
JSON_SUFFIXES = (".json", ".json.gz")
|
|
29
|
+
INGEST_SUFFIXES = DELIMITED_SUFFIXES + FASTA_SUFFIXES + GFF_SUFFIXES + GTF_SUFFIXES + JSON_SUFFIXES
|
|
30
|
+
SEARCH_ID_CANDIDATES = (
|
|
31
|
+
"fbgn_id",
|
|
32
|
+
"primary_fbgn",
|
|
33
|
+
"flybase_fbgn",
|
|
34
|
+
"gene_primary_id",
|
|
35
|
+
"annotation_id",
|
|
36
|
+
"gene_symbol",
|
|
37
|
+
"flybase_fbtr",
|
|
38
|
+
"flybase_fbpp",
|
|
39
|
+
)
|
|
40
|
+
JSON_ID_CANDIDATES = (
|
|
41
|
+
"primaryId",
|
|
42
|
+
"primary_id",
|
|
43
|
+
"id",
|
|
44
|
+
"fbid",
|
|
45
|
+
"fbgn_id",
|
|
46
|
+
"gene_symbol",
|
|
47
|
+
"symbol",
|
|
48
|
+
"name",
|
|
49
|
+
)
|
|
50
|
+
JSON_MAX_INFERRED_COLUMNS = 24
|
|
51
|
+
GENOME_SECTIONS = ("fasta", "gff", "gtf", "dna", "chado-xml")
|
|
52
|
+
GENOME_ASSET_PATTERNS = {
|
|
53
|
+
"mirna": r"miRNA",
|
|
54
|
+
"transcript": r"transcript",
|
|
55
|
+
"translation": r"translation",
|
|
56
|
+
"gene": r"all-gene-",
|
|
57
|
+
"gene-extended": r"gene_extended2000",
|
|
58
|
+
"chromosome": r"chromosome",
|
|
59
|
+
"cds": r"CDS",
|
|
60
|
+
"ncrna": r"ncRNA",
|
|
61
|
+
"gff": r"\.gff(\.gz)?$",
|
|
62
|
+
"gtf": r"\.gtf(\.gz)?$",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class ManifestSelection:
|
|
68
|
+
prefix: str
|
|
69
|
+
includes: tuple[str, ...]
|
|
70
|
+
excludes: tuple[str, ...] = ()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass(frozen=True)
|
|
74
|
+
class SyncPreset:
|
|
75
|
+
name: str
|
|
76
|
+
description: str
|
|
77
|
+
selections: tuple[ManifestSelection, ...]
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def prefixes(self) -> tuple[str, ...]:
|
|
81
|
+
return tuple(selection.prefix for selection in self.selections)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def includes(self) -> tuple[str, ...]:
|
|
85
|
+
return tuple(
|
|
86
|
+
pattern
|
|
87
|
+
for selection in self.selections
|
|
88
|
+
for pattern in selection.includes
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def excludes(self) -> tuple[str, ...]:
|
|
93
|
+
return tuple(
|
|
94
|
+
pattern
|
|
95
|
+
for selection in self.selections
|
|
96
|
+
for pattern in selection.excludes
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass(frozen=True)
|
|
101
|
+
class GenomeSyncPreset:
|
|
102
|
+
name: str
|
|
103
|
+
description: str
|
|
104
|
+
section: str
|
|
105
|
+
asset: str | None = None
|
|
106
|
+
includes: tuple[str, ...] = ()
|
|
107
|
+
excludes: tuple[str, ...] = ()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
SYNC_PRESETS: dict[str, SyncPreset] = {
|
|
111
|
+
"gene-core": SyncPreset(
|
|
112
|
+
name="gene-core",
|
|
113
|
+
description="Gene summaries plus core identifier/link tables.",
|
|
114
|
+
selections=(
|
|
115
|
+
ManifestSelection(
|
|
116
|
+
prefix="precomputed_files/genes/",
|
|
117
|
+
includes=(
|
|
118
|
+
r"best_gene_summary",
|
|
119
|
+
r"fbgn_fbtr_fbpp_fb_",
|
|
120
|
+
r"fbgn_annotation_ID",
|
|
121
|
+
r"dmel_gene_sequence_ontology_annotations",
|
|
122
|
+
),
|
|
123
|
+
),
|
|
124
|
+
),
|
|
125
|
+
),
|
|
126
|
+
"gene-expression": SyncPreset(
|
|
127
|
+
name="gene-expression",
|
|
128
|
+
description="Expression-oriented gene reports.",
|
|
129
|
+
selections=(
|
|
130
|
+
ManifestSelection(
|
|
131
|
+
prefix="precomputed_files/genes/",
|
|
132
|
+
includes=(
|
|
133
|
+
r"curated_expression",
|
|
134
|
+
r"high-throughput_gene_expression",
|
|
135
|
+
r"gene_rpkm_report",
|
|
136
|
+
r"FlyCellAtlas_slimmed_gene_expression",
|
|
137
|
+
r"scRNA",
|
|
138
|
+
),
|
|
139
|
+
),
|
|
140
|
+
),
|
|
141
|
+
),
|
|
142
|
+
"references": SyncPreset(
|
|
143
|
+
name="references",
|
|
144
|
+
description="Publication and cross-reference tables.",
|
|
145
|
+
selections=(
|
|
146
|
+
ManifestSelection(
|
|
147
|
+
prefix="precomputed_files/references/",
|
|
148
|
+
includes=(
|
|
149
|
+
r"fbrf_pmid_pmcid_doi",
|
|
150
|
+
r"entity_publication",
|
|
151
|
+
r"representative_publications",
|
|
152
|
+
),
|
|
153
|
+
),
|
|
154
|
+
),
|
|
155
|
+
),
|
|
156
|
+
"gene-knowledge": SyncPreset(
|
|
157
|
+
name="gene-knowledge",
|
|
158
|
+
description="Core gene facts plus representative publications and orthology tables.",
|
|
159
|
+
selections=(
|
|
160
|
+
ManifestSelection(
|
|
161
|
+
prefix="precomputed_files/genes/",
|
|
162
|
+
includes=(
|
|
163
|
+
r"best_gene_summary",
|
|
164
|
+
r"fbgn_fbtr_fbpp_fb_",
|
|
165
|
+
r"fbgn_annotation_ID",
|
|
166
|
+
r"dmel_gene_sequence_ontology_annotations",
|
|
167
|
+
),
|
|
168
|
+
),
|
|
169
|
+
ManifestSelection(
|
|
170
|
+
prefix="precomputed_files/references/",
|
|
171
|
+
includes=(
|
|
172
|
+
r"entity_publication",
|
|
173
|
+
r"representative_publications",
|
|
174
|
+
),
|
|
175
|
+
),
|
|
176
|
+
ManifestSelection(
|
|
177
|
+
prefix="precomputed_files/orthologs/",
|
|
178
|
+
includes=(
|
|
179
|
+
r"orthologs",
|
|
180
|
+
r"paralogs",
|
|
181
|
+
r"disease",
|
|
182
|
+
),
|
|
183
|
+
),
|
|
184
|
+
),
|
|
185
|
+
),
|
|
186
|
+
"orthology": SyncPreset(
|
|
187
|
+
name="orthology",
|
|
188
|
+
description="Ortholog, paralog, and disease-association support tables.",
|
|
189
|
+
selections=(
|
|
190
|
+
ManifestSelection(
|
|
191
|
+
prefix="precomputed_files/orthologs/",
|
|
192
|
+
includes=(
|
|
193
|
+
r"orthologs",
|
|
194
|
+
r"paralogs",
|
|
195
|
+
r"disease",
|
|
196
|
+
),
|
|
197
|
+
),
|
|
198
|
+
),
|
|
199
|
+
),
|
|
200
|
+
"interactions": SyncPreset(
|
|
201
|
+
name="interactions",
|
|
202
|
+
description="Gene- and allele-level interaction tables.",
|
|
203
|
+
selections=(
|
|
204
|
+
ManifestSelection(
|
|
205
|
+
prefix="precomputed_files/genes/",
|
|
206
|
+
includes=(r"gene_genetic_interactions",),
|
|
207
|
+
),
|
|
208
|
+
ManifestSelection(
|
|
209
|
+
prefix="precomputed_files/alleles/",
|
|
210
|
+
includes=(r"allele_genetic_interactions",),
|
|
211
|
+
),
|
|
212
|
+
),
|
|
213
|
+
),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
GENOME_SYNC_PRESETS: dict[str, GenomeSyncPreset] = {
|
|
218
|
+
"mirna-fasta": GenomeSyncPreset(
|
|
219
|
+
name="mirna-fasta",
|
|
220
|
+
description="miRNA FASTA sequences for a genome build.",
|
|
221
|
+
section="fasta",
|
|
222
|
+
asset="mirna",
|
|
223
|
+
),
|
|
224
|
+
"transcript-fasta": GenomeSyncPreset(
|
|
225
|
+
name="transcript-fasta",
|
|
226
|
+
description="Transcript FASTA sequences for a genome build.",
|
|
227
|
+
section="fasta",
|
|
228
|
+
asset="transcript",
|
|
229
|
+
),
|
|
230
|
+
"translation-fasta": GenomeSyncPreset(
|
|
231
|
+
name="translation-fasta",
|
|
232
|
+
description="Protein translation FASTA sequences for a genome build.",
|
|
233
|
+
section="fasta",
|
|
234
|
+
asset="translation",
|
|
235
|
+
),
|
|
236
|
+
"gene-fasta": GenomeSyncPreset(
|
|
237
|
+
name="gene-fasta",
|
|
238
|
+
description="Gene FASTA sequences for a genome build.",
|
|
239
|
+
section="fasta",
|
|
240
|
+
asset="gene",
|
|
241
|
+
),
|
|
242
|
+
"chromosome-fasta": GenomeSyncPreset(
|
|
243
|
+
name="chromosome-fasta",
|
|
244
|
+
description="Chromosome FASTA sequences for a genome build.",
|
|
245
|
+
section="fasta",
|
|
246
|
+
asset="chromosome",
|
|
247
|
+
),
|
|
248
|
+
"ncrna-fasta": GenomeSyncPreset(
|
|
249
|
+
name="ncrna-fasta",
|
|
250
|
+
description="ncRNA FASTA sequences for a genome build.",
|
|
251
|
+
section="fasta",
|
|
252
|
+
asset="ncrna",
|
|
253
|
+
),
|
|
254
|
+
"gff-all": GenomeSyncPreset(
|
|
255
|
+
name="gff-all",
|
|
256
|
+
description="Primary GFF annotation file for a genome build.",
|
|
257
|
+
section="gff",
|
|
258
|
+
asset="gff",
|
|
259
|
+
),
|
|
260
|
+
"gtf-all": GenomeSyncPreset(
|
|
261
|
+
name="gtf-all",
|
|
262
|
+
description="Primary GTF annotation file for a genome build.",
|
|
263
|
+
section="gtf",
|
|
264
|
+
asset="gtf",
|
|
265
|
+
),
|
|
266
|
+
}
|