flybase-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flybase_cli/config.py ADDED
@@ -0,0 +1,266 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+
7
+ BASE_RELEASES = "https://s3ftp.flybase.org/releases/"
8
+ BASE_API = "https://api.flybase.org/api/v1.0/"
9
+ DEFAULT_RELEASE = "current"
10
+ DEFAULT_ROOT = Path("data/flybase")
11
+ DEFAULT_DB = DEFAULT_ROOT / "flybase.sqlite"
12
+ DEFAULT_MANIFEST = DEFAULT_ROOT / "manifest.json"
13
+ DEFAULT_POSTGRES_DIR = DEFAULT_ROOT / "postgres"
14
+ BATCH_SIZE = 1000
15
+ DELIMITED_SUFFIXES = (".tsv", ".csv", ".tsv.gz", ".csv.gz")
16
+ FASTA_SUFFIXES = (
17
+ ".fasta",
18
+ ".fa",
19
+ ".fna",
20
+ ".faa",
21
+ ".fasta.gz",
22
+ ".fa.gz",
23
+ ".fna.gz",
24
+ ".faa.gz",
25
+ )
26
+ GFF_SUFFIXES = (".gff", ".gff3", ".gff.gz", ".gff3.gz")
27
+ GTF_SUFFIXES = (".gtf", ".gtf.gz")
28
+ JSON_SUFFIXES = (".json", ".json.gz")
29
+ INGEST_SUFFIXES = DELIMITED_SUFFIXES + FASTA_SUFFIXES + GFF_SUFFIXES + GTF_SUFFIXES + JSON_SUFFIXES
30
+ SEARCH_ID_CANDIDATES = (
31
+ "fbgn_id",
32
+ "primary_fbgn",
33
+ "flybase_fbgn",
34
+ "gene_primary_id",
35
+ "annotation_id",
36
+ "gene_symbol",
37
+ "flybase_fbtr",
38
+ "flybase_fbpp",
39
+ )
40
+ JSON_ID_CANDIDATES = (
41
+ "primaryId",
42
+ "primary_id",
43
+ "id",
44
+ "fbid",
45
+ "fbgn_id",
46
+ "gene_symbol",
47
+ "symbol",
48
+ "name",
49
+ )
50
+ JSON_MAX_INFERRED_COLUMNS = 24
51
+ GENOME_SECTIONS = ("fasta", "gff", "gtf", "dna", "chado-xml")
52
+ GENOME_ASSET_PATTERNS = {
53
+ "mirna": r"miRNA",
54
+ "transcript": r"transcript",
55
+ "translation": r"translation",
56
+ "gene": r"all-gene-",
57
+ "gene-extended": r"gene_extended2000",
58
+ "chromosome": r"chromosome",
59
+ "cds": r"CDS",
60
+ "ncrna": r"ncRNA",
61
+ "gff": r"\.gff(\.gz)?$",
62
+ "gtf": r"\.gtf(\.gz)?$",
63
+ }
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class ManifestSelection:
68
+ prefix: str
69
+ includes: tuple[str, ...]
70
+ excludes: tuple[str, ...] = ()
71
+
72
+
73
+ @dataclass(frozen=True)
74
+ class SyncPreset:
75
+ name: str
76
+ description: str
77
+ selections: tuple[ManifestSelection, ...]
78
+
79
+ @property
80
+ def prefixes(self) -> tuple[str, ...]:
81
+ return tuple(selection.prefix for selection in self.selections)
82
+
83
+ @property
84
+ def includes(self) -> tuple[str, ...]:
85
+ return tuple(
86
+ pattern
87
+ for selection in self.selections
88
+ for pattern in selection.includes
89
+ )
90
+
91
+ @property
92
+ def excludes(self) -> tuple[str, ...]:
93
+ return tuple(
94
+ pattern
95
+ for selection in self.selections
96
+ for pattern in selection.excludes
97
+ )
98
+
99
+
100
+ @dataclass(frozen=True)
101
+ class GenomeSyncPreset:
102
+ name: str
103
+ description: str
104
+ section: str
105
+ asset: str | None = None
106
+ includes: tuple[str, ...] = ()
107
+ excludes: tuple[str, ...] = ()
108
+
109
+
110
+ SYNC_PRESETS: dict[str, SyncPreset] = {
111
+ "gene-core": SyncPreset(
112
+ name="gene-core",
113
+ description="Gene summaries plus core identifier/link tables.",
114
+ selections=(
115
+ ManifestSelection(
116
+ prefix="precomputed_files/genes/",
117
+ includes=(
118
+ r"best_gene_summary",
119
+ r"fbgn_fbtr_fbpp_fb_",
120
+ r"fbgn_annotation_ID",
121
+ r"dmel_gene_sequence_ontology_annotations",
122
+ ),
123
+ ),
124
+ ),
125
+ ),
126
+ "gene-expression": SyncPreset(
127
+ name="gene-expression",
128
+ description="Expression-oriented gene reports.",
129
+ selections=(
130
+ ManifestSelection(
131
+ prefix="precomputed_files/genes/",
132
+ includes=(
133
+ r"curated_expression",
134
+ r"high-throughput_gene_expression",
135
+ r"gene_rpkm_report",
136
+ r"FlyCellAtlas_slimmed_gene_expression",
137
+ r"scRNA",
138
+ ),
139
+ ),
140
+ ),
141
+ ),
142
+ "references": SyncPreset(
143
+ name="references",
144
+ description="Publication and cross-reference tables.",
145
+ selections=(
146
+ ManifestSelection(
147
+ prefix="precomputed_files/references/",
148
+ includes=(
149
+ r"fbrf_pmid_pmcid_doi",
150
+ r"entity_publication",
151
+ r"representative_publications",
152
+ ),
153
+ ),
154
+ ),
155
+ ),
156
+ "gene-knowledge": SyncPreset(
157
+ name="gene-knowledge",
158
+ description="Core gene facts plus representative publications and orthology tables.",
159
+ selections=(
160
+ ManifestSelection(
161
+ prefix="precomputed_files/genes/",
162
+ includes=(
163
+ r"best_gene_summary",
164
+ r"fbgn_fbtr_fbpp_fb_",
165
+ r"fbgn_annotation_ID",
166
+ r"dmel_gene_sequence_ontology_annotations",
167
+ ),
168
+ ),
169
+ ManifestSelection(
170
+ prefix="precomputed_files/references/",
171
+ includes=(
172
+ r"entity_publication",
173
+ r"representative_publications",
174
+ ),
175
+ ),
176
+ ManifestSelection(
177
+ prefix="precomputed_files/orthologs/",
178
+ includes=(
179
+ r"orthologs",
180
+ r"paralogs",
181
+ r"disease",
182
+ ),
183
+ ),
184
+ ),
185
+ ),
186
+ "orthology": SyncPreset(
187
+ name="orthology",
188
+ description="Ortholog, paralog, and disease-association support tables.",
189
+ selections=(
190
+ ManifestSelection(
191
+ prefix="precomputed_files/orthologs/",
192
+ includes=(
193
+ r"orthologs",
194
+ r"paralogs",
195
+ r"disease",
196
+ ),
197
+ ),
198
+ ),
199
+ ),
200
+ "interactions": SyncPreset(
201
+ name="interactions",
202
+ description="Gene- and allele-level interaction tables.",
203
+ selections=(
204
+ ManifestSelection(
205
+ prefix="precomputed_files/genes/",
206
+ includes=(r"gene_genetic_interactions",),
207
+ ),
208
+ ManifestSelection(
209
+ prefix="precomputed_files/alleles/",
210
+ includes=(r"allele_genetic_interactions",),
211
+ ),
212
+ ),
213
+ ),
214
+ }
215
+
216
+
217
+ GENOME_SYNC_PRESETS: dict[str, GenomeSyncPreset] = {
218
+ "mirna-fasta": GenomeSyncPreset(
219
+ name="mirna-fasta",
220
+ description="miRNA FASTA sequences for a genome build.",
221
+ section="fasta",
222
+ asset="mirna",
223
+ ),
224
+ "transcript-fasta": GenomeSyncPreset(
225
+ name="transcript-fasta",
226
+ description="Transcript FASTA sequences for a genome build.",
227
+ section="fasta",
228
+ asset="transcript",
229
+ ),
230
+ "translation-fasta": GenomeSyncPreset(
231
+ name="translation-fasta",
232
+ description="Protein translation FASTA sequences for a genome build.",
233
+ section="fasta",
234
+ asset="translation",
235
+ ),
236
+ "gene-fasta": GenomeSyncPreset(
237
+ name="gene-fasta",
238
+ description="Gene FASTA sequences for a genome build.",
239
+ section="fasta",
240
+ asset="gene",
241
+ ),
242
+ "chromosome-fasta": GenomeSyncPreset(
243
+ name="chromosome-fasta",
244
+ description="Chromosome FASTA sequences for a genome build.",
245
+ section="fasta",
246
+ asset="chromosome",
247
+ ),
248
+ "ncrna-fasta": GenomeSyncPreset(
249
+ name="ncrna-fasta",
250
+ description="ncRNA FASTA sequences for a genome build.",
251
+ section="fasta",
252
+ asset="ncrna",
253
+ ),
254
+ "gff-all": GenomeSyncPreset(
255
+ name="gff-all",
256
+ description="Primary GFF annotation file for a genome build.",
257
+ section="gff",
258
+ asset="gff",
259
+ ),
260
+ "gtf-all": GenomeSyncPreset(
261
+ name="gtf-all",
262
+ description="Primary GTF annotation file for a genome build.",
263
+ section="gtf",
264
+ asset="gtf",
265
+ ),
266
+ }