metatrawl 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metatrawl-0.1.0/PKG-INFO +220 -0
- metatrawl-0.1.0/README.md +207 -0
- metatrawl-0.1.0/pyproject.toml +27 -0
- metatrawl-0.1.0/setup.cfg +4 -0
- metatrawl-0.1.0/src/metatrawl/__init__.py +9 -0
- metatrawl-0.1.0/src/metatrawl/cache.py +204 -0
- metatrawl-0.1.0/src/metatrawl/cli.py +462 -0
- metatrawl-0.1.0/src/metatrawl/db.py +685 -0
- metatrawl-0.1.0/src/metatrawl/healthcheck.py +103 -0
- metatrawl-0.1.0/src/metatrawl/logging.py +29 -0
- metatrawl-0.1.0/src/metatrawl/workflows.py +397 -0
- metatrawl-0.1.0/src/metatrawl.egg-info/PKG-INFO +220 -0
- metatrawl-0.1.0/src/metatrawl.egg-info/SOURCES.txt +17 -0
- metatrawl-0.1.0/src/metatrawl.egg-info/dependency_links.txt +1 -0
- metatrawl-0.1.0/src/metatrawl.egg-info/entry_points.txt +2 -0
- metatrawl-0.1.0/src/metatrawl.egg-info/requires.txt +7 -0
- metatrawl-0.1.0/src/metatrawl.egg-info/top_level.txt +1 -0
- metatrawl-0.1.0/tests/test_healthcheck.py +81 -0
- metatrawl-0.1.0/tests/test_registry_cli.py +546 -0
metatrawl-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metatrawl
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Mutable registry and workflow bookkeeping for large ZipStrain/SRA profiling projects.
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: click>=8.1
|
|
8
|
+
Requires-Dist: duckdb>=1.1
|
|
9
|
+
Requires-Dist: polars>=1.0
|
|
10
|
+
Requires-Dist: rich>=13.0
|
|
11
|
+
Provides-Extra: test
|
|
12
|
+
Requires-Dist: pytest>=8.0; extra == "test"
|
|
13
|
+
|
|
14
|
+
# MetaTrawl
|
|
15
|
+
|
|
16
|
+
MetaTrawl is a mutable DuckDB project store for SRA-scale ZipStrain projects.
|
|
17
|
+
It tracks run IDs, imports completed ZipStrain/Sylph outputs into real database
|
|
18
|
+
tables, coordinates shared genome cache preparation, and builds ZipStrain matrix
|
|
19
|
+
stores from selected samples.
|
|
20
|
+
|
|
21
|
+
The core idea is simple: many SRA workers can run in parallel, but one cache
|
|
22
|
+
owner prepares genome and Prodigal outputs for shared accessions. Workers create
|
|
23
|
+
per-sample concatenated references in scratch space, profile the sample, import
|
|
24
|
+
the final tables into DuckDB, and then delete scratch files.
|
|
25
|
+
|
|
26
|
+
## Install For Development
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install -e ".[test]"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
MetaTrawl checks for the external tools used by the full workflow:
|
|
33
|
+
`zipstrain`, `sylph`, `samtools`, `bowtie2`, `prefetch`, `fasterq-dump`,
|
|
34
|
+
`datasets`, and `prodigal`.
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
metatrawl test
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Use the strict checker before long jobs. It exits non-zero if anything required
|
|
41
|
+
is missing:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
metatrawl check
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Database Workflow
|
|
48
|
+
|
|
49
|
+
Initialize a project database:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
metatrawl init --db metatrawl.duckdb
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Add SRA run IDs:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
metatrawl runs add --db metatrawl.duckdb SRR000001 SRR000002
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Export only runs that are not yet complete:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
metatrawl profiles remaining \
|
|
65
|
+
--db metatrawl.duckdb \
|
|
66
|
+
--output-file remaining_runs.csv
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The CSV contains one column:
|
|
70
|
+
|
|
71
|
+
```csv
|
|
72
|
+
run_id
|
|
73
|
+
SRR000001
|
|
74
|
+
SRR000002
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Run the high-level sync. This gets remaining runs from DuckDB, runs the SRA
|
|
78
|
+
profiling lifecycle, finds completed profile outputs, imports them into DuckDB,
|
|
79
|
+
deletes the imported per-sample files, and logs each step:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
metatrawl sync \
|
|
83
|
+
--db metatrawl.duckdb \
|
|
84
|
+
--cache-dir cache \
|
|
85
|
+
--scratch-dir scratch \
|
|
86
|
+
--output-dir outputs \
|
|
87
|
+
--threads 16
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
`sync` expects per-run outputs in `--output-dir` using these conventional names:
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
SRR000001.profile.parquet
|
|
94
|
+
SRR000001.genome_stats.parquet
|
|
95
|
+
SRR000001.gene_stats.parquet # optional
|
|
96
|
+
SRR000001.sylph.csv # csv, tsv, or parquet
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
After a successful import, these per-sample outputs are removed because DuckDB is
|
|
100
|
+
the durable project store. The durable cache is left intact. Use
|
|
101
|
+
`--keep-profile-outputs` only when debugging a failed or suspicious run.
|
|
102
|
+
|
|
103
|
+
After profiling, import completed outputs into DuckDB tables:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
metatrawl profiles import \
|
|
107
|
+
--db metatrawl.duckdb \
|
|
108
|
+
--run-id SRR000001 \
|
|
109
|
+
--profile-file outputs/SRR000001.profile.parquet \
|
|
110
|
+
--genome-stats-file outputs/SRR000001.genome_stats.parquet \
|
|
111
|
+
--gene-stats-file outputs/SRR000001.gene_stats.parquet \
|
|
112
|
+
--sylph-abundance-file outputs/SRR000001.sylph.csv
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Or import many samples from a manifest:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
metatrawl profiles add \
|
|
119
|
+
--db metatrawl.duckdb \
|
|
120
|
+
--manifest completed_profiles.csv
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Manifest columns:
|
|
124
|
+
|
|
125
|
+
```csv
|
|
126
|
+
run_id,profile_file,genome_stats_file,gene_stats_file,sylph_abundance_file
|
|
127
|
+
SRR000001,/path/profile.parquet,/path/genome_stats.parquet,/path/gene_stats.parquet,/path/sylph.csv
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
`gene_stats_file` is optional. A run is complete after profile positions, genome
|
|
131
|
+
stats, and Sylph abundance have been imported.
|
|
132
|
+
|
|
133
|
+
## Cache Workflow
|
|
134
|
+
|
|
135
|
+
Prepare one sample reference from accessions using a shared cache:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
metatrawl cache prepare \
|
|
139
|
+
--cache-dir cache \
|
|
140
|
+
--accessions accessions.csv \
|
|
141
|
+
--output-dir scratch/SRR000001/reference
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
For parallel workers, start a local cache server:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
metatrawl cache serve \
|
|
148
|
+
--cache-dir cache \
|
|
149
|
+
--host 127.0.0.1 \
|
|
150
|
+
--port 8765
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
The cache keeps only durable per-accession files:
|
|
154
|
+
|
|
155
|
+
```text
|
|
156
|
+
cache/genomes/GCF_xxx.fna
|
|
157
|
+
cache/genes/GCF_xxx.genes.fna
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Per-sample concatenated references are scratch outputs and should be deleted
|
|
161
|
+
after import.
|
|
162
|
+
|
|
163
|
+
## SRA Worker Lifecycle
|
|
164
|
+
|
|
165
|
+
`profile-sra` wires the worker lifecycle around remaining runs and scratch
|
|
166
|
+
cleanup:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
metatrawl profile-sra \
|
|
170
|
+
--db metatrawl.duckdb \
|
|
171
|
+
--remaining-csv remaining_runs.csv \
|
|
172
|
+
--cache-dir cache \
|
|
173
|
+
--scratch-dir scratch \
|
|
174
|
+
--threads 8
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Long-running steps emit compact cluster-friendly logs:
|
|
178
|
+
|
|
179
|
+
```text
|
|
180
|
+
METATRAWL sample=SRR123 step=sylph status=done genomes=12 elapsed=4.2s
|
|
181
|
+
METATRAWL sample=SRR123 step=cache status=done accessions=10 elapsed=28.9s
|
|
182
|
+
METATRAWL sample=SRR123 step=cleanup status=done removed=scratch/SRR123
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Matrix Workflow
|
|
186
|
+
|
|
187
|
+
Build a ZipStrain matrix from complete DuckDB samples. Thresholds are applied
|
|
188
|
+
before temporary profile parquets are exported:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
metatrawl matrix build \
|
|
192
|
+
--db metatrawl.duckdb \
|
|
193
|
+
--genome GCF_000269965.1_ASM26996v1_genomic.fna \
|
|
194
|
+
--bed-file reference/genomes.bed \
|
|
195
|
+
--stb-file reference/genomes.stb \
|
|
196
|
+
--output-file matrices/binfantis.h5 \
|
|
197
|
+
--min-coverage 1 \
|
|
198
|
+
--min-breadth 0.2 \
|
|
199
|
+
--min-ber 0.77 \
|
|
200
|
+
--min-sylph-abundance 0.001 \
|
|
201
|
+
--sparse
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Append newly imported complete samples to a registered matrix:
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
metatrawl matrix append \
|
|
208
|
+
--db metatrawl.duckdb \
|
|
209
|
+
--matrix-id binfantis
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Compare a registered matrix:
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
metatrawl matrix compare \
|
|
216
|
+
--db metatrawl.duckdb \
|
|
217
|
+
--matrix-id binfantis \
|
|
218
|
+
--output-file compares/binfantis.duckdb \
|
|
219
|
+
--calculate all
|
|
220
|
+
```
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# MetaTrawl
|
|
2
|
+
|
|
3
|
+
MetaTrawl is a mutable DuckDB project store for SRA-scale ZipStrain projects.
|
|
4
|
+
It tracks run IDs, imports completed ZipStrain/Sylph outputs into real database
|
|
5
|
+
tables, coordinates shared genome cache preparation, and builds ZipStrain matrix
|
|
6
|
+
stores from selected samples.
|
|
7
|
+
|
|
8
|
+
The core idea is simple: many SRA workers can run in parallel, but one cache
|
|
9
|
+
owner prepares genome and Prodigal outputs for shared accessions. Workers create
|
|
10
|
+
per-sample concatenated references in scratch space, profile the sample, import
|
|
11
|
+
the final tables into DuckDB, and then delete scratch files.
|
|
12
|
+
|
|
13
|
+
## Install For Development
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install -e ".[test]"
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
MetaTrawl checks for the external tools used by the full workflow:
|
|
20
|
+
`zipstrain`, `sylph`, `samtools`, `bowtie2`, `prefetch`, `fasterq-dump`,
|
|
21
|
+
`datasets`, and `prodigal`.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
metatrawl test
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Use the strict checker before long jobs. It exits non-zero if anything required
|
|
28
|
+
is missing:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
metatrawl check
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Database Workflow
|
|
35
|
+
|
|
36
|
+
Initialize a project database:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
metatrawl init --db metatrawl.duckdb
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Add SRA run IDs:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
metatrawl runs add --db metatrawl.duckdb SRR000001 SRR000002
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Export only runs that are not yet complete:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
metatrawl profiles remaining \
|
|
52
|
+
--db metatrawl.duckdb \
|
|
53
|
+
--output-file remaining_runs.csv
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
The CSV contains one column:
|
|
57
|
+
|
|
58
|
+
```csv
|
|
59
|
+
run_id
|
|
60
|
+
SRR000001
|
|
61
|
+
SRR000002
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Run the high-level sync. This gets remaining runs from DuckDB, runs the SRA
|
|
65
|
+
profiling lifecycle, finds completed profile outputs, imports them into DuckDB,
|
|
66
|
+
deletes the imported per-sample files, and logs each step:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
metatrawl sync \
|
|
70
|
+
--db metatrawl.duckdb \
|
|
71
|
+
--cache-dir cache \
|
|
72
|
+
--scratch-dir scratch \
|
|
73
|
+
--output-dir outputs \
|
|
74
|
+
--threads 16
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
`sync` expects per-run outputs in `--output-dir` using these conventional names:
|
|
78
|
+
|
|
79
|
+
```text
|
|
80
|
+
SRR000001.profile.parquet
|
|
81
|
+
SRR000001.genome_stats.parquet
|
|
82
|
+
SRR000001.gene_stats.parquet # optional
|
|
83
|
+
SRR000001.sylph.csv # csv, tsv, or parquet
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
After a successful import, these per-sample outputs are removed because DuckDB is
|
|
87
|
+
the durable project store. The durable cache is left intact. Use
|
|
88
|
+
`--keep-profile-outputs` only when debugging a failed or suspicious run.
|
|
89
|
+
|
|
90
|
+
After profiling, import completed outputs into DuckDB tables:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
metatrawl profiles import \
|
|
94
|
+
--db metatrawl.duckdb \
|
|
95
|
+
--run-id SRR000001 \
|
|
96
|
+
--profile-file outputs/SRR000001.profile.parquet \
|
|
97
|
+
--genome-stats-file outputs/SRR000001.genome_stats.parquet \
|
|
98
|
+
--gene-stats-file outputs/SRR000001.gene_stats.parquet \
|
|
99
|
+
--sylph-abundance-file outputs/SRR000001.sylph.csv
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Or import many samples from a manifest:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
metatrawl profiles add \
|
|
106
|
+
--db metatrawl.duckdb \
|
|
107
|
+
--manifest completed_profiles.csv
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Manifest columns:
|
|
111
|
+
|
|
112
|
+
```csv
|
|
113
|
+
run_id,profile_file,genome_stats_file,gene_stats_file,sylph_abundance_file
|
|
114
|
+
SRR000001,/path/profile.parquet,/path/genome_stats.parquet,/path/gene_stats.parquet,/path/sylph.csv
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
`gene_stats_file` is optional. A run is complete after profile positions, genome
|
|
118
|
+
stats, and Sylph abundance have been imported.
|
|
119
|
+
|
|
120
|
+
## Cache Workflow
|
|
121
|
+
|
|
122
|
+
Prepare one sample reference from accessions using a shared cache:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
metatrawl cache prepare \
|
|
126
|
+
--cache-dir cache \
|
|
127
|
+
--accessions accessions.csv \
|
|
128
|
+
--output-dir scratch/SRR000001/reference
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
For parallel workers, start a local cache server:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
metatrawl cache serve \
|
|
135
|
+
--cache-dir cache \
|
|
136
|
+
--host 127.0.0.1 \
|
|
137
|
+
--port 8765
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
The cache keeps only durable per-accession files:
|
|
141
|
+
|
|
142
|
+
```text
|
|
143
|
+
cache/genomes/GCF_xxx.fna
|
|
144
|
+
cache/genes/GCF_xxx.genes.fna
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Per-sample concatenated references are scratch outputs and should be deleted
|
|
148
|
+
after import.
|
|
149
|
+
|
|
150
|
+
## SRA Worker Lifecycle
|
|
151
|
+
|
|
152
|
+
`profile-sra` wires the worker lifecycle around remaining runs and scratch
|
|
153
|
+
cleanup:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
metatrawl profile-sra \
|
|
157
|
+
--db metatrawl.duckdb \
|
|
158
|
+
--remaining-csv remaining_runs.csv \
|
|
159
|
+
--cache-dir cache \
|
|
160
|
+
--scratch-dir scratch \
|
|
161
|
+
--threads 8
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Long-running steps emit compact cluster-friendly logs:
|
|
165
|
+
|
|
166
|
+
```text
|
|
167
|
+
METATRAWL sample=SRR123 step=sylph status=done genomes=12 elapsed=4.2s
|
|
168
|
+
METATRAWL sample=SRR123 step=cache status=done accessions=10 elapsed=28.9s
|
|
169
|
+
METATRAWL sample=SRR123 step=cleanup status=done removed=scratch/SRR123
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Matrix Workflow
|
|
173
|
+
|
|
174
|
+
Build a ZipStrain matrix from complete DuckDB samples. Thresholds are applied
|
|
175
|
+
before temporary profile parquets are exported:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
metatrawl matrix build \
|
|
179
|
+
--db metatrawl.duckdb \
|
|
180
|
+
--genome GCF_000269965.1_ASM26996v1_genomic.fna \
|
|
181
|
+
--bed-file reference/genomes.bed \
|
|
182
|
+
--stb-file reference/genomes.stb \
|
|
183
|
+
--output-file matrices/binfantis.h5 \
|
|
184
|
+
--min-coverage 1 \
|
|
185
|
+
--min-breadth 0.2 \
|
|
186
|
+
--min-ber 0.77 \
|
|
187
|
+
--min-sylph-abundance 0.001 \
|
|
188
|
+
--sparse
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Append newly imported complete samples to a registered matrix:
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
metatrawl matrix append \
|
|
195
|
+
--db metatrawl.duckdb \
|
|
196
|
+
--matrix-id binfantis
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Compare a registered matrix:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
metatrawl matrix compare \
|
|
203
|
+
--db metatrawl.duckdb \
|
|
204
|
+
--matrix-id binfantis \
|
|
205
|
+
--output-file compares/binfantis.duckdb \
|
|
206
|
+
--calculate all
|
|
207
|
+
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "metatrawl"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Mutable registry and workflow bookkeeping for large ZipStrain/SRA profiling projects."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"click>=8.1",
|
|
9
|
+
"duckdb>=1.1",
|
|
10
|
+
"polars>=1.0",
|
|
11
|
+
"rich>=13.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
test = [
|
|
16
|
+
"pytest>=8.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
metatrawl = "metatrawl.cli:cli"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["setuptools>=68"]
|
|
24
|
+
build-backend = "setuptools.build_meta"
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.packages.find]
|
|
27
|
+
where = ["src"]
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Genome cache preparation for concurrent MetaTrawl workers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import shutil
|
|
9
|
+
import subprocess
|
|
10
|
+
import threading
|
|
11
|
+
from typing import Callable
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
14
|
+
|
|
15
|
+
from metatrawl.logging import WorkflowLogger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
Downloader = Callable[[str, Path], None]
|
|
19
|
+
ProdigalRunner = Callable[[Path, Path], None]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class PreparedReference:
|
|
24
|
+
"""Per-sample temporary reference files returned by the cache manager."""
|
|
25
|
+
|
|
26
|
+
reference_fasta: Path
|
|
27
|
+
gene_fasta: Path
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GenomeCache:
|
|
31
|
+
"""Single-process authoritative writer for genome/prodigal cache files."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
cache_dir: Path,
|
|
36
|
+
*,
|
|
37
|
+
downloader: Downloader | None = None,
|
|
38
|
+
prodigal_runner: ProdigalRunner | None = None,
|
|
39
|
+
logger: WorkflowLogger | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.cache_dir = Path(cache_dir)
|
|
42
|
+
self.genomes_dir = self.cache_dir / "genomes"
|
|
43
|
+
self.genes_dir = self.cache_dir / "genes"
|
|
44
|
+
self.genomes_dir.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
self.genes_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
self.downloader = downloader or download_genome_with_datasets
|
|
47
|
+
self.prodigal_runner = prodigal_runner or run_prodigal_gene_fasta
|
|
48
|
+
self.logger = logger or WorkflowLogger()
|
|
49
|
+
self._executor = ThreadPoolExecutor(max_workers=4)
|
|
50
|
+
self._lock = threading.Lock()
|
|
51
|
+
self._in_flight: dict[str, Future[tuple[Path, Path]]] = {}
|
|
52
|
+
|
|
53
|
+
def prepare_accession(self, accession: str) -> tuple[Path, Path]:
|
|
54
|
+
"""Ensure one accession has cached genome and gene FASTA files."""
|
|
55
|
+
accession = accession.strip()
|
|
56
|
+
if not accession:
|
|
57
|
+
raise ValueError("Empty accession requested from genome cache.")
|
|
58
|
+
genome_fasta = self.genome_fasta_path(accession)
|
|
59
|
+
gene_fasta = self.gene_fasta_path(accession)
|
|
60
|
+
if genome_fasta.exists() and gene_fasta.exists():
|
|
61
|
+
self.logger.emit(accession=accession, step="cache", status="cached")
|
|
62
|
+
return genome_fasta, gene_fasta
|
|
63
|
+
with self._lock:
|
|
64
|
+
future = self._in_flight.get(accession)
|
|
65
|
+
if future is None:
|
|
66
|
+
self.logger.emit(accession=accession, step="cache", status="queued")
|
|
67
|
+
future = self._executor.submit(self._prepare_accession_uncached, accession)
|
|
68
|
+
self._in_flight[accession] = future
|
|
69
|
+
try:
|
|
70
|
+
return future.result()
|
|
71
|
+
finally:
|
|
72
|
+
if future.done():
|
|
73
|
+
with self._lock:
|
|
74
|
+
self._in_flight.pop(accession, None)
|
|
75
|
+
|
|
76
|
+
def prepare_reference(self, *, accessions: list[str], output_dir: Path, sample: str | None = None) -> PreparedReference:
|
|
77
|
+
"""Build per-sample concatenated genome and gene FASTA files."""
|
|
78
|
+
clean_accessions = _dedupe(accessions)
|
|
79
|
+
if not clean_accessions:
|
|
80
|
+
raise ValueError("Cannot prepare a reference with no accessions.")
|
|
81
|
+
output_dir = Path(output_dir)
|
|
82
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
self.logger.emit(sample=sample, step="cache", status="start", accessions=len(clean_accessions))
|
|
84
|
+
prepared = [self.prepare_accession(accession) for accession in clean_accessions]
|
|
85
|
+
reference_fasta = output_dir / "reference.fna"
|
|
86
|
+
gene_fasta = output_dir / "genes.fna"
|
|
87
|
+
_concatenate_fastas([pair[0] for pair in prepared], reference_fasta)
|
|
88
|
+
_concatenate_fastas([pair[1] for pair in prepared], gene_fasta)
|
|
89
|
+
self.logger.emit(sample=sample, step="cache", status="done", accessions=len(clean_accessions))
|
|
90
|
+
return PreparedReference(reference_fasta=reference_fasta, gene_fasta=gene_fasta)
|
|
91
|
+
|
|
92
|
+
def genome_fasta_path(self, accession: str) -> Path:
|
|
93
|
+
return self.genomes_dir / f"{_safe_name(accession)}.fna"
|
|
94
|
+
|
|
95
|
+
def gene_fasta_path(self, accession: str) -> Path:
|
|
96
|
+
return self.genes_dir / f"{_safe_name(accession)}.genes.fna"
|
|
97
|
+
|
|
98
|
+
def _prepare_accession_uncached(self, accession: str) -> tuple[Path, Path]:
|
|
99
|
+
genome_fasta = self.genome_fasta_path(accession)
|
|
100
|
+
gene_fasta = self.gene_fasta_path(accession)
|
|
101
|
+
if not genome_fasta.exists():
|
|
102
|
+
self.logger.emit(accession=accession, step="download", status="start")
|
|
103
|
+
tmp_genome = genome_fasta.with_suffix(".tmp.fna")
|
|
104
|
+
self.downloader(accession, tmp_genome)
|
|
105
|
+
_atomic_publish(tmp_genome, genome_fasta)
|
|
106
|
+
self.logger.emit(accession=accession, step="download", status="done", file=genome_fasta)
|
|
107
|
+
if not gene_fasta.exists():
|
|
108
|
+
self.logger.emit(accession=accession, step="prodigal", status="start")
|
|
109
|
+
tmp_gene = gene_fasta.with_suffix(".tmp.fna")
|
|
110
|
+
self.prodigal_runner(genome_fasta, tmp_gene)
|
|
111
|
+
_atomic_publish(tmp_gene, gene_fasta)
|
|
112
|
+
self.logger.emit(accession=accession, step="prodigal", status="done", file=gene_fasta)
|
|
113
|
+
return genome_fasta, gene_fasta
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def prepare_cache_reference(
|
|
117
|
+
*,
|
|
118
|
+
cache_dir: Path,
|
|
119
|
+
accessions: list[str],
|
|
120
|
+
output_dir: Path,
|
|
121
|
+
logger: WorkflowLogger | None = None,
|
|
122
|
+
) -> PreparedReference:
|
|
123
|
+
"""Prepare a per-sample reference using the default cache manager."""
|
|
124
|
+
return GenomeCache(cache_dir, logger=logger).prepare_reference(accessions=accessions, output_dir=output_dir)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def read_accessions_file(accessions_file: Path) -> list[str]:
|
|
128
|
+
"""Read accessions from a one-column CSV/TSV/text file."""
|
|
129
|
+
path = Path(accessions_file)
|
|
130
|
+
if path.suffix.lower() == ".csv":
|
|
131
|
+
df = pl.read_csv(path)
|
|
132
|
+
column = "accession" if "accession" in df.columns else df.columns[0]
|
|
133
|
+
return [str(value) for value in df[column].to_list()]
|
|
134
|
+
return [line.strip().split()[0] for line in path.read_text().splitlines() if line.strip()]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def download_genome_with_datasets(accession: str, output_fasta: Path) -> None:
|
|
138
|
+
"""Download a genome FASTA with NCBI datasets CLI when available."""
|
|
139
|
+
output_fasta.parent.mkdir(parents=True, exist_ok=True)
|
|
140
|
+
archive = output_fasta.with_suffix(".zip")
|
|
141
|
+
tmp_dir = output_fasta.parent / f"{output_fasta.stem}.datasets"
|
|
142
|
+
try:
|
|
143
|
+
subprocess.run(
|
|
144
|
+
["datasets", "download", "genome", "accession", accession, "--filename", str(archive)],
|
|
145
|
+
check=True,
|
|
146
|
+
capture_output=True,
|
|
147
|
+
text=True,
|
|
148
|
+
)
|
|
149
|
+
shutil.unpack_archive(str(archive), str(tmp_dir))
|
|
150
|
+
fasta_files = sorted(tmp_dir.rglob("*.fna"))
|
|
151
|
+
if not fasta_files:
|
|
152
|
+
raise RuntimeError(f"datasets did not produce a FASTA for accession {accession}")
|
|
153
|
+
shutil.copy2(fasta_files[0], output_fasta)
|
|
154
|
+
finally:
|
|
155
|
+
if archive.exists():
|
|
156
|
+
archive.unlink()
|
|
157
|
+
if tmp_dir.exists():
|
|
158
|
+
shutil.rmtree(tmp_dir)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def run_prodigal_gene_fasta(genome_fasta: Path, output_gene_fasta: Path) -> None:
|
|
162
|
+
"""Run Prodigal and keep only nucleotide gene FASTA output."""
|
|
163
|
+
output_gene_fasta.parent.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
proteins = output_gene_fasta.with_suffix(".faa")
|
|
165
|
+
genes = output_gene_fasta
|
|
166
|
+
subprocess.run(
|
|
167
|
+
["prodigal", "-i", str(genome_fasta), "-d", str(genes), "-a", str(proteins), "-p", "meta", "-q"],
|
|
168
|
+
check=True,
|
|
169
|
+
capture_output=True,
|
|
170
|
+
text=True,
|
|
171
|
+
)
|
|
172
|
+
if proteins.exists():
|
|
173
|
+
proteins.unlink()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _concatenate_fastas(inputs: list[Path], output: Path) -> None:
|
|
177
|
+
tmp = output.with_suffix(output.suffix + ".tmp")
|
|
178
|
+
with tmp.open("w") as out:
|
|
179
|
+
for fasta in inputs:
|
|
180
|
+
out.write(Path(fasta).read_text())
|
|
181
|
+
if not out.tell():
|
|
182
|
+
continue
|
|
183
|
+
out.write("\n")
|
|
184
|
+
_atomic_publish(tmp, output)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _atomic_publish(tmp: Path, final: Path) -> None:
|
|
188
|
+
final.parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
tmp.replace(final)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _safe_name(value: str) -> str:
|
|
193
|
+
return "".join(char if char.isalnum() or char in "._-" else "_" for char in value)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _dedupe(values: list[str]) -> list[str]:
|
|
197
|
+
seen: set[str] = set()
|
|
198
|
+
result: list[str] = []
|
|
199
|
+
for value in values:
|
|
200
|
+
clean = value.strip()
|
|
201
|
+
if clean and clean not in seen:
|
|
202
|
+
seen.add(clean)
|
|
203
|
+
result.append(clean)
|
|
204
|
+
return result
|