fetchm 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fetchm-0.1.9/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Tasnimul-Arabi-Anik
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,9 @@
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt
4
+ recursive-include bin *
5
+ recursive-include fetchm *.py
6
+ prune figures
7
+ exclude test.tsv
8
+ global-exclude __pycache__
9
+ global-exclude *.py[cod]
fetchm-0.1.9/PKG-INFO ADDED
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.4
2
+ Name: fetchm
3
+ Version: 0.1.9
4
+ Summary: A Python tool for fetching bacterial genome metadata and sequences.
5
+ Home-page: https://github.com/Tasnimul-Arabi-Anik/fetchM
6
+ Author: Tasnimul Arabi Anik
7
+ Author-email: arabianik987@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pandas>=2.0
15
+ Requires-Dist: requests>=2.31
16
+ Requires-Dist: xmltodict>=0.13
17
+ Requires-Dist: matplotlib>=3.7
18
+ Requires-Dist: seaborn>=0.13
19
+ Requires-Dist: scipy>=1.11
20
+ Requires-Dist: tqdm>=4.66
21
+ Requires-Dist: plotly>=5.20
22
+ Requires-Dist: kaleido>=0.2.1
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # fetchm: Metadata Fetching and Analysis Tool
35
+
36
+ ## Overview
37
+ fetchm is a Python-based tool for fetching and analyzing genomic metadata from NCBI BioSample records. When you download ncbi_dataset.tsv from the NCBI genome database, the metadata fields such as 'Collection Date', 'Host', 'Geographic Location', and 'Isolation Source' are missing. This tool helps fetch the associated metadata for each BioSample ID. fetchm requires an input file (ncbi_dataset.tsv) from the NCBI genome database, retrieves additional annotations from NCBI, filters the data based on quality thresholds, and generates visualizations to help interpret the results. You can also download the filtered sequences.
38
+
39
+ ## Features
40
+ - Fetch metadata from NCBI BioSample API.
41
+ - Filter genomes based on CheckM completeness and ANI check status.
42
+ - Generate metadata summaries and annotation statistics.
43
+ - Create various visualizations for geographic distribution, collection dates, gene counts, continent, and subcontinent.
44
+ - Download genome sequences (optional).
45
+ - Download sequences after filtering by host species, year, country, continent, and subcontinent.
46
+
47
+ ## Installation
48
+ ### Install in a New Conda Environment
49
+ ```bash
50
+ conda create -n fetchm python=3.9
51
+ conda activate fetchm
52
+ pip install fetchm
53
+ ```
54
+
55
+ ## Usage
56
+ fetchm has three main modes:
57
+
58
+ 1. Generate metadata summaries and `ncbi_clean.csv` from an NCBI dataset TSV:
59
+ ```bash
60
+ fetchm metadata --input ncbi_dataset.tsv --outdir results/
61
+ ```
62
+
63
+ 2. Run the full workflow: metadata generation plus sequence download:
64
+ ```bash
65
+ fetchm run --input ncbi_dataset.tsv --outdir results/
66
+ ```
67
+
68
+ 3. Download sequences later from an existing `ncbi_clean.csv`:
69
+ ```bash
70
+ fetchm seq --input results/<organism>/metadata_output/ncbi_clean.csv --outdir results/<organism>/sequence
71
+ ```
72
+
73
+ Common examples:
74
+
75
+ Download all metadata records regardless of ANI status:
76
+ ```bash
77
+ fetchm metadata --input ncbi_dataset.tsv --outdir results/ --ani all
78
+ ```
79
+
80
+ Run the full pipeline with a CheckM threshold:
81
+ ```bash
82
+ fetchm run --input ncbi_dataset.tsv --outdir results/ --checkm 95
83
+ ```
84
+
85
+ Download only sequences from human isolates collected between 2018 and 2024:
86
+ ```bash
87
+ fetchm seq \
88
+ --input results/<organism>/metadata_output/ncbi_clean.csv \
89
+ --outdir results/<organism>/sequence \
90
+ --host "Homo sapiens" \
91
+ --year 2018-2024
92
+ ```
93
+
94
+ Download only sequences from a specific country or continent:
95
+ ```bash
96
+ fetchm seq --input ncbi_clean.csv --outdir sequence_output --country Bangladesh
97
+ fetchm seq --input ncbi_clean.csv --outdir sequence_output --cont Asia
98
+ ```
99
+
100
+ Check download completeness without downloading anything:
101
+ ```bash
102
+ fetchm seq --input ncbi_clean.csv --outdir sequence_output --check-only
103
+ ```
104
+
105
+ Important notes:
106
+
107
+ - `fetchm run` already includes sequence downloading. You do not need to add `--seq` when using `fetchm run`.
108
+ - `--seq` is only relevant for the legacy `fetchM` command, where it controls whether sequence downloading happens after metadata generation.
109
+ - `fetchm seq` supports metadata-based sequence filters: `--host`, `--year`, `--country`, `--cont`, and `--subcont`.
110
+ - Metadata filtering options for `fetchm metadata` and `fetchm run` include `--ani`, `--checkm`, and `--sleep`.
111
+ - Sequence retry behavior can be adjusted with `--retries` and `--retry-delay`.
112
+
113
+ Legacy compatibility commands:
114
+ ```bash
115
+ fetchM --input ncbi_dataset.tsv --outdir results/
116
+ fetchM --input ncbi_dataset.tsv --outdir results/ --seq
117
+ fetchM-seq --input ncbi_clean.csv --outdir sequence_output
118
+ ```
119
+
120
+ ### Test With `test.tsv`
121
+
122
+ Run a quick metadata-only smoke test:
123
+ ```bash
124
+ fetchm metadata --input test.tsv --outdir test_output
125
+ ```
126
+
127
+ Run the full pipeline, including sequence download:
128
+ ```bash
129
+ fetchm run --input test.tsv --outdir test_output
130
+ ```
131
+
132
+ Check downloaded sequence completeness from the generated `ncbi_clean.csv`:
133
+ ```bash
134
+ fetchm seq \
135
+ --input test_output/Staphylococcus_haemolyticus/metadata_output/ncbi_clean.csv \
136
+ --outdir test_output/Staphylococcus_haemolyticus/sequence \
137
+ --check-only
138
+ ```
139
+
140
+ ## Input
141
+ Download ncbi_dataset.tsv of your target organism(s) from the [NCBI genome database](https://www.ncbi.nlm.nih.gov/datasets/genome/).
142
+ -**ncbi_dataset.tsv**
143
+
144
+ # Required Columns for `ncbi_dataset.tsv` in fetchm
145
+
146
+ Before running `fetchm`, ensure that your `ncbi_dataset.tsv` file includes the following columns. These columns are necessary for metadata enrichment, quality filtering, and downstream analysis.
147
+
148
+ ---
149
+
150
+ ## 🧬 Required Columns
151
+
152
+ | Column Name | Description |
153
+ |--------------------------------------------|-------------|
154
+ | `Assembly Accession` | Unique identifier for the assembly |
155
+ | `Assembly Name` | Name of the genome assembly |
156
+ | `Organism Name` | Scientific name of the organism |
157
+ | `ANI Check status` | Status of Average Nucleotide Identity (ANI) check |
158
+ | `Annotation Name` | Annotation version or label used |
159
+ | `Assembly Stats Total Sequence Length` | Total length (in base pairs) of all sequences in the assembly |
160
+ | `Assembly BioProject Accession` | Accession ID for the related BioProject |
161
+ | `Assembly BioSample Accession` | Accession ID for the related BioSample |
162
+ | `Annotation Count Gene Total` | Total number of genes annotated |
163
+ | `Annotation Count Gene Protein-coding` | Number of protein-coding genes |
164
+ | `Annotation Count Gene Pseudogene` | Number of pseudogenes |
165
+ | `CheckM completeness` | Completeness score from CheckM (in %) |
166
+ | `CheckM contamination` | Contamination score from CheckM (in %) |
167
+
168
+ ---
169
+
170
+ ## ✅ Tips
171
+
172
+ - The file must be **tab-separated** (`.tsv` format).
173
+ - Don't change Column headers
174
+ ---
175
+
176
+ ## Output
177
+ fetchm creates a subdirectory in `/results/` based on the organism name provided in the input file. Inside this subdirectory, the following folders are created:
178
+ - **Metadata summaries** in `metadata_output/`
179
+ - `annotation_summary.csv`
180
+ - `assembly_summary.csv`
181
+ - `metadata_summary.csv`
182
+ - `ncbi_clean.csv`
183
+ - `ncbi_filtered.csv`
184
+ - `ncbi_dataset_updated.tsv`
185
+ - **Figures** in `figures/`
186
+ - `Annotation Count Gene Protein-coding_distribution.tiff`
187
+ - `Annotation Count Gene Pseudogene_distribution.tiff`
188
+ - `Annotation Count Gene Total_distribution.tiff`
189
+ - `Assembly Stats Total Sequence Length_distribution.tiff`
190
+ - `Collection Date_bar_plots.tiff`
191
+ - `Continent_bar_plots.tiff`
192
+ - `Geographic Location_bar_plots.tiff`
193
+ - `Geographic Location_map.jpg`
194
+ - `Host_bar_plots.tiff`
195
+ - `scatter_plot_gene_protein_coding_vs_collection_date.tiff`
196
+ - `scatter_plot_gene_total_vs_collection_date.tiff`
197
+ - `scatter_plot_total_sequence_length_vs_collection_date.tiff`
198
+ - `Subcontinent_bar_plots.tiff`
199
+ - **Sequences** in `sequence/` (if `--seq` is enabled, it will contain the downloaded genome sequences).
200
+
201
+
202
+ ## Visualizations
203
+ ### Annotation Distributions
204
+ ![Annotation Count Gene Protein-coding](figures/Annotation%20Count%20Gene%20Protein-coding_distribution.png)
205
+ ![Annotation Count Gene Pseudogene](figures/Annotation%20Count%20Gene%20Pseudogene_distribution.png)
206
+ ![Annotation Count Gene Total](figures/Annotation%20Count%20Gene%20Total_distribution.png)
207
+
208
+ ### Assembly Statistics
209
+ ![Assembly Sequence Length](figures/Assembly%20Stats%20Total%20Sequence%20Length_distribution.png)
210
+
211
+ ### Metadata Summaries
212
+ ![Collection Date Distribution](figures/Collection%20Date_bar_plots.png)
213
+ ![Geographic Location Map](figures/Geographic%20Location_map.jpg)
214
+ ![Geographic Location Distribution](figures/Geographic%20Location_bar_plots.png)
215
+ ![Host Distribution](figures/Host_bar_plots.png)
216
+ ![Continent Distribution](figures/Continent_bar_plots.png)
217
+ ![Subcontinent Distribution](figures/Subcontinent_bar_plots.png)
218
+
219
+ ### Scatter Plots
220
+ ![Gene Protein Coding vs Collection Date](figures/scatter_plot_gene_protein_coding_vs_collection_date.png)
221
+ ![Gene Total vs Collection Date](figures/scatter_plot_gene_total_vs_collection_date.png)
222
+ ![Sequence Length vs Collection Date](figures/scatter_plot_Sequence_Length_vs_collection_date.png)
223
+
224
+ ## License
225
+ This project is licensed under the MIT License.
226
+
227
+ ## Author
228
+ Developed by Tasnimul Arabi Anik.
229
+
230
+ ## Contributions
231
+ Contributions and improvements are welcome! Feel free to submit a pull request or report issues.
fetchm-0.1.9/README.md ADDED
@@ -0,0 +1,198 @@
1
+ # fetchm: Metadata Fetching and Analysis Tool
2
+
3
+ ## Overview
4
+ fetchm is a Python-based tool for fetching and analyzing genomic metadata from NCBI BioSample records. When you download ncbi_dataset.tsv from the NCBI genome database, the metadata fields such as 'Collection Date', 'Host', 'Geographic Location', and 'Isolation Source' are missing. This tool helps fetch the associated metadata for each BioSample ID. fetchm requires an input file (ncbi_dataset.tsv) from the NCBI genome database, retrieves additional annotations from NCBI, filters the data based on quality thresholds, and generates visualizations to help interpret the results. You can also download the filtered sequences.
5
+
6
+ ## Features
7
+ - Fetch metadata from NCBI BioSample API.
8
+ - Filter genomes based on CheckM completeness and ANI check status.
9
+ - Generate metadata summaries and annotation statistics.
10
+ - Create various visualizations for geographic distribution, collection dates, gene counts, continent, and subcontinent.
11
+ - Download genome sequences (optional).
12
+ - Download sequences after filtering by host species, year, country, continent, and subcontinent.
13
+
14
+ ## Installation
15
+ ### Install in a New Conda Environment
16
+ ```bash
17
+ conda create -n fetchm python=3.9
18
+ conda activate fetchm
19
+ pip install fetchm
20
+ ```
21
+
22
+ ## Usage
23
+ fetchm has three main modes:
24
+
25
+ 1. Generate metadata summaries and `ncbi_clean.csv` from an NCBI dataset TSV:
26
+ ```bash
27
+ fetchm metadata --input ncbi_dataset.tsv --outdir results/
28
+ ```
29
+
30
+ 2. Run the full workflow: metadata generation plus sequence download:
31
+ ```bash
32
+ fetchm run --input ncbi_dataset.tsv --outdir results/
33
+ ```
34
+
35
+ 3. Download sequences later from an existing `ncbi_clean.csv`:
36
+ ```bash
37
+ fetchm seq --input results/<organism>/metadata_output/ncbi_clean.csv --outdir results/<organism>/sequence
38
+ ```
39
+
40
+ Common examples:
41
+
42
+ Download all metadata records regardless of ANI status:
43
+ ```bash
44
+ fetchm metadata --input ncbi_dataset.tsv --outdir results/ --ani all
45
+ ```
46
+
47
+ Run the full pipeline with a CheckM threshold:
48
+ ```bash
49
+ fetchm run --input ncbi_dataset.tsv --outdir results/ --checkm 95
50
+ ```
51
+
52
+ Download only sequences from human isolates collected between 2018 and 2024:
53
+ ```bash
54
+ fetchm seq \
55
+ --input results/<organism>/metadata_output/ncbi_clean.csv \
56
+ --outdir results/<organism>/sequence \
57
+ --host "Homo sapiens" \
58
+ --year 2018-2024
59
+ ```
60
+
61
+ Download only sequences from a specific country or continent:
62
+ ```bash
63
+ fetchm seq --input ncbi_clean.csv --outdir sequence_output --country Bangladesh
64
+ fetchm seq --input ncbi_clean.csv --outdir sequence_output --cont Asia
65
+ ```
66
+
67
+ Check download completeness without downloading anything:
68
+ ```bash
69
+ fetchm seq --input ncbi_clean.csv --outdir sequence_output --check-only
70
+ ```
71
+
72
+ Important notes:
73
+
74
+ - `fetchm run` already includes sequence downloading. You do not need to add `--seq` when using `fetchm run`.
75
+ - `--seq` is only relevant for the legacy `fetchM` command, where it controls whether sequence downloading happens after metadata generation.
76
+ - `fetchm seq` supports metadata-based sequence filters: `--host`, `--year`, `--country`, `--cont`, and `--subcont`.
77
+ - Metadata filtering options for `fetchm metadata` and `fetchm run` include `--ani`, `--checkm`, and `--sleep`.
78
+ - Sequence retry behavior can be adjusted with `--retries` and `--retry-delay`.
79
+
80
+ Legacy compatibility commands:
81
+ ```bash
82
+ fetchM --input ncbi_dataset.tsv --outdir results/
83
+ fetchM --input ncbi_dataset.tsv --outdir results/ --seq
84
+ fetchM-seq --input ncbi_clean.csv --outdir sequence_output
85
+ ```
86
+
87
+ ### Test With `test.tsv`
88
+
89
+ Run a quick metadata-only smoke test:
90
+ ```bash
91
+ fetchm metadata --input test.tsv --outdir test_output
92
+ ```
93
+
94
+ Run the full pipeline, including sequence download:
95
+ ```bash
96
+ fetchm run --input test.tsv --outdir test_output
97
+ ```
98
+
99
+ Check downloaded sequence completeness from the generated `ncbi_clean.csv`:
100
+ ```bash
101
+ fetchm seq \
102
+ --input test_output/Staphylococcus_haemolyticus/metadata_output/ncbi_clean.csv \
103
+ --outdir test_output/Staphylococcus_haemolyticus/sequence \
104
+ --check-only
105
+ ```
106
+
107
+ ## Input
108
+ Download ncbi_dataset.tsv of your target organism(s) from the [NCBI genome database](https://www.ncbi.nlm.nih.gov/datasets/genome/).
109
+ -**ncbi_dataset.tsv**
110
+
111
+ # Required Columns for `ncbi_dataset.tsv` in fetchm
112
+
113
+ Before running `fetchm`, ensure that your `ncbi_dataset.tsv` file includes the following columns. These columns are necessary for metadata enrichment, quality filtering, and downstream analysis.
114
+
115
+ ---
116
+
117
+ ## 🧬 Required Columns
118
+
119
+ | Column Name | Description |
120
+ |--------------------------------------------|-------------|
121
+ | `Assembly Accession` | Unique identifier for the assembly |
122
+ | `Assembly Name` | Name of the genome assembly |
123
+ | `Organism Name` | Scientific name of the organism |
124
+ | `ANI Check status` | Status of Average Nucleotide Identity (ANI) check |
125
+ | `Annotation Name` | Annotation version or label used |
126
+ | `Assembly Stats Total Sequence Length` | Total length (in base pairs) of all sequences in the assembly |
127
+ | `Assembly BioProject Accession` | Accession ID for the related BioProject |
128
+ | `Assembly BioSample Accession` | Accession ID for the related BioSample |
129
+ | `Annotation Count Gene Total` | Total number of genes annotated |
130
+ | `Annotation Count Gene Protein-coding` | Number of protein-coding genes |
131
+ | `Annotation Count Gene Pseudogene` | Number of pseudogenes |
132
+ | `CheckM completeness` | Completeness score from CheckM (in %) |
133
+ | `CheckM contamination` | Contamination score from CheckM (in %) |
134
+
135
+ ---
136
+
137
+ ## ✅ Tips
138
+
139
+ - The file must be **tab-separated** (`.tsv` format).
140
+ - Don't change Column headers
141
+ ---
142
+
143
+ ## Output
144
+ fetchm creates a subdirectory in `/results/` based on the organism name provided in the input file. Inside this subdirectory, the following folders are created:
145
+ - **Metadata summaries** in `metadata_output/`
146
+ - `annotation_summary.csv`
147
+ - `assembly_summary.csv`
148
+ - `metadata_summary.csv`
149
+ - `ncbi_clean.csv`
150
+ - `ncbi_filtered.csv`
151
+ - `ncbi_dataset_updated.tsv`
152
+ - **Figures** in `figures/`
153
+ - `Annotation Count Gene Protein-coding_distribution.tiff`
154
+ - `Annotation Count Gene Pseudogene_distribution.tiff`
155
+ - `Annotation Count Gene Total_distribution.tiff`
156
+ - `Assembly Stats Total Sequence Length_distribution.tiff`
157
+ - `Collection Date_bar_plots.tiff`
158
+ - `Continent_bar_plots.tiff`
159
+ - `Geographic Location_bar_plots.tiff`
160
+ - `Geographic Location_map.jpg`
161
+ - `Host_bar_plots.tiff`
162
+ - `scatter_plot_gene_protein_coding_vs_collection_date.tiff`
163
+ - `scatter_plot_gene_total_vs_collection_date.tiff`
164
+ - `scatter_plot_total_sequence_length_vs_collection_date.tiff`
165
+ - `Subcontinent_bar_plots.tiff`
166
+ - **Sequences** in `sequence/` (if `--seq` is enabled, it will contain the downloaded genome sequences).
167
+
168
+
169
+ ## Visualizations
170
+ ### Annotation Distributions
171
+ ![Annotation Count Gene Protein-coding](figures/Annotation%20Count%20Gene%20Protein-coding_distribution.png)
172
+ ![Annotation Count Gene Pseudogene](figures/Annotation%20Count%20Gene%20Pseudogene_distribution.png)
173
+ ![Annotation Count Gene Total](figures/Annotation%20Count%20Gene%20Total_distribution.png)
174
+
175
+ ### Assembly Statistics
176
+ ![Assembly Sequence Length](figures/Assembly%20Stats%20Total%20Sequence%20Length_distribution.png)
177
+
178
+ ### Metadata Summaries
179
+ ![Collection Date Distribution](figures/Collection%20Date_bar_plots.png)
180
+ ![Geographic Location Map](figures/Geographic%20Location_map.jpg)
181
+ ![Geographic Location Distribution](figures/Geographic%20Location_bar_plots.png)
182
+ ![Host Distribution](figures/Host_bar_plots.png)
183
+ ![Continent Distribution](figures/Continent_bar_plots.png)
184
+ ![Subcontinent Distribution](figures/Subcontinent_bar_plots.png)
185
+
186
+ ### Scatter Plots
187
+ ![Gene Protein Coding vs Collection Date](figures/scatter_plot_gene_protein_coding_vs_collection_date.png)
188
+ ![Gene Total vs Collection Date](figures/scatter_plot_gene_total_vs_collection_date.png)
189
+ ![Sequence Length vs Collection Date](figures/scatter_plot_Sequence_Length_vs_collection_date.png)
190
+
191
+ ## License
192
+ This project is licensed under the MIT License.
193
+
194
+ ## Author
195
+ Developed by Tasnimul Arabi Anik.
196
+
197
+ ## Contributions
198
+ Contributions and improvements are welcome! Feel free to submit a pull request or report issues.
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
7
+
8
+ from fetchm.metadata import main
9
+
10
+
11
+ if __name__ == "__main__":
12
+ main()
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
7
+
8
+ from fetchm.sequence import main
9
+
10
+
11
+ if __name__ == "__main__":
12
+ main()
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
7
+
8
+ from fetchm.cli import main
9
+
10
+
11
+ if __name__ == "__main__":
12
+ main()
@@ -0,0 +1 @@
1
+ """fetchm package."""
@@ -0,0 +1,41 @@
1
+ import argparse
2
+
3
+ from fetchm.metadata import build_metadata_parser, run_metadata_pipeline
4
+ from fetchm.sequence import build_sequence_parser, run_sequence_downloads
5
+
6
+
7
+ def build_parser() -> argparse.ArgumentParser:
8
+ parser = argparse.ArgumentParser(
9
+ prog="fetchm",
10
+ description="Unified metadata and sequence download CLI for fetchm.",
11
+ )
12
+ subparsers = parser.add_subparsers(dest="command", required=True)
13
+
14
+ metadata_parser = subparsers.add_parser(
15
+ "metadata",
16
+ parents=[build_metadata_parser(add_help=False)],
17
+ help="Fetch metadata and generate summaries from an NCBI dataset TSV.",
18
+ )
19
+ metadata_parser.set_defaults(func=run_metadata_pipeline)
20
+
21
+ run_parser = subparsers.add_parser(
22
+ "run",
23
+ parents=[build_metadata_parser(add_help=False)],
24
+ help="Run metadata generation and sequence download in one command.",
25
+ )
26
+ run_parser.set_defaults(func=run_metadata_pipeline, seq=True)
27
+
28
+ seq_parser = subparsers.add_parser(
29
+ "seq",
30
+ parents=[build_sequence_parser(add_help=False)],
31
+ help="Download genome FASTA files from ncbi_clean.csv.",
32
+ )
33
+ seq_parser.set_defaults(func=run_sequence_downloads)
34
+
35
+ return parser
36
+
37
+
38
+ def main() -> None:
39
+ parser = build_parser()
40
+ args = parser.parse_args()
41
+ args.func(args)