fetchm2 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fetchm2-0.1.2/src/fetchm2.egg-info → fetchm2-0.1.4}/PKG-INFO +2 -2
- {fetchm2-0.1.2 → fetchm2-0.1.4}/README.md +1 -1
- {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/VALIDATION_REPORT.md +47 -1
- {fetchm2-0.1.2 → fetchm2-0.1.4}/pyproject.toml +1 -1
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/__init__.py +1 -1
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/sequence.py +16 -11
- {fetchm2-0.1.2 → fetchm2-0.1.4/src/fetchm2.egg-info}/PKG-INFO +2 -2
- {fetchm2-0.1.2 → fetchm2-0.1.4}/tests/test_cli.py +21 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/LICENSE +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/MANIFEST.in +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/METADATA_ANALYSIS.md +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/RELEASE_CHECKLIST.md +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/SEQUENCE_DOWNLOAD.md +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/STANDARDIZATION.md +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/environment.yml +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/examples/offline_metadata.tsv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/examples/test_ncbi_dataset.tsv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/setup.cfg +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/analysis.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/audit.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/cli.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/__init__.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/approved_broad_categories.csv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/collection_date_reviewed_rules.csv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/controlled_categories.csv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/country_mapping.json +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/geography_reviewed_rules.csv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/host_negative_rules.csv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/host_synonyms.csv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/metadata.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/standardization.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/utils.py +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/SOURCES.txt +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/dependency_links.txt +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/entry_points.txt +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/requires.txt +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/top_level.txt +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/test.tsv +0 -0
- {fetchm2-0.1.2 → fetchm2-0.1.4}/tests/test_standardization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fetchm2
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
|
|
5
5
|
Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -68,7 +68,7 @@ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
|
68
68
|
```bash
|
|
69
69
|
python -m venv fetchm2-env
|
|
70
70
|
source fetchm2-env/bin/activate
|
|
71
|
-
pip install fetchm2
|
|
71
|
+
pip install fetchm2==0.1.4
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
Verify:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# FetchM2 Validation Report
|
|
2
2
|
|
|
3
3
|
Validation date: 2026-05-05
|
|
4
|
-
Current validation target: `fetchm2 0.1.
|
|
4
|
+
Current validation target: `fetchm2 0.1.4`
|
|
5
5
|
|
|
6
6
|
## Source Baselines
|
|
7
7
|
|
|
@@ -234,3 +234,49 @@ Known scope notes for `0.1.2`:
|
|
|
234
234
|
- host lineage is bundled for common hosts and optionally enriched with `taxonkit` when installed
|
|
235
235
|
- embeddings/BGE are intentionally not used in production standardization
|
|
236
236
|
- large-scale sequence download was not run during this validation to avoid unnecessary NCBI load
|
|
237
|
+
|
|
238
|
+
## Additional 0.1.3 Documentation Validation
|
|
239
|
+
|
|
240
|
+
The 0.1.3 patch updates the README installation command to use the pinned current PyPI release:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
pip install fetchm2==0.1.3
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
No runtime behavior changed from 0.1.2.
|
|
247
|
+
|
|
248
|
+
## Additional 0.1.4 Sequence Download Validation
|
|
249
|
+
|
|
250
|
+
The 0.1.4 patch fixes a sequence-download cache issue found during remote-user-style testing.
|
|
251
|
+
|
|
252
|
+
Remote-user-style `0.1.3` validation:
|
|
253
|
+
|
|
254
|
+
```text
|
|
255
|
+
Fresh PyPI install: passed
|
|
256
|
+
fetchm2 --version: fetchm2 0.1.3
|
|
257
|
+
Live metadata run: production gate PASS
|
|
258
|
+
Sequence selection: selected 2 genomes
|
|
259
|
+
Sequence download: failed 2 / 2
|
|
260
|
+
Failure reason: SQLite sequence cache connection was created on the main thread and used inside download worker threads
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
0.1.4 fix:
|
|
264
|
+
|
|
265
|
+
```text
|
|
266
|
+
DirectoryCache now opens SQLite with check_same_thread=False
|
|
267
|
+
DirectoryCache serializes cache reads/writes with a lock
|
|
268
|
+
Added threaded DirectoryCache regression test
|
|
269
|
+
pytest: 12 passed
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Patched local real-download validation:
|
|
273
|
+
|
|
274
|
+
```text
|
|
275
|
+
Command: fetchm2 seq --input fetchm2_clean.csv --outdir /tmp/fetchm2_fixed_real_seq_download --max-genomes 2 --download-workers 1 --retries 2 --retry-delay 1
|
|
276
|
+
Sequences selected: 2
|
|
277
|
+
Sequences downloaded: 2
|
|
278
|
+
Sequences failed: 0
|
|
279
|
+
Downloaded files:
|
|
280
|
+
- GCA_006094395.1_ASM609439v1_genomic.fna
|
|
281
|
+
- GCF_006094395.1_ASM609439v1_genomic.fna
|
|
282
|
+
```
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "fetchm2"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.4"
|
|
8
8
|
description = "Standalone comprehensive genome metadata standardization and sequence download toolkit."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -4,6 +4,7 @@ import gzip
|
|
|
4
4
|
import re
|
|
5
5
|
import shutil
|
|
6
6
|
import sqlite3
|
|
7
|
+
import threading
|
|
7
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Any
|
|
@@ -32,28 +33,32 @@ def build_parent_url(accession: str) -> str:
|
|
|
32
33
|
|
|
33
34
|
class DirectoryCache:
|
|
34
35
|
def __init__(self, path: Path) -> None:
|
|
35
|
-
self.conn = sqlite3.connect(path)
|
|
36
|
+
self.conn = sqlite3.connect(path, check_same_thread=False)
|
|
37
|
+
self.lock = threading.Lock()
|
|
36
38
|
self.conn.execute(
|
|
37
39
|
"CREATE TABLE IF NOT EXISTS assembly_directory_cache (accession TEXT PRIMARY KEY, assembly_name TEXT, directory TEXT)"
|
|
38
40
|
)
|
|
39
41
|
self.conn.commit()
|
|
40
42
|
|
|
41
43
|
def get(self, accession: str, name: str) -> str | None:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
44
|
+
with self.lock:
|
|
45
|
+
row = self.conn.execute(
|
|
46
|
+
"SELECT directory FROM assembly_directory_cache WHERE accession = ? AND assembly_name = ?",
|
|
47
|
+
(accession, normalize_assembly_name(name)),
|
|
48
|
+
).fetchone()
|
|
46
49
|
return None if row is None else str(row[0])
|
|
47
50
|
|
|
48
51
|
def set(self, accession: str, name: str, directory: str) -> None:
|
|
49
|
-
self.
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
with self.lock:
|
|
53
|
+
self.conn.execute(
|
|
54
|
+
"INSERT OR REPLACE INTO assembly_directory_cache (accession, assembly_name, directory) VALUES (?, ?, ?)",
|
|
55
|
+
(accession, normalize_assembly_name(name), directory),
|
|
56
|
+
)
|
|
57
|
+
self.conn.commit()
|
|
54
58
|
|
|
55
59
|
def close(self) -> None:
|
|
56
|
-
self.
|
|
60
|
+
with self.lock:
|
|
61
|
+
self.conn.close()
|
|
57
62
|
|
|
58
63
|
|
|
59
64
|
def resolve_assembly_directory(accession: str, name: str, cache: DirectoryCache) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fetchm2
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
|
|
5
5
|
Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -68,7 +68,7 @@ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
|
|
|
68
68
|
```bash
|
|
69
69
|
python -m venv fetchm2-env
|
|
70
70
|
source fetchm2-env/bin/activate
|
|
71
|
-
pip install fetchm2
|
|
71
|
+
pip install fetchm2==0.1.4
|
|
72
72
|
```
|
|
73
73
|
|
|
74
74
|
Verify:
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
|
|
7
8
|
from fetchm2.cli import build_parser, main
|
|
8
9
|
from fetchm2.metadata import MetadataCache, RequestRateLimiter, fetch_biosample_metadata
|
|
10
|
+
from fetchm2.sequence import DirectoryCache
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
def test_metadata_cli_offline(tmp_path: Path, monkeypatch) -> None:
|
|
@@ -79,6 +81,25 @@ def test_sequence_check_only_cli(tmp_path: Path, monkeypatch) -> None:
|
|
|
79
81
|
assert (seq_out / "sequence_download_summary.csv").exists()
|
|
80
82
|
|
|
81
83
|
|
|
84
|
+
def test_sequence_directory_cache_is_thread_safe(tmp_path: Path) -> None:
|
|
85
|
+
cache = DirectoryCache(tmp_path / "sequence_cache.sqlite3")
|
|
86
|
+
|
|
87
|
+
def write_and_read(index: int) -> str | None:
|
|
88
|
+
accession = f"GCA_000000{index:03d}.1"
|
|
89
|
+
name = f"Assembly {index}"
|
|
90
|
+
directory = f"{accession}_Assembly_{index}"
|
|
91
|
+
cache.set(accession, name, directory)
|
|
92
|
+
return cache.get(accession, name)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
96
|
+
results = list(executor.map(write_and_read, range(12)))
|
|
97
|
+
finally:
|
|
98
|
+
cache.close()
|
|
99
|
+
|
|
100
|
+
assert results == [f"GCA_000000{index:03d}.1_Assembly_{index}" for index in range(12)]
|
|
101
|
+
|
|
102
|
+
|
|
82
103
|
def test_analyze_cli_generates_figures(tmp_path: Path, monkeypatch) -> None:
|
|
83
104
|
input_path = Path(__file__).resolve().parents[1] / "examples" / "offline_metadata.tsv"
|
|
84
105
|
meta_out = tmp_path / "meta"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|