fetchm2 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {fetchm2-0.1.2/src/fetchm2.egg-info → fetchm2-0.1.4}/PKG-INFO +2 -2
  2. {fetchm2-0.1.2 → fetchm2-0.1.4}/README.md +1 -1
  3. {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/VALIDATION_REPORT.md +47 -1
  4. {fetchm2-0.1.2 → fetchm2-0.1.4}/pyproject.toml +1 -1
  5. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/__init__.py +1 -1
  6. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/sequence.py +16 -11
  7. {fetchm2-0.1.2 → fetchm2-0.1.4/src/fetchm2.egg-info}/PKG-INFO +2 -2
  8. {fetchm2-0.1.2 → fetchm2-0.1.4}/tests/test_cli.py +21 -0
  9. {fetchm2-0.1.2 → fetchm2-0.1.4}/LICENSE +0 -0
  10. {fetchm2-0.1.2 → fetchm2-0.1.4}/MANIFEST.in +0 -0
  11. {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/METADATA_ANALYSIS.md +0 -0
  12. {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/RELEASE_CHECKLIST.md +0 -0
  13. {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/SEQUENCE_DOWNLOAD.md +0 -0
  14. {fetchm2-0.1.2 → fetchm2-0.1.4}/docs/STANDARDIZATION.md +0 -0
  15. {fetchm2-0.1.2 → fetchm2-0.1.4}/environment.yml +0 -0
  16. {fetchm2-0.1.2 → fetchm2-0.1.4}/examples/offline_metadata.tsv +0 -0
  17. {fetchm2-0.1.2 → fetchm2-0.1.4}/examples/test_ncbi_dataset.tsv +0 -0
  18. {fetchm2-0.1.2 → fetchm2-0.1.4}/setup.cfg +0 -0
  19. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/analysis.py +0 -0
  20. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/audit.py +0 -0
  21. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/cli.py +0 -0
  22. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/__init__.py +0 -0
  23. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/approved_broad_categories.csv +0 -0
  24. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/collection_date_reviewed_rules.csv +0 -0
  25. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/controlled_categories.csv +0 -0
  26. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/country_mapping.json +0 -0
  27. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/geography_reviewed_rules.csv +0 -0
  28. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/host_negative_rules.csv +0 -0
  29. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/data/host_synonyms.csv +0 -0
  30. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/metadata.py +0 -0
  31. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/standardization.py +0 -0
  32. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2/utils.py +0 -0
  33. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/SOURCES.txt +0 -0
  34. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/dependency_links.txt +0 -0
  35. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/entry_points.txt +0 -0
  36. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/requires.txt +0 -0
  37. {fetchm2-0.1.2 → fetchm2-0.1.4}/src/fetchm2.egg-info/top_level.txt +0 -0
  38. {fetchm2-0.1.2 → fetchm2-0.1.4}/test.tsv +0 -0
  39. {fetchm2-0.1.2 → fetchm2-0.1.4}/tests/test_standardization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fetchm2
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
5
5
  Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
6
6
  License-Expression: MIT
@@ -68,7 +68,7 @@ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
68
68
  ```bash
69
69
  python -m venv fetchm2-env
70
70
  source fetchm2-env/bin/activate
71
- pip install fetchm2
71
+ pip install fetchm2==0.1.4
72
72
  ```
73
73
 
74
74
  Verify:
@@ -32,7 +32,7 @@ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
32
32
  ```bash
33
33
  python -m venv fetchm2-env
34
34
  source fetchm2-env/bin/activate
35
- pip install fetchm2
35
+ pip install fetchm2==0.1.4
36
36
  ```
37
37
 
38
38
  Verify:
@@ -1,7 +1,7 @@
1
1
  # FetchM2 Validation Report
2
2
 
3
3
  Validation date: 2026-05-05
4
- Current validation target: `fetchm2 0.1.2`
4
+ Current validation target: `fetchm2 0.1.4`
5
5
 
6
6
  ## Source Baselines
7
7
 
@@ -234,3 +234,49 @@ Known scope notes for `0.1.2`:
234
234
  - host lineage is bundled for common hosts and optionally enriched with `taxonkit` when installed
235
235
  - embeddings/BGE are intentionally not used in production standardization
236
236
  - large-scale sequence download was not run during this validation to avoid unnecessary NCBI load
237
+
238
+ ## Additional 0.1.3 Documentation Validation
239
+
240
+ The 0.1.3 patch updates the README installation command to use the pinned current PyPI release:
241
+
242
+ ```bash
243
+ pip install fetchm2==0.1.3
244
+ ```
245
+
246
+ No runtime behavior changed from 0.1.2.
247
+
248
+ ## Additional 0.1.4 Sequence Download Validation
249
+
250
+ The 0.1.4 patch fixes a sequence-download cache issue found during remote-user-style testing.
251
+
252
+ Remote-user-style `0.1.3` validation:
253
+
254
+ ```text
255
+ Fresh PyPI install: passed
256
+ fetchm2 --version: fetchm2 0.1.3
257
+ Live metadata run: production gate PASS
258
+ Sequence selection: selected 2 genomes
259
+ Sequence download: failed 2 / 2
260
+ Failure reason: SQLite sequence cache connection was created on the main thread and used inside download worker threads
261
+ ```
262
+
263
+ 0.1.4 fix:
264
+
265
+ ```text
266
+ DirectoryCache now opens SQLite with check_same_thread=False
267
+ DirectoryCache serializes cache reads/writes with a lock
268
+ Added threaded DirectoryCache regression test
269
+ pytest: 12 passed
270
+ ```
271
+
272
+ Patched local real-download validation:
273
+
274
+ ```text
275
+ Command: fetchm2 seq --input fetchm2_clean.csv --outdir /tmp/fetchm2_fixed_real_seq_download --max-genomes 2 --download-workers 1 --retries 2 --retry-delay 1
276
+ Sequences selected: 2
277
+ Sequences downloaded: 2
278
+ Sequences failed: 0
279
+ Downloaded files:
280
+ - GCA_006094395.1_ASM609439v1_genomic.fna
281
+ - GCF_006094395.1_ASM609439v1_genomic.fna
282
+ ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "fetchm2"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "Standalone comprehensive genome metadata standardization and sequence download toolkit."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -2,4 +2,4 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.1.2"
5
+ __version__ = "0.1.4"
@@ -4,6 +4,7 @@ import gzip
4
4
  import re
5
5
  import shutil
6
6
  import sqlite3
7
+ import threading
7
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
9
  from pathlib import Path
9
10
  from typing import Any
@@ -32,28 +33,32 @@ def build_parent_url(accession: str) -> str:
32
33
 
33
34
  class DirectoryCache:
34
35
  def __init__(self, path: Path) -> None:
35
- self.conn = sqlite3.connect(path)
36
+ self.conn = sqlite3.connect(path, check_same_thread=False)
37
+ self.lock = threading.Lock()
36
38
  self.conn.execute(
37
39
  "CREATE TABLE IF NOT EXISTS assembly_directory_cache (accession TEXT PRIMARY KEY, assembly_name TEXT, directory TEXT)"
38
40
  )
39
41
  self.conn.commit()
40
42
 
41
43
  def get(self, accession: str, name: str) -> str | None:
42
- row = self.conn.execute(
43
- "SELECT directory FROM assembly_directory_cache WHERE accession = ? AND assembly_name = ?",
44
- (accession, normalize_assembly_name(name)),
45
- ).fetchone()
44
+ with self.lock:
45
+ row = self.conn.execute(
46
+ "SELECT directory FROM assembly_directory_cache WHERE accession = ? AND assembly_name = ?",
47
+ (accession, normalize_assembly_name(name)),
48
+ ).fetchone()
46
49
  return None if row is None else str(row[0])
47
50
 
48
51
  def set(self, accession: str, name: str, directory: str) -> None:
49
- self.conn.execute(
50
- "INSERT OR REPLACE INTO assembly_directory_cache (accession, assembly_name, directory) VALUES (?, ?, ?)",
51
- (accession, normalize_assembly_name(name), directory),
52
- )
53
- self.conn.commit()
52
+ with self.lock:
53
+ self.conn.execute(
54
+ "INSERT OR REPLACE INTO assembly_directory_cache (accession, assembly_name, directory) VALUES (?, ?, ?)",
55
+ (accession, normalize_assembly_name(name), directory),
56
+ )
57
+ self.conn.commit()
54
58
 
55
59
  def close(self) -> None:
56
- self.conn.close()
60
+ with self.lock:
61
+ self.conn.close()
57
62
 
58
63
 
59
64
  def resolve_assembly_directory(accession: str, name: str, cache: DirectoryCache) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fetchm2
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Standalone comprehensive genome metadata standardization and sequence download toolkit.
5
5
  Author-email: Tasnimul Arabi Anik <arabianik987@gmail.com>
6
6
  License-Expression: MIT
@@ -68,7 +68,7 @@ fetchm2 run --input ncbi_dataset.tsv --outdir results --download
68
68
  ```bash
69
69
  python -m venv fetchm2-env
70
70
  source fetchm2-env/bin/activate
71
- pip install fetchm2
71
+ pip install fetchm2==0.1.4
72
72
  ```
73
73
 
74
74
  Verify:
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from concurrent.futures import ThreadPoolExecutor
3
4
  from pathlib import Path
4
5
 
5
6
  import pandas as pd
6
7
 
7
8
  from fetchm2.cli import build_parser, main
8
9
  from fetchm2.metadata import MetadataCache, RequestRateLimiter, fetch_biosample_metadata
10
+ from fetchm2.sequence import DirectoryCache
9
11
 
10
12
 
11
13
  def test_metadata_cli_offline(tmp_path: Path, monkeypatch) -> None:
@@ -79,6 +81,25 @@ def test_sequence_check_only_cli(tmp_path: Path, monkeypatch) -> None:
79
81
  assert (seq_out / "sequence_download_summary.csv").exists()
80
82
 
81
83
 
84
+ def test_sequence_directory_cache_is_thread_safe(tmp_path: Path) -> None:
85
+ cache = DirectoryCache(tmp_path / "sequence_cache.sqlite3")
86
+
87
+ def write_and_read(index: int) -> str | None:
88
+ accession = f"GCA_000000{index:03d}.1"
89
+ name = f"Assembly {index}"
90
+ directory = f"{accession}_Assembly_{index}"
91
+ cache.set(accession, name, directory)
92
+ return cache.get(accession, name)
93
+
94
+ try:
95
+ with ThreadPoolExecutor(max_workers=4) as executor:
96
+ results = list(executor.map(write_and_read, range(12)))
97
+ finally:
98
+ cache.close()
99
+
100
+ assert results == [f"GCA_000000{index:03d}.1_Assembly_{index}" for index in range(12)]
101
+
102
+
82
103
  def test_analyze_cli_generates_figures(tmp_path: Path, monkeypatch) -> None:
83
104
  input_path = Path(__file__).resolve().parents[1] / "examples" / "offline_metadata.tsv"
84
105
  meta_out = tmp_path / "meta"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes