omix 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. omix-0.1.0/LICENSE +21 -0
  2. omix-0.1.0/PKG-INFO +138 -0
  3. omix-0.1.0/README.md +107 -0
  4. omix-0.1.0/omix/__init__.py +13 -0
  5. omix-0.1.0/omix/cli.py +531 -0
  6. omix-0.1.0/omix/config.py +150 -0
  7. omix-0.1.0/omix/logging_utils.py +59 -0
  8. omix-0.1.0/omix/metadata/__init__.py +16 -0
  9. omix-0.1.0/omix/metadata/base.py +64 -0
  10. omix-0.1.0/omix/metadata/constants.py +159 -0
  11. omix-0.1.0/omix/metadata/ena/__init__.py +16 -0
  12. omix-0.1.0/omix/metadata/ena/cache.py +464 -0
  13. omix-0.1.0/omix/metadata/ena/enrichment_pipeline.py +217 -0
  14. omix-0.1.0/omix/metadata/ena/fetcher.py +374 -0
  15. omix-0.1.0/omix/metadata/ena/metadata.py +333 -0
  16. omix-0.1.0/omix/metadata/ena/sample_parser.py +784 -0
  17. omix-0.1.0/omix/metadata/ena/sra_fallback.py +20 -0
  18. omix-0.1.0/omix/metadata/enrichment.py +420 -0
  19. omix-0.1.0/omix/metadata/file_workflow.py +1138 -0
  20. omix-0.1.0/omix/metadata/manager.py +590 -0
  21. omix-0.1.0/omix/publications/__init__.py +16 -0
  22. omix-0.1.0/omix/publications/apis/__init__.py +24 -0
  23. omix-0.1.0/omix/publications/apis/arxiv.py +82 -0
  24. omix-0.1.0/omix/publications/apis/base.py +170 -0
  25. omix-0.1.0/omix/publications/apis/basesearch.py +76 -0
  26. omix-0.1.0/omix/publications/apis/bioarxiv.py +77 -0
  27. omix-0.1.0/omix/publications/apis/core.py +66 -0
  28. omix-0.1.0/omix/publications/apis/crossref.py +72 -0
  29. omix-0.1.0/omix/publications/apis/datacite.py +71 -0
  30. omix-0.1.0/omix/publications/apis/doaj.py +73 -0
  31. omix-0.1.0/omix/publications/apis/europe_pmc.py +66 -0
  32. omix-0.1.0/omix/publications/apis/lens.py +86 -0
  33. omix-0.1.0/omix/publications/apis/mendeley.py +77 -0
  34. omix-0.1.0/omix/publications/apis/ncbi.py +227 -0
  35. omix-0.1.0/omix/publications/apis/plos.py +68 -0
  36. omix-0.1.0/omix/publications/apis/semantic_scholar.py +78 -0
  37. omix-0.1.0/omix/publications/apis/springer_nature.py +72 -0
  38. omix-0.1.0/omix/publications/apis/unpaywall.py +90 -0
  39. omix-0.1.0/omix/publications/apis/zenodo.py +93 -0
  40. omix-0.1.0/omix/publications/base.py +89 -0
  41. omix-0.1.0/omix/publications/cache.py +209 -0
  42. omix-0.1.0/omix/publications/exceptions.py +8 -0
  43. omix-0.1.0/omix/publications/extractors/__init__.py +11 -0
  44. omix-0.1.0/omix/publications/extractors/cleaning.py +317 -0
  45. omix-0.1.0/omix/publications/extractors/llm.py +145 -0
  46. omix-0.1.0/omix/publications/extractors/omics/_16s.py +212 -0
  47. omix-0.1.0/omix/publications/extractors/omics/__init__.py +9 -0
  48. omix-0.1.0/omix/publications/extractors/omics/base.py +87 -0
  49. omix-0.1.0/omix/publications/extractors/pdf.py +119 -0
  50. omix-0.1.0/omix/publications/extractors/webpage.py +84 -0
  51. omix-0.1.0/omix/publications/fetcher.py +796 -0
  52. omix-0.1.0/omix/validators/__init__.py +0 -0
  53. omix-0.1.0/omix/validators/primer_db.py +173 -0
  54. omix-0.1.0/omix/validators/probebase_builder.py +218 -0
  55. omix-0.1.0/omix.egg-info/PKG-INFO +138 -0
  56. omix-0.1.0/omix.egg-info/SOURCES.txt +60 -0
  57. omix-0.1.0/omix.egg-info/dependency_links.txt +1 -0
  58. omix-0.1.0/omix.egg-info/entry_points.txt +2 -0
  59. omix-0.1.0/omix.egg-info/requires.txt +22 -0
  60. omix-0.1.0/omix.egg-info/top_level.txt +1 -0
  61. omix-0.1.0/pyproject.toml +40 -0
  62. omix-0.1.0/setup.cfg +4 -0
omix-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Heather MacGregor
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
omix-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: omix
3
+ Version: 0.1.0
4
+ Summary: A modular Python package for fetching, enriching, and analyzing omics metadata and publications.
5
+ Author-email: Heather MacGregor <user@example.com>
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: requests>=2.31
11
+ Requires-Dist: beautifulsoup4>=4.12
12
+ Requires-Dist: pandas>=2.0
13
+ Requires-Dist: pypdf2>=3.0
14
+ Requires-Dist: urllib3>=2.0
15
+ Requires-Dist: pyyaml>=6.0
16
+ Requires-Dist: rich>=13.0
17
+ Requires-Dist: click>=8.0
18
+ Requires-Dist: aiohttp>=3.8
19
+ Requires-Dist: biopython>=1.81
20
+ Requires-Dist: geopy>=2.0
21
+ Requires-Dist: rapidfuzz>=3.0
22
+ Requires-Dist: tqdm>=4.0
23
+ Provides-Extra: llm
24
+ Requires-Dist: openai>=1.0; extra == "llm"
25
+ Requires-Dist: transformers>=4.30; extra == "llm"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == "dev"
28
+ Requires-Dist: pytest-asyncio; extra == "dev"
29
+ Requires-Dist: httpx; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # omix
33
+
34
+ A Python package that:
35
+
36
+ - Fetches comprehensive metadata from public databases (ENA, and soon others).
37
+ - Enriches coordinates, dates, host/environment categories, and experimental protocols.
38
+ - Searches across multiple publication sources (Crossref, Europe PMC, NCBI, Semantic Scholar, etc.).
39
+ - Extracts methodology from full‑text using LLMs (optional).
40
+ - Validates findings against reference databases (e.g., primer databases for 16S).
41
+ - Works for **any omics** via plugins.
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install omix
47
+ # with LLM support:
48
+ pip install omix[llm]
49
+ ```
50
+
51
+ ## Quick Start
52
+
53
+ ### Command Line
54
+
55
+ ```bash
56
+ # Enrich a metadata file with ENA data
57
+ omix fetch-metadata samples.tsv --email you@example.com
58
+
59
+ # Fetch publications for one or more accessions
60
+ omix fetch-publications PRJNA864623 --omics 16S --api-key $LLM_KEY
61
+
62
+ # Run the full metadata cleaning and enrichment pipeline
63
+ omix run-pipeline metadata.csv -o enriched.csv
64
+
65
+ # NEW: Unified pipeline (metadata + publications + validation + integration)
66
+ omix enrich-with-publications samples.csv -o enriched_complete.csv --config config.yaml
67
+ ```
68
+
69
+ ### Unified Metadata + Publications Pipeline
70
+
71
+ The `enrich-with-publications` command provides an end-to-end workflow:
72
+
73
+ 1. **Metadata Enrichment**: Fetches comprehensive data from ENA (sequences, samples, runs)
74
+ 2. **Publication Discovery**: Searches across 12+ publication APIs (Crossref, EuropePMC, NCBI, Semantic Scholar, arXiv, bioRxiv, CORE, DataCite, DOAJ, PLOS, Unpaywall, Zenodo)
75
+ 3. **Publication Validation**: Filters to only include publications with direct accession mentions
76
+ 4. **Integration**: Merges publication counts and DOIs into the enriched metadata
77
+
78
+ Output includes all ENA metadata fields plus:
79
+ - `publication_count`: Number of validated publications per study
80
+ - `publication_dois`: Semicolon-separated list of publication DOIs
81
+
82
+ ```bash
83
+ # Basic usage
84
+ omix enrich-with-publications input.csv -o output.csv
85
+
86
+ # With debug config for faster testing
87
+ omix enrich-with-publications input.csv -o output.csv --config config.debug.yaml
88
+
89
+ # Skip validation (keep all publications found)
90
+ omix enrich-with-publications input.csv -o output.csv --no-validate
91
+
92
+ # With LLM-based methodology extraction
93
+ omix enrich-with-publications input.csv -o output.csv --api-key $LLM_KEY
94
+ ```
95
+
96
+ ### Python API
97
+
98
+ ```python
99
+ from omix import Config
100
+ from omix.metadata.file_workflow import enrich_metadata_from_path
101
+ import asyncio
102
+
103
+ config = Config(email="you@example.com")
104
+ df = asyncio.run(enrich_metadata_from_path("samples.csv", config=config))
105
+ print(df.head())
106
+ ```
107
+
108
+ ## Configuration
109
+
110
+ `omix` can be configured via a YAML file:
111
+
112
+ ```yaml
113
+ credentials:
114
+ email: "your.email@example.com"
115
+ ena_email: "ena@example.com"
116
+ llm_api_key: "sk-..."
117
+ ncbi_api_key: "..."
118
+
119
+ apis:
120
+ sequence:
121
+ ena:
122
+ enabled: true
123
+ max_concurrent: 5
124
+ batch_size: 100
125
+ cache_ttl_days: 30
126
+ fetch_phases: true
127
+
128
+ metadata:
129
+ sample_id_column: "#sampleid"
130
+ exclude_host: false
131
+
132
+ paths:
133
+ cache_dir: ".cache"
134
+ logs_dir: "logs"
135
+ primer_db: null
136
+ ```
137
+
138
+ Pass it with `--config my_config.yaml` or set environment variables like `OMIX_EMAIL`.
omix-0.1.0/README.md ADDED
@@ -0,0 +1,107 @@
1
+ # omix
2
+
3
+ A Python package that:
4
+
5
+ - Fetches comprehensive metadata from public databases (ENA, and soon others).
6
+ - Enriches coordinates, dates, host/environment categories, and experimental protocols.
7
+ - Searches across multiple publication sources (Crossref, Europe PMC, NCBI, Semantic Scholar, etc.).
8
+ - Extracts methodology from full‑text using LLMs (optional).
9
+ - Validates findings against reference databases (e.g., primer databases for 16S).
10
+ - Works for **any omics** via plugins.
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install omix
16
+ # with LLM support:
17
+ pip install omix[llm]
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ### Command Line
23
+
24
+ ```bash
25
+ # Enrich a metadata file with ENA data
26
+ omix fetch-metadata samples.tsv --email you@example.com
27
+
28
+ # Fetch publications for one or more accessions
29
+ omix fetch-publications PRJNA864623 --omics 16S --api-key $LLM_KEY
30
+
31
+ # Run the full metadata cleaning and enrichment pipeline
32
+ omix run-pipeline metadata.csv -o enriched.csv
33
+
34
+ # NEW: Unified pipeline (metadata + publications + validation + integration)
35
+ omix enrich-with-publications samples.csv -o enriched_complete.csv --config config.yaml
36
+ ```
37
+
38
+ ### Unified Metadata + Publications Pipeline
39
+
40
+ The `enrich-with-publications` command provides an end-to-end workflow:
41
+
42
+ 1. **Metadata Enrichment**: Fetches comprehensive data from ENA (sequences, samples, runs)
43
+ 2. **Publication Discovery**: Searches across 12+ publication APIs (Crossref, EuropePMC, NCBI, Semantic Scholar, arXiv, bioRxiv, CORE, DataCite, DOAJ, PLOS, Unpaywall, Zenodo)
44
+ 3. **Publication Validation**: Filters to only include publications with direct accession mentions
45
+ 4. **Integration**: Merges publication counts and DOIs into the enriched metadata
46
+
47
+ Output includes all ENA metadata fields plus:
48
+ - `publication_count`: Number of validated publications per study
49
+ - `publication_dois`: Semicolon-separated list of publication DOIs
50
+
51
+ ```bash
52
+ # Basic usage
53
+ omix enrich-with-publications input.csv -o output.csv
54
+
55
+ # With debug config for faster testing
56
+ omix enrich-with-publications input.csv -o output.csv --config config.debug.yaml
57
+
58
+ # Skip validation (keep all publications found)
59
+ omix enrich-with-publications input.csv -o output.csv --no-validate
60
+
61
+ # With LLM-based methodology extraction
62
+ omix enrich-with-publications input.csv -o output.csv --api-key $LLM_KEY
63
+ ```
64
+
65
+ ### Python API
66
+
67
+ ```python
68
+ from omix import Config
69
+ from omix.metadata.file_workflow import enrich_metadata_from_path
70
+ import asyncio
71
+
72
+ config = Config(email="you@example.com")
73
+ df = asyncio.run(enrich_metadata_from_path("samples.csv", config=config))
74
+ print(df.head())
75
+ ```
76
+
77
+ ## Configuration
78
+
79
+ `omix` can be configured via a YAML file:
80
+
81
+ ```yaml
82
+ credentials:
83
+ email: "your.email@example.com"
84
+ ena_email: "ena@example.com"
85
+ llm_api_key: "sk-..."
86
+ ncbi_api_key: "..."
87
+
88
+ apis:
89
+ sequence:
90
+ ena:
91
+ enabled: true
92
+ max_concurrent: 5
93
+ batch_size: 100
94
+ cache_ttl_days: 30
95
+ fetch_phases: true
96
+
97
+ metadata:
98
+ sample_id_column: "#sampleid"
99
+ exclude_host: false
100
+
101
+ paths:
102
+ cache_dir: ".cache"
103
+ logs_dir: "logs"
104
+ primer_db: null
105
+ ```
106
+
107
+ Pass it with `--config my_config.yaml` or set environment variables like `OMIX_EMAIL`.
@@ -0,0 +1,13 @@
1
+ """
2
+ omix: A modular Python package for fetching, enriching, and analyzing
3
+ omics metadata and publications.
4
+
5
+ Usage:
6
+ from omix import Config
7
+ config = Config(email="you@example.com")
8
+ """
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ from .config import Config, load_config
13
+ from .logging_utils import setup_logging, get_logger