omix 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omix/__init__.py +13 -0
- omix/cli.py +531 -0
- omix/config.py +150 -0
- omix/logging_utils.py +59 -0
- omix/metadata/__init__.py +16 -0
- omix/metadata/base.py +64 -0
- omix/metadata/constants.py +159 -0
- omix/metadata/ena/__init__.py +16 -0
- omix/metadata/ena/cache.py +464 -0
- omix/metadata/ena/enrichment_pipeline.py +217 -0
- omix/metadata/ena/fetcher.py +374 -0
- omix/metadata/ena/metadata.py +333 -0
- omix/metadata/ena/sample_parser.py +784 -0
- omix/metadata/ena/sra_fallback.py +20 -0
- omix/metadata/enrichment.py +420 -0
- omix/metadata/file_workflow.py +1138 -0
- omix/metadata/manager.py +590 -0
- omix/publications/__init__.py +16 -0
- omix/publications/apis/__init__.py +24 -0
- omix/publications/apis/arxiv.py +82 -0
- omix/publications/apis/base.py +170 -0
- omix/publications/apis/basesearch.py +76 -0
- omix/publications/apis/bioarxiv.py +77 -0
- omix/publications/apis/core.py +66 -0
- omix/publications/apis/crossref.py +72 -0
- omix/publications/apis/datacite.py +71 -0
- omix/publications/apis/doaj.py +73 -0
- omix/publications/apis/europe_pmc.py +66 -0
- omix/publications/apis/lens.py +86 -0
- omix/publications/apis/mendeley.py +77 -0
- omix/publications/apis/ncbi.py +227 -0
- omix/publications/apis/plos.py +68 -0
- omix/publications/apis/semantic_scholar.py +78 -0
- omix/publications/apis/springer_nature.py +72 -0
- omix/publications/apis/unpaywall.py +90 -0
- omix/publications/apis/zenodo.py +93 -0
- omix/publications/base.py +89 -0
- omix/publications/cache.py +209 -0
- omix/publications/exceptions.py +8 -0
- omix/publications/extractors/__init__.py +11 -0
- omix/publications/extractors/cleaning.py +317 -0
- omix/publications/extractors/llm.py +145 -0
- omix/publications/extractors/omics/_16s.py +212 -0
- omix/publications/extractors/omics/__init__.py +9 -0
- omix/publications/extractors/omics/base.py +87 -0
- omix/publications/extractors/pdf.py +119 -0
- omix/publications/extractors/webpage.py +84 -0
- omix/publications/fetcher.py +796 -0
- omix/validators/__init__.py +0 -0
- omix/validators/primer_db.py +173 -0
- omix/validators/probebase_builder.py +218 -0
- omix-0.1.0.dist-info/METADATA +138 -0
- omix-0.1.0.dist-info/RECORD +57 -0
- omix-0.1.0.dist-info/WHEEL +5 -0
- omix-0.1.0.dist-info/entry_points.txt +2 -0
- omix-0.1.0.dist-info/licenses/LICENSE +21 -0
- omix-0.1.0.dist-info/top_level.txt +1 -0
omix/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
omix: A modular Python package for fetching, enriching, and analyzing
|
|
3
|
+
omics metadata and publications.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
from omix import Config
|
|
7
|
+
config = Config(email="you@example.com")
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
from .config import Config, load_config
|
|
13
|
+
from .logging_utils import setup_logging, get_logger
|
omix/cli.py
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command‑line interface for omix.
|
|
3
|
+
|
|
4
|
+
Provides subcommands:
|
|
5
|
+
- build-primer-db : download and build a probeBase primer database.
|
|
6
|
+
- fetch-metadata : enrich a metadata file with ENA data.
|
|
7
|
+
- fetch-publications : search and analyse publications linked to accessions.
|
|
8
|
+
- run-pipeline : run the full MetadataManager pipeline on a file.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import List, Optional
|
|
14
|
+
|
|
15
|
+
import click
|
|
16
|
+
|
|
17
|
+
from omix import __version__
|
|
18
|
+
from omix.config import Config, load_config
|
|
19
|
+
from omix.logging_utils import setup_logging
|
|
20
|
+
from omix.metadata.file_workflow import enrich_metadata_from_path
|
|
21
|
+
from omix.metadata.manager import MetadataManager
|
|
22
|
+
from omix.publications.apis.ncbi import PMIDSource
|
|
23
|
+
from omix.publications.fetcher import PublicationFetcher
|
|
24
|
+
from omix.publications.apis import (
|
|
25
|
+
CrossrefAPI,
|
|
26
|
+
EuropePMCAPI,
|
|
27
|
+
NCBIAPI,
|
|
28
|
+
SemanticScholarAPI,
|
|
29
|
+
ArxivAPI,
|
|
30
|
+
BioarxivAPI,
|
|
31
|
+
CoreAPI,
|
|
32
|
+
DataciteAPI,
|
|
33
|
+
DOAJAPI,
|
|
34
|
+
PLOSAPI,
|
|
35
|
+
UnpaywallAPI,
|
|
36
|
+
ZenodoAPI,
|
|
37
|
+
)
|
|
38
|
+
from omix.publications.extractors.omics import SixteenSExtractor
|
|
39
|
+
from omix.publications.cache import PublicationCache
|
|
40
|
+
from omix.validators.primer_db import ProbeBaseDatabase
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# --------------------------------------------------------------------------- #
|
|
44
|
+
# Common options
|
|
45
|
+
# --------------------------------------------------------------------------- #
|
|
46
|
+
|
|
47
|
+
def _config_option(f):
|
|
48
|
+
"""Decorator that adds --config / --email / --cache-dir options to a command."""
|
|
49
|
+
f = click.option(
|
|
50
|
+
"--config", "-c",
|
|
51
|
+
type=click.Path(exists=True, path_type=Path),
|
|
52
|
+
help="Path to YAML configuration file.",
|
|
53
|
+
)(f)
|
|
54
|
+
f = click.option(
|
|
55
|
+
"--email", "-e",
|
|
56
|
+
default=None,
|
|
57
|
+
help="Email address for polite API requests (overrides config).",
|
|
58
|
+
)(f)
|
|
59
|
+
f = click.option(
|
|
60
|
+
"--cache-dir",
|
|
61
|
+
type=click.Path(path_type=Path),
|
|
62
|
+
help="Override the cache directory.",
|
|
63
|
+
)(f)
|
|
64
|
+
return f
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _build_config(config_path: Optional[Path], email: Optional[str], cache_dir: Optional[Path]) -> Config:
|
|
68
|
+
"""Build a Config object from CLI arguments and/or a YAML file."""
|
|
69
|
+
if config_path:
|
|
70
|
+
config = load_config(config_path)
|
|
71
|
+
else:
|
|
72
|
+
config = Config()
|
|
73
|
+
|
|
74
|
+
if email:
|
|
75
|
+
config.credentials.email = email
|
|
76
|
+
if cache_dir:
|
|
77
|
+
config.paths.cache_dir = cache_dir
|
|
78
|
+
config.cache_dir = cache_dir
|
|
79
|
+
return config
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# --------------------------------------------------------------------------- #
|
|
83
|
+
# Main entry point
|
|
84
|
+
# --------------------------------------------------------------------------- #
|
|
85
|
+
|
|
86
|
+
@click.group()
|
|
87
|
+
@click.version_option(version=__version__)
|
|
88
|
+
def main():
|
|
89
|
+
"""omix – a modular toolkit for omics metadata & publication analysis."""
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# --------------------------------------------------------------------------- #
|
|
94
|
+
# build-primer-db
|
|
95
|
+
# --------------------------------------------------------------------------- #
|
|
96
|
+
|
|
97
|
+
@main.command()
|
|
98
|
+
@click.option("--csv", type=click.Path(path_type=Path), default=Path("data/probe_data.csv"),
|
|
99
|
+
help="Path to downloaded probeBase CSV (will be downloaded if missing).")
|
|
100
|
+
@click.option("--db", type=click.Path(path_type=Path), default=Path("data/primer_data.db"),
|
|
101
|
+
help="Output SQLite database path.")
|
|
102
|
+
@_config_option
|
|
103
|
+
def build_primer_db(
|
|
104
|
+
csv: Path,
|
|
105
|
+
db: Path,
|
|
106
|
+
config: Optional[Path],
|
|
107
|
+
email: Optional[str],
|
|
108
|
+
cache_dir: Optional[Path],
|
|
109
|
+
):
|
|
110
|
+
"""Download the probeBase primer list and build a searchable SQLite database."""
|
|
111
|
+
cfg = _build_config(config, email, cache_dir)
|
|
112
|
+
setup_logging(cfg.logs_dir)
|
|
113
|
+
|
|
114
|
+
from omix.validators.probebase_builder import import_and_save_database
|
|
115
|
+
success = import_and_save_database(csv, db)
|
|
116
|
+
if not success:
|
|
117
|
+
click.echo("❌ Primer database build failed.", err=True)
|
|
118
|
+
raise click.Abort()
|
|
119
|
+
click.echo(f"✅ Primer database built at {db}")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# --------------------------------------------------------------------------- #
|
|
123
|
+
# fetch-metadata
|
|
124
|
+
# --------------------------------------------------------------------------- #
|
|
125
|
+
|
|
126
|
+
@main.command()
|
|
127
|
+
@click.argument("input_path", type=click.Path(exists=True, path_type=Path))
|
|
128
|
+
@click.option("--output", "-o", type=click.Path(path_type=Path), help="Output file path.")
|
|
129
|
+
@click.option("--sample-id-col", default=None, help="Override sample ID column name.")
|
|
130
|
+
@click.option("--no-ena", is_flag=True, help="Disable ENA enrichment.")
|
|
131
|
+
@click.option("--no-geocode", is_flag=True, help="Disable reverse geocoding.")
|
|
132
|
+
@click.option("--preserve-rows", is_flag=True, default=True, help="Keep rows without coordinates.")
|
|
133
|
+
@click.option("--omics-type", default=None, help="Filter by omics type (amplicon, metagenomics, transcriptomics, etc.).")
|
|
134
|
+
@click.option("--amplicon-gene", default=None, help="Filter by amplicon gene (16S, 18S, ITS, COI, etc.).")
|
|
135
|
+
@click.option("--primer-set", default=None, help="Filter by primer set name.")
|
|
136
|
+
@click.option("--subfragment", default=None, help="Filter by subfragment/variable region.")
|
|
137
|
+
@click.option("--report", "-r", type=click.Path(path_type=Path), help="Save composition report as JSON to this path.")
|
|
138
|
+
@_config_option
|
|
139
|
+
def fetch_metadata(
|
|
140
|
+
input_path: Path,
|
|
141
|
+
output: Optional[Path],
|
|
142
|
+
sample_id_col: Optional[str],
|
|
143
|
+
no_ena: bool,
|
|
144
|
+
no_geocode: bool,
|
|
145
|
+
preserve_rows: bool,
|
|
146
|
+
omics_type: Optional[str],
|
|
147
|
+
amplicon_gene: Optional[str],
|
|
148
|
+
primer_set: Optional[str],
|
|
149
|
+
subfragment: Optional[str],
|
|
150
|
+
report: Optional[Path],
|
|
151
|
+
config: Optional[Path],
|
|
152
|
+
email: Optional[str],
|
|
153
|
+
cache_dir: Optional[Path],
|
|
154
|
+
):
|
|
155
|
+
"""Enrich a metadata file with ENA data and generate an omics profile report."""
|
|
156
|
+
cfg = _build_config(config, email, cache_dir)
|
|
157
|
+
if no_geocode:
|
|
158
|
+
cfg.metadata.enable_geocoding = False
|
|
159
|
+
setup_logging(cfg.logs_dir)
|
|
160
|
+
|
|
161
|
+
click.echo(f"📁 Loading {input_path} ...")
|
|
162
|
+
result = asyncio.run(
|
|
163
|
+
enrich_metadata_from_path(
|
|
164
|
+
input_path=input_path,
|
|
165
|
+
output_path=output,
|
|
166
|
+
config=cfg,
|
|
167
|
+
sample_id_column=sample_id_col,
|
|
168
|
+
enable_ena_lookup=not no_ena,
|
|
169
|
+
preserve_all_rows=preserve_rows,
|
|
170
|
+
omics_type=omics_type,
|
|
171
|
+
amplicon_gene=amplicon_gene,
|
|
172
|
+
primer_set=primer_set,
|
|
173
|
+
subfragment=subfragment,
|
|
174
|
+
report_output=report,
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
click.echo(f"✅ Done. Output has {len(result)} rows and {len(result.columns)} columns.")
|
|
178
|
+
if report:
|
|
179
|
+
click.echo(f"📊 Composition report saved to {report}")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# --------------------------------------------------------------------------- #
|
|
183
|
+
# fetch-publications
|
|
184
|
+
# --------------------------------------------------------------------------- #
|
|
185
|
+
|
|
186
|
+
@main.command()
|
|
187
|
+
@click.argument("accessions", nargs=-1)
|
|
188
|
+
@click.option("--omics", default="16S", help="Omics type (currently only '16S').")
|
|
189
|
+
@click.option("--pmid", type=str, default=None, help="PubMed ID for direct lookup.")
|
|
190
|
+
@click.option("--api-key", envvar="OMIX_LLM_API_KEY", help="LLM API key for methodology extraction.")
|
|
191
|
+
@click.option("--no-llm", is_flag=True, help="Skip LLM extraction (regex only).")
|
|
192
|
+
@click.option("--builtin", is_flag=True, help="Use the built‑in primer database (no probeBase needed).")
|
|
193
|
+
@click.option("--primer-db", type=click.Path(exists=True, path_type=Path),
|
|
194
|
+
help="Path to a probeBase SQLite primer database.")
|
|
195
|
+
@click.option("--max-rounds", type=int, default=3, help="Maximum citation chasing rounds.")
|
|
196
|
+
@click.option("--output", "-o", type=click.Path(path_type=Path), help="Save results as JSON.")
|
|
197
|
+
@_config_option
|
|
198
|
+
def fetch_publications(
|
|
199
|
+
accessions: List[str],
|
|
200
|
+
omics: str,
|
|
201
|
+
pmid: Optional[str],
|
|
202
|
+
api_key: Optional[str],
|
|
203
|
+
no_llm: bool,
|
|
204
|
+
builtin: bool,
|
|
205
|
+
primer_db: Optional[Path],
|
|
206
|
+
max_rounds: int,
|
|
207
|
+
output: Optional[Path],
|
|
208
|
+
config: Optional[Path],
|
|
209
|
+
email: Optional[str],
|
|
210
|
+
cache_dir: Optional[Path],
|
|
211
|
+
):
|
|
212
|
+
"""Search and analyse publications for one or more accessions."""
|
|
213
|
+
cfg = _build_config(config, email, cache_dir)
|
|
214
|
+
setup_logging(cfg.logs_dir)
|
|
215
|
+
|
|
216
|
+
if not accessions:
|
|
217
|
+
click.echo("❌ You must provide at least one accession.", err=True)
|
|
218
|
+
raise click.Abort()
|
|
219
|
+
|
|
220
|
+
# ---- Build publication sources (free and reliable ones) ----
|
|
221
|
+
# FEATURE 3: Pass retry config from publication config
|
|
222
|
+
retry_config = {
|
|
223
|
+
'max_retries': cfg.publication.max_retries,
|
|
224
|
+
'base_delay': cfg.publication.base_delay_seconds,
|
|
225
|
+
'max_delay': cfg.publication.max_delay_seconds,
|
|
226
|
+
}
|
|
227
|
+
sources = [
|
|
228
|
+
CrossrefAPI(cfg.credentials.email, **retry_config),
|
|
229
|
+
EuropePMCAPI(cfg.credentials.email, **retry_config),
|
|
230
|
+
NCBIAPI(cfg.credentials.email, cfg.credentials.ncbi_api_key, **retry_config),
|
|
231
|
+
SemanticScholarAPI(cfg.credentials.email, **retry_config),
|
|
232
|
+
ArxivAPI(cfg.credentials.email, **retry_config),
|
|
233
|
+
BioarxivAPI(cfg.credentials.email, **retry_config),
|
|
234
|
+
CoreAPI(cfg.credentials.email, **retry_config),
|
|
235
|
+
DataciteAPI(cfg.credentials.email, **retry_config),
|
|
236
|
+
DOAJAPI(cfg.credentials.email, **retry_config),
|
|
237
|
+
PLOSAPI(cfg.credentials.email, **retry_config),
|
|
238
|
+
UnpaywallAPI(cfg.credentials.email, **retry_config),
|
|
239
|
+
ZenodoAPI(cfg.credentials.email, **retry_config),
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
# ---- Build omics extractor ----
|
|
243
|
+
llm_key = api_key or cfg.credentials.llm_api_key
|
|
244
|
+
|
|
245
|
+
# Primer database: prefer builtin, then external file, else None
|
|
246
|
+
primer_database = None
|
|
247
|
+
if builtin:
|
|
248
|
+
primer_database = ProbeBaseDatabase(use_builtin=True)
|
|
249
|
+
elif primer_db:
|
|
250
|
+
primer_database = ProbeBaseDatabase(db_path=primer_db)
|
|
251
|
+
|
|
252
|
+
if pmid:
|
|
253
|
+
sources = [PMIDSource(cfg.credentials.email, pmid, **retry_config)]
|
|
254
|
+
|
|
255
|
+
if omics.lower() == "16s":
|
|
256
|
+
if no_llm:
|
|
257
|
+
extractor = SixteenSExtractor(api_key="", primer_db=primer_database)
|
|
258
|
+
else:
|
|
259
|
+
extractor = SixteenSExtractor(api_key=llm_key or "", primer_db=primer_database)
|
|
260
|
+
else:
|
|
261
|
+
click.echo(f"❌ Unknown omics type: {omics}", err=True)
|
|
262
|
+
raise click.Abort()
|
|
263
|
+
|
|
264
|
+
# ---- Run fetcher ----
|
|
265
|
+
cache = PublicationCache(cfg.paths.cache_dir / "publications.db")
|
|
266
|
+
fetcher = PublicationFetcher(cfg, sources, extractor, cache)
|
|
267
|
+
fetcher.MAX_PUBLICATION_ROUNDS = max_rounds
|
|
268
|
+
|
|
269
|
+
click.echo(f"🔍 Searching publications for {len(accessions)} accessions...")
|
|
270
|
+
results = fetcher.fetch_and_analyze_sync(accessions)
|
|
271
|
+
|
|
272
|
+
# ---- Output ----
|
|
273
|
+
if output:
|
|
274
|
+
import json
|
|
275
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
with open(output, "w") as f:
|
|
277
|
+
json.dump(results, f, indent=2, default=str)
|
|
278
|
+
click.echo(f"📄 Results written to {output}")
|
|
279
|
+
else:
|
|
280
|
+
for acc, pubs in results.items():
|
|
281
|
+
click.echo(f"\n📌 {acc} – {len(pubs)} publications")
|
|
282
|
+
for pub in pubs[:5]:
|
|
283
|
+
title = (pub.get('publication_title') or 'N/A')[:80]
|
|
284
|
+
status = pub.get('status', '?')
|
|
285
|
+
click.echo(f" [{status}] {title}")
|
|
286
|
+
if len(pubs) > 5:
|
|
287
|
+
click.echo(f" … and {len(pubs) - 5} more.")
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# --------------------------------------------------------------------------- #
|
|
291
|
+
# run-pipeline
|
|
292
|
+
# --------------------------------------------------------------------------- #
|
|
293
|
+
|
|
294
|
+
@main.command()
|
|
295
|
+
@click.argument("input_path", type=click.Path(exists=True, path_type=Path))
|
|
296
|
+
@click.option("--output", "-o", type=click.Path(path_type=Path), help="Output file path.")
|
|
297
|
+
@click.option("--sample-id-col", default=None, help="Override sample ID column name.")
|
|
298
|
+
@click.option("--no-geocode", is_flag=True, help="Disable reverse geocoding.")
|
|
299
|
+
@_config_option
|
|
300
|
+
def run_pipeline(
|
|
301
|
+
input_path: Path,
|
|
302
|
+
output: Optional[Path],
|
|
303
|
+
sample_id_col: Optional[str],
|
|
304
|
+
no_geocode: bool,
|
|
305
|
+
config: Optional[Path],
|
|
306
|
+
email: Optional[str],
|
|
307
|
+
cache_dir: Optional[Path],
|
|
308
|
+
):
|
|
309
|
+
"""Run the full MetadataManager pipeline on a file (cleaning + enrichment)."""
|
|
310
|
+
cfg = _build_config(config, email, cache_dir)
|
|
311
|
+
if no_geocode:
|
|
312
|
+
cfg.metadata.enable_geocoding = False
|
|
313
|
+
setup_logging(cfg.logs_dir)
|
|
314
|
+
|
|
315
|
+
import pandas as pd
|
|
316
|
+
from omix.metadata.file_workflow import _load_table, _save_table, _ensure_sample_id_column
|
|
317
|
+
|
|
318
|
+
df = _load_table(input_path)
|
|
319
|
+
if sample_id_col:
|
|
320
|
+
df = _ensure_sample_id_column(df, sample_id_col)
|
|
321
|
+
elif cfg.metadata.sample_id_column not in df.columns:
|
|
322
|
+
df = _ensure_sample_id_column(df, cfg.metadata.sample_id_column)
|
|
323
|
+
|
|
324
|
+
manager = MetadataManager(df, cfg, sample_id_column=sample_id_col)
|
|
325
|
+
enriched = asyncio.run(manager.run_pipeline())
|
|
326
|
+
|
|
327
|
+
if output:
|
|
328
|
+
_save_table(enriched, output)
|
|
329
|
+
click.echo(f"✅ Enriched metadata written to {output}")
|
|
330
|
+
else:
|
|
331
|
+
click.echo(f"✅ Pipeline complete – {len(enriched)} rows, {len(enriched.columns)} columns.")
|
|
332
|
+
click.echo(enriched.head())
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# --------------------------------------------------------------------------- #
|
|
336
|
+
# enrich-with-publications
|
|
337
|
+
# --------------------------------------------------------------------------- #
|
|
338
|
+
|
|
339
|
+
@main.command()
|
|
340
|
+
@click.argument("input_path", type=click.Path(exists=True, path_type=Path))
|
|
341
|
+
@click.option("--output", "-o", type=click.Path(path_type=Path), help="Output file path.")
|
|
342
|
+
@click.option("--no-validate", is_flag=True, help="Skip publication validation (keep all publications).")
|
|
343
|
+
@click.option("--api-key", envvar="OMIX_LLM_API_KEY", help="LLM API key for methodology extraction.")
|
|
344
|
+
@click.option("--no-llm", is_flag=True, help="Skip LLM extraction (regex only).")
|
|
345
|
+
@click.option("--builtin", is_flag=True, help="Use the built‑in primer database.")
|
|
346
|
+
@click.option("--primer-db", type=click.Path(exists=True, path_type=Path),
|
|
347
|
+
help="Path to a probeBase SQLite primer database.")
|
|
348
|
+
@click.option("--max-rounds", type=int, default=3, help="Maximum citation chasing rounds.")
|
|
349
|
+
@_config_option
|
|
350
|
+
def enrich_with_publications(
|
|
351
|
+
input_path: Path,
|
|
352
|
+
output: Optional[Path],
|
|
353
|
+
no_validate: bool,
|
|
354
|
+
api_key: Optional[str],
|
|
355
|
+
no_llm: bool,
|
|
356
|
+
builtin: bool,
|
|
357
|
+
primer_db: Optional[Path],
|
|
358
|
+
max_rounds: int,
|
|
359
|
+
config: Optional[Path],
|
|
360
|
+
email: Optional[str],
|
|
361
|
+
cache_dir: Optional[Path],
|
|
362
|
+
):
|
|
363
|
+
"""
|
|
364
|
+
Enrich metadata with ENA data AND publication information in one pipeline.
|
|
365
|
+
|
|
366
|
+
This unified command:
|
|
367
|
+
1. Fetches and enriches metadata from ENA
|
|
368
|
+
2. Discovers publications from 12+ sources
|
|
369
|
+
3. Validates publications for accession relevance
|
|
370
|
+
4. Integrates publication counts and DOIs into metadata
|
|
371
|
+
|
|
372
|
+
Output includes: all ENA metadata fields + publication_count + publication_dois
|
|
373
|
+
"""
|
|
374
|
+
cfg = _build_config(config, email, cache_dir)
|
|
375
|
+
setup_logging(cfg.logs_dir)
|
|
376
|
+
|
|
377
|
+
# Import the unified pipeline
|
|
378
|
+
from omix.metadata.file_workflow import _load_table, _save_table
|
|
379
|
+
import pandas as pd
|
|
380
|
+
|
|
381
|
+
async def async_enrich():
|
|
382
|
+
"""Run async enrichment pipeline."""
|
|
383
|
+
from omix.metadata.file_workflow import enrich_metadata_from_path
|
|
384
|
+
from omix.publications.fetcher import PublicationFetcher
|
|
385
|
+
from omix.publications.extractors.omics import SixteenSExtractor
|
|
386
|
+
from omix.publications.cache import PublicationCache
|
|
387
|
+
from omix.publications.apis import (
|
|
388
|
+
CrossrefAPI, EuropePMCAPI, NCBIAPI, SemanticScholarAPI,
|
|
389
|
+
ArxivAPI, BioarxivAPI, CoreAPI, DataciteAPI, DOAJAPI, PLOSAPI,
|
|
390
|
+
UnpaywallAPI, ZenodoAPI,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# ---- Phase 1: Metadata ----
|
|
394
|
+
click.echo("📊 Phase 1: Metadata enrichment...")
|
|
395
|
+
metadata = await enrich_metadata_from_path(
|
|
396
|
+
input_path=input_path,
|
|
397
|
+
output_path=None,
|
|
398
|
+
config=cfg,
|
|
399
|
+
enable_ena_lookup=True,
|
|
400
|
+
preserve_all_rows=True,
|
|
401
|
+
)
|
|
402
|
+
click.echo(f" ✓ {len(metadata)} rows × {len(metadata.columns)} columns")
|
|
403
|
+
|
|
404
|
+
# Extract study accessions
|
|
405
|
+
if 'study_accession' not in metadata.columns:
|
|
406
|
+
click.echo("❌ study_accession column missing; cannot fetch publications", err=True)
|
|
407
|
+
if output:
|
|
408
|
+
_save_table(metadata, output)
|
|
409
|
+
return metadata
|
|
410
|
+
|
|
411
|
+
study_accessions = sorted(set(metadata['study_accession'].dropna().unique()))
|
|
412
|
+
|
|
413
|
+
if not study_accessions:
|
|
414
|
+
click.echo("⚠️ No study accessions found; skipping publication fetch")
|
|
415
|
+
if output:
|
|
416
|
+
_save_table(metadata, output)
|
|
417
|
+
return metadata
|
|
418
|
+
|
|
419
|
+
# ---- Phase 2: Publications ----
|
|
420
|
+
click.echo(f"📚 Phase 2: Publication discovery ({len(study_accessions)} studies)...")
|
|
421
|
+
|
|
422
|
+
retry_config = {
|
|
423
|
+
'max_retries': cfg.publication.max_retries,
|
|
424
|
+
'base_delay': cfg.publication.base_delay_seconds,
|
|
425
|
+
'max_delay': cfg.publication.max_delay_seconds,
|
|
426
|
+
}
|
|
427
|
+
sources = [
|
|
428
|
+
CrossrefAPI(cfg.credentials.email, **retry_config),
|
|
429
|
+
EuropePMCAPI(cfg.credentials.email, **retry_config),
|
|
430
|
+
NCBIAPI(cfg.credentials.email, cfg.credentials.ncbi_api_key, **retry_config),
|
|
431
|
+
SemanticScholarAPI(cfg.credentials.email, **retry_config),
|
|
432
|
+
ArxivAPI(cfg.credentials.email, **retry_config),
|
|
433
|
+
BioarxivAPI(cfg.credentials.email, **retry_config),
|
|
434
|
+
CoreAPI(cfg.credentials.email, **retry_config),
|
|
435
|
+
DataciteAPI(cfg.credentials.email, **retry_config),
|
|
436
|
+
DOAJAPI(cfg.credentials.email, **retry_config),
|
|
437
|
+
PLOSAPI(cfg.credentials.email, **retry_config),
|
|
438
|
+
UnpaywallAPI(cfg.credentials.email, **retry_config),
|
|
439
|
+
ZenodoAPI(cfg.credentials.email, **retry_config),
|
|
440
|
+
]
|
|
441
|
+
|
|
442
|
+
llm_key = api_key or cfg.credentials.llm_api_key
|
|
443
|
+
primer_database = None
|
|
444
|
+
if builtin:
|
|
445
|
+
primer_database = ProbeBaseDatabase(use_builtin=True)
|
|
446
|
+
elif primer_db:
|
|
447
|
+
primer_database = ProbeBaseDatabase(db_path=primer_db)
|
|
448
|
+
|
|
449
|
+
extractor = SixteenSExtractor(api_key=llm_key if not no_llm else "", primer_db=primer_database)
|
|
450
|
+
cache = PublicationCache(cfg.paths.cache_dir / "publications.db")
|
|
451
|
+
fetcher = PublicationFetcher(cfg, sources, extractor, cache)
|
|
452
|
+
fetcher.MAX_PUBLICATION_ROUNDS = max_rounds
|
|
453
|
+
|
|
454
|
+
publications = fetcher.fetch_and_analyze_sync(study_accessions)
|
|
455
|
+
total_pubs = sum(len(p) for p in publications.values())
|
|
456
|
+
click.echo(f" ✓ {total_pubs} publications found")
|
|
457
|
+
|
|
458
|
+
# ---- Phase 3: Validation ----
|
|
459
|
+
if not no_validate:
|
|
460
|
+
click.echo("🔍 Phase 3: Publication validation...")
|
|
461
|
+
|
|
462
|
+
# Inline validation
|
|
463
|
+
filtered = {}
|
|
464
|
+
for study_accession, pubs in publications.items():
|
|
465
|
+
filtered[study_accession] = []
|
|
466
|
+
for pub in pubs:
|
|
467
|
+
if pub.get('status') != '✓ Extraction complete.':
|
|
468
|
+
continue
|
|
469
|
+
matched_queries = pub.get('matched_queries', [])
|
|
470
|
+
accession_in_text = pub.get('accession_mentions_in_text', 0) > 0
|
|
471
|
+
direct_match = any(
|
|
472
|
+
q == study_accession or (study_accession in q and not q.startswith('DATA:'))
|
|
473
|
+
for q in matched_queries
|
|
474
|
+
)
|
|
475
|
+
if direct_match or accession_in_text:
|
|
476
|
+
filtered[study_accession].append(pub)
|
|
477
|
+
|
|
478
|
+
publications = filtered
|
|
479
|
+
valid_count = sum(len(p) for p in publications.values())
|
|
480
|
+
click.echo(f" ✓ {valid_count} publications with direct accession matches")
|
|
481
|
+
|
|
482
|
+
# ---- Phase 4: Integration ----
|
|
483
|
+
click.echo("🔗 Phase 4: Integration...")
|
|
484
|
+
|
|
485
|
+
if 'publication_count' not in metadata.columns:
|
|
486
|
+
metadata['publication_count'] = None
|
|
487
|
+
if 'publication_dois' not in metadata.columns:
|
|
488
|
+
metadata['publication_dois'] = None
|
|
489
|
+
|
|
490
|
+
filled_rows = 0
|
|
491
|
+
studies_with_dois = 0
|
|
492
|
+
|
|
493
|
+
for study_accession, pubs in publications.items():
|
|
494
|
+
dois = []
|
|
495
|
+
seen = set()
|
|
496
|
+
for pub in pubs:
|
|
497
|
+
if not isinstance(pub, dict):
|
|
498
|
+
continue
|
|
499
|
+
doi = pub.get('doi')
|
|
500
|
+
if doi and doi not in seen:
|
|
501
|
+
dois.append(doi)
|
|
502
|
+
seen.add(doi)
|
|
503
|
+
|
|
504
|
+
mask = metadata['study_accession'] == study_accession
|
|
505
|
+
filled_rows += int(mask.sum())
|
|
506
|
+
metadata.loc[mask, 'publication_count'] = len(dois)
|
|
507
|
+
metadata.loc[mask, 'publication_dois'] = '; '.join(dois) if dois else ''
|
|
508
|
+
|
|
509
|
+
if dois:
|
|
510
|
+
studies_with_dois += 1
|
|
511
|
+
|
|
512
|
+
click.echo(f" ✓ {filled_rows} rows updated, {studies_with_dois} studies with DOIs")
|
|
513
|
+
|
|
514
|
+
return metadata
|
|
515
|
+
|
|
516
|
+
try:
|
|
517
|
+
enriched_metadata = asyncio.run(async_enrich())
|
|
518
|
+
|
|
519
|
+
if output:
|
|
520
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
521
|
+
_save_table(enriched_metadata, output)
|
|
522
|
+
click.echo(f"\n✅ Output saved to {output}")
|
|
523
|
+
|
|
524
|
+
click.echo(f"✅ Complete! {len(enriched_metadata)} rows × {len(enriched_metadata.columns)} columns")
|
|
525
|
+
except Exception as e:
|
|
526
|
+
click.echo(f"\n❌ Error: {e}", err=True)
|
|
527
|
+
raise click.Abort()
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
if __name__ == "__main__":
|
|
531
|
+
main()
|
omix/config.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified configuration for omix, loadable from YAML files and environment variables.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Credentials:
|
|
13
|
+
"""Holds all API credentials, loaded from config or environment."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, cred_dict: Optional[Dict[str, Any]] = None):
|
|
16
|
+
cred = cred_dict or {}
|
|
17
|
+
self.email = cred.get("email") or os.getenv("OMIX_EMAIL", "")
|
|
18
|
+
self.ena_email = cred.get("ena_email") or os.getenv("OMIX_ENA_EMAIL", self.email)
|
|
19
|
+
self.ncbi_api_key = cred.get("ncbi_api_key") or os.getenv("OMIX_NCBI_API_KEY")
|
|
20
|
+
self.llm_api_key = cred.get("llm_api_key") or os.getenv("OMIX_LLM_API_KEY")
|
|
21
|
+
self.dimensions_api_key = cred.get("dimensions_api_key") or os.getenv("OMIX_DIMENSIONS_API_KEY")
|
|
22
|
+
self.ieee_api_key = cred.get("ieee_api_key") or os.getenv("OMIX_IEEE_API_KEY")
|
|
23
|
+
self.mendeley_api_key = cred.get("mendeley_api_key") or os.getenv("OMIX_MENDELEY_API_KEY")
|
|
24
|
+
self.springer_api_key = cred.get("springer_api_key") or os.getenv("OMIX_SPRINGER_API_KEY")
|
|
25
|
+
self.lens_api_key = cred.get("lens_api_key") or os.getenv("OMIX_LENS_API_KEY")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MetadataConfig:
|
|
29
|
+
"""Metadata normalization and enrichment settings."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, cfg: Optional[Dict[str, Any]] = None):
|
|
32
|
+
cfg = cfg or {}
|
|
33
|
+
self.columns_to_drop = cfg.get("columns_to_drop", [])
|
|
34
|
+
self.force_numeric_columns = cfg.get("force_numeric_columns", ["lat", "lon", "depth", "altitude"])
|
|
35
|
+
self.mappings = cfg.get("mappings", {})
|
|
36
|
+
self.sample_id_column = cfg.get("sample_id_column", "#sampleid")
|
|
37
|
+
self.suffixes_to_collapse = cfg.get("suffixes_to_collapse", [])
|
|
38
|
+
self.exclude_host = cfg.get("exclude_host", False)
|
|
39
|
+
self.enable_geocoding = cfg.get("enable_geocoding", True)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ENAApiConfig:
|
|
43
|
+
"""ENA API specific options."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, cfg: Dict[str, Any]):
|
|
46
|
+
self.enabled = cfg.get("enabled", True)
|
|
47
|
+
self.max_concurrent = cfg.get("max_concurrent", 5)
|
|
48
|
+
self.batch_size = cfg.get("batch_size", 100)
|
|
49
|
+
self.cache_ttl_days = cfg.get("cache_ttl_days", 30)
|
|
50
|
+
self.fetch_phases = cfg.get("fetch_phases", True)
|
|
51
|
+
self.phase2_async = cfg.get("phase2_async", True)
|
|
52
|
+
self.cache_write = CacheWriteConfig(cfg.get("cache_write", {}))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class CacheWriteConfig:
|
|
56
|
+
"""Batched write configuration for SQLite cache."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, cfg: Dict[str, Any]):
|
|
59
|
+
self.batch_size = cfg.get("batch_size", 100)
|
|
60
|
+
self.flush_interval_seconds = cfg.get("flush_interval_seconds", 5.0)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ApisConfig:
|
|
64
|
+
"""Toggle which external APIs are enabled."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, cfg: Optional[Dict[str, Any]] = None):
|
|
67
|
+
cfg = cfg or {}
|
|
68
|
+
self.enabled = cfg.get("enabled", True)
|
|
69
|
+
seq = cfg.get("sequence", {})
|
|
70
|
+
self.ena = ENAApiConfig(seq.get("ena", {}))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class PublicationConfig:
|
|
74
|
+
"""Publication search and analysis settings."""
|
|
75
|
+
|
|
76
|
+
def __init__(self, cfg: Optional[Dict[str, Any]] = None):
|
|
77
|
+
cfg = cfg or {}
|
|
78
|
+
self.max_concurrent_apis = cfg.get("max_concurrent_apis", 5)
|
|
79
|
+
self.rounds = cfg.get("rounds", 3)
|
|
80
|
+
self.max_pdf_pages = cfg.get("max_pdf_pages", 10)
|
|
81
|
+
self.max_file_size = cfg.get("max_file_size", 15_000_000)
|
|
82
|
+
|
|
83
|
+
# FEATURE 3: Retry/backoff configuration per API source
|
|
84
|
+
retry_cfg = cfg.get("retry", {})
|
|
85
|
+
self.max_retries = retry_cfg.get("max_retries", 5)
|
|
86
|
+
self.base_delay_seconds = retry_cfg.get("base_delay_seconds", 1.0)
|
|
87
|
+
self.max_delay_seconds = retry_cfg.get("max_delay_seconds", 32.0)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class PathsConfig:
|
|
91
|
+
"""Project and dependency paths."""
|
|
92
|
+
|
|
93
|
+
def __init__(self, cfg: Optional[Dict[str, Any]] = None):
|
|
94
|
+
cfg = cfg or {}
|
|
95
|
+
self.project = Path(cfg.get("project", "."))
|
|
96
|
+
self.cache_dir = Path(cfg.get("cache_dir", self.project / ".cache"))
|
|
97
|
+
self.logs_dir = Path(cfg.get("logs_dir", self.project / "logs"))
|
|
98
|
+
self.primer_db = cfg.get("primer_db") # optional
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class OmicsType:
|
|
102
|
+
"""Enumeration of supported omics types."""
|
|
103
|
+
|
|
104
|
+
_16S = "16S"
|
|
105
|
+
METAGENOMICS = "metagenomics"
|
|
106
|
+
|
|
107
|
+
def __init__(self, name: str):
|
|
108
|
+
self.name = name
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class Config:
|
|
112
|
+
"""Top‑level configuration object, aggregating all sub‑configs."""
|
|
113
|
+
|
|
114
|
+
def __init__(self, config_path: Optional[Path] = None, **overrides):
|
|
115
|
+
self._raw: Dict[str, Any] = {}
|
|
116
|
+
if config_path:
|
|
117
|
+
with open(config_path, "r") as f:
|
|
118
|
+
self._raw = yaml.safe_load(f) or {}
|
|
119
|
+
|
|
120
|
+
# Move known overrides into the correct sub-dicts BEFORE building sub‑configs
|
|
121
|
+
cred_overrides = {}
|
|
122
|
+
for key in ("email", "ena_email", "llm_api_key", "ncbi_api_key",
|
|
123
|
+
"dimensions_api_key", "ieee_api_key", "mendeley_api_key",
|
|
124
|
+
"springer_api_key", "lens_api_key"):
|
|
125
|
+
if key in overrides:
|
|
126
|
+
cred_overrides[key] = overrides.pop(key)
|
|
127
|
+
|
|
128
|
+
self._raw.update(overrides) # remaining top-level overrides
|
|
129
|
+
credentials_raw = self._raw.setdefault("credentials", {})
|
|
130
|
+
credentials_raw.update(cred_overrides) # apply credential overrides
|
|
131
|
+
|
|
132
|
+
# Build sub‑configs
|
|
133
|
+
self.credentials = Credentials(self._raw.get("credentials"))
|
|
134
|
+
self.paths = PathsConfig(self._raw.get("paths"))
|
|
135
|
+
self.metadata = MetadataConfig(self._raw.get("metadata"))
|
|
136
|
+
self.apis = ApisConfig(self._raw.get("apis"))
|
|
137
|
+
self.publication = PublicationConfig(self._raw.get("publication"))
|
|
138
|
+
self.omics_type = OmicsType(self._raw.get("omics_type", "16S"))
|
|
139
|
+
self.cache_dir = self.paths.cache_dir
|
|
140
|
+
self.logs_dir = self.paths.logs_dir
|
|
141
|
+
|
|
142
|
+
def save(self, path: Path) -> None:
|
|
143
|
+
"""Write current configuration to a YAML file."""
|
|
144
|
+
with open(path, "w") as f:
|
|
145
|
+
yaml.dump(self._raw, f)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def load_config(config_path: Path) -> Config:
|
|
149
|
+
"""Load configuration from a YAML file and return a Config instance."""
|
|
150
|
+
return Config(config_path)
|