omix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. omix/__init__.py +13 -0
  2. omix/cli.py +531 -0
  3. omix/config.py +150 -0
  4. omix/logging_utils.py +59 -0
  5. omix/metadata/__init__.py +16 -0
  6. omix/metadata/base.py +64 -0
  7. omix/metadata/constants.py +159 -0
  8. omix/metadata/ena/__init__.py +16 -0
  9. omix/metadata/ena/cache.py +464 -0
  10. omix/metadata/ena/enrichment_pipeline.py +217 -0
  11. omix/metadata/ena/fetcher.py +374 -0
  12. omix/metadata/ena/metadata.py +333 -0
  13. omix/metadata/ena/sample_parser.py +784 -0
  14. omix/metadata/ena/sra_fallback.py +20 -0
  15. omix/metadata/enrichment.py +420 -0
  16. omix/metadata/file_workflow.py +1138 -0
  17. omix/metadata/manager.py +590 -0
  18. omix/publications/__init__.py +16 -0
  19. omix/publications/apis/__init__.py +24 -0
  20. omix/publications/apis/arxiv.py +82 -0
  21. omix/publications/apis/base.py +170 -0
  22. omix/publications/apis/basesearch.py +76 -0
  23. omix/publications/apis/bioarxiv.py +77 -0
  24. omix/publications/apis/core.py +66 -0
  25. omix/publications/apis/crossref.py +72 -0
  26. omix/publications/apis/datacite.py +71 -0
  27. omix/publications/apis/doaj.py +73 -0
  28. omix/publications/apis/europe_pmc.py +66 -0
  29. omix/publications/apis/lens.py +86 -0
  30. omix/publications/apis/mendeley.py +77 -0
  31. omix/publications/apis/ncbi.py +227 -0
  32. omix/publications/apis/plos.py +68 -0
  33. omix/publications/apis/semantic_scholar.py +78 -0
  34. omix/publications/apis/springer_nature.py +72 -0
  35. omix/publications/apis/unpaywall.py +90 -0
  36. omix/publications/apis/zenodo.py +93 -0
  37. omix/publications/base.py +89 -0
  38. omix/publications/cache.py +209 -0
  39. omix/publications/exceptions.py +8 -0
  40. omix/publications/extractors/__init__.py +11 -0
  41. omix/publications/extractors/cleaning.py +317 -0
  42. omix/publications/extractors/llm.py +145 -0
  43. omix/publications/extractors/omics/_16s.py +212 -0
  44. omix/publications/extractors/omics/__init__.py +9 -0
  45. omix/publications/extractors/omics/base.py +87 -0
  46. omix/publications/extractors/pdf.py +119 -0
  47. omix/publications/extractors/webpage.py +84 -0
  48. omix/publications/fetcher.py +796 -0
  49. omix/validators/__init__.py +0 -0
  50. omix/validators/primer_db.py +173 -0
  51. omix/validators/probebase_builder.py +218 -0
  52. omix-0.1.0.dist-info/METADATA +138 -0
  53. omix-0.1.0.dist-info/RECORD +57 -0
  54. omix-0.1.0.dist-info/WHEEL +5 -0
  55. omix-0.1.0.dist-info/entry_points.txt +2 -0
  56. omix-0.1.0.dist-info/licenses/LICENSE +21 -0
  57. omix-0.1.0.dist-info/top_level.txt +1 -0
omix/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ """
2
+ omix: A modular Python package for fetching, enriching, and analyzing
3
+ omics metadata and publications.
4
+
5
+ Usage:
6
+ from omix import Config
7
+ config = Config(email="you@example.com")
8
+ """
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ from .config import Config, load_config
13
+ from .logging_utils import setup_logging, get_logger
omix/cli.py ADDED
@@ -0,0 +1,531 @@
1
+ """
2
+ Command‑line interface for omix.
3
+
4
+ Provides subcommands:
5
+ - build-primer-db : download and build a probeBase primer database.
6
+ - fetch-metadata : enrich a metadata file with ENA data.
7
+ - fetch-publications : search and analyse publications linked to accessions.
8
+ - run-pipeline : run the full MetadataManager pipeline on a file.
9
+ """
10
+
11
+ import asyncio
12
+ from pathlib import Path
13
+ from typing import List, Optional
14
+
15
+ import click
16
+
17
+ from omix import __version__
18
+ from omix.config import Config, load_config
19
+ from omix.logging_utils import setup_logging
20
+ from omix.metadata.file_workflow import enrich_metadata_from_path
21
+ from omix.metadata.manager import MetadataManager
22
+ from omix.publications.apis.ncbi import PMIDSource
23
+ from omix.publications.fetcher import PublicationFetcher
24
+ from omix.publications.apis import (
25
+ CrossrefAPI,
26
+ EuropePMCAPI,
27
+ NCBIAPI,
28
+ SemanticScholarAPI,
29
+ ArxivAPI,
30
+ BioarxivAPI,
31
+ CoreAPI,
32
+ DataciteAPI,
33
+ DOAJAPI,
34
+ PLOSAPI,
35
+ UnpaywallAPI,
36
+ ZenodoAPI,
37
+ )
38
+ from omix.publications.extractors.omics import SixteenSExtractor
39
+ from omix.publications.cache import PublicationCache
40
+ from omix.validators.primer_db import ProbeBaseDatabase
41
+
42
+
43
+ # --------------------------------------------------------------------------- #
44
+ # Common options
45
+ # --------------------------------------------------------------------------- #
46
+
47
+ def _config_option(f):
48
+ """Decorator that adds --config / --email / --cache-dir options to a command."""
49
+ f = click.option(
50
+ "--config", "-c",
51
+ type=click.Path(exists=True, path_type=Path),
52
+ help="Path to YAML configuration file.",
53
+ )(f)
54
+ f = click.option(
55
+ "--email", "-e",
56
+ default=None,
57
+ help="Email address for polite API requests (overrides config).",
58
+ )(f)
59
+ f = click.option(
60
+ "--cache-dir",
61
+ type=click.Path(path_type=Path),
62
+ help="Override the cache directory.",
63
+ )(f)
64
+ return f
65
+
66
+
67
+ def _build_config(config_path: Optional[Path], email: Optional[str], cache_dir: Optional[Path]) -> Config:
68
+ """Build a Config object from CLI arguments and/or a YAML file."""
69
+ if config_path:
70
+ config = load_config(config_path)
71
+ else:
72
+ config = Config()
73
+
74
+ if email:
75
+ config.credentials.email = email
76
+ if cache_dir:
77
+ config.paths.cache_dir = cache_dir
78
+ config.cache_dir = cache_dir
79
+ return config
80
+
81
+
82
+ # --------------------------------------------------------------------------- #
83
+ # Main entry point
84
+ # --------------------------------------------------------------------------- #
85
+
86
+ @click.group()
87
+ @click.version_option(version=__version__)
88
+ def main():
89
+ """omix – a modular toolkit for omics metadata & publication analysis."""
90
+ pass
91
+
92
+
93
+ # --------------------------------------------------------------------------- #
94
+ # build-primer-db
95
+ # --------------------------------------------------------------------------- #
96
+
97
+ @main.command()
98
+ @click.option("--csv", type=click.Path(path_type=Path), default=Path("data/probe_data.csv"),
99
+ help="Path to downloaded probeBase CSV (will be downloaded if missing).")
100
+ @click.option("--db", type=click.Path(path_type=Path), default=Path("data/primer_data.db"),
101
+ help="Output SQLite database path.")
102
+ @_config_option
103
+ def build_primer_db(
104
+ csv: Path,
105
+ db: Path,
106
+ config: Optional[Path],
107
+ email: Optional[str],
108
+ cache_dir: Optional[Path],
109
+ ):
110
+ """Download the probeBase primer list and build a searchable SQLite database."""
111
+ cfg = _build_config(config, email, cache_dir)
112
+ setup_logging(cfg.logs_dir)
113
+
114
+ from omix.validators.probebase_builder import import_and_save_database
115
+ success = import_and_save_database(csv, db)
116
+ if not success:
117
+ click.echo("❌ Primer database build failed.", err=True)
118
+ raise click.Abort()
119
+ click.echo(f"✅ Primer database built at {db}")
120
+
121
+
122
+ # --------------------------------------------------------------------------- #
123
+ # fetch-metadata
124
+ # --------------------------------------------------------------------------- #
125
+
126
+ @main.command()
127
+ @click.argument("input_path", type=click.Path(exists=True, path_type=Path))
128
+ @click.option("--output", "-o", type=click.Path(path_type=Path), help="Output file path.")
129
+ @click.option("--sample-id-col", default=None, help="Override sample ID column name.")
130
+ @click.option("--no-ena", is_flag=True, help="Disable ENA enrichment.")
131
+ @click.option("--no-geocode", is_flag=True, help="Disable reverse geocoding.")
132
+ @click.option("--preserve-rows", is_flag=True, default=True, help="Keep rows without coordinates.")
133
+ @click.option("--omics-type", default=None, help="Filter by omics type (amplicon, metagenomics, transcriptomics, etc.).")
134
+ @click.option("--amplicon-gene", default=None, help="Filter by amplicon gene (16S, 18S, ITS, COI, etc.).")
135
+ @click.option("--primer-set", default=None, help="Filter by primer set name.")
136
+ @click.option("--subfragment", default=None, help="Filter by subfragment/variable region.")
137
+ @click.option("--report", "-r", type=click.Path(path_type=Path), help="Save composition report as JSON to this path.")
138
+ @_config_option
139
+ def fetch_metadata(
140
+ input_path: Path,
141
+ output: Optional[Path],
142
+ sample_id_col: Optional[str],
143
+ no_ena: bool,
144
+ no_geocode: bool,
145
+ preserve_rows: bool,
146
+ omics_type: Optional[str],
147
+ amplicon_gene: Optional[str],
148
+ primer_set: Optional[str],
149
+ subfragment: Optional[str],
150
+ report: Optional[Path],
151
+ config: Optional[Path],
152
+ email: Optional[str],
153
+ cache_dir: Optional[Path],
154
+ ):
155
+ """Enrich a metadata file with ENA data and generate an omics profile report."""
156
+ cfg = _build_config(config, email, cache_dir)
157
+ if no_geocode:
158
+ cfg.metadata.enable_geocoding = False
159
+ setup_logging(cfg.logs_dir)
160
+
161
+ click.echo(f"📁 Loading {input_path} ...")
162
+ result = asyncio.run(
163
+ enrich_metadata_from_path(
164
+ input_path=input_path,
165
+ output_path=output,
166
+ config=cfg,
167
+ sample_id_column=sample_id_col,
168
+ enable_ena_lookup=not no_ena,
169
+ preserve_all_rows=preserve_rows,
170
+ omics_type=omics_type,
171
+ amplicon_gene=amplicon_gene,
172
+ primer_set=primer_set,
173
+ subfragment=subfragment,
174
+ report_output=report,
175
+ )
176
+ )
177
+ click.echo(f"✅ Done. Output has {len(result)} rows and {len(result.columns)} columns.")
178
+ if report:
179
+ click.echo(f"📊 Composition report saved to {report}")
180
+
181
+
182
+ # --------------------------------------------------------------------------- #
183
+ # fetch-publications
184
+ # --------------------------------------------------------------------------- #
185
+
186
+ @main.command()
187
+ @click.argument("accessions", nargs=-1)
188
+ @click.option("--omics", default="16S", help="Omics type (currently only '16S').")
189
+ @click.option("--pmid", type=str, default=None, help="PubMed ID for direct lookup.")
190
+ @click.option("--api-key", envvar="OMIX_LLM_API_KEY", help="LLM API key for methodology extraction.")
191
+ @click.option("--no-llm", is_flag=True, help="Skip LLM extraction (regex only).")
192
+ @click.option("--builtin", is_flag=True, help="Use the built‑in primer database (no probeBase needed).")
193
+ @click.option("--primer-db", type=click.Path(exists=True, path_type=Path),
194
+ help="Path to a probeBase SQLite primer database.")
195
+ @click.option("--max-rounds", type=int, default=3, help="Maximum citation chasing rounds.")
196
+ @click.option("--output", "-o", type=click.Path(path_type=Path), help="Save results as JSON.")
197
+ @_config_option
198
+ def fetch_publications(
199
+ accessions: List[str],
200
+ omics: str,
201
+ pmid: Optional[str],
202
+ api_key: Optional[str],
203
+ no_llm: bool,
204
+ builtin: bool,
205
+ primer_db: Optional[Path],
206
+ max_rounds: int,
207
+ output: Optional[Path],
208
+ config: Optional[Path],
209
+ email: Optional[str],
210
+ cache_dir: Optional[Path],
211
+ ):
212
+ """Search and analyse publications for one or more accessions."""
213
+ cfg = _build_config(config, email, cache_dir)
214
+ setup_logging(cfg.logs_dir)
215
+
216
+ if not accessions:
217
+ click.echo("❌ You must provide at least one accession.", err=True)
218
+ raise click.Abort()
219
+
220
+ # ---- Build publication sources (free and reliable ones) ----
221
+ # FEATURE 3: Pass retry config from publication config
222
+ retry_config = {
223
+ 'max_retries': cfg.publication.max_retries,
224
+ 'base_delay': cfg.publication.base_delay_seconds,
225
+ 'max_delay': cfg.publication.max_delay_seconds,
226
+ }
227
+ sources = [
228
+ CrossrefAPI(cfg.credentials.email, **retry_config),
229
+ EuropePMCAPI(cfg.credentials.email, **retry_config),
230
+ NCBIAPI(cfg.credentials.email, cfg.credentials.ncbi_api_key, **retry_config),
231
+ SemanticScholarAPI(cfg.credentials.email, **retry_config),
232
+ ArxivAPI(cfg.credentials.email, **retry_config),
233
+ BioarxivAPI(cfg.credentials.email, **retry_config),
234
+ CoreAPI(cfg.credentials.email, **retry_config),
235
+ DataciteAPI(cfg.credentials.email, **retry_config),
236
+ DOAJAPI(cfg.credentials.email, **retry_config),
237
+ PLOSAPI(cfg.credentials.email, **retry_config),
238
+ UnpaywallAPI(cfg.credentials.email, **retry_config),
239
+ ZenodoAPI(cfg.credentials.email, **retry_config),
240
+ ]
241
+
242
+ # ---- Build omics extractor ----
243
+ llm_key = api_key or cfg.credentials.llm_api_key
244
+
245
+ # Primer database: prefer builtin, then external file, else None
246
+ primer_database = None
247
+ if builtin:
248
+ primer_database = ProbeBaseDatabase(use_builtin=True)
249
+ elif primer_db:
250
+ primer_database = ProbeBaseDatabase(db_path=primer_db)
251
+
252
+ if pmid:
253
+ sources = [PMIDSource(cfg.credentials.email, pmid, **retry_config)]
254
+
255
+ if omics.lower() == "16s":
256
+ if no_llm:
257
+ extractor = SixteenSExtractor(api_key="", primer_db=primer_database)
258
+ else:
259
+ extractor = SixteenSExtractor(api_key=llm_key or "", primer_db=primer_database)
260
+ else:
261
+ click.echo(f"❌ Unknown omics type: {omics}", err=True)
262
+ raise click.Abort()
263
+
264
+ # ---- Run fetcher ----
265
+ cache = PublicationCache(cfg.paths.cache_dir / "publications.db")
266
+ fetcher = PublicationFetcher(cfg, sources, extractor, cache)
267
+ fetcher.MAX_PUBLICATION_ROUNDS = max_rounds
268
+
269
+ click.echo(f"🔍 Searching publications for {len(accessions)} accessions...")
270
+ results = fetcher.fetch_and_analyze_sync(accessions)
271
+
272
+ # ---- Output ----
273
+ if output:
274
+ import json
275
+ output.parent.mkdir(parents=True, exist_ok=True)
276
+ with open(output, "w") as f:
277
+ json.dump(results, f, indent=2, default=str)
278
+ click.echo(f"📄 Results written to {output}")
279
+ else:
280
+ for acc, pubs in results.items():
281
+ click.echo(f"\n📌 {acc} – {len(pubs)} publications")
282
+ for pub in pubs[:5]:
283
+ title = (pub.get('publication_title') or 'N/A')[:80]
284
+ status = pub.get('status', '?')
285
+ click.echo(f" [{status}] {title}")
286
+ if len(pubs) > 5:
287
+ click.echo(f" … and {len(pubs) - 5} more.")
288
+
289
+
290
+ # --------------------------------------------------------------------------- #
291
+ # run-pipeline
292
+ # --------------------------------------------------------------------------- #
293
+
294
+ @main.command()
295
+ @click.argument("input_path", type=click.Path(exists=True, path_type=Path))
296
+ @click.option("--output", "-o", type=click.Path(path_type=Path), help="Output file path.")
297
+ @click.option("--sample-id-col", default=None, help="Override sample ID column name.")
298
+ @click.option("--no-geocode", is_flag=True, help="Disable reverse geocoding.")
299
+ @_config_option
300
+ def run_pipeline(
301
+ input_path: Path,
302
+ output: Optional[Path],
303
+ sample_id_col: Optional[str],
304
+ no_geocode: bool,
305
+ config: Optional[Path],
306
+ email: Optional[str],
307
+ cache_dir: Optional[Path],
308
+ ):
309
+ """Run the full MetadataManager pipeline on a file (cleaning + enrichment)."""
310
+ cfg = _build_config(config, email, cache_dir)
311
+ if no_geocode:
312
+ cfg.metadata.enable_geocoding = False
313
+ setup_logging(cfg.logs_dir)
314
+
315
+ import pandas as pd
316
+ from omix.metadata.file_workflow import _load_table, _save_table, _ensure_sample_id_column
317
+
318
+ df = _load_table(input_path)
319
+ if sample_id_col:
320
+ df = _ensure_sample_id_column(df, sample_id_col)
321
+ elif cfg.metadata.sample_id_column not in df.columns:
322
+ df = _ensure_sample_id_column(df, cfg.metadata.sample_id_column)
323
+
324
+ manager = MetadataManager(df, cfg, sample_id_column=sample_id_col)
325
+ enriched = asyncio.run(manager.run_pipeline())
326
+
327
+ if output:
328
+ _save_table(enriched, output)
329
+ click.echo(f"✅ Enriched metadata written to {output}")
330
+ else:
331
+ click.echo(f"✅ Pipeline complete – {len(enriched)} rows, {len(enriched.columns)} columns.")
332
+ click.echo(enriched.head())
333
+
334
+
335
+ # --------------------------------------------------------------------------- #
336
+ # enrich-with-publications
337
+ # --------------------------------------------------------------------------- #
338
+
339
+ @main.command()
340
+ @click.argument("input_path", type=click.Path(exists=True, path_type=Path))
341
+ @click.option("--output", "-o", type=click.Path(path_type=Path), help="Output file path.")
342
+ @click.option("--no-validate", is_flag=True, help="Skip publication validation (keep all publications).")
343
+ @click.option("--api-key", envvar="OMIX_LLM_API_KEY", help="LLM API key for methodology extraction.")
344
+ @click.option("--no-llm", is_flag=True, help="Skip LLM extraction (regex only).")
345
+ @click.option("--builtin", is_flag=True, help="Use the built‑in primer database.")
346
+ @click.option("--primer-db", type=click.Path(exists=True, path_type=Path),
347
+ help="Path to a probeBase SQLite primer database.")
348
+ @click.option("--max-rounds", type=int, default=3, help="Maximum citation chasing rounds.")
349
+ @_config_option
350
+ def enrich_with_publications(
351
+ input_path: Path,
352
+ output: Optional[Path],
353
+ no_validate: bool,
354
+ api_key: Optional[str],
355
+ no_llm: bool,
356
+ builtin: bool,
357
+ primer_db: Optional[Path],
358
+ max_rounds: int,
359
+ config: Optional[Path],
360
+ email: Optional[str],
361
+ cache_dir: Optional[Path],
362
+ ):
363
+ """
364
+ Enrich metadata with ENA data AND publication information in one pipeline.
365
+
366
+ This unified command:
367
+ 1. Fetches and enriches metadata from ENA
368
+ 2. Discovers publications from 12+ sources
369
+ 3. Validates publications for accession relevance
370
+ 4. Integrates publication counts and DOIs into metadata
371
+
372
+ Output includes: all ENA metadata fields + publication_count + publication_dois
373
+ """
374
+ cfg = _build_config(config, email, cache_dir)
375
+ setup_logging(cfg.logs_dir)
376
+
377
+ # Import the unified pipeline
378
+ from omix.metadata.file_workflow import _load_table, _save_table
379
+ import pandas as pd
380
+
381
+ async def async_enrich():
382
+ """Run async enrichment pipeline."""
383
+ from omix.metadata.file_workflow import enrich_metadata_from_path
384
+ from omix.publications.fetcher import PublicationFetcher
385
+ from omix.publications.extractors.omics import SixteenSExtractor
386
+ from omix.publications.cache import PublicationCache
387
+ from omix.publications.apis import (
388
+ CrossrefAPI, EuropePMCAPI, NCBIAPI, SemanticScholarAPI,
389
+ ArxivAPI, BioarxivAPI, CoreAPI, DataciteAPI, DOAJAPI, PLOSAPI,
390
+ UnpaywallAPI, ZenodoAPI,
391
+ )
392
+
393
+ # ---- Phase 1: Metadata ----
394
+ click.echo("📊 Phase 1: Metadata enrichment...")
395
+ metadata = await enrich_metadata_from_path(
396
+ input_path=input_path,
397
+ output_path=None,
398
+ config=cfg,
399
+ enable_ena_lookup=True,
400
+ preserve_all_rows=True,
401
+ )
402
+ click.echo(f" ✓ {len(metadata)} rows × {len(metadata.columns)} columns")
403
+
404
+ # Extract study accessions
405
+ if 'study_accession' not in metadata.columns:
406
+ click.echo("❌ study_accession column missing; cannot fetch publications", err=True)
407
+ if output:
408
+ _save_table(metadata, output)
409
+ return metadata
410
+
411
+ study_accessions = sorted(set(metadata['study_accession'].dropna().unique()))
412
+
413
+ if not study_accessions:
414
+ click.echo("⚠️ No study accessions found; skipping publication fetch")
415
+ if output:
416
+ _save_table(metadata, output)
417
+ return metadata
418
+
419
+ # ---- Phase 2: Publications ----
420
+ click.echo(f"📚 Phase 2: Publication discovery ({len(study_accessions)} studies)...")
421
+
422
+ retry_config = {
423
+ 'max_retries': cfg.publication.max_retries,
424
+ 'base_delay': cfg.publication.base_delay_seconds,
425
+ 'max_delay': cfg.publication.max_delay_seconds,
426
+ }
427
+ sources = [
428
+ CrossrefAPI(cfg.credentials.email, **retry_config),
429
+ EuropePMCAPI(cfg.credentials.email, **retry_config),
430
+ NCBIAPI(cfg.credentials.email, cfg.credentials.ncbi_api_key, **retry_config),
431
+ SemanticScholarAPI(cfg.credentials.email, **retry_config),
432
+ ArxivAPI(cfg.credentials.email, **retry_config),
433
+ BioarxivAPI(cfg.credentials.email, **retry_config),
434
+ CoreAPI(cfg.credentials.email, **retry_config),
435
+ DataciteAPI(cfg.credentials.email, **retry_config),
436
+ DOAJAPI(cfg.credentials.email, **retry_config),
437
+ PLOSAPI(cfg.credentials.email, **retry_config),
438
+ UnpaywallAPI(cfg.credentials.email, **retry_config),
439
+ ZenodoAPI(cfg.credentials.email, **retry_config),
440
+ ]
441
+
442
+ llm_key = api_key or cfg.credentials.llm_api_key
443
+ primer_database = None
444
+ if builtin:
445
+ primer_database = ProbeBaseDatabase(use_builtin=True)
446
+ elif primer_db:
447
+ primer_database = ProbeBaseDatabase(db_path=primer_db)
448
+
449
+ extractor = SixteenSExtractor(api_key=llm_key if not no_llm else "", primer_db=primer_database)
450
+ cache = PublicationCache(cfg.paths.cache_dir / "publications.db")
451
+ fetcher = PublicationFetcher(cfg, sources, extractor, cache)
452
+ fetcher.MAX_PUBLICATION_ROUNDS = max_rounds
453
+
454
+ publications = fetcher.fetch_and_analyze_sync(study_accessions)
455
+ total_pubs = sum(len(p) for p in publications.values())
456
+ click.echo(f" ✓ {total_pubs} publications found")
457
+
458
+ # ---- Phase 3: Validation ----
459
+ if not no_validate:
460
+ click.echo("🔍 Phase 3: Publication validation...")
461
+
462
+ # Inline validation
463
+ filtered = {}
464
+ for study_accession, pubs in publications.items():
465
+ filtered[study_accession] = []
466
+ for pub in pubs:
467
+ if pub.get('status') != '✓ Extraction complete.':
468
+ continue
469
+ matched_queries = pub.get('matched_queries', [])
470
+ accession_in_text = pub.get('accession_mentions_in_text', 0) > 0
471
+ direct_match = any(
472
+ q == study_accession or (study_accession in q and not q.startswith('DATA:'))
473
+ for q in matched_queries
474
+ )
475
+ if direct_match or accession_in_text:
476
+ filtered[study_accession].append(pub)
477
+
478
+ publications = filtered
479
+ valid_count = sum(len(p) for p in publications.values())
480
+ click.echo(f" ✓ {valid_count} publications with direct accession matches")
481
+
482
+ # ---- Phase 4: Integration ----
483
+ click.echo("🔗 Phase 4: Integration...")
484
+
485
+ if 'publication_count' not in metadata.columns:
486
+ metadata['publication_count'] = None
487
+ if 'publication_dois' not in metadata.columns:
488
+ metadata['publication_dois'] = None
489
+
490
+ filled_rows = 0
491
+ studies_with_dois = 0
492
+
493
+ for study_accession, pubs in publications.items():
494
+ dois = []
495
+ seen = set()
496
+ for pub in pubs:
497
+ if not isinstance(pub, dict):
498
+ continue
499
+ doi = pub.get('doi')
500
+ if doi and doi not in seen:
501
+ dois.append(doi)
502
+ seen.add(doi)
503
+
504
+ mask = metadata['study_accession'] == study_accession
505
+ filled_rows += int(mask.sum())
506
+ metadata.loc[mask, 'publication_count'] = len(dois)
507
+ metadata.loc[mask, 'publication_dois'] = '; '.join(dois) if dois else ''
508
+
509
+ if dois:
510
+ studies_with_dois += 1
511
+
512
+ click.echo(f" ✓ {filled_rows} rows updated, {studies_with_dois} studies with DOIs")
513
+
514
+ return metadata
515
+
516
+ try:
517
+ enriched_metadata = asyncio.run(async_enrich())
518
+
519
+ if output:
520
+ output.parent.mkdir(parents=True, exist_ok=True)
521
+ _save_table(enriched_metadata, output)
522
+ click.echo(f"\n✅ Output saved to {output}")
523
+
524
+ click.echo(f"✅ Complete! {len(enriched_metadata)} rows × {len(enriched_metadata.columns)} columns")
525
+ except Exception as e:
526
+ click.echo(f"\n❌ Error: {e}", err=True)
527
+ raise click.Abort()
528
+
529
+
530
+ if __name__ == "__main__":
531
+ main()
omix/config.py ADDED
@@ -0,0 +1,150 @@
1
+ """
2
+ Unified configuration for omix, loadable from YAML files and environment variables.
3
+ """
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+ import yaml
10
+
11
+
12
+ class Credentials:
13
+ """Holds all API credentials, loaded from config or environment."""
14
+
15
+ def __init__(self, cred_dict: Optional[Dict[str, Any]] = None):
16
+ cred = cred_dict or {}
17
+ self.email = cred.get("email") or os.getenv("OMIX_EMAIL", "")
18
+ self.ena_email = cred.get("ena_email") or os.getenv("OMIX_ENA_EMAIL", self.email)
19
+ self.ncbi_api_key = cred.get("ncbi_api_key") or os.getenv("OMIX_NCBI_API_KEY")
20
+ self.llm_api_key = cred.get("llm_api_key") or os.getenv("OMIX_LLM_API_KEY")
21
+ self.dimensions_api_key = cred.get("dimensions_api_key") or os.getenv("OMIX_DIMENSIONS_API_KEY")
22
+ self.ieee_api_key = cred.get("ieee_api_key") or os.getenv("OMIX_IEEE_API_KEY")
23
+ self.mendeley_api_key = cred.get("mendeley_api_key") or os.getenv("OMIX_MENDELEY_API_KEY")
24
+ self.springer_api_key = cred.get("springer_api_key") or os.getenv("OMIX_SPRINGER_API_KEY")
25
+ self.lens_api_key = cred.get("lens_api_key") or os.getenv("OMIX_LENS_API_KEY")
26
+
27
+
28
+ class MetadataConfig:
29
+ """Metadata normalization and enrichment settings."""
30
+
31
+ def __init__(self, cfg: Optional[Dict[str, Any]] = None):
32
+ cfg = cfg or {}
33
+ self.columns_to_drop = cfg.get("columns_to_drop", [])
34
+ self.force_numeric_columns = cfg.get("force_numeric_columns", ["lat", "lon", "depth", "altitude"])
35
+ self.mappings = cfg.get("mappings", {})
36
+ self.sample_id_column = cfg.get("sample_id_column", "#sampleid")
37
+ self.suffixes_to_collapse = cfg.get("suffixes_to_collapse", [])
38
+ self.exclude_host = cfg.get("exclude_host", False)
39
+ self.enable_geocoding = cfg.get("enable_geocoding", True)
40
+
41
+
42
+ class ENAApiConfig:
43
+ """ENA API specific options."""
44
+
45
+ def __init__(self, cfg: Dict[str, Any]):
46
+ self.enabled = cfg.get("enabled", True)
47
+ self.max_concurrent = cfg.get("max_concurrent", 5)
48
+ self.batch_size = cfg.get("batch_size", 100)
49
+ self.cache_ttl_days = cfg.get("cache_ttl_days", 30)
50
+ self.fetch_phases = cfg.get("fetch_phases", True)
51
+ self.phase2_async = cfg.get("phase2_async", True)
52
+ self.cache_write = CacheWriteConfig(cfg.get("cache_write", {}))
53
+
54
+
55
+ class CacheWriteConfig:
56
+ """Batched write configuration for SQLite cache."""
57
+
58
+ def __init__(self, cfg: Dict[str, Any]):
59
+ self.batch_size = cfg.get("batch_size", 100)
60
+ self.flush_interval_seconds = cfg.get("flush_interval_seconds", 5.0)
61
+
62
+
63
+ class ApisConfig:
64
+ """Toggle which external APIs are enabled."""
65
+
66
+ def __init__(self, cfg: Optional[Dict[str, Any]] = None):
67
+ cfg = cfg or {}
68
+ self.enabled = cfg.get("enabled", True)
69
+ seq = cfg.get("sequence", {})
70
+ self.ena = ENAApiConfig(seq.get("ena", {}))
71
+
72
+
73
+ class PublicationConfig:
74
+ """Publication search and analysis settings."""
75
+
76
+ def __init__(self, cfg: Optional[Dict[str, Any]] = None):
77
+ cfg = cfg or {}
78
+ self.max_concurrent_apis = cfg.get("max_concurrent_apis", 5)
79
+ self.rounds = cfg.get("rounds", 3)
80
+ self.max_pdf_pages = cfg.get("max_pdf_pages", 10)
81
+ self.max_file_size = cfg.get("max_file_size", 15_000_000)
82
+
83
+ # FEATURE 3: Retry/backoff configuration per API source
84
+ retry_cfg = cfg.get("retry", {})
85
+ self.max_retries = retry_cfg.get("max_retries", 5)
86
+ self.base_delay_seconds = retry_cfg.get("base_delay_seconds", 1.0)
87
+ self.max_delay_seconds = retry_cfg.get("max_delay_seconds", 32.0)
88
+
89
+
90
+ class PathsConfig:
91
+ """Project and dependency paths."""
92
+
93
+ def __init__(self, cfg: Optional[Dict[str, Any]] = None):
94
+ cfg = cfg or {}
95
+ self.project = Path(cfg.get("project", "."))
96
+ self.cache_dir = Path(cfg.get("cache_dir", self.project / ".cache"))
97
+ self.logs_dir = Path(cfg.get("logs_dir", self.project / "logs"))
98
+ self.primer_db = cfg.get("primer_db") # optional
99
+
100
+
101
+ class OmicsType:
102
+ """Enumeration of supported omics types."""
103
+
104
+ _16S = "16S"
105
+ METAGENOMICS = "metagenomics"
106
+
107
+ def __init__(self, name: str):
108
+ self.name = name
109
+
110
+
111
+ class Config:
112
+ """Top‑level configuration object, aggregating all sub‑configs."""
113
+
114
+ def __init__(self, config_path: Optional[Path] = None, **overrides):
115
+ self._raw: Dict[str, Any] = {}
116
+ if config_path:
117
+ with open(config_path, "r") as f:
118
+ self._raw = yaml.safe_load(f) or {}
119
+
120
+ # Move known overrides into the correct sub-dicts BEFORE building sub‑configs
121
+ cred_overrides = {}
122
+ for key in ("email", "ena_email", "llm_api_key", "ncbi_api_key",
123
+ "dimensions_api_key", "ieee_api_key", "mendeley_api_key",
124
+ "springer_api_key", "lens_api_key"):
125
+ if key in overrides:
126
+ cred_overrides[key] = overrides.pop(key)
127
+
128
+ self._raw.update(overrides) # remaining top-level overrides
129
+ credentials_raw = self._raw.setdefault("credentials", {})
130
+ credentials_raw.update(cred_overrides) # apply credential overrides
131
+
132
+ # Build sub‑configs
133
+ self.credentials = Credentials(self._raw.get("credentials"))
134
+ self.paths = PathsConfig(self._raw.get("paths"))
135
+ self.metadata = MetadataConfig(self._raw.get("metadata"))
136
+ self.apis = ApisConfig(self._raw.get("apis"))
137
+ self.publication = PublicationConfig(self._raw.get("publication"))
138
+ self.omics_type = OmicsType(self._raw.get("omics_type", "16S"))
139
+ self.cache_dir = self.paths.cache_dir
140
+ self.logs_dir = self.paths.logs_dir
141
+
142
+ def save(self, path: Path) -> None:
143
+ """Write current configuration to a YAML file."""
144
+ with open(path, "w") as f:
145
+ yaml.dump(self._raw, f)
146
+
147
+
148
+ def load_config(config_path: Path) -> Config:
149
+ """Load configuration from a YAML file and return a Config instance."""
150
+ return Config(config_path)