omicsmeta 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. omicsmeta-0.1.0/.github/workflows/ci.yml +58 -0
  2. omicsmeta-0.1.0/.github/workflows/pages.yml +63 -0
  3. omicsmeta-0.1.0/.github/workflows/publish.yml +58 -0
  4. omicsmeta-0.1.0/.gitignore +18 -0
  5. omicsmeta-0.1.0/CHANGELOG.md +28 -0
  6. omicsmeta-0.1.0/CODE_OF_CONDUCT.md +31 -0
  7. omicsmeta-0.1.0/CONTRIBUTING.md +55 -0
  8. omicsmeta-0.1.0/LICENSE +22 -0
  9. omicsmeta-0.1.0/PKG-INFO +167 -0
  10. omicsmeta-0.1.0/README.md +131 -0
  11. omicsmeta-0.1.0/benchmarks/README.md +25 -0
  12. omicsmeta-0.1.0/benchmarks/known_answer_suite.tsv +9 -0
  13. omicsmeta-0.1.0/benchmarks/truth/GSE154243_expected_harmonized.tsv +9 -0
  14. omicsmeta-0.1.0/benchmarks/truth/ambiguous_phenotype_expected_harmonized.tsv +5 -0
  15. omicsmeta-0.1.0/benchmarks/truth/basic_expected_harmonized.tsv +11 -0
  16. omicsmeta-0.1.0/benchmarks/truth/biosample_expected_harmonized.tsv +11 -0
  17. omicsmeta-0.1.0/benchmarks/truth/immune_pbmc_expected_harmonized.tsv +7 -0
  18. omicsmeta-0.1.0/benchmarks/truth/mouse_liver_expected_harmonized.tsv +7 -0
  19. omicsmeta-0.1.0/benchmarks/truth/sra_expected_harmonized.tsv +9 -0
  20. omicsmeta-0.1.0/benchmarks/truth/treatment_perturbation_expected_harmonized.tsv +11 -0
  21. omicsmeta-0.1.0/docs/api.md +109 -0
  22. omicsmeta-0.1.0/docs/design.md +88 -0
  23. omicsmeta-0.1.0/docs/index.md +49 -0
  24. omicsmeta-0.1.0/docs/quickstart.md +169 -0
  25. omicsmeta-0.1.0/docs/release_readiness.md +58 -0
  26. omicsmeta-0.1.0/docs/stylesheets/extra.css +131 -0
  27. omicsmeta-0.1.0/docs/tutorials/basic_fixture.md +67 -0
  28. omicsmeta-0.1.0/examples/basic/README.md +25 -0
  29. omicsmeta-0.1.0/examples/basic/expected_harmonized.tsv +11 -0
  30. omicsmeta-0.1.0/examples/basic/expected_samples.tsv +3 -0
  31. omicsmeta-0.1.0/examples/basic/expected_unmapped_summary.tsv +3 -0
  32. omicsmeta-0.1.0/examples/basic/metadata.tsv +3 -0
  33. omicsmeta-0.1.0/galaxy-omicsmeta/.shed.yml +13 -0
  34. omicsmeta-0.1.0/galaxy-omicsmeta/macros.xml +8 -0
  35. omicsmeta-0.1.0/galaxy-omicsmeta/omicsmeta_harmonize.py +53 -0
  36. omicsmeta-0.1.0/galaxy-omicsmeta/omicsmeta_harmonize.xml +72 -0
  37. omicsmeta-0.1.0/galaxy-omicsmeta/test-data/raw_metadata.tsv +3 -0
  38. omicsmeta-0.1.0/mkdocs.yml +34 -0
  39. omicsmeta-0.1.0/paper/paper.bib +41 -0
  40. omicsmeta-0.1.0/paper/paper.md +113 -0
  41. omicsmeta-0.1.0/pyproject.toml +58 -0
  42. omicsmeta-0.1.0/scripts/benchmark_mapping.py +49 -0
  43. omicsmeta-0.1.0/src/omicsmeta/__init__.py +7 -0
  44. omicsmeta-0.1.0/src/omicsmeta/_version.py +2 -0
  45. omicsmeta-0.1.0/src/omicsmeta/benchmark.py +199 -0
  46. omicsmeta-0.1.0/src/omicsmeta/cli.py +252 -0
  47. omicsmeta-0.1.0/src/omicsmeta/core/__init__.py +26 -0
  48. omicsmeta-0.1.0/src/omicsmeta/core/detector.py +205 -0
  49. omicsmeta-0.1.0/src/omicsmeta/core/fetcher.py +85 -0
  50. omicsmeta-0.1.0/src/omicsmeta/core/harmonizer.py +340 -0
  51. omicsmeta-0.1.0/src/omicsmeta/core/mapper.py +302 -0
  52. omicsmeta-0.1.0/src/omicsmeta/core/normalizer.py +114 -0
  53. omicsmeta-0.1.0/src/omicsmeta/core/types.py +56 -0
  54. omicsmeta-0.1.0/src/omicsmeta/core/validator.py +149 -0
  55. omicsmeta-0.1.0/src/omicsmeta/io/__init__.py +24 -0
  56. omicsmeta-0.1.0/src/omicsmeta/io/readers.py +216 -0
  57. omicsmeta-0.1.0/src/omicsmeta/io/writers.py +102 -0
  58. omicsmeta-0.1.0/src/omicsmeta/ontologies/__init__.py +35 -0
  59. omicsmeta-0.1.0/src/omicsmeta/ontologies/cache.py +83 -0
  60. omicsmeta-0.1.0/src/omicsmeta/ontologies/loader.py +79 -0
  61. omicsmeta-0.1.0/src/omicsmeta/ontologies/resources.py +229 -0
  62. omicsmeta-0.1.0/tests/fixtures/GSE154243_soft_snippet.txt +22 -0
  63. omicsmeta-0.1.0/tests/fixtures/ambiguous_phenotype_soft_snippet.txt +15 -0
  64. omicsmeta-0.1.0/tests/fixtures/biosample_edge_cases.xml +19 -0
  65. omicsmeta-0.1.0/tests/fixtures/biosample_snippet.xml +27 -0
  66. omicsmeta-0.1.0/tests/fixtures/custom_doid.obo +7 -0
  67. omicsmeta-0.1.0/tests/fixtures/immune_pbmc_soft_snippet.txt +17 -0
  68. omicsmeta-0.1.0/tests/fixtures/mouse_liver_soft_snippet.txt +19 -0
  69. omicsmeta-0.1.0/tests/fixtures/sra_edge_cases.xml +29 -0
  70. omicsmeta-0.1.0/tests/fixtures/sra_snippet.xml +47 -0
  71. omicsmeta-0.1.0/tests/fixtures/tiny.obo +7 -0
  72. omicsmeta-0.1.0/tests/fixtures/treatment_perturbation_soft_snippet.txt +19 -0
  73. omicsmeta-0.1.0/tests/test_additional_fixtures.py +47 -0
  74. omicsmeta-0.1.0/tests/test_benchmark.py +37 -0
  75. omicsmeta-0.1.0/tests/test_cli.py +207 -0
  76. omicsmeta-0.1.0/tests/test_detector.py +50 -0
  77. omicsmeta-0.1.0/tests/test_examples.py +130 -0
  78. omicsmeta-0.1.0/tests/test_fetcher.py +63 -0
  79. omicsmeta-0.1.0/tests/test_galaxy_wrapper.py +43 -0
  80. omicsmeta-0.1.0/tests/test_harmonizer.py +74 -0
  81. omicsmeta-0.1.0/tests/test_mapper.py +114 -0
  82. omicsmeta-0.1.0/tests/test_normalizer.py +21 -0
  83. omicsmeta-0.1.0/tests/test_ontology_loader.py +24 -0
  84. omicsmeta-0.1.0/tests/test_ontology_resources.py +113 -0
  85. omicsmeta-0.1.0/tests/test_readers.py +64 -0
  86. omicsmeta-0.1.0/tests/test_real_geo_integration.py +49 -0
  87. omicsmeta-0.1.0/tests/test_validator.py +66 -0
@@ -0,0 +1,58 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["main"]
6
+ pull_request:
7
+ branches: ["main"]
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ test:
12
+ name: Python ${{ matrix.python-version }}
13
+ runs-on: ubuntu-latest
14
+ strategy:
15
+ fail-fast: false
16
+ matrix:
17
+ python-version: ["3.10", "3.11", "3.12"]
18
+
19
+ steps:
20
+ - name: Check out repository
21
+ uses: actions/checkout@v4
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ cache: pip
28
+
29
+ - name: Install package and test dependencies
30
+ run: python -m pip install -e ".[dev]"
31
+
32
+ - name: Run tests
33
+ run: python -m pytest --cov=omicsmeta --cov-report=term-missing --cov-fail-under=70
34
+
35
+ package:
36
+ name: Package artifacts
37
+ runs-on: ubuntu-latest
38
+
39
+ steps:
40
+ - name: Check out repository
41
+ uses: actions/checkout@v4
42
+
43
+ - name: Set up Python
44
+ uses: actions/setup-python@v5
45
+ with:
46
+ python-version: "3.12"
47
+ cache: pip
48
+
49
+ - name: Build and check distributions
50
+ run: |
51
+ python -m pip install build twine
52
+ python -m build
53
+ python -m twine check dist/*.tar.gz dist/*.whl
54
+
55
+ - name: Install wheel and smoke-test CLI
56
+ run: |
57
+ python -m pip install dist/omicsmeta-*.whl
58
+ omicsmeta --help
@@ -0,0 +1,63 @@
1
+ name: Docs
2
+
3
+ on:
4
+ push:
5
+ branches: ["main"]
6
+ paths:
7
+ - ".github/workflows/pages.yml"
8
+ - "docs/**"
9
+ - "mkdocs.yml"
10
+ - "pyproject.toml"
11
+ - "README.md"
12
+ workflow_dispatch:
13
+
14
+ permissions:
15
+ contents: read
16
+ pages: write
17
+ id-token: write
18
+
19
+ concurrency:
20
+ group: pages
21
+ cancel-in-progress: false
22
+
23
+ jobs:
24
+ build:
25
+ name: Build MkDocs site
26
+ runs-on: ubuntu-latest
27
+
28
+ steps:
29
+ - name: Check out repository
30
+ uses: actions/checkout@v4
31
+
32
+ - name: Set up Python
33
+ uses: actions/setup-python@v5
34
+ with:
35
+ python-version: "3.12"
36
+ cache: pip
37
+
38
+ - name: Install documentation dependencies
39
+ run: python -m pip install -e ".[docs]"
40
+
41
+ - name: Configure GitHub Pages
42
+ uses: actions/configure-pages@v5
43
+
44
+ - name: Build documentation
45
+ run: mkdocs build --strict --site-dir site
46
+
47
+ - name: Upload Pages artifact
48
+ uses: actions/upload-pages-artifact@v3
49
+ with:
50
+ path: site
51
+
52
+ deploy:
53
+ name: Deploy GitHub Pages
54
+ runs-on: ubuntu-latest
55
+ needs: build
56
+ environment:
57
+ name: github-pages
58
+ url: ${{ steps.deployment.outputs.page_url }}
59
+
60
+ steps:
61
+ - name: Deploy to GitHub Pages
62
+ id: deployment
63
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,58 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published, released]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ build:
12
+ name: Build distributions
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Check out repository
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+ cache: pip
24
+
25
+ - name: Install build tools
26
+ run: python -m pip install build twine
27
+
28
+ - name: Build source and wheel distributions
29
+ run: python -m build
30
+
31
+ - name: Validate distributions
32
+ run: python -m twine check dist/*.tar.gz dist/*.whl
33
+
34
+ - name: Upload distribution artifact
35
+ uses: actions/upload-artifact@v4
36
+ with:
37
+ name: python-package-distributions
38
+ path: dist/
39
+
40
+ publish:
41
+ name: Publish distributions to PyPI
42
+ needs: build
43
+ runs-on: ubuntu-latest
44
+ environment:
45
+ name: pypi
46
+ url: https://pypi.org/project/omicsmeta/
47
+ permissions:
48
+ id-token: write
49
+
50
+ steps:
51
+ - name: Download distribution artifact
52
+ uses: actions/download-artifact@v4
53
+ with:
54
+ name: python-package-distributions
55
+ path: dist/
56
+
57
+ - name: Publish package distributions to PyPI
58
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,18 @@
1
+ .DS_Store
2
+ .env
3
+ .env.*
4
+ !.env.example
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ .tmp/
8
+ .coverage
9
+ htmlcov/
10
+ site/
11
+ dist/
12
+ build/
13
+ *.egg-info/
14
+ __pycache__/
15
+ *.py[cod]
16
+ .venv/
17
+ venv/
18
+ to-do.md
@@ -0,0 +1,28 @@
1
+ # Changelog
2
+
3
+ All notable changes to `omicsmeta` will be documented in this file.
4
+
5
+ The project is currently pre-alpha.
6
+
7
+ ## 0.1.0 - 2026-06-13
8
+
9
+ - Added a Python package scaffold with CLI entry point.
10
+ - Added normalization, field detection, mapping, validation, and harmonization
11
+ modules.
12
+ - Added tabular, minimal GEO SOFT, BioSample XML, and SRA XML readers.
13
+ - Added detailed harmonized, unmapped, unmapped-summary, sample-wide, and HTML
14
+ QC outputs.
15
+ - Added managed ontology resource download and SQLite indexing commands.
16
+ - Added conservative handling for ambiguous metadata fields.
17
+ - Added transparent cell-line inference for common cell lines.
18
+ - Added batch harmonization for multiple files and GEO accessions.
19
+ - Added known-answer benchmark helper and CLI script.
20
+ - Added a multi-fixture known-answer benchmark suite.
21
+ - Added integration fixtures based on GEO-style metadata snippets.
22
+ - Added public documentation, contribution metadata, and a JOSS paper skeleton.
23
+ - Added a Galaxy wrapper scaffold with local wrapper smoke test data.
24
+ - Added package artifact build validation and a release-readiness checklist.
25
+ - Added SRA/BioSample XML edge-case coverage for namespaced documents,
26
+ repeated attributes, and accession fallbacks.
27
+ - Added GitHub Pages documentation publishing and a Material for MkDocs site
28
+ theme.
@@ -0,0 +1,31 @@
1
+ # Code of Conduct
2
+
3
+ `omicsmeta` aims to be a respectful, practical, and technically rigorous open
4
+ source project.
5
+
6
+ ## Expected Behavior
7
+
8
+ - Treat contributors and users with respect.
9
+ - Discuss technical disagreements with evidence and clear reasoning.
10
+ - Keep feedback specific to the code, documentation, data, or scientific claim.
11
+ - Assume public project spaces are read by people who do not know the private
12
+ context behind a discussion.
13
+
14
+ ## Unacceptable Behavior
15
+
16
+ - Harassment, discrimination, intimidation, or personal attacks.
17
+ - Publishing private information without explicit permission.
18
+ - Repeated disruption of project discussions after maintainers ask for a change
19
+ in behavior.
20
+ - Knowingly submitting fabricated data, misleading benchmark results, or
21
+ unsupported scientific claims.
22
+
23
+ ## Enforcement
24
+
25
+ Maintainers may remove comments, close issues, reject contributions, or restrict
26
+ participation when behavior conflicts with this code of conduct. Enforcement
27
+ decisions should be documented privately among maintainers with enough context
28
+ to review them later.
29
+
30
+ Report conduct concerns to the current project maintainers through the issue
31
+ tracker or another maintainer-provided contact channel.
@@ -0,0 +1,55 @@
1
+ # Contributing
2
+
3
+ Thank you for considering a contribution to `omicsmeta`. The project is
4
+ pre-alpha, so the highest-value contributions are focused tests, small bug
5
+ fixes, documentation improvements, and real metadata examples with expected
6
+ outputs.
7
+
8
+ ## Development Setup
9
+
10
+ ```bash
11
+ python -m pip install -e ".[dev]"
12
+ python -m pytest
13
+ ```
14
+
15
+ Run the coverage gate used during development:
16
+
17
+ ```bash
18
+ python -m pytest --cov=omicsmeta --cov-report=term-missing --cov-fail-under=70
19
+ ```
20
+
21
+ ## Contribution Guidelines
22
+
23
+ - Keep changes focused. A pull request should usually address one feature,
24
+ bug, fixture, or documentation topic.
25
+ - Add tests for new behavior, especially field detection, ontology mapping,
26
+ validation, CLI output, and real metadata fixtures.
27
+ - Preserve provenance in outputs. New transformations should keep enough source
28
+ context for a curator to audit the result.
29
+ - Prefer conservative defaults. Uncertain mappings should be reported for
30
+ review instead of silently accepted.
31
+ - Public documentation should assume readers do not know any prior discussion
32
+ about the project.
33
+
34
+ ## Reporting Issues
35
+
36
+ Useful issue reports include:
37
+
38
+ - the exact command or Python snippet that failed;
39
+ - a small input file or fixture;
40
+ - expected and actual outputs;
41
+ - the installed `omicsmeta` version or commit hash;
42
+ - whether the issue involves local OBO files, managed ontology resources, or
43
+ the optional `text2term` backend.
44
+
45
+ ## Release Readiness
46
+
47
+ Before any release intended for external users, maintainers should verify that
48
+ the test suite passes, documentation examples run from a clean checkout, and the
49
+ paper and citation metadata do not overstate the validation status of the
50
+ software.
51
+
52
+ See the public [release readiness checklist](docs/release_readiness.md) for the
53
+ artifact build, documentation, benchmark, and manual review checks. The
54
+ checklist validates readiness only; publishing requires explicit maintainer
55
+ approval.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 omicsmeta contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.4
2
+ Name: omicsmeta
3
+ Version: 0.1.0
4
+ Summary: Automated harmonization of public omics metadata
5
+ Project-URL: Homepage, https://github.com/qchiujunhao/omicsmeta
6
+ Project-URL: Documentation, https://qchiujunhao.github.io/omicsmeta/
7
+ Project-URL: Issues, https://github.com/qchiujunhao/omicsmeta/issues
8
+ Author: omicsmeta contributors
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: BioSample,GEO,SRA,bioinformatics,metadata,ontology
12
+ Classifier: Development Status :: 2 - Pre-Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: pandas>=2.0
22
+ Requires-Dist: requests>=2.31
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
25
+ Requires-Dist: pytest>=8; extra == 'dev'
26
+ Provides-Extra: docs
27
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
28
+ Requires-Dist: mkdocs<2,>=1.6; extra == 'docs'
29
+ Provides-Extra: fuzzy
30
+ Requires-Dist: rapidfuzz>=3.0; extra == 'fuzzy'
31
+ Provides-Extra: ontology
32
+ Requires-Dist: pronto>=2.5; extra == 'ontology'
33
+ Provides-Extra: text2term
34
+ Requires-Dist: text2term; extra == 'text2term'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # omicsmeta
38
+
39
+ `omicsmeta` is an early-stage Python package for harmonizing public omics
40
+ metadata from GEO, SRA, BioSample, and tabular exports.
41
+
42
+ The central design choice is to make ontology mapping pluggable. Generic
43
+ mapping tools such as `text2term` are useful, but public omics metadata also
44
+ needs domain-specific preprocessing, field-type detection, confidence-aware
45
+ routing, and cross-field validation. `omicsmeta` is intended to provide that
46
+ pipeline.
47
+
48
+ Current implementation status:
49
+
50
+ - string normalization and biomedical abbreviation expansion
51
+ - heuristic metadata field detection
52
+ - lightweight built-in ontology mapper for common terms
53
+ - optional `text2term` mapper adapter
54
+ - simple OBO loader and SQLite ontology cache
55
+ - tabular, minimal GEO SOFT, BioSample XML, and SRA XML readers
56
+ - harmonization orchestrator and CLI
57
+ - real GEO SOFT snippet test coverage
58
+ - conservative field routing for ambiguous metadata columns
59
+ - transparent cell-line inference for missing species/tissue/disease fields
60
+ - deduplicated unmapped-term summaries for manual curation
61
+ - sample-wide output tables
62
+ - batch harmonization
63
+ - known-answer benchmark helper and CLI script
64
+
65
+ Example:
66
+
67
+ ```bash
68
+ omicsmeta harmonize metadata.tsv \
69
+ --output harmonized.tsv \
70
+ --unmapped unmapped.tsv \
71
+ --unmapped-summary-output unmapped_summary.tsv \
72
+ --sample-output samples.tsv \
73
+ --report qc_report.html
74
+ ```
75
+
76
+ Direct GEO fetching is also available:
77
+
78
+ ```bash
79
+ omicsmeta harmonize \
80
+ --geo-accession GSE123456 \
81
+ --output harmonized.tsv \
82
+ --unmapped unmapped.tsv \
83
+ --unmapped-summary-output unmapped_summary.tsv \
84
+ --sample-output samples.tsv \
85
+ --report qc_report.html
86
+ ```
87
+
88
+ Custom local OBO files can be added to the built-in mapper:
89
+
90
+ ```bash
91
+ omicsmeta harmonize metadata.tsv \
92
+ --ontology-obo disease_slim.obo \
93
+ --output harmonized.tsv \
94
+ --unmapped unmapped.tsv \
95
+ --unmapped-summary-output unmapped_summary.tsv \
96
+ --sample-output samples.tsv \
97
+ --report qc_report.html
98
+ ```
99
+
100
+ Managed ontology resources can be cached locally:
101
+
102
+ ```bash
103
+ omicsmeta ontologies list
104
+ omicsmeta ontologies download doid uberon cl
105
+ omicsmeta ontologies index --resource doid --resource uberon
106
+ omicsmeta harmonize metadata.tsv \
107
+ --ontology-resource doid \
108
+ --ontology-resource uberon \
109
+ --output harmonized.tsv \
110
+ --unmapped unmapped.tsv \
111
+ --unmapped-summary-output unmapped_summary.tsv \
112
+ --sample-output samples.tsv \
113
+ --report qc_report.html
114
+ ```
115
+
116
+ Multiple files can be harmonized in one run:
117
+
118
+ ```bash
119
+ omicsmeta batch \
120
+ --input metadata_a.tsv \
121
+ --input metadata_b.tsv \
122
+ --output harmonized.tsv \
123
+ --unmapped unmapped.tsv \
124
+ --unmapped-summary-output unmapped_summary.tsv \
125
+ --sample-output samples.tsv \
126
+ --report qc_report.html
127
+ ```
128
+
129
+ Known-answer fixtures can be benchmarked:
130
+
131
+ ```bash
132
+ python scripts/benchmark_mapping.py \
133
+ --input examples/basic/metadata.tsv \
134
+ --truth examples/basic/expected_harmonized.tsv
135
+ ```
136
+
137
+ Run the bundled multi-fixture benchmark suite:
138
+
139
+ ```bash
140
+ python scripts/benchmark_mapping.py \
141
+ --manifest benchmarks/known_answer_suite.tsv \
142
+ --output-json benchmark_suite.json
143
+ ```
144
+
145
+ ## Documentation
146
+
147
+ - [Documentation site](https://qchiujunhao.github.io/omicsmeta/)
148
+ - [Quickstart](docs/quickstart.md)
149
+ - [API reference](docs/api.md)
150
+ - [Design notes](docs/design.md)
151
+ - [Release readiness checklist](docs/release_readiness.md)
152
+ - [Basic fixture tutorial](docs/tutorials/basic_fixture.md)
153
+ - [Galaxy wrapper scaffold](galaxy-omicsmeta/omicsmeta_harmonize.xml)
154
+
155
+ ## Maturity
156
+
157
+ This repository is pre-alpha and not yet JOSS-ready. The implemented code is
158
+ tested, but the project still needs publication-scale curated benchmarks,
159
+ release packaging, external user feedback, and Galaxy Tool Shed validation
160
+ before submission.
161
+
162
+ Run tests locally with:
163
+
164
+ ```bash
165
+ python -m pip install -e ".[dev]"
166
+ python -m pytest
167
+ ```
@@ -0,0 +1,131 @@
1
+ # omicsmeta
2
+
3
+ `omicsmeta` is an early-stage Python package for harmonizing public omics
4
+ metadata from GEO, SRA, BioSample, and tabular exports.
5
+
6
+ The central design choice is to make ontology mapping pluggable. Generic
7
+ mapping tools such as `text2term` are useful, but public omics metadata also
8
+ needs domain-specific preprocessing, field-type detection, confidence-aware
9
+ routing, and cross-field validation. `omicsmeta` is intended to provide that
10
+ pipeline.
11
+
12
+ Current implementation status:
13
+
14
+ - string normalization and biomedical abbreviation expansion
15
+ - heuristic metadata field detection
16
+ - lightweight built-in ontology mapper for common terms
17
+ - optional `text2term` mapper adapter
18
+ - simple OBO loader and SQLite ontology cache
19
+ - tabular, minimal GEO SOFT, BioSample XML, and SRA XML readers
20
+ - harmonization orchestrator and CLI
21
+ - real GEO SOFT snippet test coverage
22
+ - conservative field routing for ambiguous metadata columns
23
+ - transparent cell-line inference for missing species/tissue/disease fields
24
+ - deduplicated unmapped-term summaries for manual curation
25
+ - sample-wide output tables
26
+ - batch harmonization
27
+ - known-answer benchmark helper and CLI script
28
+
29
+ Example:
30
+
31
+ ```bash
32
+ omicsmeta harmonize metadata.tsv \
33
+ --output harmonized.tsv \
34
+ --unmapped unmapped.tsv \
35
+ --unmapped-summary-output unmapped_summary.tsv \
36
+ --sample-output samples.tsv \
37
+ --report qc_report.html
38
+ ```
39
+
40
+ Direct GEO fetching is also available:
41
+
42
+ ```bash
43
+ omicsmeta harmonize \
44
+ --geo-accession GSE123456 \
45
+ --output harmonized.tsv \
46
+ --unmapped unmapped.tsv \
47
+ --unmapped-summary-output unmapped_summary.tsv \
48
+ --sample-output samples.tsv \
49
+ --report qc_report.html
50
+ ```
51
+
52
+ Custom local OBO files can be added to the built-in mapper:
53
+
54
+ ```bash
55
+ omicsmeta harmonize metadata.tsv \
56
+ --ontology-obo disease_slim.obo \
57
+ --output harmonized.tsv \
58
+ --unmapped unmapped.tsv \
59
+ --unmapped-summary-output unmapped_summary.tsv \
60
+ --sample-output samples.tsv \
61
+ --report qc_report.html
62
+ ```
63
+
64
+ Managed ontology resources can be cached locally:
65
+
66
+ ```bash
67
+ omicsmeta ontologies list
68
+ omicsmeta ontologies download doid uberon cl
69
+ omicsmeta ontologies index --resource doid --resource uberon
70
+ omicsmeta harmonize metadata.tsv \
71
+ --ontology-resource doid \
72
+ --ontology-resource uberon \
73
+ --output harmonized.tsv \
74
+ --unmapped unmapped.tsv \
75
+ --unmapped-summary-output unmapped_summary.tsv \
76
+ --sample-output samples.tsv \
77
+ --report qc_report.html
78
+ ```
79
+
80
+ Multiple files can be harmonized in one run:
81
+
82
+ ```bash
83
+ omicsmeta batch \
84
+ --input metadata_a.tsv \
85
+ --input metadata_b.tsv \
86
+ --output harmonized.tsv \
87
+ --unmapped unmapped.tsv \
88
+ --unmapped-summary-output unmapped_summary.tsv \
89
+ --sample-output samples.tsv \
90
+ --report qc_report.html
91
+ ```
92
+
93
+ Known-answer fixtures can be benchmarked:
94
+
95
+ ```bash
96
+ python scripts/benchmark_mapping.py \
97
+ --input examples/basic/metadata.tsv \
98
+ --truth examples/basic/expected_harmonized.tsv
99
+ ```
100
+
101
+ Run the bundled multi-fixture benchmark suite:
102
+
103
+ ```bash
104
+ python scripts/benchmark_mapping.py \
105
+ --manifest benchmarks/known_answer_suite.tsv \
106
+ --output-json benchmark_suite.json
107
+ ```
108
+
109
+ ## Documentation
110
+
111
+ - [Documentation site](https://qchiujunhao.github.io/omicsmeta/)
112
+ - [Quickstart](docs/quickstart.md)
113
+ - [API reference](docs/api.md)
114
+ - [Design notes](docs/design.md)
115
+ - [Release readiness checklist](docs/release_readiness.md)
116
+ - [Basic fixture tutorial](docs/tutorials/basic_fixture.md)
117
+ - [Galaxy wrapper scaffold](galaxy-omicsmeta/omicsmeta_harmonize.xml)
118
+
119
+ ## Maturity
120
+
121
+ This repository is pre-alpha and not yet JOSS-ready. The implemented code is
122
+ tested, but the project still needs publication-scale curated benchmarks,
123
+ release packaging, external user feedback, and Galaxy Tool Shed validation
124
+ before submission.
125
+
126
+ Run tests locally with:
127
+
128
+ ```bash
129
+ python -m pip install -e ".[dev]"
130
+ python -m pytest
131
+ ```
@@ -0,0 +1,25 @@
1
+ # Benchmark Fixtures
2
+
3
+ This directory contains a small known-answer benchmark suite for regression
4
+ testing and development. It is not a publication-scale validation dataset.
5
+
6
+ `known_answer_suite.tsv` is a manifest with one benchmark case per row:
7
+
8
+ - `name`: stable case identifier.
9
+ - `input_path`: metadata input path, relative to this directory unless absolute.
10
+ - `input_type`: `tabular` or `geo_soft`.
11
+ - `truth_path`: expected mapping table, relative to this directory unless
12
+ absolute.
13
+ - `description`: short human-readable case description.
14
+
15
+ Truth tables are stored in `truth/`. Each truth row is scored by
16
+ `sample_id`, `field_type`, and `ontology_id`; extra columns such as `label` are
17
+ for reader context.
18
+
19
+ Run the suite from the repository root:
20
+
21
+ ```bash
22
+ python scripts/benchmark_mapping.py \
23
+ --manifest benchmarks/known_answer_suite.tsv \
24
+ --output-json benchmark_suite.json
25
+ ```
@@ -0,0 +1,9 @@
1
+ name input_path input_type truth_path description
2
+ basic_tabular ../examples/basic/metadata.tsv tabular truth/basic_expected_harmonized.tsv Small tabular fixture with disease, tissue, species, sex, and cell-line fields.
3
+ geo_lung_a549 ../tests/fixtures/GSE154243_soft_snippet.txt geo_soft truth/GSE154243_expected_harmonized.tsv GEO-style A549 lung adenocarcinoma metadata with a typo in source text.
4
+ immune_pbmc ../tests/fixtures/immune_pbmc_soft_snippet.txt geo_soft truth/immune_pbmc_expected_harmonized.tsv GEO-style immune blood metadata with sex and species fields.
5
+ mouse_liver ../tests/fixtures/mouse_liver_soft_snippet.txt geo_soft truth/mouse_liver_expected_harmonized.tsv GEO-style mouse liver metadata with organism, tissue, and sex fields.
6
+ treatment_perturbation ../tests/fixtures/treatment_perturbation_soft_snippet.txt geo_soft truth/treatment_perturbation_expected_harmonized.tsv GEO-style treatment metadata where perturbation terms remain review items.
7
+ ambiguous_phenotype ../tests/fixtures/ambiguous_phenotype_soft_snippet.txt geo_soft truth/ambiguous_phenotype_expected_harmonized.tsv GEO-style ambiguous phenotype metadata that should not force phenotype terms into disease.
8
+ biosample_xml ../tests/fixtures/biosample_snippet.xml biosample_xml truth/biosample_expected_harmonized.tsv BioSample XML fixture with attributes for disease, tissue, sex, species, and cell line.
9
+ sra_xml ../tests/fixtures/sra_snippet.xml sra_xml truth/sra_expected_harmonized.tsv SRA XML fixture with SAMPLE_ATTRIBUTE disease, tissue, species, and cell-line fields.
@@ -0,0 +1,9 @@
1
+ sample_id field_type ontology_id label
2
+ GSM4667502 cell_line CVCL:0023 A549
3
+ GSM4667502 species NCBITaxon:9606 Homo sapiens
4
+ GSM4667502 tissue UBERON:0002048 lung
5
+ GSM4667502 disease DOID:299 adenocarcinoma
6
+ GSM4667503 species NCBITaxon:9606 Homo sapiens
7
+ GSM4667503 tissue UBERON:0002048 lung
8
+ GSM4667503 cell_line CVCL:0023 A549
9
+ GSM4667503 disease DOID:299 adenocarcinoma
@@ -0,0 +1,5 @@
1
+ sample_id field_type ontology_id label
2
+ GSMAMB1 species NCBITaxon:9606 Homo sapiens
3
+ GSMAMB1 tissue UBERON:0000310 breast
4
+ GSMAMB2 species NCBITaxon:9606 Homo sapiens
5
+ GSMAMB2 tissue UBERON:0000310 breast