omicsmeta 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omicsmeta-0.1.0/.github/workflows/ci.yml +58 -0
- omicsmeta-0.1.0/.github/workflows/pages.yml +63 -0
- omicsmeta-0.1.0/.github/workflows/publish.yml +58 -0
- omicsmeta-0.1.0/.gitignore +18 -0
- omicsmeta-0.1.0/CHANGELOG.md +28 -0
- omicsmeta-0.1.0/CODE_OF_CONDUCT.md +31 -0
- omicsmeta-0.1.0/CONTRIBUTING.md +55 -0
- omicsmeta-0.1.0/LICENSE +22 -0
- omicsmeta-0.1.0/PKG-INFO +167 -0
- omicsmeta-0.1.0/README.md +131 -0
- omicsmeta-0.1.0/benchmarks/README.md +25 -0
- omicsmeta-0.1.0/benchmarks/known_answer_suite.tsv +9 -0
- omicsmeta-0.1.0/benchmarks/truth/GSE154243_expected_harmonized.tsv +9 -0
- omicsmeta-0.1.0/benchmarks/truth/ambiguous_phenotype_expected_harmonized.tsv +5 -0
- omicsmeta-0.1.0/benchmarks/truth/basic_expected_harmonized.tsv +11 -0
- omicsmeta-0.1.0/benchmarks/truth/biosample_expected_harmonized.tsv +11 -0
- omicsmeta-0.1.0/benchmarks/truth/immune_pbmc_expected_harmonized.tsv +7 -0
- omicsmeta-0.1.0/benchmarks/truth/mouse_liver_expected_harmonized.tsv +7 -0
- omicsmeta-0.1.0/benchmarks/truth/sra_expected_harmonized.tsv +9 -0
- omicsmeta-0.1.0/benchmarks/truth/treatment_perturbation_expected_harmonized.tsv +11 -0
- omicsmeta-0.1.0/docs/api.md +109 -0
- omicsmeta-0.1.0/docs/design.md +88 -0
- omicsmeta-0.1.0/docs/index.md +49 -0
- omicsmeta-0.1.0/docs/quickstart.md +169 -0
- omicsmeta-0.1.0/docs/release_readiness.md +58 -0
- omicsmeta-0.1.0/docs/stylesheets/extra.css +131 -0
- omicsmeta-0.1.0/docs/tutorials/basic_fixture.md +67 -0
- omicsmeta-0.1.0/examples/basic/README.md +25 -0
- omicsmeta-0.1.0/examples/basic/expected_harmonized.tsv +11 -0
- omicsmeta-0.1.0/examples/basic/expected_samples.tsv +3 -0
- omicsmeta-0.1.0/examples/basic/expected_unmapped_summary.tsv +3 -0
- omicsmeta-0.1.0/examples/basic/metadata.tsv +3 -0
- omicsmeta-0.1.0/galaxy-omicsmeta/.shed.yml +13 -0
- omicsmeta-0.1.0/galaxy-omicsmeta/macros.xml +8 -0
- omicsmeta-0.1.0/galaxy-omicsmeta/omicsmeta_harmonize.py +53 -0
- omicsmeta-0.1.0/galaxy-omicsmeta/omicsmeta_harmonize.xml +72 -0
- omicsmeta-0.1.0/galaxy-omicsmeta/test-data/raw_metadata.tsv +3 -0
- omicsmeta-0.1.0/mkdocs.yml +34 -0
- omicsmeta-0.1.0/paper/paper.bib +41 -0
- omicsmeta-0.1.0/paper/paper.md +113 -0
- omicsmeta-0.1.0/pyproject.toml +58 -0
- omicsmeta-0.1.0/scripts/benchmark_mapping.py +49 -0
- omicsmeta-0.1.0/src/omicsmeta/__init__.py +7 -0
- omicsmeta-0.1.0/src/omicsmeta/_version.py +2 -0
- omicsmeta-0.1.0/src/omicsmeta/benchmark.py +199 -0
- omicsmeta-0.1.0/src/omicsmeta/cli.py +252 -0
- omicsmeta-0.1.0/src/omicsmeta/core/__init__.py +26 -0
- omicsmeta-0.1.0/src/omicsmeta/core/detector.py +205 -0
- omicsmeta-0.1.0/src/omicsmeta/core/fetcher.py +85 -0
- omicsmeta-0.1.0/src/omicsmeta/core/harmonizer.py +340 -0
- omicsmeta-0.1.0/src/omicsmeta/core/mapper.py +302 -0
- omicsmeta-0.1.0/src/omicsmeta/core/normalizer.py +114 -0
- omicsmeta-0.1.0/src/omicsmeta/core/types.py +56 -0
- omicsmeta-0.1.0/src/omicsmeta/core/validator.py +149 -0
- omicsmeta-0.1.0/src/omicsmeta/io/__init__.py +24 -0
- omicsmeta-0.1.0/src/omicsmeta/io/readers.py +216 -0
- omicsmeta-0.1.0/src/omicsmeta/io/writers.py +102 -0
- omicsmeta-0.1.0/src/omicsmeta/ontologies/__init__.py +35 -0
- omicsmeta-0.1.0/src/omicsmeta/ontologies/cache.py +83 -0
- omicsmeta-0.1.0/src/omicsmeta/ontologies/loader.py +79 -0
- omicsmeta-0.1.0/src/omicsmeta/ontologies/resources.py +229 -0
- omicsmeta-0.1.0/tests/fixtures/GSE154243_soft_snippet.txt +22 -0
- omicsmeta-0.1.0/tests/fixtures/ambiguous_phenotype_soft_snippet.txt +15 -0
- omicsmeta-0.1.0/tests/fixtures/biosample_edge_cases.xml +19 -0
- omicsmeta-0.1.0/tests/fixtures/biosample_snippet.xml +27 -0
- omicsmeta-0.1.0/tests/fixtures/custom_doid.obo +7 -0
- omicsmeta-0.1.0/tests/fixtures/immune_pbmc_soft_snippet.txt +17 -0
- omicsmeta-0.1.0/tests/fixtures/mouse_liver_soft_snippet.txt +19 -0
- omicsmeta-0.1.0/tests/fixtures/sra_edge_cases.xml +29 -0
- omicsmeta-0.1.0/tests/fixtures/sra_snippet.xml +47 -0
- omicsmeta-0.1.0/tests/fixtures/tiny.obo +7 -0
- omicsmeta-0.1.0/tests/fixtures/treatment_perturbation_soft_snippet.txt +19 -0
- omicsmeta-0.1.0/tests/test_additional_fixtures.py +47 -0
- omicsmeta-0.1.0/tests/test_benchmark.py +37 -0
- omicsmeta-0.1.0/tests/test_cli.py +207 -0
- omicsmeta-0.1.0/tests/test_detector.py +50 -0
- omicsmeta-0.1.0/tests/test_examples.py +130 -0
- omicsmeta-0.1.0/tests/test_fetcher.py +63 -0
- omicsmeta-0.1.0/tests/test_galaxy_wrapper.py +43 -0
- omicsmeta-0.1.0/tests/test_harmonizer.py +74 -0
- omicsmeta-0.1.0/tests/test_mapper.py +114 -0
- omicsmeta-0.1.0/tests/test_normalizer.py +21 -0
- omicsmeta-0.1.0/tests/test_ontology_loader.py +24 -0
- omicsmeta-0.1.0/tests/test_ontology_resources.py +113 -0
- omicsmeta-0.1.0/tests/test_readers.py +64 -0
- omicsmeta-0.1.0/tests/test_real_geo_integration.py +49 -0
- omicsmeta-0.1.0/tests/test_validator.py +66 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["main"]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: ["main"]
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
name: Python ${{ matrix.python-version }}
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
strategy:
|
|
15
|
+
fail-fast: false
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- name: Check out repository
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Set up Python
|
|
24
|
+
uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: ${{ matrix.python-version }}
|
|
27
|
+
cache: pip
|
|
28
|
+
|
|
29
|
+
- name: Install package and test dependencies
|
|
30
|
+
run: python -m pip install -e ".[dev]"
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: python -m pytest --cov=omicsmeta --cov-report=term-missing --cov-fail-under=70
|
|
34
|
+
|
|
35
|
+
package:
|
|
36
|
+
name: Package artifacts
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
|
|
39
|
+
steps:
|
|
40
|
+
- name: Check out repository
|
|
41
|
+
uses: actions/checkout@v4
|
|
42
|
+
|
|
43
|
+
- name: Set up Python
|
|
44
|
+
uses: actions/setup-python@v5
|
|
45
|
+
with:
|
|
46
|
+
python-version: "3.12"
|
|
47
|
+
cache: pip
|
|
48
|
+
|
|
49
|
+
- name: Build and check distributions
|
|
50
|
+
run: |
|
|
51
|
+
python -m pip install build twine
|
|
52
|
+
python -m build
|
|
53
|
+
python -m twine check dist/*.tar.gz dist/*.whl
|
|
54
|
+
|
|
55
|
+
- name: Install wheel and smoke-test CLI
|
|
56
|
+
run: |
|
|
57
|
+
python -m pip install dist/omicsmeta-*.whl
|
|
58
|
+
omicsmeta --help
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["main"]
|
|
6
|
+
paths:
|
|
7
|
+
- ".github/workflows/pages.yml"
|
|
8
|
+
- "docs/**"
|
|
9
|
+
- "mkdocs.yml"
|
|
10
|
+
- "pyproject.toml"
|
|
11
|
+
- "README.md"
|
|
12
|
+
workflow_dispatch:
|
|
13
|
+
|
|
14
|
+
permissions:
|
|
15
|
+
contents: read
|
|
16
|
+
pages: write
|
|
17
|
+
id-token: write
|
|
18
|
+
|
|
19
|
+
concurrency:
|
|
20
|
+
group: pages
|
|
21
|
+
cancel-in-progress: false
|
|
22
|
+
|
|
23
|
+
jobs:
|
|
24
|
+
build:
|
|
25
|
+
name: Build MkDocs site
|
|
26
|
+
runs-on: ubuntu-latest
|
|
27
|
+
|
|
28
|
+
steps:
|
|
29
|
+
- name: Check out repository
|
|
30
|
+
uses: actions/checkout@v4
|
|
31
|
+
|
|
32
|
+
- name: Set up Python
|
|
33
|
+
uses: actions/setup-python@v5
|
|
34
|
+
with:
|
|
35
|
+
python-version: "3.12"
|
|
36
|
+
cache: pip
|
|
37
|
+
|
|
38
|
+
- name: Install documentation dependencies
|
|
39
|
+
run: python -m pip install -e ".[docs]"
|
|
40
|
+
|
|
41
|
+
- name: Configure GitHub Pages
|
|
42
|
+
uses: actions/configure-pages@v5
|
|
43
|
+
|
|
44
|
+
- name: Build documentation
|
|
45
|
+
run: mkdocs build --strict --site-dir site
|
|
46
|
+
|
|
47
|
+
- name: Upload Pages artifact
|
|
48
|
+
uses: actions/upload-pages-artifact@v3
|
|
49
|
+
with:
|
|
50
|
+
path: site
|
|
51
|
+
|
|
52
|
+
deploy:
|
|
53
|
+
name: Deploy GitHub Pages
|
|
54
|
+
runs-on: ubuntu-latest
|
|
55
|
+
needs: build
|
|
56
|
+
environment:
|
|
57
|
+
name: github-pages
|
|
58
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
59
|
+
|
|
60
|
+
steps:
|
|
61
|
+
- name: Deploy to GitHub Pages
|
|
62
|
+
id: deployment
|
|
63
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published, released]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build:
|
|
12
|
+
name: Build distributions
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Check out repository
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
cache: pip
|
|
24
|
+
|
|
25
|
+
- name: Install build tools
|
|
26
|
+
run: python -m pip install build twine
|
|
27
|
+
|
|
28
|
+
- name: Build source and wheel distributions
|
|
29
|
+
run: python -m build
|
|
30
|
+
|
|
31
|
+
- name: Validate distributions
|
|
32
|
+
run: python -m twine check dist/*.tar.gz dist/*.whl
|
|
33
|
+
|
|
34
|
+
- name: Upload distribution artifact
|
|
35
|
+
uses: actions/upload-artifact@v4
|
|
36
|
+
with:
|
|
37
|
+
name: python-package-distributions
|
|
38
|
+
path: dist/
|
|
39
|
+
|
|
40
|
+
publish:
|
|
41
|
+
name: Publish distributions to PyPI
|
|
42
|
+
needs: build
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
environment:
|
|
45
|
+
name: pypi
|
|
46
|
+
url: https://pypi.org/project/omicsmeta/
|
|
47
|
+
permissions:
|
|
48
|
+
id-token: write
|
|
49
|
+
|
|
50
|
+
steps:
|
|
51
|
+
- name: Download distribution artifact
|
|
52
|
+
uses: actions/download-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: python-package-distributions
|
|
55
|
+
path: dist/
|
|
56
|
+
|
|
57
|
+
- name: Publish package distributions to PyPI
|
|
58
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `omicsmeta` will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The project is currently pre-alpha.
|
|
6
|
+
|
|
7
|
+
## 0.1.0 - 2026-06-13
|
|
8
|
+
|
|
9
|
+
- Added a Python package scaffold with CLI entry point.
|
|
10
|
+
- Added normalization, field detection, mapping, validation, and harmonization
|
|
11
|
+
modules.
|
|
12
|
+
- Added tabular, minimal GEO SOFT, BioSample XML, and SRA XML readers.
|
|
13
|
+
- Added detailed harmonized, unmapped, unmapped-summary, sample-wide, and HTML
|
|
14
|
+
QC outputs.
|
|
15
|
+
- Added managed ontology resource download and SQLite indexing commands.
|
|
16
|
+
- Added conservative handling for ambiguous metadata fields.
|
|
17
|
+
- Added transparent cell-line inference for common cell lines.
|
|
18
|
+
- Added batch harmonization for multiple files and GEO accessions.
|
|
19
|
+
- Added known-answer benchmark helper and CLI script.
|
|
20
|
+
- Added a multi-fixture known-answer benchmark suite.
|
|
21
|
+
- Added integration fixtures based on GEO-style metadata snippets.
|
|
22
|
+
- Added public documentation, contribution metadata, and a JOSS paper skeleton.
|
|
23
|
+
- Added a Galaxy wrapper scaffold with local wrapper smoke test data.
|
|
24
|
+
- Added package artifact build validation and a release-readiness checklist.
|
|
25
|
+
- Added SRA/BioSample XML edge-case coverage for namespaced documents,
|
|
26
|
+
repeated attributes, and accession fallbacks.
|
|
27
|
+
- Added GitHub Pages documentation publishing and a Material for MkDocs site
|
|
28
|
+
theme.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Code of Conduct
|
|
2
|
+
|
|
3
|
+
`omicsmeta` aims to be a respectful, practical, and technically rigorous open
|
|
4
|
+
source project.
|
|
5
|
+
|
|
6
|
+
## Expected Behavior
|
|
7
|
+
|
|
8
|
+
- Treat contributors and users with respect.
|
|
9
|
+
- Discuss technical disagreements with evidence and clear reasoning.
|
|
10
|
+
- Keep feedback specific to the code, documentation, data, or scientific claim.
|
|
11
|
+
- Assume public project spaces are read by people who do not know the private
|
|
12
|
+
context behind a discussion.
|
|
13
|
+
|
|
14
|
+
## Unacceptable Behavior
|
|
15
|
+
|
|
16
|
+
- Harassment, discrimination, intimidation, or personal attacks.
|
|
17
|
+
- Publishing private information without explicit permission.
|
|
18
|
+
- Repeated disruption of project discussions after maintainers ask for a change
|
|
19
|
+
in behavior.
|
|
20
|
+
- Knowingly submitting fabricated data, misleading benchmark results, or
|
|
21
|
+
unsupported scientific claims.
|
|
22
|
+
|
|
23
|
+
## Enforcement
|
|
24
|
+
|
|
25
|
+
Maintainers may remove comments, close issues, reject contributions, or restrict
|
|
26
|
+
participation when behavior conflicts with this code of conduct. Enforcement
|
|
27
|
+
decisions should be documented privately among maintainers with enough context
|
|
28
|
+
to review them later.
|
|
29
|
+
|
|
30
|
+
Report conduct concerns to the current project maintainers through the issue
|
|
31
|
+
tracker or another maintainer-provided contact channel.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thank you for considering a contribution to `omicsmeta`. The project is
|
|
4
|
+
pre-alpha, so the highest-value contributions are focused tests, small bug
|
|
5
|
+
fixes, documentation improvements, and real metadata examples with expected
|
|
6
|
+
outputs.
|
|
7
|
+
|
|
8
|
+
## Development Setup
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
python -m pip install -e ".[dev]"
|
|
12
|
+
python -m pytest
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Run the coverage gate used during development:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
python -m pytest --cov=omicsmeta --cov-report=term-missing --cov-fail-under=70
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Contribution Guidelines
|
|
22
|
+
|
|
23
|
+
- Keep changes focused. A pull request should usually address one feature,
|
|
24
|
+
bug, fixture, or documentation topic.
|
|
25
|
+
- Add tests for new behavior, especially field detection, ontology mapping,
|
|
26
|
+
validation, CLI output, and real metadata fixtures.
|
|
27
|
+
- Preserve provenance in outputs. New transformations should keep enough source
|
|
28
|
+
context for a curator to audit the result.
|
|
29
|
+
- Prefer conservative defaults. Uncertain mappings should be reported for
|
|
30
|
+
review instead of silently accepted.
|
|
31
|
+
- Public documentation should assume readers do not know any prior discussion
|
|
32
|
+
about the project.
|
|
33
|
+
|
|
34
|
+
## Reporting Issues
|
|
35
|
+
|
|
36
|
+
Useful issue reports include:
|
|
37
|
+
|
|
38
|
+
- the exact command or Python snippet that failed;
|
|
39
|
+
- a small input file or fixture;
|
|
40
|
+
- expected and actual outputs;
|
|
41
|
+
- the installed `omicsmeta` version or commit hash;
|
|
42
|
+
- whether the issue involves local OBO files, managed ontology resources, or
|
|
43
|
+
the optional `text2term` backend.
|
|
44
|
+
|
|
45
|
+
## Release Readiness
|
|
46
|
+
|
|
47
|
+
Before any release intended for external users, maintainers should verify that
|
|
48
|
+
the test suite passes, documentation examples run from a clean checkout, and the
|
|
49
|
+
paper and citation metadata do not overstate the validation status of the
|
|
50
|
+
software.
|
|
51
|
+
|
|
52
|
+
See the public [release readiness checklist](docs/release_readiness.md) for the
|
|
53
|
+
artifact build, documentation, benchmark, and manual review checks. The
|
|
54
|
+
checklist validates readiness only; publishing requires explicit maintainer
|
|
55
|
+
approval.
|
omicsmeta-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 omicsmeta contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
omicsmeta-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: omicsmeta
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated harmonization of public omics metadata
|
|
5
|
+
Project-URL: Homepage, https://github.com/qchiujunhao/omicsmeta
|
|
6
|
+
Project-URL: Documentation, https://qchiujunhao.github.io/omicsmeta/
|
|
7
|
+
Project-URL: Issues, https://github.com/qchiujunhao/omicsmeta/issues
|
|
8
|
+
Author: omicsmeta contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: BioSample,GEO,SRA,bioinformatics,metadata,ontology
|
|
12
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: requests>=2.31
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
26
|
+
Provides-Extra: docs
|
|
27
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
28
|
+
Requires-Dist: mkdocs<2,>=1.6; extra == 'docs'
|
|
29
|
+
Provides-Extra: fuzzy
|
|
30
|
+
Requires-Dist: rapidfuzz>=3.0; extra == 'fuzzy'
|
|
31
|
+
Provides-Extra: ontology
|
|
32
|
+
Requires-Dist: pronto>=2.5; extra == 'ontology'
|
|
33
|
+
Provides-Extra: text2term
|
|
34
|
+
Requires-Dist: text2term; extra == 'text2term'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# omicsmeta
|
|
38
|
+
|
|
39
|
+
`omicsmeta` is an early-stage Python package for harmonizing public omics
|
|
40
|
+
metadata from GEO, SRA, BioSample, and tabular exports.
|
|
41
|
+
|
|
42
|
+
The central design choice is to make ontology mapping pluggable. Generic
|
|
43
|
+
mapping tools such as `text2term` are useful, but public omics metadata also
|
|
44
|
+
needs domain-specific preprocessing, field-type detection, confidence-aware
|
|
45
|
+
routing, and cross-field validation. `omicsmeta` is intended to provide that
|
|
46
|
+
pipeline.
|
|
47
|
+
|
|
48
|
+
Current implementation status:
|
|
49
|
+
|
|
50
|
+
- string normalization and biomedical abbreviation expansion
|
|
51
|
+
- heuristic metadata field detection
|
|
52
|
+
- lightweight built-in ontology mapper for common terms
|
|
53
|
+
- optional `text2term` mapper adapter
|
|
54
|
+
- simple OBO loader and SQLite ontology cache
|
|
55
|
+
- tabular, minimal GEO SOFT, BioSample XML, and SRA XML readers
|
|
56
|
+
- harmonization orchestrator and CLI
|
|
57
|
+
- real GEO SOFT snippet test coverage
|
|
58
|
+
- conservative field routing for ambiguous metadata columns
|
|
59
|
+
- transparent cell-line inference for missing species/tissue/disease fields
|
|
60
|
+
- deduplicated unmapped-term summaries for manual curation
|
|
61
|
+
- sample-wide output tables
|
|
62
|
+
- batch harmonization
|
|
63
|
+
- known-answer benchmark helper and CLI script
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
omicsmeta harmonize metadata.tsv \
|
|
69
|
+
--output harmonized.tsv \
|
|
70
|
+
--unmapped unmapped.tsv \
|
|
71
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
72
|
+
--sample-output samples.tsv \
|
|
73
|
+
--report qc_report.html
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Direct GEO fetching is also available:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
omicsmeta harmonize \
|
|
80
|
+
--geo-accession GSE123456 \
|
|
81
|
+
--output harmonized.tsv \
|
|
82
|
+
--unmapped unmapped.tsv \
|
|
83
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
84
|
+
--sample-output samples.tsv \
|
|
85
|
+
--report qc_report.html
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Custom local OBO files can be added to the built-in mapper:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
omicsmeta harmonize metadata.tsv \
|
|
92
|
+
--ontology-obo disease_slim.obo \
|
|
93
|
+
--output harmonized.tsv \
|
|
94
|
+
--unmapped unmapped.tsv \
|
|
95
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
96
|
+
--sample-output samples.tsv \
|
|
97
|
+
--report qc_report.html
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Managed ontology resources can be cached locally:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
omicsmeta ontologies list
|
|
104
|
+
omicsmeta ontologies download doid uberon cl
|
|
105
|
+
omicsmeta ontologies index --resource doid --resource uberon
|
|
106
|
+
omicsmeta harmonize metadata.tsv \
|
|
107
|
+
--ontology-resource doid \
|
|
108
|
+
--ontology-resource uberon \
|
|
109
|
+
--output harmonized.tsv \
|
|
110
|
+
--unmapped unmapped.tsv \
|
|
111
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
112
|
+
--sample-output samples.tsv \
|
|
113
|
+
--report qc_report.html
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Multiple files can be harmonized in one run:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
omicsmeta batch \
|
|
120
|
+
--input metadata_a.tsv \
|
|
121
|
+
--input metadata_b.tsv \
|
|
122
|
+
--output harmonized.tsv \
|
|
123
|
+
--unmapped unmapped.tsv \
|
|
124
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
125
|
+
--sample-output samples.tsv \
|
|
126
|
+
--report qc_report.html
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Known-answer fixtures can be benchmarked:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
python scripts/benchmark_mapping.py \
|
|
133
|
+
--input examples/basic/metadata.tsv \
|
|
134
|
+
--truth examples/basic/expected_harmonized.tsv
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Run the bundled multi-fixture benchmark suite:
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
python scripts/benchmark_mapping.py \
|
|
141
|
+
--manifest benchmarks/known_answer_suite.tsv \
|
|
142
|
+
--output-json benchmark_suite.json
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Documentation
|
|
146
|
+
|
|
147
|
+
- [Documentation site](https://qchiujunhao.github.io/omicsmeta/)
|
|
148
|
+
- [Quickstart](docs/quickstart.md)
|
|
149
|
+
- [API reference](docs/api.md)
|
|
150
|
+
- [Design notes](docs/design.md)
|
|
151
|
+
- [Release readiness checklist](docs/release_readiness.md)
|
|
152
|
+
- [Basic fixture tutorial](docs/tutorials/basic_fixture.md)
|
|
153
|
+
- [Galaxy wrapper scaffold](galaxy-omicsmeta/omicsmeta_harmonize.xml)
|
|
154
|
+
|
|
155
|
+
## Maturity
|
|
156
|
+
|
|
157
|
+
This repository is pre-alpha and not yet JOSS-ready. The implemented code is
|
|
158
|
+
tested, but the project still needs publication-scale curated benchmarks,
|
|
159
|
+
release packaging, external user feedback, and Galaxy Tool Shed validation
|
|
160
|
+
before submission.
|
|
161
|
+
|
|
162
|
+
Run tests locally with:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
python -m pip install -e ".[dev]"
|
|
166
|
+
python -m pytest
|
|
167
|
+
```
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# omicsmeta
|
|
2
|
+
|
|
3
|
+
`omicsmeta` is an early-stage Python package for harmonizing public omics
|
|
4
|
+
metadata from GEO, SRA, BioSample, and tabular exports.
|
|
5
|
+
|
|
6
|
+
The central design choice is to make ontology mapping pluggable. Generic
|
|
7
|
+
mapping tools such as `text2term` are useful, but public omics metadata also
|
|
8
|
+
needs domain-specific preprocessing, field-type detection, confidence-aware
|
|
9
|
+
routing, and cross-field validation. `omicsmeta` is intended to provide that
|
|
10
|
+
pipeline.
|
|
11
|
+
|
|
12
|
+
Current implementation status:
|
|
13
|
+
|
|
14
|
+
- string normalization and biomedical abbreviation expansion
|
|
15
|
+
- heuristic metadata field detection
|
|
16
|
+
- lightweight built-in ontology mapper for common terms
|
|
17
|
+
- optional `text2term` mapper adapter
|
|
18
|
+
- simple OBO loader and SQLite ontology cache
|
|
19
|
+
- tabular, minimal GEO SOFT, BioSample XML, and SRA XML readers
|
|
20
|
+
- harmonization orchestrator and CLI
|
|
21
|
+
- real GEO SOFT snippet test coverage
|
|
22
|
+
- conservative field routing for ambiguous metadata columns
|
|
23
|
+
- transparent cell-line inference for missing species/tissue/disease fields
|
|
24
|
+
- deduplicated unmapped-term summaries for manual curation
|
|
25
|
+
- sample-wide output tables
|
|
26
|
+
- batch harmonization
|
|
27
|
+
- known-answer benchmark helper and CLI script
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
omicsmeta harmonize metadata.tsv \
|
|
33
|
+
--output harmonized.tsv \
|
|
34
|
+
--unmapped unmapped.tsv \
|
|
35
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
36
|
+
--sample-output samples.tsv \
|
|
37
|
+
--report qc_report.html
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Direct GEO fetching is also available:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
omicsmeta harmonize \
|
|
44
|
+
--geo-accession GSE123456 \
|
|
45
|
+
--output harmonized.tsv \
|
|
46
|
+
--unmapped unmapped.tsv \
|
|
47
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
48
|
+
--sample-output samples.tsv \
|
|
49
|
+
--report qc_report.html
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Custom local OBO files can be added to the built-in mapper:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
omicsmeta harmonize metadata.tsv \
|
|
56
|
+
--ontology-obo disease_slim.obo \
|
|
57
|
+
--output harmonized.tsv \
|
|
58
|
+
--unmapped unmapped.tsv \
|
|
59
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
60
|
+
--sample-output samples.tsv \
|
|
61
|
+
--report qc_report.html
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Managed ontology resources can be cached locally:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
omicsmeta ontologies list
|
|
68
|
+
omicsmeta ontologies download doid uberon cl
|
|
69
|
+
omicsmeta ontologies index --resource doid --resource uberon
|
|
70
|
+
omicsmeta harmonize metadata.tsv \
|
|
71
|
+
--ontology-resource doid \
|
|
72
|
+
--ontology-resource uberon \
|
|
73
|
+
--output harmonized.tsv \
|
|
74
|
+
--unmapped unmapped.tsv \
|
|
75
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
76
|
+
--sample-output samples.tsv \
|
|
77
|
+
--report qc_report.html
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Multiple files can be harmonized in one run:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
omicsmeta batch \
|
|
84
|
+
--input metadata_a.tsv \
|
|
85
|
+
--input metadata_b.tsv \
|
|
86
|
+
--output harmonized.tsv \
|
|
87
|
+
--unmapped unmapped.tsv \
|
|
88
|
+
--unmapped-summary-output unmapped_summary.tsv \
|
|
89
|
+
--sample-output samples.tsv \
|
|
90
|
+
--report qc_report.html
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Known-answer fixtures can be benchmarked:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
python scripts/benchmark_mapping.py \
|
|
97
|
+
--input examples/basic/metadata.tsv \
|
|
98
|
+
--truth examples/basic/expected_harmonized.tsv
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Run the bundled multi-fixture benchmark suite:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
python scripts/benchmark_mapping.py \
|
|
105
|
+
--manifest benchmarks/known_answer_suite.tsv \
|
|
106
|
+
--output-json benchmark_suite.json
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Documentation
|
|
110
|
+
|
|
111
|
+
- [Documentation site](https://qchiujunhao.github.io/omicsmeta/)
|
|
112
|
+
- [Quickstart](docs/quickstart.md)
|
|
113
|
+
- [API reference](docs/api.md)
|
|
114
|
+
- [Design notes](docs/design.md)
|
|
115
|
+
- [Release readiness checklist](docs/release_readiness.md)
|
|
116
|
+
- [Basic fixture tutorial](docs/tutorials/basic_fixture.md)
|
|
117
|
+
- [Galaxy wrapper scaffold](galaxy-omicsmeta/omicsmeta_harmonize.xml)
|
|
118
|
+
|
|
119
|
+
## Maturity
|
|
120
|
+
|
|
121
|
+
This repository is pre-alpha and not yet JOSS-ready. The implemented code is
|
|
122
|
+
tested, but the project still needs publication-scale curated benchmarks,
|
|
123
|
+
release packaging, external user feedback, and Galaxy Tool Shed validation
|
|
124
|
+
before submission.
|
|
125
|
+
|
|
126
|
+
Run tests locally with:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
python -m pip install -e ".[dev]"
|
|
130
|
+
python -m pytest
|
|
131
|
+
```
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Benchmark Fixtures
|
|
2
|
+
|
|
3
|
+
This directory contains a small known-answer benchmark suite for regression
|
|
4
|
+
testing and development. It is not a publication-scale validation dataset.
|
|
5
|
+
|
|
6
|
+
`known_answer_suite.tsv` is a manifest with one benchmark case per row:
|
|
7
|
+
|
|
8
|
+
- `name`: stable case identifier.
|
|
9
|
+
- `input_path`: metadata input path, relative to this directory unless absolute.
|
|
10
|
+
- `input_type`: `tabular` or `geo_soft`.
|
|
11
|
+
- `truth_path`: expected mapping table, relative to this directory unless
|
|
12
|
+
absolute.
|
|
13
|
+
- `description`: short human-readable case description.
|
|
14
|
+
|
|
15
|
+
Truth tables are stored in `truth/`. Each truth row is scored by
|
|
16
|
+
`sample_id`, `field_type`, and `ontology_id`; extra columns such as `label` are
|
|
17
|
+
for reader context.
|
|
18
|
+
|
|
19
|
+
Run the suite from the repository root:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python scripts/benchmark_mapping.py \
|
|
23
|
+
--manifest benchmarks/known_answer_suite.tsv \
|
|
24
|
+
--output-json benchmark_suite.json
|
|
25
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
name input_path input_type truth_path description
|
|
2
|
+
basic_tabular ../examples/basic/metadata.tsv tabular truth/basic_expected_harmonized.tsv Small tabular fixture with disease, tissue, species, sex, and cell-line fields.
|
|
3
|
+
geo_lung_a549 ../tests/fixtures/GSE154243_soft_snippet.txt geo_soft truth/GSE154243_expected_harmonized.tsv GEO-style A549 lung adenocarcinoma metadata with a typo in source text.
|
|
4
|
+
immune_pbmc ../tests/fixtures/immune_pbmc_soft_snippet.txt geo_soft truth/immune_pbmc_expected_harmonized.tsv GEO-style immune blood metadata with sex and species fields.
|
|
5
|
+
mouse_liver ../tests/fixtures/mouse_liver_soft_snippet.txt geo_soft truth/mouse_liver_expected_harmonized.tsv GEO-style mouse liver metadata with organism, tissue, and sex fields.
|
|
6
|
+
treatment_perturbation ../tests/fixtures/treatment_perturbation_soft_snippet.txt geo_soft truth/treatment_perturbation_expected_harmonized.tsv GEO-style treatment metadata where perturbation terms remain review items.
|
|
7
|
+
ambiguous_phenotype ../tests/fixtures/ambiguous_phenotype_soft_snippet.txt geo_soft truth/ambiguous_phenotype_expected_harmonized.tsv GEO-style ambiguous phenotype metadata that should not force phenotype terms into disease.
|
|
8
|
+
biosample_xml ../tests/fixtures/biosample_snippet.xml biosample_xml truth/biosample_expected_harmonized.tsv BioSample XML fixture with attributes for disease, tissue, sex, species, and cell line.
|
|
9
|
+
sra_xml ../tests/fixtures/sra_snippet.xml sra_xml truth/sra_expected_harmonized.tsv SRA XML fixture with SAMPLE_ATTRIBUTE disease, tissue, species, and cell-line fields.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
sample_id field_type ontology_id label
|
|
2
|
+
GSM4667502 cell_line CVCL:0023 A549
|
|
3
|
+
GSM4667502 species NCBITaxon:9606 Homo sapiens
|
|
4
|
+
GSM4667502 tissue UBERON:0002048 lung
|
|
5
|
+
GSM4667502 disease DOID:299 adenocarcinoma
|
|
6
|
+
GSM4667503 species NCBITaxon:9606 Homo sapiens
|
|
7
|
+
GSM4667503 tissue UBERON:0002048 lung
|
|
8
|
+
GSM4667503 cell_line CVCL:0023 A549
|
|
9
|
+
GSM4667503 disease DOID:299 adenocarcinoma
|