koza 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- koza-0.0.0/.github/dependabot.yaml +10 -0
- koza-0.0.0/.github/workflows/documentation.yaml +51 -0
- koza-0.0.0/.github/workflows/publish.yaml +40 -0
- koza-0.0.0/.github/workflows/test.yaml +42 -0
- koza-0.0.0/.gitignore +141 -0
- koza-0.0.0/CITATION.cff +21 -0
- koza-0.0.0/CONTRIBUTING.md +44 -0
- koza-0.0.0/LICENSE +29 -0
- koza-0.0.0/Makefile +66 -0
- koza-0.0.0/PKG-INFO +202 -0
- koza-0.0.0/README.md +173 -0
- koza-0.0.0/docs/CNAME +1 -0
- koza-0.0.0/docs/Ingests/index.md +12 -0
- koza-0.0.0/docs/Ingests/koza_config.md +416 -0
- koza-0.0.0/docs/Ingests/mapping.md +62 -0
- koza-0.0.0/docs/Ingests/testing.md +87 -0
- koza-0.0.0/docs/Ingests/transform.md +53 -0
- koza-0.0.0/docs/Usage/CLI.md +65 -0
- koza-0.0.0/docs/Usage/Module.md +1 -0
- koza-0.0.0/docs/cli-reference.md +564 -0
- koza-0.0.0/docs/graph-operations/explanation/architecture.md +243 -0
- koza-0.0.0/docs/graph-operations/explanation/biolink-compliance.md +330 -0
- koza-0.0.0/docs/graph-operations/explanation/data-integrity.md +280 -0
- koza-0.0.0/docs/graph-operations/explanation/index.md +71 -0
- koza-0.0.0/docs/graph-operations/explanation/schema-handling.md +229 -0
- koza-0.0.0/docs/graph-operations/how-to/clean-graph.md +304 -0
- koza-0.0.0/docs/graph-operations/how-to/export-formats.md +373 -0
- koza-0.0.0/docs/graph-operations/how-to/generate-reports.md +434 -0
- koza-0.0.0/docs/graph-operations/how-to/incremental-updates.md +296 -0
- koza-0.0.0/docs/graph-operations/how-to/index.md +52 -0
- koza-0.0.0/docs/graph-operations/how-to/join-files.md +238 -0
- koza-0.0.0/docs/graph-operations/how-to/normalize-ids.md +302 -0
- koza-0.0.0/docs/graph-operations/how-to/split-graph.md +235 -0
- koza-0.0.0/docs/graph-operations/index.md +67 -0
- koza-0.0.0/docs/graph-operations/reference/api.md +168 -0
- koza-0.0.0/docs/graph-operations/reference/cli.md +797 -0
- koza-0.0.0/docs/graph-operations/reference/configuration.md +791 -0
- koza-0.0.0/docs/graph-operations/reference/index.md +80 -0
- koza-0.0.0/docs/graph-operations/tutorials/first-graph.md +471 -0
- koza-0.0.0/docs/graph-operations/tutorials/index.md +43 -0
- koza-0.0.0/docs/graph-operations/tutorials/merge-pipeline.md +480 -0
- koza-0.0.0/docs/graph-operations.md +34 -0
- koza-0.0.0/docs/img/docs-comming-soon.jpg +0 -0
- koza-0.0.0/docs/img/favicon.ico +0 -0
- koza-0.0.0/docs/img/pupa.png +0 -0
- koza-0.0.0/docs/index.md +87 -0
- koza-0.0.0/examples/data/additional-entrez-2-string.tsv +11 -0
- koza-0.0.0/examples/data/entrez-2-string.tsv +11 -0
- koza-0.0.0/examples/data/string.tsv +5 -0
- koza-0.0.0/examples/data/string2.tsv +15 -0
- koza-0.0.0/examples/maps/custom-entrez-2-string.py +11 -0
- koza-0.0.0/examples/maps/custom-entrez-2-string.yaml +27 -0
- koza-0.0.0/examples/maps/entrez-2-string.yaml +27 -0
- koza-0.0.0/examples/maps/genepage-2-gene.yaml +26 -0
- koza-0.0.0/examples/minimal.py +6 -0
- koza-0.0.0/examples/standards/gpi.yaml +10 -0
- koza-0.0.0/examples/standards/oban.yaml +13 -0
- koza-0.0.0/examples/standards/string.yaml +10 -0
- koza-0.0.0/examples/string/metadata.yaml +4 -0
- koza-0.0.0/examples/string/protein-links-detailed.py +25 -0
- koza-0.0.0/examples/string/protein-links-detailed.yaml +33 -0
- koza-0.0.0/examples/string-declarative/declarative-protein-links-detailed.py +25 -0
- koza-0.0.0/examples/string-declarative/declarative-protein-links-detailed.yaml +50 -0
- koza-0.0.0/examples/string-declarative/metadata.yaml +4 -0
- koza-0.0.0/examples/string-file-archive/metadata.yaml +4 -0
- koza-0.0.0/examples/string-file-archive/protein-links-file-archive.py +24 -0
- koza-0.0.0/examples/string-file-archive/protein-links-file-archive.yaml +31 -0
- koza-0.0.0/examples/string-w-custom-map/custom-map-protein-links-detailed.py +27 -0
- koza-0.0.0/examples/string-w-custom-map/custom-map-protein-links-detailed.yaml +48 -0
- koza-0.0.0/examples/string-w-custom-map/metadata.yaml +4 -0
- koza-0.0.0/examples/string-w-map/map-protein-links-detailed.py +27 -0
- koza-0.0.0/examples/string-w-map/map-protein-links-detailed.yaml +47 -0
- koza-0.0.0/examples/string-w-map/metadata.yaml +4 -0
- koza-0.0.0/examples/string-w-state/metadata.yaml +4 -0
- koza-0.0.0/examples/string-w-state/protein-links-detailed.py +35 -0
- koza-0.0.0/examples/string-w-state/protein-links-detailed.yaml +33 -0
- koza-0.0.0/examples/string-yield/metadata.yaml +4 -0
- koza-0.0.0/examples/string-yield/protein-links-yield.py +26 -0
- koza-0.0.0/examples/string-yield/protein-links-yield.yaml +30 -0
- koza-0.0.0/examples/translation_table.yaml +819 -0
- koza-0.0.0/mkdocs.yaml +130 -0
- koza-0.0.0/pyproject.toml +82 -0
- koza-0.0.0/src/koza/__init__.py +18 -0
- koza-0.0.0/src/koza/converter/__init__.py +0 -0
- koza-0.0.0/src/koza/converter/kgx_converter.py +54 -0
- koza-0.0.0/src/koza/decorators.py +169 -0
- koza-0.0.0/src/koza/graph_operations/__init__.py +51 -0
- koza-0.0.0/src/koza/graph_operations/append.py +523 -0
- koza-0.0.0/src/koza/graph_operations/deduplicate.py +346 -0
- koza-0.0.0/src/koza/graph_operations/join.py +260 -0
- koza-0.0.0/src/koza/graph_operations/merge.py +466 -0
- koza-0.0.0/src/koza/graph_operations/normalize.py +559 -0
- koza-0.0.0/src/koza/graph_operations/prune.py +399 -0
- koza-0.0.0/src/koza/graph_operations/report.py +1604 -0
- koza-0.0.0/src/koza/graph_operations/schema.py +143 -0
- koza-0.0.0/src/koza/graph_operations/schema_utils.py +179 -0
- koza-0.0.0/src/koza/graph_operations/split.py +351 -0
- koza-0.0.0/src/koza/graph_operations/utils.py +609 -0
- koza-0.0.0/src/koza/io/__init__.py +8 -0
- koza-0.0.0/src/koza/io/reader/__init__.py +3 -0
- koza-0.0.0/src/koza/io/reader/csv_reader.py +240 -0
- koza-0.0.0/src/koza/io/reader/json_reader.py +60 -0
- koza-0.0.0/src/koza/io/reader/jsonl_reader.py +43 -0
- koza-0.0.0/src/koza/io/utils.py +336 -0
- koza-0.0.0/src/koza/io/writer/__init__.py +0 -0
- koza-0.0.0/src/koza/io/writer/jsonl_writer.py +74 -0
- koza-0.0.0/src/koza/io/writer/passthrough_writer.py +26 -0
- koza-0.0.0/src/koza/io/writer/tsv_writer.py +136 -0
- koza-0.0.0/src/koza/io/writer/writer.py +31 -0
- koza-0.0.0/src/koza/io/yaml_loader.py +66 -0
- koza-0.0.0/src/koza/main.py +1295 -0
- koza-0.0.0/src/koza/model/__init__.py +0 -0
- koza-0.0.0/src/koza/model/config/__init__.py +0 -0
- koza-0.0.0/src/koza/model/config/pydantic_config.py +12 -0
- koza-0.0.0/src/koza/model/config/source_config.py +146 -0
- koza-0.0.0/src/koza/model/config/sssom_config.py +171 -0
- koza-0.0.0/src/koza/model/curie_cleaner.py +11 -0
- koza-0.0.0/src/koza/model/filters.py +68 -0
- koza-0.0.0/src/koza/model/formats.py +24 -0
- koza-0.0.0/src/koza/model/graph_operations.py +747 -0
- koza-0.0.0/src/koza/model/graphs.py +16 -0
- koza-0.0.0/src/koza/model/koza.py +66 -0
- koza-0.0.0/src/koza/model/reader.py +139 -0
- koza-0.0.0/src/koza/model/source.py +180 -0
- koza-0.0.0/src/koza/model/transform.py +61 -0
- koza-0.0.0/src/koza/model/writer.py +17 -0
- koza-0.0.0/src/koza/runner.py +408 -0
- koza-0.0.0/src/koza/tools/split_file.py +73 -0
- koza-0.0.0/src/koza/transform.py +99 -0
- koza-0.0.0/src/koza/utils/__init__ +0 -0
- koza-0.0.0/src/koza/utils/exceptions.py +18 -0
- koza-0.0.0/src/koza/utils/log_utils.py +26 -0
- koza-0.0.0/src/koza/utils/row_filter.py +83 -0
- koza-0.0.0/tests/conftest.py +9 -0
- koza-0.0.0/tests/integration/test_examples.py +46 -0
- koza-0.0.0/tests/integration/test_multi_file_pipeline.py +321 -0
- koza-0.0.0/tests/integration/test_row_limit.py +58 -0
- koza-0.0.0/tests/integration/test_validator.py +57 -0
- koza-0.0.0/tests/resources/module_cache_test_source_a/data.tsv +3 -0
- koza-0.0.0/tests/resources/module_cache_test_source_a/gene.py +13 -0
- koza-0.0.0/tests/resources/module_cache_test_source_a/gene.yaml +16 -0
- koza-0.0.0/tests/resources/module_cache_test_source_b/data.tsv +3 -0
- koza-0.0.0/tests/resources/module_cache_test_source_b/gene.py +13 -0
- koza-0.0.0/tests/resources/module_cache_test_source_b/gene.yaml +16 -0
- koza-0.0.0/tests/resources/multifile.yaml +35 -0
- koza-0.0.0/tests/resources/source-files/ZFIN_PHENOTYPE_0.jsonl.gz +0 -0
- koza-0.0.0/tests/resources/source-files/ddpheno.json.gz +0 -0
- koza-0.0.0/tests/resources/source-files/string-split.tar.gz +0 -0
- koza-0.0.0/tests/resources/source-files/string-split.zip +0 -0
- koza-0.0.0/tests/resources/source-files/string.tar.gz +0 -0
- koza-0.0.0/tests/resources/source-files/test_BGI_ZFIN.json.gz +0 -0
- koza-0.0.0/tests/resources/source-files/tsv-with-footer.tsv +9 -0
- koza-0.0.0/tests/resources/sssom/testmapping.sssom.tsv +9 -0
- koza-0.0.0/tests/resources/sssom/testmapping2.sssom.tsv +10 -0
- koza-0.0.0/tests/resources/string.yaml +45 -0
- koza-0.0.0/tests/resources/translation_table.yaml +819 -0
- koza-0.0.0/tests/test_append.py +492 -0
- koza-0.0.0/tests/test_deduplicate.py +756 -0
- koza-0.0.0/tests/test_export.py +340 -0
- koza-0.0.0/tests/test_join.py +754 -0
- koza-0.0.0/tests/test_merge.py +720 -0
- koza-0.0.0/tests/test_normalize.py +384 -0
- koza-0.0.0/tests/test_prune.py +503 -0
- koza-0.0.0/tests/test_schema.py +373 -0
- koza-0.0.0/tests/test_split.py +442 -0
- koza-0.0.0/tests/test_utils.py +377 -0
- koza-0.0.0/tests/unit/resources/primary-source.yaml +37 -0
- koza-0.0.0/tests/unit/test_cli.py +33 -0
- koza-0.0.0/tests/unit/test_config.py +152 -0
- koza-0.0.0/tests/unit/test_csvreader.py +208 -0
- koza-0.0.0/tests/unit/test_filter.py +150 -0
- koza-0.0.0/tests/unit/test_io_utils.py +208 -0
- koza-0.0.0/tests/unit/test_jsonl_writer_lazy_handles.py +49 -0
- koza-0.0.0/tests/unit/test_jsonlreader.py +42 -0
- koza-0.0.0/tests/unit/test_jsonreader.py +46 -0
- koza-0.0.0/tests/unit/test_kgx_converter.py +98 -0
- koza-0.0.0/tests/unit/test_module_caching.py +54 -0
- koza-0.0.0/tests/unit/test_multifile.py +33 -0
- koza-0.0.0/tests/unit/test_runner.py +255 -0
- koza-0.0.0/tests/unit/test_sssom_mapping.py +55 -0
- koza-0.0.0/tests/unit/test_transform_command.py +85 -0
- koza-0.0.0/tests/unit/test_tsvwriter_node_and_edge.py +91 -0
- koza-0.0.0/tests/unit/test_tsvwriter_node_only.py +30 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: Build and Deploy Docs to GitHub Pages
|
|
2
|
+
on:
|
|
3
|
+
workflow_dispatch:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
|
|
9
|
+
env:
|
|
10
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
11
|
+
UV_VERSION: "0.7.x"
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
build-docs:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- name: Checkout
|
|
18
|
+
uses: actions/checkout@main
|
|
19
|
+
with:
|
|
20
|
+
fetch-depth: 0 # otherwise, you will failed to push refs to dest repo
|
|
21
|
+
|
|
22
|
+
- name: Set up Python 3
|
|
23
|
+
uses: actions/setup-python@main
|
|
24
|
+
with:
|
|
25
|
+
python-version: "3.11"
|
|
26
|
+
|
|
27
|
+
- name: Install uv
|
|
28
|
+
uses: astral-sh/setup-uv@v7
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
version: ${{ env.UV_VERSION }}
|
|
32
|
+
|
|
33
|
+
- name: Install library
|
|
34
|
+
run: |
|
|
35
|
+
make install
|
|
36
|
+
|
|
37
|
+
- name: Verify lockfile is up-to-date
|
|
38
|
+
run: |
|
|
39
|
+
uv lock --check
|
|
40
|
+
|
|
41
|
+
- name: Build Documentation
|
|
42
|
+
run: make docs
|
|
43
|
+
|
|
44
|
+
- name: Deploy to gh-pages
|
|
45
|
+
uses: JamesIves/github-pages-deploy-action@v4
|
|
46
|
+
if: github.ref == 'refs/heads/main'
|
|
47
|
+
with:
|
|
48
|
+
folder: site
|
|
49
|
+
target-folder: docs
|
|
50
|
+
clean: true
|
|
51
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: publish on pypi
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
pull_request:
|
|
6
|
+
release:
|
|
7
|
+
types: [published]
|
|
8
|
+
|
|
9
|
+
env:
|
|
10
|
+
UV_VERSION: "0.7.x"
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
publish:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout sources
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v7
|
|
21
|
+
with:
|
|
22
|
+
enable-cache: true
|
|
23
|
+
version: ${{ env.UV_VERSION }}
|
|
24
|
+
|
|
25
|
+
- name: Install
|
|
26
|
+
run: make install
|
|
27
|
+
|
|
28
|
+
- name: Verify lockfile is up-to-date
|
|
29
|
+
run: |
|
|
30
|
+
uv lock --check
|
|
31
|
+
|
|
32
|
+
- name: Build
|
|
33
|
+
run: make build
|
|
34
|
+
|
|
35
|
+
- name: Publish to PyPi
|
|
36
|
+
if: github.event_name == 'release' || github.event_name == 'workflow_dispatch'
|
|
37
|
+
env:
|
|
38
|
+
UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
|
39
|
+
run: |
|
|
40
|
+
uv publish --token $UV_PUBLISH_TOKEN
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: Test Koza
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
pull_request:
|
|
6
|
+
push:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
env:
|
|
10
|
+
UV_VERSION: "0.7.x"
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test-python3-ubuntu-latest:
|
|
14
|
+
name: test py${{ matrix.python-version }} on linux
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
strategy:
|
|
17
|
+
fail-fast: false
|
|
18
|
+
matrix:
|
|
19
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
20
|
+
env:
|
|
21
|
+
OS: ubuntu
|
|
22
|
+
|
|
23
|
+
steps:
|
|
24
|
+
- name: Checkout
|
|
25
|
+
uses: actions/checkout@v4
|
|
26
|
+
|
|
27
|
+
- name: Install uv
|
|
28
|
+
uses: astral-sh/setup-uv@v7
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
version: ${{ env.UV_VERSION }}
|
|
32
|
+
python-version: ${{ matrix.python-version }}
|
|
33
|
+
|
|
34
|
+
- name: Install library
|
|
35
|
+
run: make install
|
|
36
|
+
|
|
37
|
+
- name: Verify lockfile is up-to-date
|
|
38
|
+
run: |
|
|
39
|
+
uv lock --check
|
|
40
|
+
|
|
41
|
+
- name: Run tests
|
|
42
|
+
run: make test
|
koza-0.0.0/.gitignore
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Default output / Generated / Unpacked data files
|
|
2
|
+
output/
|
|
3
|
+
tests/resources/source-files/string.tsv*
|
|
4
|
+
.ruff_cache
|
|
5
|
+
|
|
6
|
+
# Byte-compiled / optimized / DLL files
|
|
7
|
+
__pycache__/
|
|
8
|
+
*.py[cod]
|
|
9
|
+
*$py.class
|
|
10
|
+
|
|
11
|
+
# C extensions
|
|
12
|
+
*.so
|
|
13
|
+
|
|
14
|
+
# Distribution / packaging
|
|
15
|
+
.Python
|
|
16
|
+
build/
|
|
17
|
+
develop-eggs/
|
|
18
|
+
dist/
|
|
19
|
+
downloads/
|
|
20
|
+
eggs/
|
|
21
|
+
.eggs/
|
|
22
|
+
lib/
|
|
23
|
+
lib64/
|
|
24
|
+
parts/
|
|
25
|
+
sdist/
|
|
26
|
+
var/
|
|
27
|
+
wheels/
|
|
28
|
+
pip-wheel-metadata/
|
|
29
|
+
share/python-wheels/
|
|
30
|
+
*.egg-info/
|
|
31
|
+
.installed.cfg
|
|
32
|
+
*.egg
|
|
33
|
+
MANIFEST
|
|
34
|
+
|
|
35
|
+
# PyInstaller
|
|
36
|
+
# Usually these files are written by a python script from a template
|
|
37
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
38
|
+
*.manifest
|
|
39
|
+
*.spec
|
|
40
|
+
|
|
41
|
+
# Installer logs
|
|
42
|
+
pip-log.txt
|
|
43
|
+
pip-delete-this-directory.txt
|
|
44
|
+
|
|
45
|
+
# Unit test / coverage reports
|
|
46
|
+
htmlcov/
|
|
47
|
+
.tox/
|
|
48
|
+
.nox/
|
|
49
|
+
.coverage
|
|
50
|
+
.coverage.*
|
|
51
|
+
.cache
|
|
52
|
+
nosetests.xml
|
|
53
|
+
coverage.xml
|
|
54
|
+
*.cover
|
|
55
|
+
*.py,cover
|
|
56
|
+
.hypothesis/
|
|
57
|
+
.pytest_cache/
|
|
58
|
+
|
|
59
|
+
# Translations
|
|
60
|
+
*.mo
|
|
61
|
+
*.pot
|
|
62
|
+
|
|
63
|
+
# Django stuff:
|
|
64
|
+
*.log
|
|
65
|
+
local_settings.py
|
|
66
|
+
db.sqlite3
|
|
67
|
+
db.sqlite3-journal
|
|
68
|
+
|
|
69
|
+
# Flask stuff:
|
|
70
|
+
instance/
|
|
71
|
+
.webassets-cache
|
|
72
|
+
|
|
73
|
+
# Scrapy stuff:
|
|
74
|
+
.scrapy
|
|
75
|
+
|
|
76
|
+
# Sphinx documentation
|
|
77
|
+
docs/_build/
|
|
78
|
+
|
|
79
|
+
# PyBuilder
|
|
80
|
+
target/
|
|
81
|
+
|
|
82
|
+
# Jupyter Notebook
|
|
83
|
+
.ipynb_checkpoints
|
|
84
|
+
|
|
85
|
+
# IPython
|
|
86
|
+
profile_default/
|
|
87
|
+
ipython_config.py
|
|
88
|
+
|
|
89
|
+
# pyenv
|
|
90
|
+
.python-version
|
|
91
|
+
|
|
92
|
+
# pipenv
|
|
93
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
94
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
95
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
96
|
+
# install all needed dependencies.
|
|
97
|
+
#Pipfile.lock
|
|
98
|
+
|
|
99
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
100
|
+
__pypackages__/
|
|
101
|
+
|
|
102
|
+
# Celery stuff
|
|
103
|
+
celerybeat-schedule
|
|
104
|
+
celerybeat.pid
|
|
105
|
+
|
|
106
|
+
# SageMath parsed files
|
|
107
|
+
*.sage.py
|
|
108
|
+
|
|
109
|
+
# Environments
|
|
110
|
+
.env
|
|
111
|
+
.venv
|
|
112
|
+
env/
|
|
113
|
+
venv/
|
|
114
|
+
ENV/
|
|
115
|
+
env.bak/
|
|
116
|
+
venv.bak/
|
|
117
|
+
|
|
118
|
+
# Spyder project settings
|
|
119
|
+
.spyderproject
|
|
120
|
+
.spyproject
|
|
121
|
+
|
|
122
|
+
# Rope project settings
|
|
123
|
+
.ropeproject
|
|
124
|
+
|
|
125
|
+
# mkdocs documentation
|
|
126
|
+
/site
|
|
127
|
+
|
|
128
|
+
# mypy
|
|
129
|
+
.mypy_cache/
|
|
130
|
+
.dmypy.json
|
|
131
|
+
dmypy.json
|
|
132
|
+
|
|
133
|
+
# Pyre type checker
|
|
134
|
+
.pyre/
|
|
135
|
+
|
|
136
|
+
# IDE
|
|
137
|
+
.idea
|
|
138
|
+
protein-links-detailed_edges.tsv
|
|
139
|
+
protein-links-detailed_nodes.tsv
|
|
140
|
+
uv.lock
|
|
141
|
+
uv.lock
|
koza-0.0.0/CITATION.cff
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
cff-version: '1.1.0'
|
|
2
|
+
message: 'Please cite the following works when using this software.'
|
|
3
|
+
abstract: 'Data transformation framework for LinkML data models'
|
|
4
|
+
authors:
|
|
5
|
+
- family-names: 'Schaper'
|
|
6
|
+
given-names: 'Kevin'
|
|
7
|
+
- family-names: 'Ships'
|
|
8
|
+
given-names: 'Glass'
|
|
9
|
+
- family-names: 'Shefchek'
|
|
10
|
+
given-names: 'Kent'
|
|
11
|
+
- family-names: 'Moxon'
|
|
12
|
+
given-names: 'Sierra'
|
|
13
|
+
- family-names: 'Mungall'
|
|
14
|
+
given-names: 'Chris'
|
|
15
|
+
date-released: 2022-06-15
|
|
16
|
+
identifiers:
|
|
17
|
+
- type: 'url'
|
|
18
|
+
value: 'https://github.com/monarch-initiative/koza'
|
|
19
|
+
title: 'monarch-initiative/koza'
|
|
20
|
+
url: 'https://github.com/monarch-initiative/koza'
|
|
21
|
+
version: '0.1.14'
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
##### Building locally
|
|
2
|
+
|
|
3
|
+
First create a virtual environment with your favorite tool, and activate eg
|
|
4
|
+
```bash
|
|
5
|
+
python3.8 -m venv venv
|
|
6
|
+
source venv/bin/activate
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Install and test with make
|
|
10
|
+
```bash
|
|
11
|
+
make
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Or with flit
|
|
15
|
+
```
|
|
16
|
+
pip install flit
|
|
17
|
+
flit install --deps develop --symlink
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
##### Linting and Formatting
|
|
21
|
+
TODO - write some docs on linting on formating
|
|
22
|
+
|
|
23
|
+
Lint with flake8, black, and isort
|
|
24
|
+
```bash
|
|
25
|
+
make lint
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Format with autoflake, black, and isort (updates files in place)
|
|
29
|
+
```bash
|
|
30
|
+
make format
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
##### Build and Publish to PyPI
|
|
34
|
+
Building and publishing requires git >= 2.30
|
|
35
|
+
|
|
36
|
+
Build a wheel and an sdist (tarball) from the package:
|
|
37
|
+
```bash
|
|
38
|
+
make build
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Publish to PyPI
|
|
42
|
+
```bash
|
|
43
|
+
make publish
|
|
44
|
+
```
|
koza-0.0.0/LICENSE
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022, Monarch Initiative
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
17
|
+
contributors may be used to endorse or promote products derived from
|
|
18
|
+
this software without specific prior written permission.
|
|
19
|
+
|
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
21
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
22
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
23
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
24
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
25
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
26
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
27
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
28
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
koza-0.0.0/Makefile
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Note that uv is required, see https://docs.astral.sh/uv/getting-started/installation/
|
|
2
|
+
|
|
3
|
+
MAKEFLAGS += --warn-undefined-variables
|
|
4
|
+
MAKEFLAGS += --no-builtin-rules
|
|
5
|
+
MAKEFLAGS += --no-builtin-variables
|
|
6
|
+
|
|
7
|
+
ifneq (,$(wildcard ./.env))
|
|
8
|
+
include .env
|
|
9
|
+
export
|
|
10
|
+
endif
|
|
11
|
+
|
|
12
|
+
.DEFAULT_GOAL := all
|
|
13
|
+
SHELL := bash
|
|
14
|
+
RUN := uv run
|
|
15
|
+
|
|
16
|
+
# This nifty grep/sort/awk pipeline collects all comments headed by the double "#" symbols next to each target and recycles them as comments
|
|
17
|
+
.PHONY: help
|
|
18
|
+
help: ## Print this help message
|
|
19
|
+
@grep -hE '^[[:alnum:]_/.-]+:.*## ' $(MAKEFILE_LIST) | \
|
|
20
|
+
awk 'BEGIN {FS = ":.*## "}; {printf "\033[36m%-25s\033[0m %s\n", $$1, $$2}'
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
.PHONY: all
|
|
24
|
+
all: install test clean ## Install, test, and clean
|
|
25
|
+
|
|
26
|
+
.PHONY: install
|
|
27
|
+
install: ## Install development environment
|
|
28
|
+
uv venv --allow-existing
|
|
29
|
+
uv pip install -e .[dev]
|
|
30
|
+
uv lock
|
|
31
|
+
|
|
32
|
+
.PHONY: build
|
|
33
|
+
build: ## Build the package
|
|
34
|
+
uv build
|
|
35
|
+
|
|
36
|
+
.PHONY: test
|
|
37
|
+
test: ## Run the test suite
|
|
38
|
+
$(RUN) pytest tests
|
|
39
|
+
|
|
40
|
+
.PHONY: docs
|
|
41
|
+
docs: ## Build the documentation
|
|
42
|
+
$(RUN) typer src/koza/main.py utils docs --name koza --output docs/Usage/CLI.md
|
|
43
|
+
$(RUN) mkdocs build
|
|
44
|
+
|
|
45
|
+
.PHONY: clean
|
|
46
|
+
clean: ## Clean up build artifacts, etc.
|
|
47
|
+
rm -rf `find . -name __pycache__`
|
|
48
|
+
rm -f `find . -type f -name '*.py[co]' `
|
|
49
|
+
rm -rf .pytest_cache
|
|
50
|
+
rm -rf output test-output
|
|
51
|
+
rm -rf dist
|
|
52
|
+
|
|
53
|
+
.PHONY: coverage
|
|
54
|
+
coverage: ## Run the test suite with coverage reporting
|
|
55
|
+
-$(RUN) coverage run -m pytest tests
|
|
56
|
+
$(RUN) coverage report -m
|
|
57
|
+
|
|
58
|
+
.PHONY: lint
|
|
59
|
+
lint: ## Lint the codebase
|
|
60
|
+
$(RUN) ruff check --diff --exit-zero src/ tests/ examples/
|
|
61
|
+
$(RUN) ruff format --check --diff src/ tests/ examples/
|
|
62
|
+
|
|
63
|
+
.PHONY: format
|
|
64
|
+
format: ## Format the codebase
|
|
65
|
+
$(RUN) ruff check --fix --exit-zero src/ tests/ examples/
|
|
66
|
+
$(RUN) ruff format src/ tests/ examples/
|
koza-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: koza
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Data transformation framework for LinkML data models
|
|
5
|
+
Author-email: The Monarch Initiative <info@monarchinitiative.org>, Kevin Schaper <kevinschaper@gmail.com>, Glass Elsarboukh <g.elsarboukh@gmail.com>, Kent Shefchek <kent@tislab.org>, Daniel Korn <daniel_korn@med.unc.edu>
|
|
6
|
+
License-Expression: BSD-3-Clause
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: <4,>=3.10
|
|
9
|
+
Requires-Dist: biolink-model>=4.3.6
|
|
10
|
+
Requires-Dist: coverage>=7.13.0
|
|
11
|
+
Requires-Dist: duckdb
|
|
12
|
+
Requires-Dist: linkml>=1.9.0
|
|
13
|
+
Requires-Dist: loguru
|
|
14
|
+
Requires-Dist: mergedeep==1.3.4
|
|
15
|
+
Requires-Dist: ordered-set>=4.1.0
|
|
16
|
+
Requires-Dist: pydantic>=2.12.5
|
|
17
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
18
|
+
Requires-Dist: requests>=2.32.5
|
|
19
|
+
Requires-Dist: sssom>=0.4
|
|
20
|
+
Requires-Dist: tqdm>=4.67.1
|
|
21
|
+
Requires-Dist: typer>=0.20.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: mkdocs-material>=9.7.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: mkdocs>=1.6.1; extra == 'dev'
|
|
25
|
+
Requires-Dist: mkdocstrings[python]>=1.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# Koza - Knowledge Graph Transformation and Operations Toolkit
|
|
31
|
+
|
|
32
|
+
[](https://pypi.python.org/pypi/koza)
|
|
33
|
+
[](https://pypi.python.org/pypi/koza)
|
|
34
|
+

|
|
35
|
+
|
|
36
|
+

|
|
37
|
+
|
|
38
|
+
[**Documentation**](https://koza.monarchinitiative.org/)
|
|
39
|
+
|
|
40
|
+
_Disclaimer_: Koza is in beta - we are looking for testers!
|
|
41
|
+
|
|
42
|
+
## Overview
|
|
43
|
+
|
|
44
|
+
Koza is a Python library and CLI tool for transforming biomedical data and performing graph operations on Knowledge Graph Exchange (KGX) files. It provides two main capabilities:
|
|
45
|
+
|
|
46
|
+
### ๐ **Graph Operations** (New!)
|
|
47
|
+
Powerful DuckDB-based operations for KGX knowledge graphs:
|
|
48
|
+
- **Join** multiple KGX files with schema harmonization
|
|
49
|
+
- **Split** files by field values with format conversion
|
|
50
|
+
- **Prune** dangling edges and handle singleton nodes
|
|
51
|
+
- **Append** new data to existing databases with schema evolution
|
|
52
|
+
- **Multi-format support** for TSV, JSONL, and Parquet files
|
|
53
|
+
|
|
54
|
+
### ๐ **Data Transformation** (Core)
|
|
55
|
+
Transform biomedical data sources into KGX format:
|
|
56
|
+
- Transform csv, json, yaml, jsonl, and xml to target formats
|
|
57
|
+
- Output in [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv)
|
|
58
|
+
- Write data transforms in semi-declarative Python
|
|
59
|
+
- Configure source files, columns/properties, and metadata in YAML
|
|
60
|
+
- Create mapping files and translation tables between vocabularies
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
Koza is available on PyPi and can be installed via pip/pipx:
|
|
64
|
+
```
|
|
65
|
+
[pip|pipx] install koza
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
### Quick Start with Graph Operations
|
|
71
|
+
|
|
72
|
+
Koza's graph operations work seamlessly across multiple KGX formats (TSV, JSONL, Parquet):
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Join multiple KGX files into a unified database
|
|
76
|
+
koza join --nodes genes.tsv pathways.jsonl --edges interactions.parquet --output merged_graph.duckdb
|
|
77
|
+
|
|
78
|
+
# Prune dangling edges and handle singleton nodes
|
|
79
|
+
koza prune --database merged_graph.duckdb --keep-singletons
|
|
80
|
+
|
|
81
|
+
# Append new data to existing database with schema evolution
|
|
82
|
+
koza append --database merged_graph.duckdb --nodes new_genes.tsv --edges new_interactions.jsonl
|
|
83
|
+
|
|
84
|
+
# Split database by source with format conversion
|
|
85
|
+
koza split --database merged_graph.duckdb --split-on provided_by --output-format parquet
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**NOTE: As of version 0.2.0, there is a new method for getting your ingest's `KozaApp` instance. Please see the [updated documentation](https://koza.monarchinitiative.org/Usage/configuring_ingests/#transform-code) for details.**
|
|
89
|
+
|
|
90
|
+
See the [Koza documentation](https://koza.monarchinitiative.org/) for complete usage information
|
|
91
|
+
|
|
92
|
+
### Examples
|
|
93
|
+
|
|
94
|
+
#### Validate
|
|
95
|
+
|
|
96
|
+
Give Koza a local or remote csv file, and get some basic information (headers, number of rows)
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
koza validate \
|
|
100
|
+
--file https://raw.githubusercontent.com/monarch-initiative/koza/main/examples/data/string.tsv \
|
|
101
|
+
--delimiter ' '
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Sending a json or jsonl formatted file will confirm if the file is valid json or jsonl
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
koza validate \
|
|
108
|
+
--file ./examples/data/ZFIN_PHENOTYPE_0.jsonl.gz \
|
|
109
|
+
--format jsonl
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
koza validate \
|
|
114
|
+
--file ./examples/data/ddpheno.json.gz \
|
|
115
|
+
--format json
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
#### Transform
|
|
119
|
+
|
|
120
|
+
Run the example ingest, "string/protein-links-detailed"
|
|
121
|
+
```bash
|
|
122
|
+
koza transform \
|
|
123
|
+
--source examples/string/protein-links-detailed.yaml \
|
|
124
|
+
--global-table examples/translation_table.yaml
|
|
125
|
+
|
|
126
|
+
koza transform \
|
|
127
|
+
--source examples/string-declarative/protein-links-detailed.yaml \
|
|
128
|
+
--global-table examples/translation_table.yaml
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Note**:
|
|
132
|
+
Koza expects a directory structure as described in the above example
|
|
133
|
+
with the source config file and transform code in the same directory:
|
|
134
|
+
```
|
|
135
|
+
.
|
|
136
|
+
โโโ ...
|
|
137
|
+
โ โโโ your_source
|
|
138
|
+
โ โ โโโ your_ingest.yaml
|
|
139
|
+
โ โ โโโ your_ingest.py
|
|
140
|
+
โ โโโ some_translation_table.yaml
|
|
141
|
+
โโโ ...
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
#### Graph Operations
|
|
145
|
+
|
|
146
|
+
Create and manipulate knowledge graphs from existing KGX files:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Join heterogeneous KGX files with automatic schema harmonization
|
|
150
|
+
koza join \
|
|
151
|
+
--nodes genes.tsv proteins.jsonl pathways.parquet \
|
|
152
|
+
--edges gene_protein.tsv protein_pathway.jsonl \
|
|
153
|
+
--output unified_graph.duckdb \
|
|
154
|
+
--schema-report
|
|
155
|
+
|
|
156
|
+
# Clean up graph integrity issues
|
|
157
|
+
koza prune \
|
|
158
|
+
--database unified_graph.duckdb \
|
|
159
|
+
--keep-singletons \
|
|
160
|
+
--dry-run # Preview changes before applying
|
|
161
|
+
|
|
162
|
+
# Incrementally add new data with schema evolution
|
|
163
|
+
koza append \
|
|
164
|
+
--database unified_graph.duckdb \
|
|
165
|
+
--nodes new_genes.tsv updated_pathways.jsonl \
|
|
166
|
+
--deduplicate \
|
|
167
|
+
--show-progress
|
|
168
|
+
|
|
169
|
+
# Export subsets with format conversion
|
|
170
|
+
koza split \
|
|
171
|
+
--database unified_graph.duckdb \
|
|
172
|
+
--split-on provided_by \
|
|
173
|
+
--output-format parquet \
|
|
174
|
+
--output-dir ./split_graphs
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Key Features
|
|
178
|
+
|
|
179
|
+
### ๐ง **Multi-Format Support**
|
|
180
|
+
- Native support for TSV, JSONL, and Parquet KGX files
|
|
181
|
+
- Automatic format detection and conversion
|
|
182
|
+
- Mixed-format operations in single commands
|
|
183
|
+
|
|
184
|
+
### ๐ก๏ธ **Schema Flexibility**
|
|
185
|
+
- Automatic schema harmonization across heterogeneous files
|
|
186
|
+
- Schema evolution with backward compatibility
|
|
187
|
+
- Comprehensive schema reporting and validation
|
|
188
|
+
|
|
189
|
+
### โก **High Performance**
|
|
190
|
+
- DuckDB-powered operations for fast bulk processing
|
|
191
|
+
- Memory-efficient handling of large knowledge graphs
|
|
192
|
+
- Parallel processing and streaming where possible
|
|
193
|
+
|
|
194
|
+
### ๐ **Rich CLI Experience**
|
|
195
|
+
- Progress indicators for long-running operations
|
|
196
|
+
- Detailed statistics and operation summaries
|
|
197
|
+
- Dry-run modes for safe operation preview
|
|
198
|
+
|
|
199
|
+
### ๐งน **Data Integrity**
|
|
200
|
+
- Dangling edge detection and preservation
|
|
201
|
+
- Duplicate detection and removal strategies
|
|
202
|
+
- Non-destructive operations with data archiving
|