koza 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. koza-0.0.0/.github/dependabot.yaml +10 -0
  2. koza-0.0.0/.github/workflows/documentation.yaml +51 -0
  3. koza-0.0.0/.github/workflows/publish.yaml +40 -0
  4. koza-0.0.0/.github/workflows/test.yaml +42 -0
  5. koza-0.0.0/.gitignore +141 -0
  6. koza-0.0.0/CITATION.cff +21 -0
  7. koza-0.0.0/CONTRIBUTING.md +44 -0
  8. koza-0.0.0/LICENSE +29 -0
  9. koza-0.0.0/Makefile +66 -0
  10. koza-0.0.0/PKG-INFO +202 -0
  11. koza-0.0.0/README.md +173 -0
  12. koza-0.0.0/docs/CNAME +1 -0
  13. koza-0.0.0/docs/Ingests/index.md +12 -0
  14. koza-0.0.0/docs/Ingests/koza_config.md +416 -0
  15. koza-0.0.0/docs/Ingests/mapping.md +62 -0
  16. koza-0.0.0/docs/Ingests/testing.md +87 -0
  17. koza-0.0.0/docs/Ingests/transform.md +53 -0
  18. koza-0.0.0/docs/Usage/CLI.md +65 -0
  19. koza-0.0.0/docs/Usage/Module.md +1 -0
  20. koza-0.0.0/docs/cli-reference.md +564 -0
  21. koza-0.0.0/docs/graph-operations/explanation/architecture.md +243 -0
  22. koza-0.0.0/docs/graph-operations/explanation/biolink-compliance.md +330 -0
  23. koza-0.0.0/docs/graph-operations/explanation/data-integrity.md +280 -0
  24. koza-0.0.0/docs/graph-operations/explanation/index.md +71 -0
  25. koza-0.0.0/docs/graph-operations/explanation/schema-handling.md +229 -0
  26. koza-0.0.0/docs/graph-operations/how-to/clean-graph.md +304 -0
  27. koza-0.0.0/docs/graph-operations/how-to/export-formats.md +373 -0
  28. koza-0.0.0/docs/graph-operations/how-to/generate-reports.md +434 -0
  29. koza-0.0.0/docs/graph-operations/how-to/incremental-updates.md +296 -0
  30. koza-0.0.0/docs/graph-operations/how-to/index.md +52 -0
  31. koza-0.0.0/docs/graph-operations/how-to/join-files.md +238 -0
  32. koza-0.0.0/docs/graph-operations/how-to/normalize-ids.md +302 -0
  33. koza-0.0.0/docs/graph-operations/how-to/split-graph.md +235 -0
  34. koza-0.0.0/docs/graph-operations/index.md +67 -0
  35. koza-0.0.0/docs/graph-operations/reference/api.md +168 -0
  36. koza-0.0.0/docs/graph-operations/reference/cli.md +797 -0
  37. koza-0.0.0/docs/graph-operations/reference/configuration.md +791 -0
  38. koza-0.0.0/docs/graph-operations/reference/index.md +80 -0
  39. koza-0.0.0/docs/graph-operations/tutorials/first-graph.md +471 -0
  40. koza-0.0.0/docs/graph-operations/tutorials/index.md +43 -0
  41. koza-0.0.0/docs/graph-operations/tutorials/merge-pipeline.md +480 -0
  42. koza-0.0.0/docs/graph-operations.md +34 -0
  43. koza-0.0.0/docs/img/docs-comming-soon.jpg +0 -0
  44. koza-0.0.0/docs/img/favicon.ico +0 -0
  45. koza-0.0.0/docs/img/pupa.png +0 -0
  46. koza-0.0.0/docs/index.md +87 -0
  47. koza-0.0.0/examples/data/additional-entrez-2-string.tsv +11 -0
  48. koza-0.0.0/examples/data/entrez-2-string.tsv +11 -0
  49. koza-0.0.0/examples/data/string.tsv +5 -0
  50. koza-0.0.0/examples/data/string2.tsv +15 -0
  51. koza-0.0.0/examples/maps/custom-entrez-2-string.py +11 -0
  52. koza-0.0.0/examples/maps/custom-entrez-2-string.yaml +27 -0
  53. koza-0.0.0/examples/maps/entrez-2-string.yaml +27 -0
  54. koza-0.0.0/examples/maps/genepage-2-gene.yaml +26 -0
  55. koza-0.0.0/examples/minimal.py +6 -0
  56. koza-0.0.0/examples/standards/gpi.yaml +10 -0
  57. koza-0.0.0/examples/standards/oban.yaml +13 -0
  58. koza-0.0.0/examples/standards/string.yaml +10 -0
  59. koza-0.0.0/examples/string/metadata.yaml +4 -0
  60. koza-0.0.0/examples/string/protein-links-detailed.py +25 -0
  61. koza-0.0.0/examples/string/protein-links-detailed.yaml +33 -0
  62. koza-0.0.0/examples/string-declarative/declarative-protein-links-detailed.py +25 -0
  63. koza-0.0.0/examples/string-declarative/declarative-protein-links-detailed.yaml +50 -0
  64. koza-0.0.0/examples/string-declarative/metadata.yaml +4 -0
  65. koza-0.0.0/examples/string-file-archive/metadata.yaml +4 -0
  66. koza-0.0.0/examples/string-file-archive/protein-links-file-archive.py +24 -0
  67. koza-0.0.0/examples/string-file-archive/protein-links-file-archive.yaml +31 -0
  68. koza-0.0.0/examples/string-w-custom-map/custom-map-protein-links-detailed.py +27 -0
  69. koza-0.0.0/examples/string-w-custom-map/custom-map-protein-links-detailed.yaml +48 -0
  70. koza-0.0.0/examples/string-w-custom-map/metadata.yaml +4 -0
  71. koza-0.0.0/examples/string-w-map/map-protein-links-detailed.py +27 -0
  72. koza-0.0.0/examples/string-w-map/map-protein-links-detailed.yaml +47 -0
  73. koza-0.0.0/examples/string-w-map/metadata.yaml +4 -0
  74. koza-0.0.0/examples/string-w-state/metadata.yaml +4 -0
  75. koza-0.0.0/examples/string-w-state/protein-links-detailed.py +35 -0
  76. koza-0.0.0/examples/string-w-state/protein-links-detailed.yaml +33 -0
  77. koza-0.0.0/examples/string-yield/metadata.yaml +4 -0
  78. koza-0.0.0/examples/string-yield/protein-links-yield.py +26 -0
  79. koza-0.0.0/examples/string-yield/protein-links-yield.yaml +30 -0
  80. koza-0.0.0/examples/translation_table.yaml +819 -0
  81. koza-0.0.0/mkdocs.yaml +130 -0
  82. koza-0.0.0/pyproject.toml +82 -0
  83. koza-0.0.0/src/koza/__init__.py +18 -0
  84. koza-0.0.0/src/koza/converter/__init__.py +0 -0
  85. koza-0.0.0/src/koza/converter/kgx_converter.py +54 -0
  86. koza-0.0.0/src/koza/decorators.py +169 -0
  87. koza-0.0.0/src/koza/graph_operations/__init__.py +51 -0
  88. koza-0.0.0/src/koza/graph_operations/append.py +523 -0
  89. koza-0.0.0/src/koza/graph_operations/deduplicate.py +346 -0
  90. koza-0.0.0/src/koza/graph_operations/join.py +260 -0
  91. koza-0.0.0/src/koza/graph_operations/merge.py +466 -0
  92. koza-0.0.0/src/koza/graph_operations/normalize.py +559 -0
  93. koza-0.0.0/src/koza/graph_operations/prune.py +399 -0
  94. koza-0.0.0/src/koza/graph_operations/report.py +1604 -0
  95. koza-0.0.0/src/koza/graph_operations/schema.py +143 -0
  96. koza-0.0.0/src/koza/graph_operations/schema_utils.py +179 -0
  97. koza-0.0.0/src/koza/graph_operations/split.py +351 -0
  98. koza-0.0.0/src/koza/graph_operations/utils.py +609 -0
  99. koza-0.0.0/src/koza/io/__init__.py +8 -0
  100. koza-0.0.0/src/koza/io/reader/__init__.py +3 -0
  101. koza-0.0.0/src/koza/io/reader/csv_reader.py +240 -0
  102. koza-0.0.0/src/koza/io/reader/json_reader.py +60 -0
  103. koza-0.0.0/src/koza/io/reader/jsonl_reader.py +43 -0
  104. koza-0.0.0/src/koza/io/utils.py +336 -0
  105. koza-0.0.0/src/koza/io/writer/__init__.py +0 -0
  106. koza-0.0.0/src/koza/io/writer/jsonl_writer.py +74 -0
  107. koza-0.0.0/src/koza/io/writer/passthrough_writer.py +26 -0
  108. koza-0.0.0/src/koza/io/writer/tsv_writer.py +136 -0
  109. koza-0.0.0/src/koza/io/writer/writer.py +31 -0
  110. koza-0.0.0/src/koza/io/yaml_loader.py +66 -0
  111. koza-0.0.0/src/koza/main.py +1295 -0
  112. koza-0.0.0/src/koza/model/__init__.py +0 -0
  113. koza-0.0.0/src/koza/model/config/__init__.py +0 -0
  114. koza-0.0.0/src/koza/model/config/pydantic_config.py +12 -0
  115. koza-0.0.0/src/koza/model/config/source_config.py +146 -0
  116. koza-0.0.0/src/koza/model/config/sssom_config.py +171 -0
  117. koza-0.0.0/src/koza/model/curie_cleaner.py +11 -0
  118. koza-0.0.0/src/koza/model/filters.py +68 -0
  119. koza-0.0.0/src/koza/model/formats.py +24 -0
  120. koza-0.0.0/src/koza/model/graph_operations.py +747 -0
  121. koza-0.0.0/src/koza/model/graphs.py +16 -0
  122. koza-0.0.0/src/koza/model/koza.py +66 -0
  123. koza-0.0.0/src/koza/model/reader.py +139 -0
  124. koza-0.0.0/src/koza/model/source.py +180 -0
  125. koza-0.0.0/src/koza/model/transform.py +61 -0
  126. koza-0.0.0/src/koza/model/writer.py +17 -0
  127. koza-0.0.0/src/koza/runner.py +408 -0
  128. koza-0.0.0/src/koza/tools/split_file.py +73 -0
  129. koza-0.0.0/src/koza/transform.py +99 -0
  130. koza-0.0.0/src/koza/utils/__init__ +0 -0
  131. koza-0.0.0/src/koza/utils/exceptions.py +18 -0
  132. koza-0.0.0/src/koza/utils/log_utils.py +26 -0
  133. koza-0.0.0/src/koza/utils/row_filter.py +83 -0
  134. koza-0.0.0/tests/conftest.py +9 -0
  135. koza-0.0.0/tests/integration/test_examples.py +46 -0
  136. koza-0.0.0/tests/integration/test_multi_file_pipeline.py +321 -0
  137. koza-0.0.0/tests/integration/test_row_limit.py +58 -0
  138. koza-0.0.0/tests/integration/test_validator.py +57 -0
  139. koza-0.0.0/tests/resources/module_cache_test_source_a/data.tsv +3 -0
  140. koza-0.0.0/tests/resources/module_cache_test_source_a/gene.py +13 -0
  141. koza-0.0.0/tests/resources/module_cache_test_source_a/gene.yaml +16 -0
  142. koza-0.0.0/tests/resources/module_cache_test_source_b/data.tsv +3 -0
  143. koza-0.0.0/tests/resources/module_cache_test_source_b/gene.py +13 -0
  144. koza-0.0.0/tests/resources/module_cache_test_source_b/gene.yaml +16 -0
  145. koza-0.0.0/tests/resources/multifile.yaml +35 -0
  146. koza-0.0.0/tests/resources/source-files/ZFIN_PHENOTYPE_0.jsonl.gz +0 -0
  147. koza-0.0.0/tests/resources/source-files/ddpheno.json.gz +0 -0
  148. koza-0.0.0/tests/resources/source-files/string-split.tar.gz +0 -0
  149. koza-0.0.0/tests/resources/source-files/string-split.zip +0 -0
  150. koza-0.0.0/tests/resources/source-files/string.tar.gz +0 -0
  151. koza-0.0.0/tests/resources/source-files/test_BGI_ZFIN.json.gz +0 -0
  152. koza-0.0.0/tests/resources/source-files/tsv-with-footer.tsv +9 -0
  153. koza-0.0.0/tests/resources/sssom/testmapping.sssom.tsv +9 -0
  154. koza-0.0.0/tests/resources/sssom/testmapping2.sssom.tsv +10 -0
  155. koza-0.0.0/tests/resources/string.yaml +45 -0
  156. koza-0.0.0/tests/resources/translation_table.yaml +819 -0
  157. koza-0.0.0/tests/test_append.py +492 -0
  158. koza-0.0.0/tests/test_deduplicate.py +756 -0
  159. koza-0.0.0/tests/test_export.py +340 -0
  160. koza-0.0.0/tests/test_join.py +754 -0
  161. koza-0.0.0/tests/test_merge.py +720 -0
  162. koza-0.0.0/tests/test_normalize.py +384 -0
  163. koza-0.0.0/tests/test_prune.py +503 -0
  164. koza-0.0.0/tests/test_schema.py +373 -0
  165. koza-0.0.0/tests/test_split.py +442 -0
  166. koza-0.0.0/tests/test_utils.py +377 -0
  167. koza-0.0.0/tests/unit/resources/primary-source.yaml +37 -0
  168. koza-0.0.0/tests/unit/test_cli.py +33 -0
  169. koza-0.0.0/tests/unit/test_config.py +152 -0
  170. koza-0.0.0/tests/unit/test_csvreader.py +208 -0
  171. koza-0.0.0/tests/unit/test_filter.py +150 -0
  172. koza-0.0.0/tests/unit/test_io_utils.py +208 -0
  173. koza-0.0.0/tests/unit/test_jsonl_writer_lazy_handles.py +49 -0
  174. koza-0.0.0/tests/unit/test_jsonlreader.py +42 -0
  175. koza-0.0.0/tests/unit/test_jsonreader.py +46 -0
  176. koza-0.0.0/tests/unit/test_kgx_converter.py +98 -0
  177. koza-0.0.0/tests/unit/test_module_caching.py +54 -0
  178. koza-0.0.0/tests/unit/test_multifile.py +33 -0
  179. koza-0.0.0/tests/unit/test_runner.py +255 -0
  180. koza-0.0.0/tests/unit/test_sssom_mapping.py +55 -0
  181. koza-0.0.0/tests/unit/test_transform_command.py +85 -0
  182. koza-0.0.0/tests/unit/test_tsvwriter_node_and_edge.py +91 -0
  183. koza-0.0.0/tests/unit/test_tsvwriter_node_only.py +30 -0
@@ -0,0 +1,10 @@
1
+ # Set update schedule for GitHub Actions
2
+
3
+ version: 2
4
+ updates:
5
+
6
+ - package-ecosystem: "github-actions"
7
+ directory: "/"
8
+ schedule:
9
+ # Check for updates to GitHub Actions every week
10
+ interval: "weekly"
@@ -0,0 +1,51 @@
1
+ name: Build and Deploy Docs to GitHub Pages
2
+ on:
3
+ workflow_dispatch:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ env:
10
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11
+ UV_VERSION: "0.7.x"
12
+
13
+ jobs:
14
+ build-docs:
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Checkout
18
+ uses: actions/checkout@main
19
+ with:
20
+ fetch-depth: 0 # otherwise, you will failed to push refs to dest repo
21
+
22
+ - name: Set up Python 3
23
+ uses: actions/setup-python@main
24
+ with:
25
+ python-version: "3.11"
26
+
27
+ - name: Install uv
28
+ uses: astral-sh/setup-uv@v7
29
+ with:
30
+ enable-cache: true
31
+ version: ${{ env.UV_VERSION }}
32
+
33
+ - name: Install library
34
+ run: |
35
+ make install
36
+
37
+ - name: Verify lockfile is up-to-date
38
+ run: |
39
+ uv lock --check
40
+
41
+ - name: Build Documentation
42
+ run: make docs
43
+
44
+ - name: Deploy to gh-pages
45
+ uses: JamesIves/github-pages-deploy-action@v4
46
+ if: github.ref == 'refs/heads/main'
47
+ with:
48
+ folder: site
49
+ target-folder: docs
50
+ clean: true
51
+
@@ -0,0 +1,40 @@
1
+ name: publish on pypi
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ pull_request:
6
+ release:
7
+ types: [published]
8
+
9
+ env:
10
+ UV_VERSION: "0.7.x"
11
+
12
+ jobs:
13
+ publish:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - name: Checkout sources
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v7
21
+ with:
22
+ enable-cache: true
23
+ version: ${{ env.UV_VERSION }}
24
+
25
+ - name: Install
26
+ run: make install
27
+
28
+ - name: Verify lockfile is up-to-date
29
+ run: |
30
+ uv lock --check
31
+
32
+ - name: Build
33
+ run: make build
34
+
35
+ - name: Publish to PyPi
36
+ if: github.event_name == 'release' || github.event_name == 'workflow_dispatch'
37
+ env:
38
+ UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
39
+ run: |
40
+ uv publish --token $UV_PUBLISH_TOKEN
@@ -0,0 +1,42 @@
1
+ name: Test Koza
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ pull_request:
6
+ push:
7
+ branches: [main]
8
+
9
+ env:
10
+ UV_VERSION: "0.7.x"
11
+
12
+ jobs:
13
+ test-python3-ubuntu-latest:
14
+ name: test py${{ matrix.python-version }} on linux
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
20
+ env:
21
+ OS: ubuntu
22
+
23
+ steps:
24
+ - name: Checkout
25
+ uses: actions/checkout@v4
26
+
27
+ - name: Install uv
28
+ uses: astral-sh/setup-uv@v7
29
+ with:
30
+ enable-cache: true
31
+ version: ${{ env.UV_VERSION }}
32
+ python-version: ${{ matrix.python-version }}
33
+
34
+ - name: Install library
35
+ run: make install
36
+
37
+ - name: Verify lockfile is up-to-date
38
+ run: |
39
+ uv lock --check
40
+
41
+ - name: Run tests
42
+ run: make test
koza-0.0.0/.gitignore ADDED
@@ -0,0 +1,141 @@
1
+ # Default output / Generated / Unpacked data files
2
+ output/
3
+ tests/resources/source-files/string.tsv*
4
+ .ruff_cache
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100
+ __pypackages__/
101
+
102
+ # Celery stuff
103
+ celerybeat-schedule
104
+ celerybeat.pid
105
+
106
+ # SageMath parsed files
107
+ *.sage.py
108
+
109
+ # Environments
110
+ .env
111
+ .venv
112
+ env/
113
+ venv/
114
+ ENV/
115
+ env.bak/
116
+ venv.bak/
117
+
118
+ # Spyder project settings
119
+ .spyderproject
120
+ .spyproject
121
+
122
+ # Rope project settings
123
+ .ropeproject
124
+
125
+ # mkdocs documentation
126
+ /site
127
+
128
+ # mypy
129
+ .mypy_cache/
130
+ .dmypy.json
131
+ dmypy.json
132
+
133
+ # Pyre type checker
134
+ .pyre/
135
+
136
+ # IDE
137
+ .idea
138
+ protein-links-detailed_edges.tsv
139
+ protein-links-detailed_nodes.tsv
140
+ uv.lock
141
+ uv.lock
@@ -0,0 +1,21 @@
1
+ cff-version: '1.1.0'
2
+ message: 'Please cite the following works when using this software.'
3
+ abstract: 'Data transformation framework for LinkML data models'
4
+ authors:
5
+ - family-names: 'Schaper'
6
+ given-names: 'Kevin'
7
+ - family-names: 'Ships'
8
+ given-names: 'Glass'
9
+ - family-names: 'Shefchek'
10
+ given-names: 'Kent'
11
+ - family-names: 'Moxon'
12
+ given-names: 'Sierra'
13
+ - family-names: 'Mungall'
14
+ given-names: 'Chris'
15
+ date-released: 2022-06-15
16
+ identifiers:
17
+ - type: 'url'
18
+ value: 'https://github.com/monarch-initiative/koza'
19
+ title: 'monarch-initiative/koza'
20
+ url: 'https://github.com/monarch-initiative/koza'
21
+ version: '0.1.14'
@@ -0,0 +1,44 @@
1
+ ##### Building locally
2
+
3
+ First create a virtual environment with your favorite tool, and activate eg
4
+ ```bash
5
+ python3.8 -m venv venv
6
+ source venv/bin/activate
7
+ ```
8
+
9
+ Install and test with make
10
+ ```bash
11
+ make
12
+ ```
13
+
14
+ Or with flit
15
+ ```
16
+ pip install flit
17
+ flit install --deps develop --symlink
18
+ ```
19
+
20
+ ##### Linting and Formatting
21
+ TODO - write some docs on linting on formating
22
+
23
+ Lint with flake8, black, and isort
24
+ ```bash
25
+ make lint
26
+ ```
27
+
28
+ Format with autoflake, black, and isort (updates files in place)
29
+ ```bash
30
+ make format
31
+ ```
32
+
33
+ ##### Build and Publish to PyPI
34
+ Building and publishing requires git >= 2.30
35
+
36
+ Build a wheel and an sdist (tarball) from the package:
37
+ ```bash
38
+ make build
39
+ ```
40
+
41
+ Publish to PyPI
42
+ ```bash
43
+ make publish
44
+ ```
koza-0.0.0/LICENSE ADDED
@@ -0,0 +1,29 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2022, Monarch Initiative
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
koza-0.0.0/Makefile ADDED
@@ -0,0 +1,66 @@
1
+ # Note that uv is required, see https://docs.astral.sh/uv/getting-started/installation/
2
+
3
+ MAKEFLAGS += --warn-undefined-variables
4
+ MAKEFLAGS += --no-builtin-rules
5
+ MAKEFLAGS += --no-builtin-variables
6
+
7
+ ifneq (,$(wildcard ./.env))
8
+ include .env
9
+ export
10
+ endif
11
+
12
+ .DEFAULT_GOAL := all
13
+ SHELL := bash
14
+ RUN := uv run
15
+
16
+ # This nifty grep/sort/awk pipeline collects all comments headed by the double "#" symbols next to each target and recycles them as comments
17
+ .PHONY: help
18
+ help: ## Print this help message
19
+ @grep -hE '^[[:alnum:]_/.-]+:.*## ' $(MAKEFILE_LIST) | \
20
+ awk 'BEGIN {FS = ":.*## "}; {printf "\033[36m%-25s\033[0m %s\n", $$1, $$2}'
21
+
22
+
23
+ .PHONY: all
24
+ all: install test clean ## Install, test, and clean
25
+
26
+ .PHONY: install
27
+ install: ## Install development environment
28
+ uv venv --allow-existing
29
+ uv pip install -e .[dev]
30
+ uv lock
31
+
32
+ .PHONY: build
33
+ build: ## Build the package
34
+ uv build
35
+
36
+ .PHONY: test
37
+ test: ## Run the test suite
38
+ $(RUN) pytest tests
39
+
40
+ .PHONY: docs
41
+ docs: ## Build the documentation
42
+ $(RUN) typer src/koza/main.py utils docs --name koza --output docs/Usage/CLI.md
43
+ $(RUN) mkdocs build
44
+
45
+ .PHONY: clean
46
+ clean: ## Clean up build artifacts, etc.
47
+ rm -rf `find . -name __pycache__`
48
+ rm -f `find . -type f -name '*.py[co]' `
49
+ rm -rf .pytest_cache
50
+ rm -rf output test-output
51
+ rm -rf dist
52
+
53
+ .PHONY: coverage
54
+ coverage: ## Run the test suite with coverage reporting
55
+ -$(RUN) coverage run -m pytest tests
56
+ $(RUN) coverage report -m
57
+
58
+ .PHONY: lint
59
+ lint: ## Lint the codebase
60
+ $(RUN) ruff check --diff --exit-zero src/ tests/ examples/
61
+ $(RUN) ruff format --check --diff src/ tests/ examples/
62
+
63
+ .PHONY: format
64
+ format: ## Format the codebase
65
+ $(RUN) ruff check --fix --exit-zero src/ tests/ examples/
66
+ $(RUN) ruff format src/ tests/ examples/
koza-0.0.0/PKG-INFO ADDED
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.4
2
+ Name: koza
3
+ Version: 0.0.0
4
+ Summary: Data transformation framework for LinkML data models
5
+ Author-email: The Monarch Initiative <info@monarchinitiative.org>, Kevin Schaper <kevinschaper@gmail.com>, Glass Elsarboukh <g.elsarboukh@gmail.com>, Kent Shefchek <kent@tislab.org>, Daniel Korn <daniel_korn@med.unc.edu>
6
+ License-Expression: BSD-3-Clause
7
+ License-File: LICENSE
8
+ Requires-Python: <4,>=3.10
9
+ Requires-Dist: biolink-model>=4.3.6
10
+ Requires-Dist: coverage>=7.13.0
11
+ Requires-Dist: duckdb
12
+ Requires-Dist: linkml>=1.9.0
13
+ Requires-Dist: loguru
14
+ Requires-Dist: mergedeep==1.3.4
15
+ Requires-Dist: ordered-set>=4.1.0
16
+ Requires-Dist: pydantic>=2.12.5
17
+ Requires-Dist: pyyaml>=6.0.3
18
+ Requires-Dist: requests>=2.32.5
19
+ Requires-Dist: sssom>=0.4
20
+ Requires-Dist: tqdm>=4.67.1
21
+ Requires-Dist: typer>=0.20.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: mkdocs-material>=9.7.0; extra == 'dev'
24
+ Requires-Dist: mkdocs>=1.6.1; extra == 'dev'
25
+ Requires-Dist: mkdocstrings[python]>=1.0.0; extra == 'dev'
26
+ Requires-Dist: pytest; extra == 'dev'
27
+ Requires-Dist: ruff; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Koza - Knowledge Graph Transformation and Operations Toolkit
31
+
32
+ [![Pyversions](https://img.shields.io/pypi/pyversions/koza.svg)](https://pypi.python.org/pypi/koza)
33
+ [![PyPi](https://img.shields.io/pypi/v/koza.svg)](https://pypi.python.org/pypi/koza)
34
+ ![Github Action](https://github.com/monarch-initiative/koza/actions/workflows/test.yaml/badge.svg)
35
+
36
+ ![pupa](docs/img/pupa.png)
37
+
38
+ [**Documentation**](https://koza.monarchinitiative.org/)
39
+
40
+ _Disclaimer_: Koza is in beta - we are looking for testers!
41
+
42
+ ## Overview
43
+
44
+ Koza is a Python library and CLI tool for transforming biomedical data and performing graph operations on Knowledge Graph Exchange (KGX) files. It provides two main capabilities:
45
+
46
+ ### ๐Ÿ“Š **Graph Operations** (New!)
47
+ Powerful DuckDB-based operations for KGX knowledge graphs:
48
+ - **Join** multiple KGX files with schema harmonization
49
+ - **Split** files by field values with format conversion
50
+ - **Prune** dangling edges and handle singleton nodes
51
+ - **Append** new data to existing databases with schema evolution
52
+ - **Multi-format support** for TSV, JSONL, and Parquet files
53
+
54
+ ### ๐Ÿ”„ **Data Transformation** (Core)
55
+ Transform biomedical data sources into KGX format:
56
+ - Transform csv, json, yaml, jsonl, and xml to target formats
57
+ - Output in [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv)
58
+ - Write data transforms in semi-declarative Python
59
+ - Configure source files, columns/properties, and metadata in YAML
60
+ - Create mapping files and translation tables between vocabularies
61
+
62
+ ## Installation
63
+ Koza is available on PyPi and can be installed via pip/pipx:
64
+ ```
65
+ [pip|pipx] install koza
66
+ ```
67
+
68
+ ## Usage
69
+
70
+ ### Quick Start with Graph Operations
71
+
72
+ Koza's graph operations work seamlessly across multiple KGX formats (TSV, JSONL, Parquet):
73
+
74
+ ```bash
75
+ # Join multiple KGX files into a unified database
76
+ koza join --nodes genes.tsv pathways.jsonl --edges interactions.parquet --output merged_graph.duckdb
77
+
78
+ # Prune dangling edges and handle singleton nodes
79
+ koza prune --database merged_graph.duckdb --keep-singletons
80
+
81
+ # Append new data to existing database with schema evolution
82
+ koza append --database merged_graph.duckdb --nodes new_genes.tsv --edges new_interactions.jsonl
83
+
84
+ # Split database by source with format conversion
85
+ koza split --database merged_graph.duckdb --split-on provided_by --output-format parquet
86
+ ```
87
+
88
+ **NOTE: As of version 0.2.0, there is a new method for getting your ingest's `KozaApp` instance. Please see the [updated documentation](https://koza.monarchinitiative.org/Usage/configuring_ingests/#transform-code) for details.**
89
+
90
+ See the [Koza documentation](https://koza.monarchinitiative.org/) for complete usage information
91
+
92
+ ### Examples
93
+
94
+ #### Validate
95
+
96
+ Give Koza a local or remote csv file, and get some basic information (headers, number of rows)
97
+
98
+ ```bash
99
+ koza validate \
100
+ --file https://raw.githubusercontent.com/monarch-initiative/koza/main/examples/data/string.tsv \
101
+ --delimiter ' '
102
+ ```
103
+
104
+ Sending a json or jsonl formatted file will confirm if the file is valid json or jsonl
105
+
106
+ ```bash
107
+ koza validate \
108
+ --file ./examples/data/ZFIN_PHENOTYPE_0.jsonl.gz \
109
+ --format jsonl
110
+ ```
111
+
112
+ ```bash
113
+ koza validate \
114
+ --file ./examples/data/ddpheno.json.gz \
115
+ --format json
116
+ ```
117
+
118
+ #### Transform
119
+
120
+ Run the example ingest, "string/protein-links-detailed"
121
+ ```bash
122
+ koza transform \
123
+ --source examples/string/protein-links-detailed.yaml \
124
+ --global-table examples/translation_table.yaml
125
+
126
+ koza transform \
127
+ --source examples/string-declarative/protein-links-detailed.yaml \
128
+ --global-table examples/translation_table.yaml
129
+ ```
130
+
131
+ **Note**:
132
+ Koza expects a directory structure as described in the above example
133
+ with the source config file and transform code in the same directory:
134
+ ```
135
+ .
136
+ โ”œโ”€โ”€ ...
137
+ โ”‚ โ”œโ”€โ”€ your_source
138
+ โ”‚ โ”‚ โ”œโ”€โ”€ your_ingest.yaml
139
+ โ”‚ โ”‚ โ””โ”€โ”€ your_ingest.py
140
+ โ”‚ โ””โ”€โ”€ some_translation_table.yaml
141
+ โ””โ”€โ”€ ...
142
+ ```
143
+
144
+ #### Graph Operations
145
+
146
+ Create and manipulate knowledge graphs from existing KGX files:
147
+
148
+ ```bash
149
+ # Join heterogeneous KGX files with automatic schema harmonization
150
+ koza join \
151
+ --nodes genes.tsv proteins.jsonl pathways.parquet \
152
+ --edges gene_protein.tsv protein_pathway.jsonl \
153
+ --output unified_graph.duckdb \
154
+ --schema-report
155
+
156
+ # Clean up graph integrity issues
157
+ koza prune \
158
+ --database unified_graph.duckdb \
159
+ --keep-singletons \
160
+ --dry-run # Preview changes before applying
161
+
162
+ # Incrementally add new data with schema evolution
163
+ koza append \
164
+ --database unified_graph.duckdb \
165
+ --nodes new_genes.tsv updated_pathways.jsonl \
166
+ --deduplicate \
167
+ --show-progress
168
+
169
+ # Export subsets with format conversion
170
+ koza split \
171
+ --database unified_graph.duckdb \
172
+ --split-on provided_by \
173
+ --output-format parquet \
174
+ --output-dir ./split_graphs
175
+ ```
176
+
177
+ ## Key Features
178
+
179
+ ### ๐Ÿ”ง **Multi-Format Support**
180
+ - Native support for TSV, JSONL, and Parquet KGX files
181
+ - Automatic format detection and conversion
182
+ - Mixed-format operations in single commands
183
+
184
+ ### ๐Ÿ›ก๏ธ **Schema Flexibility**
185
+ - Automatic schema harmonization across heterogeneous files
186
+ - Schema evolution with backward compatibility
187
+ - Comprehensive schema reporting and validation
188
+
189
+ ### โšก **High Performance**
190
+ - DuckDB-powered operations for fast bulk processing
191
+ - Memory-efficient handling of large knowledge graphs
192
+ - Parallel processing and streaming where possible
193
+
194
+ ### ๐Ÿ” **Rich CLI Experience**
195
+ - Progress indicators for long-running operations
196
+ - Detailed statistics and operation summaries
197
+ - Dry-run modes for safe operation preview
198
+
199
+ ### ๐Ÿงน **Data Integrity**
200
+ - Dangling edge detection and preservation
201
+ - Duplicate detection and removal strategies
202
+ - Non-destructive operations with data archiving