omop-emb 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omop_emb-0.2.1/.github/workflows/docs.yml +31 -0
- omop_emb-0.2.1/.github/workflows/lint-pr.yml +17 -0
- omop_emb-0.2.1/.github/workflows/release.yml +99 -0
- omop_emb-0.2.1/.gitignore +207 -0
- omop_emb-0.2.1/.releaserc.json +22 -0
- omop_emb-0.2.1/CHANGELOG.md +20 -0
- omop_emb-0.2.1/PKG-INFO +70 -0
- omop_emb-0.2.1/README.md +37 -0
- omop_emb-0.2.1/docs/index.md +42 -0
- omop_emb-0.2.1/docs/usage/backend-selection.md +89 -0
- omop_emb-0.2.1/docs/usage/cli.md +55 -0
- omop_emb-0.2.1/docs/usage/installation.md +71 -0
- omop_emb-0.2.1/mkdocs.yml +52 -0
- omop_emb-0.2.1/pyproject.toml +96 -0
- omop_emb-0.2.1/src/omop_emb/__init__.py +13 -0
- omop_emb-0.2.1/src/omop_emb/backends/__init__.py +47 -0
- omop_emb-0.2.1/src/omop_emb/backends/base.py +443 -0
- omop_emb-0.2.1/src/omop_emb/backends/config.py +61 -0
- omop_emb-0.2.1/src/omop_emb/backends/embedding_utils.py +121 -0
- omop_emb-0.2.1/src/omop_emb/backends/errors.py +22 -0
- omop_emb-0.2.1/src/omop_emb/backends/factory.py +73 -0
- omop_emb-0.2.1/src/omop_emb/backends/faiss/__init__.py +1 -0
- omop_emb-0.2.1/src/omop_emb/backends/faiss/faiss_backend.py +401 -0
- omop_emb-0.2.1/src/omop_emb/backends/faiss/faiss_sql.py +109 -0
- omop_emb-0.2.1/src/omop_emb/backends/faiss/index_manager.py +257 -0
- omop_emb-0.2.1/src/omop_emb/backends/faiss/storage_manager.py +285 -0
- omop_emb-0.2.1/src/omop_emb/backends/pgvector/__init__.py +1 -0
- omop_emb-0.2.1/src/omop_emb/backends/pgvector/pgvector_backend.py +188 -0
- omop_emb-0.2.1/src/omop_emb/backends/pgvector/pgvector_sql.py +251 -0
- omop_emb-0.2.1/src/omop_emb/backends/registry.py +58 -0
- omop_emb-0.2.1/src/omop_emb/cli.py +139 -0
- omop_emb-0.2.1/src/omop_emb/interface.py +363 -0
- omop_emb-0.2.1/tests/README.md +118 -0
- omop_emb-0.2.1/tests/__init__.py +1 -0
- omop_emb-0.2.1/tests/conftest.py +301 -0
- omop_emb-0.2.1/tests/shared_backend_tests.py +357 -0
- omop_emb-0.2.1/tests/test_config.py +37 -0
- omop_emb-0.2.1/tests/test_dummy.py +3 -0
- omop_emb-0.2.1/tests/test_faiss.py +16 -0
- omop_emb-0.2.1/tests/test_fixtures.py +74 -0
- omop_emb-0.2.1/tests/test_interface.py +175 -0
- omop_emb-0.2.1/tests/test_pgvector.py +16 -0
- omop_emb-0.2.1/uv.lock +2853 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: docs
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches:
|
|
5
|
+
- master
|
|
6
|
+
- main
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
jobs:
|
|
12
|
+
deploy:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- name: Configure Git Credentials
|
|
17
|
+
run: |
|
|
18
|
+
git config user.name github-actions[bot]
|
|
19
|
+
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: 3.x
|
|
23
|
+
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
|
|
24
|
+
- uses: actions/cache@v4
|
|
25
|
+
with:
|
|
26
|
+
key: mkdocs-material-${{ env.cache_id }}
|
|
27
|
+
path: ~/.cache
|
|
28
|
+
restore-keys: |
|
|
29
|
+
mkdocs-material-
|
|
30
|
+
- run: pip install mkdocs mkdocs-material mkdocstrings-python mkdocs-mermaid2-plugin mkdocs-table-reader-plugin
|
|
31
|
+
- run: mkdocs gh-deploy --force
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# This workflow runs a linter on pull requests to ensure the right prefix is used for semantic-versioning
|
|
2
|
+
name: "Lint PR"
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
pull_request:
|
|
6
|
+
types:
|
|
7
|
+
- opened
|
|
8
|
+
- edited
|
|
9
|
+
- synchronize
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
main:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: amannn/action-semantic-pull-request@v5
|
|
16
|
+
env:
|
|
17
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
|
|
12
|
+
services:
|
|
13
|
+
postgres:
|
|
14
|
+
image: pgvector/pgvector:pg16 # Have the pgvector extension pre-installed
|
|
15
|
+
env:
|
|
16
|
+
POSTGRES_USER: postgres
|
|
17
|
+
POSTGRES_PASSWORD: postgres
|
|
18
|
+
POSTGRES_DB: postgres
|
|
19
|
+
ports:
|
|
20
|
+
- 5432:5432
|
|
21
|
+
options: >-
|
|
22
|
+
--health-cmd "pg_isready -U postgres -d postgres"
|
|
23
|
+
--health-interval 10s
|
|
24
|
+
--health-timeout 5s
|
|
25
|
+
--health-retries 10
|
|
26
|
+
|
|
27
|
+
env:
|
|
28
|
+
TEST_DB_HOST: localhost
|
|
29
|
+
TEST_DB_PORT: 5432
|
|
30
|
+
POSTGRES_USER: postgres
|
|
31
|
+
POSTGRES_PASSWORD: postgres
|
|
32
|
+
TEST_DATABASE_NAME: test_omop_emb
|
|
33
|
+
TEST_DB_USERNAME: test
|
|
34
|
+
TEST_DB_PASSWORD: test
|
|
35
|
+
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/checkout@v5
|
|
38
|
+
|
|
39
|
+
- name: Install uv
|
|
40
|
+
uses: astral-sh/setup-uv@v7
|
|
41
|
+
with:
|
|
42
|
+
enable-cache: true
|
|
43
|
+
python-version: "3.12"
|
|
44
|
+
|
|
45
|
+
- name: Install dependencies
|
|
46
|
+
run: uv sync --all-extras --dev
|
|
47
|
+
|
|
48
|
+
- name: Run tests
|
|
49
|
+
run: uv run pytest
|
|
50
|
+
|
|
51
|
+
release:
|
|
52
|
+
needs: test
|
|
53
|
+
runs-on: ubuntu-latest
|
|
54
|
+
concurrency: release
|
|
55
|
+
permissions:
|
|
56
|
+
id-token: write # Required for Trusted Publishing to PyPI
|
|
57
|
+
contents: write # Required for Semantic Release to push tags/labels
|
|
58
|
+
|
|
59
|
+
environment:
|
|
60
|
+
name: pypi
|
|
61
|
+
url: https://pypi.org/p/omop-emb
|
|
62
|
+
|
|
63
|
+
steps:
|
|
64
|
+
- name: Checkout
|
|
65
|
+
uses: actions/checkout@v5
|
|
66
|
+
with:
|
|
67
|
+
fetch-depth: 0
|
|
68
|
+
# Use GITHUB_TOKEN for standard releases
|
|
69
|
+
token: ${{ secrets.GITHUB_TOKEN }}
|
|
70
|
+
|
|
71
|
+
- name: Semantic Release
|
|
72
|
+
id: semantic
|
|
73
|
+
uses: cycjimmy/semantic-release-action@v6
|
|
74
|
+
with:
|
|
75
|
+
extra_plugins: |
|
|
76
|
+
@semantic-release/changelog
|
|
77
|
+
@semantic-release/git
|
|
78
|
+
@semantic-release/exec
|
|
79
|
+
env:
|
|
80
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
81
|
+
|
|
82
|
+
- name: Install uv
|
|
83
|
+
if: steps.semantic.outputs.new_release_published == 'true'
|
|
84
|
+
uses: astral-sh/setup-uv@v7
|
|
85
|
+
with:
|
|
86
|
+
python-version: "3.12"
|
|
87
|
+
|
|
88
|
+
- name: Build and publish to PyPI
|
|
89
|
+
if: steps.semantic.outputs.new_release_published == 'true'
|
|
90
|
+
run: |
|
|
91
|
+
uv build
|
|
92
|
+
# uv uses the id-token permission for OIDC "Trusted Publishing"
|
|
93
|
+
uv publish
|
|
94
|
+
|
|
95
|
+
- name: Trigger docs update
|
|
96
|
+
if: steps.semantic.outputs.new_release_published == 'true'
|
|
97
|
+
run: gh workflow run docs.yml
|
|
98
|
+
env:
|
|
99
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
.vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"branches": ["main"],
|
|
3
|
+
"plugins": [
|
|
4
|
+
"@semantic-release/commit-analyzer",
|
|
5
|
+
"@semantic-release/release-notes-generator",
|
|
6
|
+
"@semantic-release/changelog",
|
|
7
|
+
[
|
|
8
|
+
"@semantic-release/exec",
|
|
9
|
+
{
|
|
10
|
+
"prepareCmd": "sed -i 's/version = \".*\"/version = \"${nextRelease.version}\"/' pyproject.toml"
|
|
11
|
+
}
|
|
12
|
+
],
|
|
13
|
+
[
|
|
14
|
+
"@semantic-release/git",
|
|
15
|
+
{
|
|
16
|
+
"assets": ["CHANGELOG.md", "pyproject.toml"],
|
|
17
|
+
"message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
|
|
18
|
+
}
|
|
19
|
+
],
|
|
20
|
+
"@semantic-release/github"
|
|
21
|
+
]
|
|
22
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
## [0.2.1](https://github.com/AustralianCancerDataNetwork/omop-emb/compare/v0.2.0...v0.2.1) (2026-04-01)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* trigger PyPI publish after OIDC config ([2fb4b40](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/2fb4b40c4d9221ceac1ae1f3a25f7059380b53bd))
|
|
7
|
+
|
|
8
|
+
# [0.2.0](https://github.com/AustralianCancerDataNetwork/omop-emb/compare/v0.1.0...v0.2.0) (2026-04-01)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Bug Fixes
|
|
12
|
+
|
|
13
|
+
* pull newest omop-llm ([c3fb805](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/c3fb8050d84804f48a44966e5d4271465485652d))
|
|
14
|
+
* remove dupblicat optional-dep ([c7a58c1](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/c7a58c16078615a42e5dc6787e0abb44449ab273))
|
|
15
|
+
* Remove duplicate "scripts" key after PR ([9d8369d](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/9d8369d9c54aecddb39efcefada186748eb6ff0f))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
### Features
|
|
19
|
+
|
|
20
|
+
* diverse interface for embedding backends ([3d696dc](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/3d696dc906ac7a94ee7464b82459b0a9b9db3ed2))
|
omop_emb-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: omop-emb
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Embedding extension to omop-graph
|
|
5
|
+
Author-email: Nico Loesch <n.loesch@unsw.edu.au>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Keywords: LLM-grounding,OHDSI,OMOP,clinical-data,health-informatics,knowledge-graph,sqlalchemy
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Requires-Dist: numpy>=1.26
|
|
17
|
+
Requires-Dist: omop-alchemy>=0.5.7
|
|
18
|
+
Requires-Dist: omop-llm
|
|
19
|
+
Requires-Dist: orm-loader>=0.3.15
|
|
20
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
21
|
+
Requires-Dist: sqlalchemy>=2.0.45
|
|
22
|
+
Requires-Dist: typing-extensions>=4.15.0
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: faiss-cpu>=1.8.0; extra == 'all'
|
|
25
|
+
Requires-Dist: h5py; extra == 'all'
|
|
26
|
+
Requires-Dist: pgvector; extra == 'all'
|
|
27
|
+
Provides-Extra: faiss
|
|
28
|
+
Requires-Dist: faiss-cpu>=1.8.0; extra == 'faiss'
|
|
29
|
+
Requires-Dist: h5py; extra == 'faiss'
|
|
30
|
+
Provides-Extra: pgvector
|
|
31
|
+
Requires-Dist: pgvector; extra == 'pgvector'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# omop-emb
|
|
35
|
+
Embedding layer for OMOP CDM.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
`omop-emb` now exposes backend-specific optional dependencies so installation
|
|
40
|
+
can match the embedding backend you actually intend to use.
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install "omop-emb[postgres]"
|
|
44
|
+
pip install "omop-emb[faiss]"
|
|
45
|
+
pip install "omop-emb[all]"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Notes:
|
|
49
|
+
|
|
50
|
+
- `postgres` installs the PostgreSQL/pgvector dependencies.
|
|
51
|
+
- `faiss` installs the FAISS-based backend dependencies. This currently only includes CPU support
|
|
52
|
+
- `all` installs both backend stacks for development or mixed environments.
|
|
53
|
+
- A plain `pip install omop-emb` installs the shared core package only.
|
|
54
|
+
- PostgreSQL-specific embedding dependencies are now optional, but `omop-emb`
|
|
55
|
+
still requires some database backend for OMOP access and model registration.
|
|
56
|
+
- Non-PostgreSQL database backends have not yet been tested.
|
|
57
|
+
|
|
58
|
+
Extended documentation can be found [here](https://AustralianCancerDataNetwork.github.io/omop-emb).
|
|
59
|
+
|
|
60
|
+
# Project Roadmap
|
|
61
|
+
|
|
62
|
+
- [x] Interface for postgres storage of vectors
|
|
63
|
+
- [x] Interface for FAISS storage of embeddings
|
|
64
|
+
- [ ] Extensive unit testing
|
|
65
|
+
- [ ] Backend testing
|
|
66
|
+
- [ ] Corruption and restoration of DB testing
|
|
67
|
+
- [ ] Support non-Flat indices for each backend
|
|
68
|
+
- [ ] `faiss` GPU support
|
|
69
|
+
- [ ] [`pgvectorscale`](https://github.com/timescale/pgvectorscale) support
|
|
70
|
+
- [ ] Vector-quantisation for more efficient storage
|
omop_emb-0.2.1/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# omop-emb
|
|
2
|
+
Embedding layer for OMOP CDM.
|
|
3
|
+
|
|
4
|
+
## Installation
|
|
5
|
+
|
|
6
|
+
`omop-emb` now exposes backend-specific optional dependencies so installation
|
|
7
|
+
can match the embedding backend you actually intend to use.
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install "omop-emb[postgres]"
|
|
11
|
+
pip install "omop-emb[faiss]"
|
|
12
|
+
pip install "omop-emb[all]"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Notes:
|
|
16
|
+
|
|
17
|
+
- `postgres` installs the PostgreSQL/pgvector dependencies.
|
|
18
|
+
- `faiss` installs the FAISS-based backend dependencies. This currently only includes CPU support
|
|
19
|
+
- `all` installs both backend stacks for development or mixed environments.
|
|
20
|
+
- A plain `pip install omop-emb` installs the shared core package only.
|
|
21
|
+
- PostgreSQL-specific embedding dependencies are now optional, but `omop-emb`
|
|
22
|
+
still requires some database backend for OMOP access and model registration.
|
|
23
|
+
- Non-PostgreSQL database backends have not yet been tested.
|
|
24
|
+
|
|
25
|
+
Extended documentation can be found [here](https://AustralianCancerDataNetwork.github.io/omop-emb).
|
|
26
|
+
|
|
27
|
+
# Project Roadmap
|
|
28
|
+
|
|
29
|
+
- [x] Interface for postgres storage of vectors
|
|
30
|
+
- [x] Interface for FAISS storage of embeddings
|
|
31
|
+
- [ ] Extensive unit testing
|
|
32
|
+
- [ ] Backend testing
|
|
33
|
+
- [ ] Corruption and restoration of DB testing
|
|
34
|
+
- [ ] Support non-Flat indices for each backend
|
|
35
|
+
- [ ] `faiss` GPU support
|
|
36
|
+
- [ ] [`pgvectorscale`](https://github.com/timescale/pgvectorscale) support
|
|
37
|
+
- [ ] Vector-quantisation for more efficient storage
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# OMOP Embeddings
|
|
2
|
+
|
|
3
|
+
`omop-emb` is an optional package to super-charge [`omop-graph`](https://AustralianCancerDataNetwork.github.io/omop-graph) and provide additional graph reasoning tools for information retrieval and RAG-based knowledge extraction.
|
|
4
|
+
|
|
5
|
+
The package currently supports:
|
|
6
|
+
|
|
7
|
+
- dynamic embedding model registration
|
|
8
|
+
- multiple embedding models can be stored in the respective backend
|
|
9
|
+
- embedding and lookup for OMOP concepts
|
|
10
|
+
- supports various backends with a PostgreSQL linker
|
|
11
|
+
- [pgvector](https://github.com/pgvector/pgvector): storage in the original OMOP database
|
|
12
|
+
- [FAISS](https://github.com/facebookresearch/faiss): efficient storage on disk for low-RAM applications
|
|
13
|
+
- Extension to [`omop-alchemy`](https://AustralianCancerDataNetwork.github.io/OMOP_Alchemy/) to support new tables
|
|
14
|
+
- CLI scripts to add embeddings to an already existing OMOP CDM
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Install the backend you actually want to use:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install "omop-emb[postgres]"
|
|
22
|
+
pip install "omop-emb[faiss]"
|
|
23
|
+
pip install "omop-emb[all]"
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
A plain `pip install omop-emb` installs only the shared core package.
|
|
27
|
+
|
|
28
|
+
At runtime, backend choice should also be explicit. The intended direction is:
|
|
29
|
+
|
|
30
|
+
- install-time choice via extras
|
|
31
|
+
- runtime choice via config such as `OMOP_EMB_BACKEND=postgres` or `OMOP_EMB_BACKEND=faiss` or passing it as an argument to the respective interface (e.g. see [CLI reference](usage/cli.md))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
!!! info "Important caveats"
|
|
35
|
+
|
|
36
|
+
- `omop-emb` depends on an OMOP PostgreSQL database for storage of embeddings (pgvector) or to keep track of already embedded concepts.
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## Documentation overview
|
|
40
|
+
- [Installation](usage/installation.md)
|
|
41
|
+
- [Backend Selection](usage/backend-selection.md)
|
|
42
|
+
- [CLI Reference](usage/cli.md)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Backend Selection
|
|
2
|
+
|
|
3
|
+
`omop-emb` now has a backend abstraction layer so embedding storage and
|
|
4
|
+
retrieval can be selected explicitly instead of being inferred implicitly from
|
|
5
|
+
whatever happens to be installed.
|
|
6
|
+
|
|
7
|
+
## Supported backend names
|
|
8
|
+
|
|
9
|
+
The current backend factory recognizes:
|
|
10
|
+
|
|
11
|
+
- `pgvector`: The [pgvector](https://github.com/pgvector/pgvector) extension to a standard postgres database to store embeddings directly in the database.
|
|
12
|
+
- `faiss`: The [FAISS](https://github.com/facebookresearch/faiss) storage solution for on-disk storage.
|
|
13
|
+
|
|
14
|
+
The default backend name is currently `postgres`.
|
|
15
|
+
|
|
16
|
+
## Runtime selection
|
|
17
|
+
|
|
18
|
+
The intended pattern is:
|
|
19
|
+
|
|
20
|
+
1. choose the backend at install time with package extras
|
|
21
|
+
2. choose the backend again at runtime explicitly
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
export OMOP_EMB_BACKEND=postgres
|
|
27
|
+
export OMOP_EMB_BACKEND=faiss
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
You can also pass the backend name directly in Python.
|
|
31
|
+
|
|
32
|
+
## Python factory
|
|
33
|
+
|
|
34
|
+
The backend factory lives in `omop_emb.backends`:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from omop_emb.backends import get_embedding_backend
|
|
38
|
+
|
|
39
|
+
backend = get_embedding_backend("postgres")
|
|
40
|
+
backend = get_embedding_backend("faiss")
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The factory currently exposes:
|
|
44
|
+
|
|
45
|
+
- `get_embedding_backend(...)`
|
|
46
|
+
- `normalize_backend_name(...)`
|
|
47
|
+
|
|
48
|
+
## Why explicit selection is necessary
|
|
49
|
+
|
|
50
|
+
Explicit backend selection improves clarity in a multi-backend world:
|
|
51
|
+
|
|
52
|
+
- users can see which backend they intended to use
|
|
53
|
+
- missing optional dependencies fail clearly
|
|
54
|
+
- the system avoids silent fallback between incompatible storage implementations
|
|
55
|
+
|
|
56
|
+
This is especially important when embeddings affect retrieval behavior, because
|
|
57
|
+
silent fallback can make users think semantic retrieval is active when it is
|
|
58
|
+
not.
|
|
59
|
+
|
|
60
|
+
## Dependency errors
|
|
61
|
+
|
|
62
|
+
If a backend is requested but its optional dependencies are missing, the
|
|
63
|
+
factory raises an explicit backend dependency error rather than falling back to
|
|
64
|
+
another backend.
|
|
65
|
+
|
|
66
|
+
This is the intended behavior.
|
|
67
|
+
|
|
68
|
+
Examples of the error classes exposed by the backend layer:
|
|
69
|
+
|
|
70
|
+
- `EmbeddingBackendDependencyError`
|
|
71
|
+
- `UnknownEmbeddingBackendError`
|
|
72
|
+
- `EmbeddingBackendConfigurationError`
|
|
73
|
+
|
|
74
|
+
## Current scope
|
|
75
|
+
|
|
76
|
+
At the moment:
|
|
77
|
+
|
|
78
|
+
- the backend abstraction and backend factory exist
|
|
79
|
+
- PostgreSQL and FAISS backend classes exist
|
|
80
|
+
- the production CLI path still targets the PostgreSQL embedding workflow
|
|
81
|
+
- PostgreSQL-specific embedding dependencies are optional, but a database
|
|
82
|
+
backend is still required for OMOP access and model registration
|
|
83
|
+
- model registration is intended to remain shared and database-backed even when
|
|
84
|
+
FAISS is used for vector storage and retrieval
|
|
85
|
+
- database backends other than PostgreSQL have not yet been tested
|
|
86
|
+
|
|
87
|
+
So this page documents the selection model and Python interface shape now, even
|
|
88
|
+
before every runtime path has been migrated to delegate through the backend
|
|
89
|
+
factory.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Embedding Generation CLI
|
|
2
|
+
|
|
3
|
+
This tool generates vector embeddings for OMOP CDM concepts and stores them in the configured embedding backend.
|
|
4
|
+
|
|
5
|
+
At present, the production CLI path is PostgreSQL-oriented and stores embeddings in Postgres/pgvector-backed model tables. It specifically targets concepts that do not yet have embeddings and processes them in batches.
|
|
6
|
+
|
|
7
|
+
!!! note "Supported Models"
|
|
8
|
+
|
|
9
|
+
Currently supported are only Ollama models
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Prerequisites
|
|
13
|
+
|
|
14
|
+
- **Installation**: install the PostgreSQL backend dependencies:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install "omop-emb[postgres]"
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
- **Database**: Postgres implementation of OMOP CDM. See [`omop-graph` documentation](reference-missing) for information how to setup.
|
|
21
|
+
- **Environment**: `OMOP_DATABASE_URL` must be exported or existing in the .env file (e.g., `postgresql://user:pass@localhost:5432/omop`).
|
|
22
|
+
- **Connectivity**: Access to an OpenAI-compatible embeddings endpoint. *Currently only Ollama supported*.
|
|
23
|
+
|
|
24
|
+
!!! note "Backend Scope"
|
|
25
|
+
|
|
26
|
+
`omop-emb` now defines a backend abstraction layer for both PostgreSQL and FAISS-style storage.
|
|
27
|
+
The current `add-embeddings` CLI still targets the PostgreSQL backend path.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## `add-embeddings`
|
|
32
|
+
|
|
33
|
+
### Usage
|
|
34
|
+
```bash
|
|
35
|
+
omop-emb add-embeddings --api-base <URL> --api-key <KEY> [OPTIONS]
|
|
36
|
+
```
|
|
37
|
+
where `[OPTIONS]` are optional arguments that can be specified as described below.
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
### Command Options
|
|
41
|
+
|
|
42
|
+
### Command Options
|
|
43
|
+
|
|
44
|
+
| Option | Short | Type | Default | Description |
|
|
45
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
46
|
+
| **`--api-base`** | | `String` | **Required** | Base URL for the embedding API service. |
|
|
47
|
+
| **`--api-key`** | | `String` | **Required** | API key for the embedding API provider. |
|
|
48
|
+
| **`--index-type`** | | `IndexType` | `FLAT` | The storage index for the embeddings for retrieval. Currently supported: `FLAT`. |
|
|
49
|
+
| **`--batch-size`** | `-b` | `Integer` | `100` | Number of concepts to process in each chunk. |
|
|
50
|
+
| **`--model`** | `-m` | `String` | `text-embedding-3-small` | Name of the embedding model to use for generating vectors. |
|
|
51
|
+
| **`--backend`** | | `Literal['pgvector', 'faiss']` | `None` | Embedding backend to use (can be replaced by `OMOP_EMB_BACKEND` env var). Requires the respective backend installed using `pip install omop-emb[pgvector or faiss]` |
|
|
52
|
+
| **`--faiss-base-dir`** | | `String` | `None` | Optional base directory for FAISS backend storage. |
|
|
53
|
+
| **`--standard-only`** | | `Boolean` | `False` | If set, only generate embeddings for OMOP standard concepts (`standard_concept = 'S'`). |
|
|
54
|
+
| **`--vocabulary`** | | `List[String]` | `None` | Filter to embed concepts only from specific OMOP vocabularies. |
|
|
55
|
+
| **`--num-embeddings`** | `-n` | `Integer` | `None` | Limit the number of concepts processed (useful for testing). |
|