omop-emb 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. omop_emb-0.2.1/.github/workflows/docs.yml +31 -0
  2. omop_emb-0.2.1/.github/workflows/lint-pr.yml +17 -0
  3. omop_emb-0.2.1/.github/workflows/release.yml +99 -0
  4. omop_emb-0.2.1/.gitignore +207 -0
  5. omop_emb-0.2.1/.releaserc.json +22 -0
  6. omop_emb-0.2.1/CHANGELOG.md +20 -0
  7. omop_emb-0.2.1/PKG-INFO +70 -0
  8. omop_emb-0.2.1/README.md +37 -0
  9. omop_emb-0.2.1/docs/index.md +42 -0
  10. omop_emb-0.2.1/docs/usage/backend-selection.md +89 -0
  11. omop_emb-0.2.1/docs/usage/cli.md +55 -0
  12. omop_emb-0.2.1/docs/usage/installation.md +71 -0
  13. omop_emb-0.2.1/mkdocs.yml +52 -0
  14. omop_emb-0.2.1/pyproject.toml +96 -0
  15. omop_emb-0.2.1/src/omop_emb/__init__.py +13 -0
  16. omop_emb-0.2.1/src/omop_emb/backends/__init__.py +47 -0
  17. omop_emb-0.2.1/src/omop_emb/backends/base.py +443 -0
  18. omop_emb-0.2.1/src/omop_emb/backends/config.py +61 -0
  19. omop_emb-0.2.1/src/omop_emb/backends/embedding_utils.py +121 -0
  20. omop_emb-0.2.1/src/omop_emb/backends/errors.py +22 -0
  21. omop_emb-0.2.1/src/omop_emb/backends/factory.py +73 -0
  22. omop_emb-0.2.1/src/omop_emb/backends/faiss/__init__.py +1 -0
  23. omop_emb-0.2.1/src/omop_emb/backends/faiss/faiss_backend.py +401 -0
  24. omop_emb-0.2.1/src/omop_emb/backends/faiss/faiss_sql.py +109 -0
  25. omop_emb-0.2.1/src/omop_emb/backends/faiss/index_manager.py +257 -0
  26. omop_emb-0.2.1/src/omop_emb/backends/faiss/storage_manager.py +285 -0
  27. omop_emb-0.2.1/src/omop_emb/backends/pgvector/__init__.py +1 -0
  28. omop_emb-0.2.1/src/omop_emb/backends/pgvector/pgvector_backend.py +188 -0
  29. omop_emb-0.2.1/src/omop_emb/backends/pgvector/pgvector_sql.py +251 -0
  30. omop_emb-0.2.1/src/omop_emb/backends/registry.py +58 -0
  31. omop_emb-0.2.1/src/omop_emb/cli.py +139 -0
  32. omop_emb-0.2.1/src/omop_emb/interface.py +363 -0
  33. omop_emb-0.2.1/tests/README.md +118 -0
  34. omop_emb-0.2.1/tests/__init__.py +1 -0
  35. omop_emb-0.2.1/tests/conftest.py +301 -0
  36. omop_emb-0.2.1/tests/shared_backend_tests.py +357 -0
  37. omop_emb-0.2.1/tests/test_config.py +37 -0
  38. omop_emb-0.2.1/tests/test_dummy.py +3 -0
  39. omop_emb-0.2.1/tests/test_faiss.py +16 -0
  40. omop_emb-0.2.1/tests/test_fixtures.py +74 -0
  41. omop_emb-0.2.1/tests/test_interface.py +175 -0
  42. omop_emb-0.2.1/tests/test_pgvector.py +16 -0
  43. omop_emb-0.2.1/uv.lock +2853 -0
@@ -0,0 +1,31 @@
1
+ name: docs
2
+ on:
3
+ push:
4
+ branches:
5
+ - master
6
+ - main
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: write
11
+ jobs:
12
+ deploy:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - name: Configure Git Credentials
17
+ run: |
18
+ git config user.name github-actions[bot]
19
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: 3.x
23
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
24
+ - uses: actions/cache@v4
25
+ with:
26
+ key: mkdocs-material-${{ env.cache_id }}
27
+ path: ~/.cache
28
+ restore-keys: |
29
+ mkdocs-material-
30
+ - run: pip install mkdocs mkdocs-material mkdocstrings-python mkdocs-mermaid2-plugin mkdocs-table-reader-plugin
31
+ - run: mkdocs gh-deploy --force
@@ -0,0 +1,17 @@
1
+ # This workflow runs a linter on pull requests to ensure the right prefix is used for semantic-versioning
2
+ name: "Lint PR"
3
+
4
+ on:
5
+ pull_request:
6
+ types:
7
+ - opened
8
+ - edited
9
+ - synchronize
10
+
11
+ jobs:
12
+ main:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: amannn/action-semantic-pull-request@v5
16
+ env:
17
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,99 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+
12
+ services:
13
+ postgres:
14
+ image: pgvector/pgvector:pg16 # Have the pgvector extension pre-installed
15
+ env:
16
+ POSTGRES_USER: postgres
17
+ POSTGRES_PASSWORD: postgres
18
+ POSTGRES_DB: postgres
19
+ ports:
20
+ - 5432:5432
21
+ options: >-
22
+ --health-cmd "pg_isready -U postgres -d postgres"
23
+ --health-interval 10s
24
+ --health-timeout 5s
25
+ --health-retries 10
26
+
27
+ env:
28
+ TEST_DB_HOST: localhost
29
+ TEST_DB_PORT: 5432
30
+ POSTGRES_USER: postgres
31
+ POSTGRES_PASSWORD: postgres
32
+ TEST_DATABASE_NAME: test_omop_emb
33
+ TEST_DB_USERNAME: test
34
+ TEST_DB_PASSWORD: test
35
+
36
+ steps:
37
+ - uses: actions/checkout@v5
38
+
39
+ - name: Install uv
40
+ uses: astral-sh/setup-uv@v7
41
+ with:
42
+ enable-cache: true
43
+ python-version: "3.12"
44
+
45
+ - name: Install dependencies
46
+ run: uv sync --all-extras --dev
47
+
48
+ - name: Run tests
49
+ run: uv run pytest
50
+
51
+ release:
52
+ needs: test
53
+ runs-on: ubuntu-latest
54
+ concurrency: release
55
+ permissions:
56
+ id-token: write # Required for Trusted Publishing to PyPI
57
+ contents: write # Required for Semantic Release to push tags/labels
58
+
59
+ environment:
60
+ name: pypi
61
+ url: https://pypi.org/p/omop-emb
62
+
63
+ steps:
64
+ - name: Checkout
65
+ uses: actions/checkout@v5
66
+ with:
67
+ fetch-depth: 0
68
+ # Use GITHUB_TOKEN for standard releases
69
+ token: ${{ secrets.GITHUB_TOKEN }}
70
+
71
+ - name: Semantic Release
72
+ id: semantic
73
+ uses: cycjimmy/semantic-release-action@v6
74
+ with:
75
+ extra_plugins: |
76
+ @semantic-release/changelog
77
+ @semantic-release/git
78
+ @semantic-release/exec
79
+ env:
80
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
81
+
82
+ - name: Install uv
83
+ if: steps.semantic.outputs.new_release_published == 'true'
84
+ uses: astral-sh/setup-uv@v7
85
+ with:
86
+ python-version: "3.12"
87
+
88
+ - name: Build and publish to PyPI
89
+ if: steps.semantic.outputs.new_release_published == 'true'
90
+ run: |
91
+ uv build
92
+ # uv uses the id-token permission for OIDC "Trusted Publishing"
93
+ uv publish
94
+
95
+ - name: Trigger docs update
96
+ if: steps.semantic.outputs.new_release_published == 'true'
97
+ run: gh workflow run docs.yml
98
+ env:
99
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,207 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
@@ -0,0 +1,22 @@
1
+ {
2
+ "branches": ["main"],
3
+ "plugins": [
4
+ "@semantic-release/commit-analyzer",
5
+ "@semantic-release/release-notes-generator",
6
+ "@semantic-release/changelog",
7
+ [
8
+ "@semantic-release/exec",
9
+ {
10
+ "prepareCmd": "sed -i 's/version = \".*\"/version = \"${nextRelease.version}\"/' pyproject.toml"
11
+ }
12
+ ],
13
+ [
14
+ "@semantic-release/git",
15
+ {
16
+ "assets": ["CHANGELOG.md", "pyproject.toml"],
17
+ "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
18
+ }
19
+ ],
20
+ "@semantic-release/github"
21
+ ]
22
+ }
@@ -0,0 +1,20 @@
1
+ ## [0.2.1](https://github.com/AustralianCancerDataNetwork/omop-emb/compare/v0.2.0...v0.2.1) (2026-04-01)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * trigger PyPI publish after OIDC config ([2fb4b40](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/2fb4b40c4d9221ceac1ae1f3a25f7059380b53bd))
7
+
8
+ # [0.2.0](https://github.com/AustralianCancerDataNetwork/omop-emb/compare/v0.1.0...v0.2.0) (2026-04-01)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * pull newest omop-llm ([c3fb805](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/c3fb8050d84804f48a44966e5d4271465485652d))
14
+ * remove dupblicat optional-dep ([c7a58c1](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/c7a58c16078615a42e5dc6787e0abb44449ab273))
15
+ * Remove duplicate "scripts" key after PR ([9d8369d](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/9d8369d9c54aecddb39efcefada186748eb6ff0f))
16
+
17
+
18
+ ### Features
19
+
20
+ * diverse interface for embedding backends ([3d696dc](https://github.com/AustralianCancerDataNetwork/omop-emb/commit/3d696dc906ac7a94ee7464b82459b0a9b9db3ed2))
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: omop-emb
3
+ Version: 0.2.1
4
+ Summary: Embedding extension to omop-graph
5
+ Author-email: Nico Loesch <n.loesch@unsw.edu.au>
6
+ License-Expression: Apache-2.0
7
+ Keywords: LLM-grounding,OHDSI,OMOP,clinical-data,health-informatics,knowledge-graph,sqlalchemy
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Healthcare Industry
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: numpy>=1.26
17
+ Requires-Dist: omop-alchemy>=0.5.7
18
+ Requires-Dist: omop-llm
19
+ Requires-Dist: orm-loader>=0.3.15
20
+ Requires-Dist: psycopg2-binary>=2.9.11
21
+ Requires-Dist: sqlalchemy>=2.0.45
22
+ Requires-Dist: typing-extensions>=4.15.0
23
+ Provides-Extra: all
24
+ Requires-Dist: faiss-cpu>=1.8.0; extra == 'all'
25
+ Requires-Dist: h5py; extra == 'all'
26
+ Requires-Dist: pgvector; extra == 'all'
27
+ Provides-Extra: faiss
28
+ Requires-Dist: faiss-cpu>=1.8.0; extra == 'faiss'
29
+ Requires-Dist: h5py; extra == 'faiss'
30
+ Provides-Extra: pgvector
31
+ Requires-Dist: pgvector; extra == 'pgvector'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # omop-emb
35
+ Embedding layer for OMOP CDM.
36
+
37
+ ## Installation
38
+
39
+ `omop-emb` now exposes backend-specific optional dependencies so installation
40
+ can match the embedding backend you actually intend to use.
41
+
42
+ ```bash
43
+ pip install "omop-emb[postgres]"
44
+ pip install "omop-emb[faiss]"
45
+ pip install "omop-emb[all]"
46
+ ```
47
+
48
+ Notes:
49
+
50
+ - `postgres` installs the PostgreSQL/pgvector dependencies.
51
+ - `faiss` installs the FAISS-based backend dependencies. This currently only includes CPU support
52
+ - `all` installs both backend stacks for development or mixed environments.
53
+ - A plain `pip install omop-emb` installs the shared core package only.
54
+ - PostgreSQL-specific embedding dependencies are now optional, but `omop-emb`
55
+ still requires some database backend for OMOP access and model registration.
56
+ - Non-PostgreSQL database backends have not yet been tested.
57
+
58
+ Extended documentation can be found [here](https://AustralianCancerDataNetwork.github.io/omop-emb).
59
+
60
+ # Project Roadmap
61
+
62
+ - [x] Interface for postgres storage of vectors
63
+ - [x] Interface for FAISS storage of embeddings
64
+ - [ ] Extensive unit testing
65
+ - [ ] Backend testing
66
+ - [ ] Corruption and restoration of DB testing
67
+ - [ ] Support non-Flat indices for each backend
68
+ - [ ] `faiss` GPU support
69
+ - [ ] [`pgvectorscale`](https://github.com/timescale/pgvectorscale) support
70
+ - [ ] Vector-quantisation for more efficient storage
@@ -0,0 +1,37 @@
1
+ # omop-emb
2
+ Embedding layer for OMOP CDM.
3
+
4
+ ## Installation
5
+
6
+ `omop-emb` now exposes backend-specific optional dependencies so installation
7
+ can match the embedding backend you actually intend to use.
8
+
9
+ ```bash
10
+ pip install "omop-emb[postgres]"
11
+ pip install "omop-emb[faiss]"
12
+ pip install "omop-emb[all]"
13
+ ```
14
+
15
+ Notes:
16
+
17
+ - `postgres` installs the PostgreSQL/pgvector dependencies.
18
+ - `faiss` installs the FAISS-based backend dependencies. This currently only includes CPU support
19
+ - `all` installs both backend stacks for development or mixed environments.
20
+ - A plain `pip install omop-emb` installs the shared core package only.
21
+ - PostgreSQL-specific embedding dependencies are now optional, but `omop-emb`
22
+ still requires some database backend for OMOP access and model registration.
23
+ - Non-PostgreSQL database backends have not yet been tested.
24
+
25
+ Extended documentation can be found [here](https://AustralianCancerDataNetwork.github.io/omop-emb).
26
+
27
+ # Project Roadmap
28
+
29
+ - [x] Interface for postgres storage of vectors
30
+ - [x] Interface for FAISS storage of embeddings
31
+ - [ ] Extensive unit testing
32
+ - [ ] Backend testing
33
+ - [ ] Corruption and restoration of DB testing
34
+ - [ ] Support non-Flat indices for each backend
35
+ - [ ] `faiss` GPU support
36
+ - [ ] [`pgvectorscale`](https://github.com/timescale/pgvectorscale) support
37
+ - [ ] Vector-quantisation for more efficient storage
@@ -0,0 +1,42 @@
1
+ # OMOP Embeddings
2
+
3
+ `omop-emb` is an optional package to super-charge [`omop-graph`](https://AustralianCancerDataNetwork.github.io/omop-graph) and provide additional graph reasoning tools for information retrieval and RAG-based knowledge extraction.
4
+
5
+ The package currently supports:
6
+
7
+ - dynamic embedding model registration
8
+ - multiple embedding models can be stored in the respective backend
9
+ - embedding and lookup for OMOP concepts
10
+ - supports various backends with a PostgreSQL linker
11
+ - [pgvector](https://github.com/pgvector/pgvector): storage in the original OMOP database
12
+ - [FAISS](https://github.com/facebookresearch/faiss): efficient storage on disk for low-RAM applications
13
+ - Extension to [`omop-alchemy`](https://AustralianCancerDataNetwork.github.io/OMOP_Alchemy/) to support new tables
14
+ - CLI scripts to add embeddings to an already existing OMOP CDM
15
+
16
+ ## Installation
17
+
18
+ Install the backend you actually want to use:
19
+
20
+ ```bash
21
+ pip install "omop-emb[postgres]"
22
+ pip install "omop-emb[faiss]"
23
+ pip install "omop-emb[all]"
24
+ ```
25
+
26
+ A plain `pip install omop-emb` installs only the shared core package.
27
+
28
+ At runtime, backend choice should also be explicit. The intended direction is:
29
+
30
+ - install-time choice via extras
31
+ - runtime choice via config such as `OMOP_EMB_BACKEND=postgres` or `OMOP_EMB_BACKEND=faiss` or passing it as an argument to the respective interface (e.g. see [CLI reference](usage/cli.md))
32
+
33
+
34
+ !!! info "Important caveats"
35
+
36
+ - `omop-emb` depends on an OMOP PostgreSQL database for storage of embeddings (pgvector) or to keep track of already embedded concepts.
37
+
38
+
39
+ ## Documentation overview
40
+ - [Installation](usage/installation.md)
41
+ - [Backend Selection](usage/backend-selection.md)
42
+ - [CLI Reference](usage/cli.md)
@@ -0,0 +1,89 @@
1
+ # Backend Selection
2
+
3
+ `omop-emb` now has a backend abstraction layer so embedding storage and
4
+ retrieval can be selected explicitly instead of being inferred implicitly from
5
+ whatever happens to be installed.
6
+
7
+ ## Supported backend names
8
+
9
+ The current backend factory recognizes:
10
+
11
+ - `pgvector`: The [pgvector](https://github.com/pgvector/pgvector) extension to a standard postgres database to store embeddings directly in the database.
12
+ - `faiss`: The [FAISS](https://github.com/facebookresearch/faiss) storage solution for on-disk storage.
13
+
14
+ The default backend name is currently `postgres`.
15
+
16
+ ## Runtime selection
17
+
18
+ The intended pattern is:
19
+
20
+ 1. choose the backend at install time with package extras
21
+ 2. choose the backend again at runtime explicitly
22
+
23
+ Examples:
24
+
25
+ ```bash
26
+ export OMOP_EMB_BACKEND=postgres
27
+ export OMOP_EMB_BACKEND=faiss
28
+ ```
29
+
30
+ You can also pass the backend name directly in Python.
31
+
32
+ ## Python factory
33
+
34
+ The backend factory lives in `omop_emb.backends`:
35
+
36
+ ```python
37
+ from omop_emb.backends import get_embedding_backend
38
+
39
+ backend = get_embedding_backend("postgres")
40
+ backend = get_embedding_backend("faiss")
41
+ ```
42
+
43
+ The factory currently exposes:
44
+
45
+ - `get_embedding_backend(...)`
46
+ - `normalize_backend_name(...)`
47
+
48
+ ## Why explicit selection is necessary
49
+
50
+ Explicit backend selection improves clarity in a multi-backend world:
51
+
52
+ - users can see which backend they intended to use
53
+ - missing optional dependencies fail clearly
54
+ - the system avoids silent fallback between incompatible storage implementations
55
+
56
+ This is especially important when embeddings affect retrieval behavior, because
57
+ silent fallback can make users think semantic retrieval is active when it is
58
+ not.
59
+
60
+ ## Dependency errors
61
+
62
+ If a backend is requested but its optional dependencies are missing, the
63
+ factory raises an explicit backend dependency error rather than falling back to
64
+ another backend.
65
+
66
+ This is the intended behavior.
67
+
68
+ Examples of the error classes exposed by the backend layer:
69
+
70
+ - `EmbeddingBackendDependencyError`
71
+ - `UnknownEmbeddingBackendError`
72
+ - `EmbeddingBackendConfigurationError`
73
+
74
+ ## Current scope
75
+
76
+ At the moment:
77
+
78
+ - the backend abstraction and backend factory exist
79
+ - PostgreSQL and FAISS backend classes exist
80
+ - the production CLI path still targets the PostgreSQL embedding workflow
81
+ - PostgreSQL-specific embedding dependencies are optional, but a database
82
+ backend is still required for OMOP access and model registration
83
+ - model registration is intended to remain shared and database-backed even when
84
+ FAISS is used for vector storage and retrieval
85
+ - database backends other than PostgreSQL have not yet been tested
86
+
87
+ So this page documents the selection model and Python interface shape now, even
88
+ before every runtime path has been migrated to delegate through the backend
89
+ factory.
@@ -0,0 +1,55 @@
1
+ # Embedding Generation CLI
2
+
3
+ This tool generates vector embeddings for OMOP CDM concepts and stores them in the configured embedding backend.
4
+
5
+ At present, the production CLI path is PostgreSQL-oriented and stores embeddings in Postgres/pgvector-backed model tables. It specifically targets concepts that do not yet have embeddings and processes them in batches.
6
+
7
+ !!! note "Supported Models"
8
+
9
+ Currently supported are only Ollama models
10
+ ---
11
+
12
+ ## Prerequisites
13
+
14
+ - **Installation**: install the PostgreSQL backend dependencies:
15
+
16
+ ```bash
17
+ pip install "omop-emb[postgres]"
18
+ ```
19
+
20
+ - **Database**: Postgres implementation of OMOP CDM. See [`omop-graph` documentation](reference-missing) for information how to setup.
21
+ - **Environment**: `OMOP_DATABASE_URL` must be exported or existing in the .env file (e.g., `postgresql://user:pass@localhost:5432/omop`).
22
+ - **Connectivity**: Access to an OpenAI-compatible embeddings endpoint. *Currently only Ollama supported*.
23
+
24
+ !!! note "Backend Scope"
25
+
26
+ `omop-emb` now defines a backend abstraction layer for both PostgreSQL and FAISS-style storage.
27
+ The current `add-embeddings` CLI still targets the PostgreSQL backend path.
28
+
29
+ ---
30
+
31
+ ## `add-embeddings`
32
+
33
+ ### Usage
34
+ ```bash
35
+ omop-emb add-embeddings --api-base <URL> --api-key <KEY> [OPTIONS]
36
+ ```
37
+ where `[OPTIONS]` are optional arguments that can be specified as described below.
38
+
39
+
40
+ ### Command Options
41
+
42
+ ### Command Options
43
+
44
+ | Option | Short | Type | Default | Description |
45
+ | :--- | :--- | :--- | :--- | :--- |
46
+ | **`--api-base`** | | `String` | **Required** | Base URL for the embedding API service. |
47
+ | **`--api-key`** | | `String` | **Required** | API key for the embedding API provider. |
48
+ | **`--index-type`** | | `IndexType` | `FLAT` | The storage index for the embeddings for retrieval. Currently supported: `FLAT`. |
49
+ | **`--batch-size`** | `-b` | `Integer` | `100` | Number of concepts to process in each chunk. |
50
+ | **`--model`** | `-m` | `String` | `text-embedding-3-small` | Name of the embedding model to use for generating vectors. |
51
+ | **`--backend`** | | `Literal['pgvector', 'faiss']` | `None` | Embedding backend to use (can be replaced by `OMOP_EMB_BACKEND` env var). Requires the respective backend installed using `pip install omop-emb[pgvector or faiss]` |
52
+ | **`--faiss-base-dir`** | | `String` | `None` | Optional base directory for FAISS backend storage. |
53
+ | **`--standard-only`** | | `Boolean` | `False` | If set, only generate embeddings for OMOP standard concepts (`standard_concept = 'S'`). |
54
+ | **`--vocabulary`** | | `List[String]` | `None` | Filter to embed concepts only from specific OMOP vocabularies. |
55
+ | **`--num-embeddings`** | `-n` | `Integer` | `None` | Limit the number of concepts processed (useful for testing). |