groundworkers 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundworkers-0.1.0/PKG-INFO +116 -0
- groundworkers-0.1.0/README.md +96 -0
- groundworkers-0.1.0/pyproject.toml +46 -0
- groundworkers-0.1.0/setup.cfg +4 -0
- groundworkers-0.1.0/src/groundworkers/__init__.py +3 -0
- groundworkers-0.1.0/src/groundworkers/adapters/__init__.py +1 -0
- groundworkers-0.1.0/src/groundworkers/adapters/omop_emb.py +251 -0
- groundworkers-0.1.0/src/groundworkers/adapters/omop_graph.py +721 -0
- groundworkers-0.1.0/src/groundworkers/adapters/omop_vocab.py +582 -0
- groundworkers-0.1.0/src/groundworkers/base/__init__.py +17 -0
- groundworkers-0.1.0/src/groundworkers/base/errors.py +19 -0
- groundworkers-0.1.0/src/groundworkers/base/results.py +38 -0
- groundworkers-0.1.0/src/groundworkers/base/server.py +52 -0
- groundworkers-0.1.0/src/groundworkers/base/sql.py +109 -0
- groundworkers-0.1.0/src/groundworkers/config.py +139 -0
- groundworkers-0.1.0/src/groundworkers/server.py +127 -0
- groundworkers-0.1.0/src/groundworkers/tools/__init__.py +1 -0
- groundworkers-0.1.0/src/groundworkers/tools/concept_tools.py +237 -0
- groundworkers-0.1.0/src/groundworkers/tools/embedding_tools.py +83 -0
- groundworkers-0.1.0/src/groundworkers/tools/resolver_tools.py +90 -0
- groundworkers-0.1.0/src/groundworkers/tools/search_tools.py +163 -0
- groundworkers-0.1.0/src/groundworkers/tools/system_tools.py +67 -0
- groundworkers-0.1.0/src/groundworkers.egg-info/PKG-INFO +116 -0
- groundworkers-0.1.0/src/groundworkers.egg-info/SOURCES.txt +28 -0
- groundworkers-0.1.0/src/groundworkers.egg-info/dependency_links.txt +1 -0
- groundworkers-0.1.0/src/groundworkers.egg-info/entry_points.txt +2 -0
- groundworkers-0.1.0/src/groundworkers.egg-info/requires.txt +16 -0
- groundworkers-0.1.0/src/groundworkers.egg-info/top_level.txt +1 -0
- groundworkers-0.1.0/tests/test_server_registry.py +85 -0
- groundworkers-0.1.0/tests/test_sql_resource.py +61 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groundworkers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Groundworkers MCP server — read-only agentive access to OMOP vocabularies, concept graphs, and embeddings.
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: mcp[cli]<2,>=1
|
|
8
|
+
Requires-Dist: pydantic<3,>=2
|
|
9
|
+
Requires-Dist: pyyaml<7,>=6
|
|
10
|
+
Requires-Dist: SQLAlchemy<3,>=2
|
|
11
|
+
Requires-Dist: psycopg[binary]<4,>=3.1
|
|
12
|
+
Requires-Dist: omop-graph>=1.1.0
|
|
13
|
+
Requires-Dist: omop-emb>=1.0.0
|
|
14
|
+
Provides-Extra: embedding-pgvector
|
|
15
|
+
Requires-Dist: omop-emb[pgvector]>=1.0.0; extra == "embedding-pgvector"
|
|
16
|
+
Provides-Extra: embedding-faiss
|
|
17
|
+
Requires-Dist: omop-emb[faiss-cpu]>=1.0.0; extra == "embedding-faiss"
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest<9,>=8; extra == "dev"
|
|
20
|
+
|
|
21
|
+
# groundworkers
|
|
22
|
+
|
|
23
|
+
**groundworkers** is an atomic, read-only MCP (Model Context Protocol) tool library for
|
|
24
|
+
navigating the OMOP vocabularies. It exposes OMOP vocabulary lookups, embedding similarity search,
|
|
25
|
+
cohort concept references, and system status as typed MCP tools that any MCP client can call —
|
|
26
|
+
including [groundcrew](https://github.com/AustralianCancerDataNetwork/groundcrew),
|
|
27
|
+
Claude Code, and autonomous agents.
|
|
28
|
+
|
|
29
|
+
Read-only. No patient-level data. No write operations.
|
|
30
|
+
|
|
31
|
+
## What it exposes
|
|
32
|
+
|
|
33
|
+
| Group | Tools |
|
|
34
|
+
|---|---|
|
|
35
|
+
| **Concept** | `concept_get`, `concept_by_code`, `concept_ancestors`, `concept_descendants`, `concept_relationships`, `concept_equivalency_path`, `concept_path`, `concept_map_to_standard`, `concept_neighbors` |
|
|
36
|
+
| **Resolver** | `concept_ground` (with `parent_ids`, scoring fields, and `grounding_explanation`) |
|
|
37
|
+
| **Search** | `concept_search_exact`, `concept_search_fulltext`, `concept_navigate_to_standard` |
|
|
38
|
+
| **Embedding** | `embedding_index_status`, `embedding_neighbours`, `embedding_search`, `embedding_encode` |
|
|
39
|
+
| **Cohort** | `cohort_find_concept_references` |
|
|
40
|
+
| **System** | `system_status`, `system_vocabulary_catalogue` |
|
|
41
|
+
|
|
42
|
+
Tools are registered conditionally — if an adapter is not configured, its tools are
|
|
43
|
+
simply not registered. `system_status` and `system_vocabulary_catalogue` are always
|
|
44
|
+
registered so clients can always query adapter availability.
|
|
45
|
+
|
|
46
|
+
## Quick start
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv venv
|
|
50
|
+
uv sync --extra dev --extra embedding-tools
|
|
51
|
+
uv run groundworkers --config config/groundworkers.example.yaml --describe
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Start the server:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
uv run groundworkers --config config/groundworkers.example.yaml
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Example config
|
|
61
|
+
|
|
62
|
+
```yaml
|
|
63
|
+
omop_graph:
|
|
64
|
+
db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
|
|
65
|
+
vocab_schema: omop_vocab
|
|
66
|
+
|
|
67
|
+
omop_emb:
|
|
68
|
+
enabled: true
|
|
69
|
+
backend_type: pgvector
|
|
70
|
+
db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
|
|
71
|
+
default_model_name: qwen3-embedding:0.6b
|
|
72
|
+
api_base: "http://localhost:11434/v1"
|
|
73
|
+
api_key: "ollama"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Install matrix
|
|
77
|
+
|
|
78
|
+
| Use case | Extras |
|
|
79
|
+
|---|---|
|
|
80
|
+
| Core server only | none |
|
|
81
|
+
| Concept tools | `concept-tools` |
|
|
82
|
+
| Cohort tools | `cohort-tools` |
|
|
83
|
+
| Embedding tools (sqlite-vec) | `embedding-tools` |
|
|
84
|
+
| Embedding tools (pgvector) | `embedding-pgvector` |
|
|
85
|
+
| Embedding tools (FAISS sidecar) | `embedding-faiss` |
|
|
86
|
+
| All tool families | `all-tools` |
|
|
87
|
+
| All + pgvector embeddings | `all-tools-pgvector` |
|
|
88
|
+
| All + FAISS embeddings | `all-tools-faiss` |
|
|
89
|
+
| Development | `dev` |
|
|
90
|
+
| Development + all tools | `dev-all` |
|
|
91
|
+
|
|
92
|
+
## Layout
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
src/groundworkers/
|
|
96
|
+
adapters/ — omop_graph, omop_emb, oa_cohorts adapter classes
|
|
97
|
+
base/ — GroundcrewServer, errors, results, SQL helpers
|
|
98
|
+
tools/ — MCP tool registrations by domain
|
|
99
|
+
config.py — Pydantic config models (AppConfig, OmopGraphConfig, etc.)
|
|
100
|
+
server.py — Server factory and CLI entry point
|
|
101
|
+
config/ — Example YAML configs
|
|
102
|
+
_design/ — Architecture notes and spec documents
|
|
103
|
+
tests/ — Unit and integration tests
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Adapter backends
|
|
107
|
+
|
|
108
|
+
- **omop-graph** — concept lookup, hierarchy traversal, full-text search
|
|
109
|
+
- **omop-emb** — embedding index (sqlite-vec, pgvector, or FAISS sidecar)
|
|
110
|
+
- **OpenAnalytics cohorts** — cohort concept reference queries (Phase N)
|
|
111
|
+
|
|
112
|
+
## Companion repos
|
|
113
|
+
|
|
114
|
+
- [groundcrew](https://github.com/AustralianCancerDataNetwork/groundcrew) — ACP orchestration layer that drives this tool substrate
|
|
115
|
+
- [omop-graph](https://australiancancerdatanetwork.github.io/omop-graph/) — OMOP virtual knowledge graph library
|
|
116
|
+
- [omop-emb](https://australiancancerdatanetwork.github.io/omop-emb/) — OMOP embedding index library
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# groundworkers
|
|
2
|
+
|
|
3
|
+
**groundworkers** is an atomic, read-only MCP (Model Context Protocol) tool library for
|
|
4
|
+
navigating the OMOP vocabularies. It exposes OMOP vocabulary lookups, embedding similarity search,
|
|
5
|
+
cohort concept references, and system status as typed MCP tools that any MCP client can call —
|
|
6
|
+
including [groundcrew](https://github.com/AustralianCancerDataNetwork/groundcrew),
|
|
7
|
+
Claude Code, and autonomous agents.
|
|
8
|
+
|
|
9
|
+
Read-only. No patient-level data. No write operations.
|
|
10
|
+
|
|
11
|
+
## What it exposes
|
|
12
|
+
|
|
13
|
+
| Group | Tools |
|
|
14
|
+
|---|---|
|
|
15
|
+
| **Concept** | `concept_get`, `concept_by_code`, `concept_ancestors`, `concept_descendants`, `concept_relationships`, `concept_equivalency_path`, `concept_path`, `concept_map_to_standard`, `concept_neighbors` |
|
|
16
|
+
| **Resolver** | `concept_ground` (with `parent_ids`, scoring fields, and `grounding_explanation`) |
|
|
17
|
+
| **Search** | `concept_search_exact`, `concept_search_fulltext`, `concept_navigate_to_standard` |
|
|
18
|
+
| **Embedding** | `embedding_index_status`, `embedding_neighbours`, `embedding_search`, `embedding_encode` |
|
|
19
|
+
| **Cohort** | `cohort_find_concept_references` |
|
|
20
|
+
| **System** | `system_status`, `system_vocabulary_catalogue` |
|
|
21
|
+
|
|
22
|
+
Tools are registered conditionally — if an adapter is not configured, its tools are
|
|
23
|
+
simply not registered. `system_status` and `system_vocabulary_catalogue` are always
|
|
24
|
+
registered so clients can always query adapter availability.
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
uv venv
|
|
30
|
+
uv sync --extra dev --extra embedding-tools
|
|
31
|
+
uv run groundworkers --config config/groundworkers.example.yaml --describe
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Start the server:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
uv run groundworkers --config config/groundworkers.example.yaml
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Example config
|
|
41
|
+
|
|
42
|
+
```yaml
|
|
43
|
+
omop_graph:
|
|
44
|
+
db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
|
|
45
|
+
vocab_schema: omop_vocab
|
|
46
|
+
|
|
47
|
+
omop_emb:
|
|
48
|
+
enabled: true
|
|
49
|
+
backend_type: pgvector
|
|
50
|
+
db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
|
|
51
|
+
default_model_name: qwen3-embedding:0.6b
|
|
52
|
+
api_base: "http://localhost:11434/v1"
|
|
53
|
+
api_key: "ollama"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Install matrix
|
|
57
|
+
|
|
58
|
+
| Use case | Extras |
|
|
59
|
+
|---|---|
|
|
60
|
+
| Core server only | none |
|
|
61
|
+
| Concept tools | `concept-tools` |
|
|
62
|
+
| Cohort tools | `cohort-tools` |
|
|
63
|
+
| Embedding tools (sqlite-vec) | `embedding-tools` |
|
|
64
|
+
| Embedding tools (pgvector) | `embedding-pgvector` |
|
|
65
|
+
| Embedding tools (FAISS sidecar) | `embedding-faiss` |
|
|
66
|
+
| All tool families | `all-tools` |
|
|
67
|
+
| All + pgvector embeddings | `all-tools-pgvector` |
|
|
68
|
+
| All + FAISS embeddings | `all-tools-faiss` |
|
|
69
|
+
| Development | `dev` |
|
|
70
|
+
| Development + all tools | `dev-all` |
|
|
71
|
+
|
|
72
|
+
## Layout
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
src/groundworkers/
|
|
76
|
+
adapters/ — omop_graph, omop_emb, oa_cohorts adapter classes
|
|
77
|
+
base/ — GroundcrewServer, errors, results, SQL helpers
|
|
78
|
+
tools/ — MCP tool registrations by domain
|
|
79
|
+
config.py — Pydantic config models (AppConfig, OmopGraphConfig, etc.)
|
|
80
|
+
server.py — Server factory and CLI entry point
|
|
81
|
+
config/ — Example YAML configs
|
|
82
|
+
_design/ — Architecture notes and spec documents
|
|
83
|
+
tests/ — Unit and integration tests
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Adapter backends
|
|
87
|
+
|
|
88
|
+
- **omop-graph** — concept lookup, hierarchy traversal, full-text search
|
|
89
|
+
- **omop-emb** — embedding index (sqlite-vec, pgvector, or FAISS sidecar)
|
|
90
|
+
- **OpenAnalytics cohorts** — cohort concept reference queries (Phase N)
|
|
91
|
+
|
|
92
|
+
## Companion repos
|
|
93
|
+
|
|
94
|
+
- [groundcrew](https://github.com/AustralianCancerDataNetwork/groundcrew) — ACP orchestration layer that drives this tool substrate
|
|
95
|
+
- [omop-graph](https://australiancancerdatanetwork.github.io/omop-graph/) — OMOP virtual knowledge graph library
|
|
96
|
+
- [omop-emb](https://australiancancerdatanetwork.github.io/omop-emb/) — OMOP embedding index library
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "groundworkers"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Groundworkers MCP server — read-only agentive access to OMOP vocabularies, concept graphs, and embeddings."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"mcp[cli]>=1,<2",
|
|
9
|
+
"pydantic>=2,<3",
|
|
10
|
+
"pyyaml>=6,<7",
|
|
11
|
+
"SQLAlchemy>=2,<3",
|
|
12
|
+
"psycopg[binary]>=3.1,<4",
|
|
13
|
+
"omop-graph>=1.1.0",
|
|
14
|
+
"omop-emb>=1.0.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.optional-dependencies]
|
|
18
|
+
embedding-pgvector = [
|
|
19
|
+
"omop-emb[pgvector]>=1.0.0",
|
|
20
|
+
]
|
|
21
|
+
embedding-faiss = [
|
|
22
|
+
"omop-emb[faiss-cpu]>=1.0.0",
|
|
23
|
+
]
|
|
24
|
+
dev = [
|
|
25
|
+
"pytest>=8,<9",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
groundworkers = "groundworkers.server:main"
|
|
30
|
+
|
|
31
|
+
[build-system]
|
|
32
|
+
requires = ["setuptools>=68", "wheel"]
|
|
33
|
+
build-backend = "setuptools.build_meta"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools]
|
|
36
|
+
package-dir = {"" = "src"}
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
testpaths = ["tests"]
|
|
43
|
+
markers = [
|
|
44
|
+
"integration: requires a live backend or database",
|
|
45
|
+
"requires_client: requires an embedding API client",
|
|
46
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from omop_emb import EmbeddingConceptFilter, EmbeddingReaderInterface
|
|
8
|
+
from omop_emb.config import MetricType
|
|
9
|
+
from omop_emb.embeddings.embedding_client import EmbeddingRole
|
|
10
|
+
from omop_emb.interface import list_registered_models
|
|
11
|
+
|
|
12
|
+
from groundworkers.base.errors import GroundworkersError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OmopEmbAdapter:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
*,
|
|
19
|
+
backend_factory: Callable[[], object],
|
|
20
|
+
backend_type: str | None,
|
|
21
|
+
default_model_name: str | None = None,
|
|
22
|
+
client_factory: Callable[[str], object] | None = None,
|
|
23
|
+
cdm_engine: object | None = None,
|
|
24
|
+
faiss_cache_dir: str | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
self._backend_factory = backend_factory
|
|
27
|
+
self._backend_type = backend_type
|
|
28
|
+
self._default_model_name = default_model_name
|
|
29
|
+
self._client_factory = client_factory
|
|
30
|
+
self._cdm_engine = cdm_engine
|
|
31
|
+
self._faiss_cache_dir = faiss_cache_dir
|
|
32
|
+
self._backend: object | None = None
|
|
33
|
+
self._clients: dict[str, object] = {}
|
|
34
|
+
|
|
35
|
+
def is_available(self) -> bool:
|
|
36
|
+
return self.index_status()["available"]
|
|
37
|
+
|
|
38
|
+
def has_client(self) -> bool:
|
|
39
|
+
return self._client_factory is not None
|
|
40
|
+
|
|
41
|
+
def close(self) -> None:
|
|
42
|
+
backend = self._backend
|
|
43
|
+
if backend is not None and hasattr(backend, "close"):
|
|
44
|
+
backend.close()
|
|
45
|
+
self._backend = None
|
|
46
|
+
self._clients.clear()
|
|
47
|
+
|
|
48
|
+
def index_status(self) -> dict[str, Any]:
|
|
49
|
+
try:
|
|
50
|
+
backend = self._get_backend()
|
|
51
|
+
records = list_registered_models(backend=backend)
|
|
52
|
+
models: list[dict[str, Any]] = []
|
|
53
|
+
for record in records:
|
|
54
|
+
metric_type = record.metric_type or MetricType.COSINE
|
|
55
|
+
concept_count = backend.get_embedding_count(
|
|
56
|
+
model_name=record.model_name,
|
|
57
|
+
metric_type=metric_type,
|
|
58
|
+
)
|
|
59
|
+
models.append(
|
|
60
|
+
{
|
|
61
|
+
"model_name": record.model_name,
|
|
62
|
+
"provider": self._enum_value(record.provider_type),
|
|
63
|
+
"dimensions": int(record.dimensions),
|
|
64
|
+
"index_type": self._enum_value(record.index_type),
|
|
65
|
+
"concept_count": int(concept_count),
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
return {
|
|
69
|
+
"available": bool(models),
|
|
70
|
+
"backend_type": self._backend_type or self._backend_type_from_backend(backend),
|
|
71
|
+
"models": models,
|
|
72
|
+
}
|
|
73
|
+
except Exception:
|
|
74
|
+
return {
|
|
75
|
+
"available": False,
|
|
76
|
+
"backend_type": self._backend_type,
|
|
77
|
+
"models": [],
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def get_neighbours(
|
|
81
|
+
self,
|
|
82
|
+
concept_id: int,
|
|
83
|
+
limit: int,
|
|
84
|
+
model_name: str | None,
|
|
85
|
+
) -> dict[str, Any]:
|
|
86
|
+
record = self._resolve_model_record(model_name)
|
|
87
|
+
reader = self._build_reader(record)
|
|
88
|
+
vectors = reader.get_embeddings_by_concept_ids((concept_id,))
|
|
89
|
+
if concept_id not in vectors:
|
|
90
|
+
raise GroundworkersError("NOT_FOUND", f"Concept {concept_id} is not present in the embedding index")
|
|
91
|
+
|
|
92
|
+
vector = np.asarray(vectors[concept_id], dtype=float).reshape(1, -1)
|
|
93
|
+
# Request limit+1 so that self-exclusion below still yields `limit` results.
|
|
94
|
+
concept_filter = self._build_concept_filter(limit=limit + 1)
|
|
95
|
+
raw = reader.get_nearest_concepts(
|
|
96
|
+
query_embedding=vector,
|
|
97
|
+
concept_filter=concept_filter,
|
|
98
|
+
k=limit + 1,
|
|
99
|
+
)
|
|
100
|
+
matches = raw[0] if raw else ()
|
|
101
|
+
results = [
|
|
102
|
+
self._serialise_nearest_match(match)
|
|
103
|
+
for match in matches
|
|
104
|
+
if getattr(match, "concept_id", None) != concept_id
|
|
105
|
+
][:limit]
|
|
106
|
+
return {
|
|
107
|
+
"query_concept_id": concept_id,
|
|
108
|
+
"model_name": record.model_name,
|
|
109
|
+
"results": results,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
def search(
|
|
113
|
+
self,
|
|
114
|
+
query: str,
|
|
115
|
+
limit: int,
|
|
116
|
+
domain: str | None,
|
|
117
|
+
vocabulary: str | None,
|
|
118
|
+
standard_only: bool,
|
|
119
|
+
active_only: bool,
|
|
120
|
+
model_name: str | None,
|
|
121
|
+
) -> dict[str, Any]:
|
|
122
|
+
if not self.has_client():
|
|
123
|
+
raise GroundworkersError(
|
|
124
|
+
"BACKEND_UNAVAIL",
|
|
125
|
+
"on-the-fly embedding requires a configured model client",
|
|
126
|
+
)
|
|
127
|
+
record = self._resolve_model_record(model_name)
|
|
128
|
+
reader = self._build_reader(record)
|
|
129
|
+
client = self._get_client(record.model_name)
|
|
130
|
+
concept_filter = self._build_concept_filter(
|
|
131
|
+
limit=limit,
|
|
132
|
+
domain=domain,
|
|
133
|
+
vocabulary=vocabulary,
|
|
134
|
+
standard_only=standard_only,
|
|
135
|
+
active_only=active_only,
|
|
136
|
+
)
|
|
137
|
+
raw = reader.get_nearest_concepts_from_query_texts(
|
|
138
|
+
query_texts=(query,),
|
|
139
|
+
embedding_client=client,
|
|
140
|
+
concept_filter=concept_filter,
|
|
141
|
+
k=limit,
|
|
142
|
+
)
|
|
143
|
+
matches = raw[0] if raw else ()
|
|
144
|
+
return {
|
|
145
|
+
"query_text": query,
|
|
146
|
+
"model_name": record.model_name,
|
|
147
|
+
"results": [self._serialise_nearest_match(match) for match in matches],
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def encode(self, text: str, model_name: str | None) -> dict[str, Any]:
|
|
151
|
+
if not self.has_client():
|
|
152
|
+
raise GroundworkersError("BACKEND_UNAVAIL", "embedding client is not configured")
|
|
153
|
+
record = self._resolve_model_record(model_name)
|
|
154
|
+
client = self._get_client(record.model_name)
|
|
155
|
+
vector = client.embeddings(text, embedding_role=EmbeddingRole.QUERY)
|
|
156
|
+
array = np.asarray(vector, dtype=float)
|
|
157
|
+
if array.ndim != 2 or array.shape[0] != 1:
|
|
158
|
+
raise GroundworkersError("QUERY_ERROR", f"Expected one embedding vector, got shape {array.shape}")
|
|
159
|
+
row = array[0]
|
|
160
|
+
return {
|
|
161
|
+
"text": text,
|
|
162
|
+
"model_name": record.model_name,
|
|
163
|
+
"dimensions": int(row.shape[0]),
|
|
164
|
+
"vector": row.tolist(),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
def _get_backend(self) -> object:
|
|
168
|
+
if self._backend is None:
|
|
169
|
+
try:
|
|
170
|
+
self._backend = self._backend_factory()
|
|
171
|
+
except Exception as exc:
|
|
172
|
+
raise GroundworkersError("BACKEND_UNAVAIL", f"Embedding backend is unavailable: {exc}") from exc
|
|
173
|
+
return self._backend
|
|
174
|
+
|
|
175
|
+
def _resolve_model_record(self, model_name: str | None) -> object:
|
|
176
|
+
backend = self._get_backend()
|
|
177
|
+
requested_name = model_name or self._default_model_name
|
|
178
|
+
records = list_registered_models(backend=backend, model_name=requested_name)
|
|
179
|
+
if requested_name is not None:
|
|
180
|
+
if not records:
|
|
181
|
+
raise GroundworkersError("NOT_FOUND", f"Embedding model {requested_name!r} is not registered")
|
|
182
|
+
return records[0]
|
|
183
|
+
# No specific model requested — records contains all registered models.
|
|
184
|
+
if len(records) == 1:
|
|
185
|
+
return records[0]
|
|
186
|
+
if not records:
|
|
187
|
+
raise GroundworkersError("BACKEND_UNAVAIL", "No embedding models are registered in the backend")
|
|
188
|
+
raise GroundworkersError(
|
|
189
|
+
"BACKEND_UNAVAIL",
|
|
190
|
+
"No default embedding model is configured and multiple registered models are available",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def _build_reader(self, record: object) -> EmbeddingReaderInterface:
|
|
194
|
+
return EmbeddingReaderInterface(
|
|
195
|
+
model=record.model_name,
|
|
196
|
+
backend=self._get_backend(),
|
|
197
|
+
metric_type=record.metric_type or MetricType.COSINE,
|
|
198
|
+
omop_cdm_engine=self._cdm_engine,
|
|
199
|
+
provider_name_or_type=record.provider_type,
|
|
200
|
+
faiss_cache_dir=self._faiss_cache_dir,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def _get_client(self, model_name: str) -> object:
|
|
204
|
+
if self._client_factory is None:
|
|
205
|
+
raise GroundworkersError("BACKEND_UNAVAIL", "embedding client is not configured")
|
|
206
|
+
client = self._clients.get(model_name)
|
|
207
|
+
if client is None:
|
|
208
|
+
try:
|
|
209
|
+
client = self._client_factory(model_name)
|
|
210
|
+
except Exception as exc:
|
|
211
|
+
raise GroundworkersError("BACKEND_UNAVAIL", f"Embedding client is unavailable: {exc}") from exc
|
|
212
|
+
self._clients[model_name] = client
|
|
213
|
+
return client
|
|
214
|
+
|
|
215
|
+
def _build_concept_filter(
|
|
216
|
+
self,
|
|
217
|
+
*,
|
|
218
|
+
limit: int,
|
|
219
|
+
domain: str | None = None,
|
|
220
|
+
vocabulary: str | None = None,
|
|
221
|
+
standard_only: bool = False,
|
|
222
|
+
active_only: bool = False,
|
|
223
|
+
) -> EmbeddingConceptFilter:
|
|
224
|
+
domains = (domain,) if domain else None
|
|
225
|
+
vocabularies = (vocabulary,) if vocabulary else None
|
|
226
|
+
return EmbeddingConceptFilter(
|
|
227
|
+
domains=domains,
|
|
228
|
+
vocabularies=vocabularies,
|
|
229
|
+
require_standard=standard_only,
|
|
230
|
+
require_active=active_only,
|
|
231
|
+
limit=limit,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def _backend_type_from_backend(self, backend: object) -> str | None:
|
|
235
|
+
backend_type = getattr(backend, "backend_type", None)
|
|
236
|
+
return self._enum_value(backend_type)
|
|
237
|
+
|
|
238
|
+
def _serialise_nearest_match(self, match: object) -> dict[str, Any]:
|
|
239
|
+
return {
|
|
240
|
+
"concept_id": int(getattr(match, "concept_id")),
|
|
241
|
+
"concept_name": getattr(match, "concept_name", None),
|
|
242
|
+
"similarity": round(float(getattr(match, "similarity")), 6),
|
|
243
|
+
"is_standard": getattr(match, "is_standard", None),
|
|
244
|
+
"is_active": getattr(match, "is_active", None),
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def _enum_value(value: object) -> str | None:
|
|
249
|
+
if value is None:
|
|
250
|
+
return None
|
|
251
|
+
return getattr(value, "value", str(value))
|