groundworkers 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. groundworkers-0.2.0/PKG-INFO +208 -0
  2. groundworkers-0.2.0/README.md +178 -0
  3. {groundworkers-0.1.0 → groundworkers-0.2.0}/pyproject.toml +12 -2
  4. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/adapters/omop_emb.py +19 -11
  5. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/adapters/omop_graph.py +69 -10
  6. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/adapters/omop_vocab.py +272 -0
  7. groundworkers-0.1.0/src/groundworkers/server.py → groundworkers-0.2.0/src/groundworkers/app.py +50 -55
  8. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/config.py +14 -0
  9. groundworkers-0.2.0/src/groundworkers/server.py +62 -0
  10. groundworkers-0.2.0/src/groundworkers/services/__init__.py +3 -0
  11. groundworkers-0.2.0/src/groundworkers/services/mapping.py +600 -0
  12. groundworkers-0.2.0/src/groundworkers/tools/mapping_tools.py +225 -0
  13. groundworkers-0.2.0/src/groundworkers.egg-info/PKG-INFO +208 -0
  14. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers.egg-info/SOURCES.txt +4 -0
  15. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers.egg-info/requires.txt +11 -1
  16. {groundworkers-0.1.0 → groundworkers-0.2.0}/tests/test_server_registry.py +12 -0
  17. groundworkers-0.1.0/PKG-INFO +0 -116
  18. groundworkers-0.1.0/README.md +0 -96
  19. groundworkers-0.1.0/src/groundworkers.egg-info/PKG-INFO +0 -116
  20. {groundworkers-0.1.0 → groundworkers-0.2.0}/setup.cfg +0 -0
  21. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/__init__.py +0 -0
  22. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/adapters/__init__.py +0 -0
  23. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/base/__init__.py +0 -0
  24. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/base/errors.py +0 -0
  25. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/base/results.py +0 -0
  26. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/base/server.py +0 -0
  27. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/base/sql.py +0 -0
  28. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/tools/__init__.py +0 -0
  29. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/tools/concept_tools.py +0 -0
  30. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/tools/embedding_tools.py +0 -0
  31. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/tools/resolver_tools.py +0 -0
  32. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/tools/search_tools.py +0 -0
  33. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers/tools/system_tools.py +0 -0
  34. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers.egg-info/dependency_links.txt +0 -0
  35. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers.egg-info/entry_points.txt +0 -0
  36. {groundworkers-0.1.0 → groundworkers-0.2.0}/src/groundworkers.egg-info/top_level.txt +0 -0
  37. {groundworkers-0.1.0 → groundworkers-0.2.0}/tests/test_sql_resource.py +0 -0
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: groundworkers
3
+ Version: 0.2.0
4
+ Summary: Groundworkers MCP server — read-only agentive access to OMOP vocabularies, concept graphs, and embeddings.
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: mcp[cli]<2,>=1
8
+ Requires-Dist: pydantic<3,>=2
9
+ Requires-Dist: pyyaml<7,>=6
10
+ Requires-Dist: SQLAlchemy<3,>=2
11
+ Requires-Dist: psycopg[binary]<4,>=3.1
12
+ Requires-Dist: omop-graph>=1.1.0
13
+ Requires-Dist: omop-emb>=1.0.0
14
+ Provides-Extra: embedding-pgvector
15
+ Requires-Dist: omop-emb[pgvector]>=1.0.0; extra == "embedding-pgvector"
16
+ Provides-Extra: embedding-faiss
17
+ Requires-Dist: omop-emb[faiss-cpu]>=1.0.0; extra == "embedding-faiss"
18
+ Provides-Extra: dev
19
+ Requires-Dist: ipython>=8.0; extra == "dev"
20
+ Requires-Dist: tornado>=6.5.5; extra == "dev"
21
+ Requires-Dist: pytest>=9.0.3; extra == "dev"
22
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
23
+ Requires-Dist: mypy>=1.8; extra == "dev"
24
+ Requires-Dist: ruff>=0.4; extra == "dev"
25
+ Requires-Dist: mkdocs-material>=9.7.1; extra == "dev"
26
+ Requires-Dist: mkdocstrings-python>=2.0.1; extra == "dev"
27
+ Requires-Dist: mkdocs>=1.6.1; extra == "dev"
28
+ Requires-Dist: requests>=2.33.0; extra == "dev"
29
+ Requires-Dist: mkdocs-mermaid2-plugin; extra == "dev"
30
+
31
+ # groundworkers
32
+
33
+ `groundworkers` is a read-only OMOP vocabulary integration package. You can use it
34
+ in two ways:
35
+
36
+ - as an **MCP server** for tool consumers such as `groundcrew`, Claude Code, and
37
+ other MCP clients
38
+ - as a **Python library** for applications that want to call mapping and retrieval
39
+ logic directly
40
+
41
+ No patient-level writes. No session state. No transport-specific business logic.
42
+
43
+ ## When to use it
44
+
45
+ Use `groundworkers` when you want:
46
+
47
+ - OMOP concept lookup and hierarchy navigation
48
+ - exact, full-text, and embedding-based concept retrieval
49
+ - mapping-oriented evidence bundles and context assembly
50
+ - one package that works both over MCP and in-process from Python
51
+
52
+ ## How it is organized
53
+
54
+ ```mermaid
55
+ flowchart LR
56
+ Client1[Python app] --> App[build_application]
57
+ Client2[MCP client] --> Server[groundworkers server]
58
+ App --> Services[services/]
59
+ Server --> Tools[tools/]
60
+ Tools --> Services
61
+ Services --> Adapters[adapters/]
62
+ Adapters --> OG[omop-graph]
63
+ Adapters --> OE[omop-emb]
64
+ Adapters --> DB[(OMOP DB)]
65
+ ```
66
+
67
+ - `adapters/` handle dependency-specific details
68
+ - `services/` handle reusable workflow logic
69
+ - `tools/` expose MCP-facing wrappers
70
+ - `app.py` and `server.py` wire those pieces together
71
+
72
+ ## What it exposes
73
+
74
+ | Group | Surface | Notes |
75
+ |---|---|---|
76
+ | Concept | `concept_get`, `concept_by_code`, `concept_ancestors`, `concept_descendants`, `concept_relationships`, `concept_equivalency_path`, `concept_path`, `concept_map_to_standard`, `concept_neighbors` | Backed by `OmopGraphAdapter` |
77
+ | Resolver | `concept_ground` | Best-answer grounding pipeline |
78
+ | Search | `concept_search_exact`, `concept_search_fulltext`, `concept_navigate_to_standard` | Low-level lexical primitives |
79
+ | Mapping | `concept_search_normalized`, `concept_candidate_bundle`, `concept_parent_backoff`, `concept_mapping_context`, `concept_map_to_value`, `concept_resolve_mapping_expression`, `mapping_evaluate_candidates` | High-level mapping workflows |
80
+ | Embedding | `embedding_index_status`, `embedding_neighbours`, `embedding_search`, `embedding_encode` | Backed by `OmopEmbAdapter` |
81
+ | System | `system_status`, `system_vocabulary_catalogue` | Always registered |
82
+
83
+ ## Quick start
84
+
85
+ ### MCP server
86
+
87
+ ```bash
88
+ uv venv
89
+ uv sync --extra dev --extra embedding-tools
90
+ uv run groundworkers --config config/groundworkers.example.yaml --describe
91
+ uv run groundworkers --config config/groundworkers.example.yaml
92
+ ```
93
+
94
+ ### Direct Python use
95
+
96
+ ```python
97
+ from groundworkers.app import build_application
98
+ from groundworkers.config import AppConfig
99
+
100
+ config = AppConfig.model_validate(
101
+ {
102
+ "omop_graph": {
103
+ "db_url": "postgresql+psycopg://user:pass@localhost:5432/omop",
104
+ "vocab_schema": "omop_vocab",
105
+ },
106
+ "omop_emb": {
107
+ "enabled": True,
108
+ "backend_type": "pgvector",
109
+ "db_url": "postgresql+psycopg://user:pass@localhost:5432/omop",
110
+ "default_model_name": "qwen3-embedding:0.6b",
111
+ "api_base": "http://localhost:11434/v1",
112
+ "api_key": "ollama",
113
+ },
114
+ }
115
+ )
116
+
117
+ app = build_application(config)
118
+ mapping = app.services.mapping
119
+ assert mapping is not None
120
+
121
+ bundle = mapping.concept_candidate_bundle(
122
+ "type 2 diabetes",
123
+ domain="Condition",
124
+ include_normalized=True,
125
+ include_fulltext=True,
126
+ include_embedding=True,
127
+ )
128
+ ```
129
+
130
+ ## Example config
131
+
132
+ ```yaml
133
+ omop_graph:
134
+ db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
135
+ vocab_schema: omop_vocab
136
+
137
+ omop_emb:
138
+ enabled: true
139
+ backend_type: pgvector
140
+ db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
141
+ default_model_name: qwen3-embedding:0.6b
142
+ api_base: "http://localhost:11434/v1"
143
+ api_key: "ollama"
144
+ ```
145
+
146
+ ## End-to-end examples
147
+
148
+ ### MCP consumer flow
149
+
150
+ ```mermaid
151
+ sequenceDiagram
152
+ participant C as MCP consumer
153
+ participant GW as groundworkers
154
+ participant M as MappingService
155
+ participant D as OMOP dependencies
156
+
157
+ C->>GW: call tool concept_candidate_bundle
158
+ GW->>M: invoke tool wrapper
159
+ M->>D: gather lexical, graph, and embedding evidence
160
+ D-->>M: candidate evidence
161
+ M-->>GW: assembled bundle
162
+ GW-->>C: MCP-safe JSON result
163
+ ```
164
+
165
+ Representative tool payload:
166
+
167
+ ```json
168
+ {
169
+ "tool": "concept_candidate_bundle",
170
+ "arguments": {
171
+ "query": "type 2 diabetes",
172
+ "domain": "Condition",
173
+ "include_normalized": true,
174
+ "include_fulltext": true,
175
+ "include_embedding": true,
176
+ "include_standard_mappings": true
177
+ }
178
+ }
179
+ ```
180
+
181
+ ### Direct Python flow
182
+
183
+ ```mermaid
184
+ sequenceDiagram
185
+ participant App as Python application
186
+ participant S as MappingService
187
+ participant A as Adapters
188
+ participant D as OMOP dependencies
189
+
190
+ App->>S: concept_mapping_context(...)
191
+ S->>A: coordinate graph / vocab / emb calls
192
+ A->>D: execute dependency queries
193
+ D-->>A: raw results
194
+ A-->>S: adapter-shaped results
195
+ S-->>App: domain result
196
+ ```
197
+
198
+ ## If you are using it as a library
199
+
200
+ Start with `build_application(config)` and `app.services.mapping` for higher-level
201
+ mapping workflows. Drop down to `app.adapters.*` when you want lower-level,
202
+ dependency-shaped operations.
203
+
204
+ ## Companion repos
205
+
206
+ - [groundcrew](https://github.com/AustralianCancerDataNetwork/groundcrew) for MCP-based orchestration
207
+ - [omop-graph](https://australiancancerdatanetwork.github.io/omop-graph/) for OMOP concept and hierarchy queries
208
+ - [omop-emb](https://australiancancerdatanetwork.github.io/omop-emb/) for embedding index and semantic retrieval
@@ -0,0 +1,178 @@
1
+ # groundworkers
2
+
3
+ `groundworkers` is a read-only OMOP vocabulary integration package. You can use it
4
+ in two ways:
5
+
6
+ - as an **MCP server** for tool consumers such as `groundcrew`, Claude Code, and
7
+ other MCP clients
8
+ - as a **Python library** for applications that want to call mapping and retrieval
9
+ logic directly
10
+
11
+ No patient-level writes. No session state. No transport-specific business logic.
12
+
13
+ ## When to use it
14
+
15
+ Use `groundworkers` when you want:
16
+
17
+ - OMOP concept lookup and hierarchy navigation
18
+ - exact, full-text, and embedding-based concept retrieval
19
+ - mapping-oriented evidence bundles and context assembly
20
+ - one package that works both over MCP and in-process from Python
21
+
22
+ ## How it is organized
23
+
24
+ ```mermaid
25
+ flowchart LR
26
+ Client1[Python app] --> App[build_application]
27
+ Client2[MCP client] --> Server[groundworkers server]
28
+ App --> Services[services/]
29
+ Server --> Tools[tools/]
30
+ Tools --> Services
31
+ Services --> Adapters[adapters/]
32
+ Adapters --> OG[omop-graph]
33
+ Adapters --> OE[omop-emb]
34
+ Adapters --> DB[(OMOP DB)]
35
+ ```
36
+
37
+ - `adapters/` handle dependency-specific details
38
+ - `services/` handle reusable workflow logic
39
+ - `tools/` expose MCP-facing wrappers
40
+ - `app.py` and `server.py` wire those pieces together
41
+
42
+ ## What it exposes
43
+
44
+ | Group | Surface | Notes |
45
+ |---|---|---|
46
+ | Concept | `concept_get`, `concept_by_code`, `concept_ancestors`, `concept_descendants`, `concept_relationships`, `concept_equivalency_path`, `concept_path`, `concept_map_to_standard`, `concept_neighbors` | Backed by `OmopGraphAdapter` |
47
+ | Resolver | `concept_ground` | Best-answer grounding pipeline |
48
+ | Search | `concept_search_exact`, `concept_search_fulltext`, `concept_navigate_to_standard` | Low-level lexical primitives |
49
+ | Mapping | `concept_search_normalized`, `concept_candidate_bundle`, `concept_parent_backoff`, `concept_mapping_context`, `concept_map_to_value`, `concept_resolve_mapping_expression`, `mapping_evaluate_candidates` | High-level mapping workflows |
50
+ | Embedding | `embedding_index_status`, `embedding_neighbours`, `embedding_search`, `embedding_encode` | Backed by `OmopEmbAdapter` |
51
+ | System | `system_status`, `system_vocabulary_catalogue` | Always registered |
52
+
53
+ ## Quick start
54
+
55
+ ### MCP server
56
+
57
+ ```bash
58
+ uv venv
59
+ uv sync --extra dev --extra embedding-tools
60
+ uv run groundworkers --config config/groundworkers.example.yaml --describe
61
+ uv run groundworkers --config config/groundworkers.example.yaml
62
+ ```
63
+
64
+ ### Direct Python use
65
+
66
+ ```python
67
+ from groundworkers.app import build_application
68
+ from groundworkers.config import AppConfig
69
+
70
+ config = AppConfig.model_validate(
71
+ {
72
+ "omop_graph": {
73
+ "db_url": "postgresql+psycopg://user:pass@localhost:5432/omop",
74
+ "vocab_schema": "omop_vocab",
75
+ },
76
+ "omop_emb": {
77
+ "enabled": True,
78
+ "backend_type": "pgvector",
79
+ "db_url": "postgresql+psycopg://user:pass@localhost:5432/omop",
80
+ "default_model_name": "qwen3-embedding:0.6b",
81
+ "api_base": "http://localhost:11434/v1",
82
+ "api_key": "ollama",
83
+ },
84
+ }
85
+ )
86
+
87
+ app = build_application(config)
88
+ mapping = app.services.mapping
89
+ assert mapping is not None
90
+
91
+ bundle = mapping.concept_candidate_bundle(
92
+ "type 2 diabetes",
93
+ domain="Condition",
94
+ include_normalized=True,
95
+ include_fulltext=True,
96
+ include_embedding=True,
97
+ )
98
+ ```
99
+
100
+ ## Example config
101
+
102
+ ```yaml
103
+ omop_graph:
104
+ db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
105
+ vocab_schema: omop_vocab
106
+
107
+ omop_emb:
108
+ enabled: true
109
+ backend_type: pgvector
110
+ db_url: "postgresql+psycopg://user:pass@localhost:5432/omop"
111
+ default_model_name: qwen3-embedding:0.6b
112
+ api_base: "http://localhost:11434/v1"
113
+ api_key: "ollama"
114
+ ```
115
+
116
+ ## End-to-end examples
117
+
118
+ ### MCP consumer flow
119
+
120
+ ```mermaid
121
+ sequenceDiagram
122
+ participant C as MCP consumer
123
+ participant GW as groundworkers
124
+ participant M as MappingService
125
+ participant D as OMOP dependencies
126
+
127
+ C->>GW: call tool concept_candidate_bundle
128
+ GW->>M: invoke tool wrapper
129
+ M->>D: gather lexical, graph, and embedding evidence
130
+ D-->>M: candidate evidence
131
+ M-->>GW: assembled bundle
132
+ GW-->>C: MCP-safe JSON result
133
+ ```
134
+
135
+ Representative tool payload:
136
+
137
+ ```json
138
+ {
139
+ "tool": "concept_candidate_bundle",
140
+ "arguments": {
141
+ "query": "type 2 diabetes",
142
+ "domain": "Condition",
143
+ "include_normalized": true,
144
+ "include_fulltext": true,
145
+ "include_embedding": true,
146
+ "include_standard_mappings": true
147
+ }
148
+ }
149
+ ```
150
+
151
+ ### Direct Python flow
152
+
153
+ ```mermaid
154
+ sequenceDiagram
155
+ participant App as Python application
156
+ participant S as MappingService
157
+ participant A as Adapters
158
+ participant D as OMOP dependencies
159
+
160
+ App->>S: concept_mapping_context(...)
161
+ S->>A: coordinate graph / vocab / emb calls
162
+ A->>D: execute dependency queries
163
+ D-->>A: raw results
164
+ A-->>S: adapter-shaped results
165
+ S-->>App: domain result
166
+ ```
167
+
168
+ ## If you are using it as a library
169
+
170
+ Start with `build_application(config)` and `app.services.mapping` for higher-level
171
+ mapping workflows. Drop down to `app.adapters.*` when you want lower-level,
172
+ dependency-shaped operations.
173
+
174
+ ## Companion repos
175
+
176
+ - [groundcrew](https://github.com/AustralianCancerDataNetwork/groundcrew) for MCP-based orchestration
177
+ - [omop-graph](https://australiancancerdatanetwork.github.io/omop-graph/) for OMOP concept and hierarchy queries
178
+ - [omop-emb](https://australiancancerdatanetwork.github.io/omop-emb/) for embedding index and semantic retrieval
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "groundworkers"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  description = "Groundworkers MCP server — read-only agentive access to OMOP vocabularies, concept graphs, and embeddings."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -22,7 +22,17 @@ embedding-faiss = [
22
22
  "omop-emb[faiss-cpu]>=1.0.0",
23
23
  ]
24
24
  dev = [
25
- "pytest>=8,<9",
25
+ "ipython>=8.0",
26
+ "tornado>=6.5.5",
27
+ "pytest>=9.0.3",
28
+ "pytest-cov>=4.0",
29
+ "mypy>=1.8",
30
+ "ruff>=0.4",
31
+ "mkdocs-material>=9.7.1",
32
+ "mkdocstrings-python>=2.0.1",
33
+ "mkdocs>=1.6.1",
34
+ "requests>=2.33.0",
35
+ "mkdocs-mermaid2-plugin"
26
36
  ]
27
37
 
28
38
  [project.scripts]
@@ -4,7 +4,15 @@ from collections.abc import Callable
4
4
  from typing import Any
5
5
 
6
6
  import numpy as np
7
- from omop_emb import EmbeddingConceptFilter, EmbeddingReaderInterface
7
+ from sqlalchemy.engine import Engine
8
+
9
+ from omop_emb import (
10
+ EmbeddingBackend,
11
+ EmbeddingClient,
12
+ EmbeddingConceptFilter,
13
+ EmbeddingModelRecord,
14
+ EmbeddingReaderInterface,
15
+ )
8
16
  from omop_emb.config import MetricType
9
17
  from omop_emb.embeddings.embedding_client import EmbeddingRole
10
18
  from omop_emb.interface import list_registered_models
@@ -16,11 +24,11 @@ class OmopEmbAdapter:
16
24
  def __init__(
17
25
  self,
18
26
  *,
19
- backend_factory: Callable[[], object],
27
+ backend_factory: Callable[[], EmbeddingBackend],
20
28
  backend_type: str | None,
21
29
  default_model_name: str | None = None,
22
- client_factory: Callable[[str], object] | None = None,
23
- cdm_engine: object | None = None,
30
+ client_factory: Callable[[str], EmbeddingClient] | None = None,
31
+ cdm_engine: Engine | None = None,
24
32
  faiss_cache_dir: str | None = None,
25
33
  ) -> None:
26
34
  self._backend_factory = backend_factory
@@ -29,8 +37,8 @@ class OmopEmbAdapter:
29
37
  self._client_factory = client_factory
30
38
  self._cdm_engine = cdm_engine
31
39
  self._faiss_cache_dir = faiss_cache_dir
32
- self._backend: object | None = None
33
- self._clients: dict[str, object] = {}
40
+ self._backend: EmbeddingBackend | None = None
41
+ self._clients: dict[str, EmbeddingClient] = {}
34
42
 
35
43
  def is_available(self) -> bool:
36
44
  return self.index_status()["available"]
@@ -164,7 +172,7 @@ class OmopEmbAdapter:
164
172
  "vector": row.tolist(),
165
173
  }
166
174
 
167
- def _get_backend(self) -> object:
175
+ def _get_backend(self) -> EmbeddingBackend:
168
176
  if self._backend is None:
169
177
  try:
170
178
  self._backend = self._backend_factory()
@@ -172,7 +180,7 @@ class OmopEmbAdapter:
172
180
  raise GroundworkersError("BACKEND_UNAVAIL", f"Embedding backend is unavailable: {exc}") from exc
173
181
  return self._backend
174
182
 
175
- def _resolve_model_record(self, model_name: str | None) -> object:
183
+ def _resolve_model_record(self, model_name: str | None) -> EmbeddingModelRecord:
176
184
  backend = self._get_backend()
177
185
  requested_name = model_name or self._default_model_name
178
186
  records = list_registered_models(backend=backend, model_name=requested_name)
@@ -190,7 +198,7 @@ class OmopEmbAdapter:
190
198
  "No default embedding model is configured and multiple registered models are available",
191
199
  )
192
200
 
193
- def _build_reader(self, record: object) -> EmbeddingReaderInterface:
201
+ def _build_reader(self, record: EmbeddingModelRecord) -> EmbeddingReaderInterface:
194
202
  return EmbeddingReaderInterface(
195
203
  model=record.model_name,
196
204
  backend=self._get_backend(),
@@ -200,7 +208,7 @@ class OmopEmbAdapter:
200
208
  faiss_cache_dir=self._faiss_cache_dir,
201
209
  )
202
210
 
203
- def _get_client(self, model_name: str) -> object:
211
+ def _get_client(self, model_name: str) -> EmbeddingClient:
204
212
  if self._client_factory is None:
205
213
  raise GroundworkersError("BACKEND_UNAVAIL", "embedding client is not configured")
206
214
  client = self._clients.get(model_name)
@@ -231,7 +239,7 @@ class OmopEmbAdapter:
231
239
  limit=limit,
232
240
  )
233
241
 
234
- def _backend_type_from_backend(self, backend: object) -> str | None:
242
+ def _backend_type_from_backend(self, backend: EmbeddingBackend) -> str | None:
235
243
  backend_type = getattr(backend, "backend_type", None)
236
244
  return self._enum_value(backend_type)
237
245
 
@@ -7,7 +7,7 @@ from typing import Any
7
7
 
8
8
  from omop_graph.extensions.omop_alchemy import PredicateKind
9
9
  from omop_graph.graph.constraints import SearchConstraintConcept
10
- from omop_graph.graph.kg import KnowledgeGraph
10
+ from omop_graph.graph.kg import KnowledgeGraph, KnowledgeGraphEmbeddingConfiguration
11
11
  from omop_graph.graph.paths import find_shortest_paths_batch
12
12
  from omop_graph.graph.traverse import traverse
13
13
  from omop_graph.reasoning.grounding import GroundingConstraints, ground_term
@@ -32,6 +32,8 @@ from sqlalchemy import func, select, text
32
32
  from sqlalchemy.engine import Engine
33
33
  from sqlalchemy.exc import NoResultFound
34
34
 
35
+ from omop_emb import EmbeddingClient
36
+
35
37
  from groundworkers.base.errors import GroundworkersError
36
38
 
37
39
  # TODO: some of this adapter logic really should be pushed back into
@@ -45,12 +47,28 @@ class OmopGraphAdapter:
45
47
  *,
46
48
  vocab_schema: str = "omop_vocab",
47
49
  emb_model_name: str | None = None,
50
+ embedding_client: EmbeddingClient | None = None,
51
+ min_fulltext_overlap: float = 0.0,
48
52
  ) -> None:
49
53
  self.engine = engine
50
54
  self.vocab_schema = vocab_schema
51
55
  self.emb_model_name = emb_model_name
56
+ self._embedding_client: EmbeddingClient | None = embedding_client
57
+ self.min_fulltext_overlap = min_fulltext_overlap
52
58
  self._kg: KnowledgeGraph | None = None
53
59
 
60
+ def set_embedding_client(self, client: EmbeddingClient, model_name: str | None = None) -> None:
61
+ """Inject an EmbeddingClient so concept_ground can encode query strings on-the-fly.
62
+
63
+ Call this after construction (e.g. once the omop_emb adapter has resolved
64
+ the default model from the registry). The embedding is computed before
65
+ ground_term is called and passed as the query_embedding argument — the KG
66
+ itself does not need to be rebuilt.
67
+ """
68
+ self._embedding_client = client
69
+ if model_name is not None:
70
+ self.emb_model_name = model_name
71
+
54
72
  def is_available(self) -> bool:
55
73
  try:
56
74
  self._get_kg()
@@ -157,22 +175,38 @@ class OmopGraphAdapter:
157
175
  (ExactLabelResolver(), ExactSynonymResolver()),
158
176
  (FullTextResolver(), FullTextSynonymResolver()),
159
177
  ]
160
- if self.emb_model_name:
178
+ if self.emb_model_name or self._embedding_client is not None:
161
179
  tiers.append((EmbeddingResolver(),))
162
180
  tiers.append((PartialLabelResolver(), PartialSynonymResolver()))
163
181
 
164
182
  results: list[Any] = []
165
183
  for tier in tiers:
184
+ is_fts_tier = any(
185
+ isinstance(r, (FullTextResolver, FullTextSynonymResolver)) for r in tier
186
+ )
166
187
  pipeline = ResolverPipeline(resolvers=tier)
167
188
  try:
168
- results = ground_term(
189
+ raw = ground_term(
169
190
  pipeline, kg, query,
170
- query_embedding=None,
191
+ query_embedding=None, # KG computes this via its emb_config
171
192
  constraints=constraints,
172
193
  max_candidates=limit,
173
194
  )
174
195
  except Exception as exc:
175
196
  raise self._wrap_graph_error(exc, default_code="QUERY_ERROR")
197
+ # Apply minimum token-overlap filter to FTS results: if fewer than
198
+ # min_fulltext_overlap of the query tokens appear in the matched
199
+ # concept name, drop the hit and fall through to a better tier.
200
+ if raw and is_fts_tier and self.min_fulltext_overlap > 0.0:
201
+ query_tokens = set(query.lower().split())
202
+ filtered = [
203
+ r for r in raw
204
+ if self._fts_overlap(query_tokens, r.matched_concept_label or "")
205
+ >= self.min_fulltext_overlap
206
+ ]
207
+ results = filtered
208
+ else:
209
+ results = list(raw)
176
210
  if results:
177
211
  break
178
212
 
@@ -535,6 +569,20 @@ class OmopGraphAdapter:
535
569
  "valid": edge.invalid_reason is None, # type: ignore[attr-defined]
536
570
  }
537
571
 
572
+ @staticmethod
573
+ def _fts_overlap(query_tokens: set[str], concept_label: str) -> float:
574
+ """Return the proportion of query tokens that appear in *concept_label*.
575
+
576
+ Both sides are lowercased and split on whitespace. A value of 1.0
577
+ means every query token was found; 0.0 means none were found.
578
+ Used to filter noisy fulltext results before falling through to the
579
+ embedding tier.
580
+ """
581
+ if not query_tokens:
582
+ return 1.0
583
+ label_tokens = set(concept_label.lower().split())
584
+ return len(query_tokens & label_tokens) / len(query_tokens)
585
+
538
586
  @staticmethod
539
587
  def _label_match_kind_name(match_kind: object) -> str:
540
588
  _MAP = {0: "EXACT", 1: "FULLTEXT", 2: "PARTIAL", 3: "EMBEDDING_NEAREST"}
@@ -588,7 +636,18 @@ class OmopGraphAdapter:
588
636
  raise GroundworkersError("DB_UNAVAILABLE", f"Cannot connect to database: {exc}") from exc
589
637
 
590
638
  try:
591
- self._kg = KnowledgeGraph(cdm_engine=self.engine)
639
+ emb_config: KnowledgeGraphEmbeddingConfiguration | None = None
640
+ if self._embedding_client is not None:
641
+ try:
642
+ from omop_emb.config import MetricType
643
+ emb_config = KnowledgeGraphEmbeddingConfiguration(
644
+ metric_type=MetricType.COSINE,
645
+ model_name=self.emb_model_name,
646
+ client=self._embedding_client,
647
+ )
648
+ except Exception:
649
+ emb_config = None # Non-fatal: grounding falls back to non-embedding tiers
650
+ self._kg = KnowledgeGraph(cdm_engine=self.engine, emb_config=emb_config)
592
651
  except Exception as exc:
593
652
  raise self._wrap_graph_error(exc, default_code="BACKEND_UNAVAIL")
594
653
  return self._kg
@@ -597,11 +656,11 @@ class OmopGraphAdapter:
597
656
  # These are consistent across all Athena vocabulary releases (concept_ids may differ
598
657
  # between instances, but concept_codes are stable).
599
658
  _DOMAIN_ROOT_CODES: dict[str, tuple[str, str]] = {
600
- "condition": ("SNOMED", "404684003"), # Clinical finding
601
- "procedure": ("SNOMED", "71388002"), # Procedure
602
- "drug": ("SNOMED", "373873005"), # Pharmaceutical / biologic product
603
- "measurement": ("SNOMED", "363787002"), # Observable entity
604
- "device": ("SNOMED", "260787004"), # Physical object
659
+ "Condition": ("SNOMED", "404684003"), # Clinical finding
660
+ "Procedure": ("SNOMED", "71388002"), # Procedure
661
+ "Drug": ("SNOMED", "373873005"), # Pharmaceutical / biologic product
662
+ "Measurement": ("SNOMED", "363787002"), # Observable entity
663
+ "Device": ("SNOMED", "260787004"), # Physical object
605
664
  }
606
665
 
607
666
  def _get_domain_root_ids(self, domain: str | None) -> tuple[int, ...]: