orchestrator-core 4.4.1__py3-none-any.whl → 4.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +17 -2
- orchestrator/agentic_app.py +103 -0
- orchestrator/api/api_v1/api.py +14 -2
- orchestrator/api/api_v1/endpoints/processes.py +2 -0
- orchestrator/api/api_v1/endpoints/search.py +296 -0
- orchestrator/app.py +32 -0
- orchestrator/cli/main.py +22 -1
- orchestrator/cli/search/__init__.py +32 -0
- orchestrator/cli/search/index_llm.py +73 -0
- orchestrator/cli/search/resize_embedding.py +135 -0
- orchestrator/cli/search/search_explore.py +208 -0
- orchestrator/cli/search/speedtest.py +151 -0
- orchestrator/db/models.py +37 -1
- orchestrator/devtools/populator.py +16 -0
- orchestrator/domain/base.py +2 -7
- orchestrator/domain/lifecycle.py +24 -7
- orchestrator/llm_settings.py +57 -0
- orchestrator/log_config.py +1 -0
- orchestrator/migrations/helpers.py +7 -1
- orchestrator/schemas/search.py +130 -0
- orchestrator/schemas/workflow.py +1 -0
- orchestrator/search/__init__.py +12 -0
- orchestrator/search/agent/__init__.py +21 -0
- orchestrator/search/agent/agent.py +62 -0
- orchestrator/search/agent/prompts.py +100 -0
- orchestrator/search/agent/state.py +21 -0
- orchestrator/search/agent/tools.py +258 -0
- orchestrator/search/core/__init__.py +12 -0
- orchestrator/search/core/embedding.py +73 -0
- orchestrator/search/core/exceptions.py +36 -0
- orchestrator/search/core/types.py +296 -0
- orchestrator/search/core/validators.py +40 -0
- orchestrator/search/docs/index.md +37 -0
- orchestrator/search/docs/running_local_text_embedding_inference.md +46 -0
- orchestrator/search/filters/__init__.py +40 -0
- orchestrator/search/filters/base.py +295 -0
- orchestrator/search/filters/date_filters.py +88 -0
- orchestrator/search/filters/definitions.py +107 -0
- orchestrator/search/filters/ltree_filters.py +56 -0
- orchestrator/search/filters/numeric_filter.py +73 -0
- orchestrator/search/indexing/__init__.py +16 -0
- orchestrator/search/indexing/indexer.py +334 -0
- orchestrator/search/indexing/registry.py +101 -0
- orchestrator/search/indexing/tasks.py +69 -0
- orchestrator/search/indexing/traverse.py +334 -0
- orchestrator/search/llm_migration.py +108 -0
- orchestrator/search/retrieval/__init__.py +16 -0
- orchestrator/search/retrieval/builder.py +123 -0
- orchestrator/search/retrieval/engine.py +154 -0
- orchestrator/search/retrieval/exceptions.py +90 -0
- orchestrator/search/retrieval/pagination.py +96 -0
- orchestrator/search/retrieval/retrievers/__init__.py +26 -0
- orchestrator/search/retrieval/retrievers/base.py +123 -0
- orchestrator/search/retrieval/retrievers/fuzzy.py +94 -0
- orchestrator/search/retrieval/retrievers/hybrid.py +277 -0
- orchestrator/search/retrieval/retrievers/semantic.py +94 -0
- orchestrator/search/retrieval/retrievers/structured.py +39 -0
- orchestrator/search/retrieval/utils.py +120 -0
- orchestrator/search/retrieval/validation.py +152 -0
- orchestrator/search/schemas/__init__.py +12 -0
- orchestrator/search/schemas/parameters.py +129 -0
- orchestrator/search/schemas/results.py +77 -0
- orchestrator/services/processes.py +2 -1
- orchestrator/services/settings_env_variables.py +2 -2
- orchestrator/settings.py +8 -1
- orchestrator/utils/state.py +6 -1
- orchestrator/workflows/steps.py +15 -1
- orchestrator/workflows/tasks/validate_products.py +1 -1
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0.dist-info}/METADATA +15 -8
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0.dist-info}/RECORD +72 -22
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from collections.abc import Awaitable, Callable
|
|
15
|
+
from typing import Any, TypeVar
|
|
16
|
+
|
|
17
|
+
import structlog
|
|
18
|
+
from ag_ui.core import EventType, StateSnapshotEvent
|
|
19
|
+
from pydantic_ai import RunContext
|
|
20
|
+
from pydantic_ai.ag_ui import StateDeps
|
|
21
|
+
from pydantic_ai.exceptions import ModelRetry
|
|
22
|
+
from pydantic_ai.messages import ModelRequest, UserPromptPart
|
|
23
|
+
from pydantic_ai.toolsets import FunctionToolset
|
|
24
|
+
|
|
25
|
+
from orchestrator.api.api_v1.endpoints.search import (
|
|
26
|
+
get_definitions,
|
|
27
|
+
list_paths,
|
|
28
|
+
search_processes,
|
|
29
|
+
search_products,
|
|
30
|
+
search_subscriptions,
|
|
31
|
+
search_workflows,
|
|
32
|
+
)
|
|
33
|
+
from orchestrator.schemas.search import SearchResultsSchema
|
|
34
|
+
from orchestrator.search.core.types import ActionType, EntityType, FilterOp
|
|
35
|
+
from orchestrator.search.filters import FilterTree
|
|
36
|
+
from orchestrator.search.retrieval.exceptions import FilterValidationError, PathNotFoundError
|
|
37
|
+
from orchestrator.search.retrieval.validation import validate_filter_tree
|
|
38
|
+
from orchestrator.search.schemas.parameters import PARAMETER_REGISTRY, BaseSearchParameters
|
|
39
|
+
|
|
40
|
+
from .state import SearchState
|
|
41
|
+
|
|
42
|
+
logger = structlog.get_logger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
P = TypeVar("P", bound=BaseSearchParameters)
|
|
46
|
+
|
|
47
|
+
SearchFn = Callable[[P], Awaitable[SearchResultsSchema[Any]]]
|
|
48
|
+
|
|
49
|
+
SEARCH_FN_MAP: dict[EntityType, SearchFn] = {
|
|
50
|
+
EntityType.SUBSCRIPTION: search_subscriptions,
|
|
51
|
+
EntityType.WORKFLOW: search_workflows,
|
|
52
|
+
EntityType.PRODUCT: search_products,
|
|
53
|
+
EntityType.PROCESS: search_processes,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
search_toolset: FunctionToolset[StateDeps[SearchState]] = FunctionToolset(max_retries=1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def last_user_message(ctx: RunContext[StateDeps[SearchState]]) -> str | None:
|
|
60
|
+
for msg in reversed(ctx.messages):
|
|
61
|
+
if isinstance(msg, ModelRequest):
|
|
62
|
+
for part in msg.parts:
|
|
63
|
+
if isinstance(part, UserPromptPart) and isinstance(part.content, str):
|
|
64
|
+
return part.content
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@search_toolset.tool
|
|
69
|
+
async def set_search_parameters(
|
|
70
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
71
|
+
entity_type: EntityType,
|
|
72
|
+
action: str | ActionType = ActionType.SELECT,
|
|
73
|
+
) -> StateSnapshotEvent:
|
|
74
|
+
"""Sets the initial search context, like the entity type and the user's query.
|
|
75
|
+
|
|
76
|
+
This MUST be the first tool called to start any new search.
|
|
77
|
+
Warning: Calling this tool will erase any existing filters and search results from the state.
|
|
78
|
+
"""
|
|
79
|
+
params = ctx.deps.state.parameters or {}
|
|
80
|
+
is_new_search = params.get("entity_type") != entity_type.value
|
|
81
|
+
final_query = (last_user_message(ctx) or "") if is_new_search else params.get("query", "")
|
|
82
|
+
|
|
83
|
+
logger.debug(
|
|
84
|
+
"Setting search parameters",
|
|
85
|
+
entity_type=entity_type.value,
|
|
86
|
+
action=action,
|
|
87
|
+
is_new_search=is_new_search,
|
|
88
|
+
query=final_query,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
ctx.deps.state.parameters = {"action": action, "entity_type": entity_type, "filters": None, "query": final_query}
|
|
92
|
+
ctx.deps.state.results = []
|
|
93
|
+
logger.debug("Search parameters set", parameters=ctx.deps.state.parameters)
|
|
94
|
+
|
|
95
|
+
return StateSnapshotEvent(
|
|
96
|
+
type=EventType.STATE_SNAPSHOT,
|
|
97
|
+
snapshot=ctx.deps.state.model_dump(),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@search_toolset.tool(retries=2)
|
|
102
|
+
async def set_filter_tree(
|
|
103
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
104
|
+
filters: FilterTree | None,
|
|
105
|
+
) -> StateSnapshotEvent:
|
|
106
|
+
"""Replace current filters atomically with a full FilterTree, or clear with None.
|
|
107
|
+
|
|
108
|
+
Requirements:
|
|
109
|
+
- Root/group operators must be 'AND' or 'OR' (uppercase).
|
|
110
|
+
- Provide either PathFilters or nested groups under `children`.
|
|
111
|
+
- See the FilterTree schema examples for the exact shape.
|
|
112
|
+
"""
|
|
113
|
+
if ctx.deps.state.parameters is None:
|
|
114
|
+
raise ModelRetry("Search parameters are not initialized. Call set_search_parameters first.")
|
|
115
|
+
|
|
116
|
+
entity_type = EntityType(ctx.deps.state.parameters["entity_type"])
|
|
117
|
+
|
|
118
|
+
logger.debug(
|
|
119
|
+
"Setting filter tree",
|
|
120
|
+
entity_type=entity_type.value,
|
|
121
|
+
has_filters=filters is not None,
|
|
122
|
+
filter_summary=f"{len(filters.get_all_leaves())} filters" if filters else "no filters",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
await validate_filter_tree(filters, entity_type)
|
|
127
|
+
except PathNotFoundError as e:
|
|
128
|
+
logger.debug(f"{PathNotFoundError.__name__}: {str(e)}")
|
|
129
|
+
raise ModelRetry(f"{str(e)} Use discover_filter_paths tool to find valid paths.")
|
|
130
|
+
except FilterValidationError as e:
|
|
131
|
+
# ModelRetry will trigger an agent retry, containing the specific validation error.
|
|
132
|
+
logger.debug(f"Filter validation failed: {str(e)}")
|
|
133
|
+
raise ModelRetry(str(e))
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error("Unexpected Filter validation exception", error=str(e))
|
|
136
|
+
raise ModelRetry(f"Filter validation failed: {str(e)}. Please check your filter structure and try again.")
|
|
137
|
+
|
|
138
|
+
filter_data = None if filters is None else filters.model_dump(mode="json", by_alias=True)
|
|
139
|
+
ctx.deps.state.parameters["filters"] = filter_data
|
|
140
|
+
return StateSnapshotEvent(type=EventType.STATE_SNAPSHOT, snapshot=ctx.deps.state.model_dump())
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@search_toolset.tool
|
|
144
|
+
async def execute_search(
|
|
145
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
146
|
+
limit: int = 10,
|
|
147
|
+
) -> StateSnapshotEvent:
|
|
148
|
+
"""Execute the search with the current parameters."""
|
|
149
|
+
if not ctx.deps.state.parameters:
|
|
150
|
+
raise ValueError("No search parameters set")
|
|
151
|
+
|
|
152
|
+
entity_type = EntityType(ctx.deps.state.parameters["entity_type"])
|
|
153
|
+
param_class = PARAMETER_REGISTRY.get(entity_type)
|
|
154
|
+
if not param_class:
|
|
155
|
+
raise ValueError(f"Unknown entity type: {entity_type}")
|
|
156
|
+
|
|
157
|
+
params = param_class(**ctx.deps.state.parameters)
|
|
158
|
+
logger.debug(
|
|
159
|
+
"Executing database search",
|
|
160
|
+
search_entity_type=entity_type.value,
|
|
161
|
+
limit=limit,
|
|
162
|
+
has_filters=params.filters is not None,
|
|
163
|
+
query=params.query,
|
|
164
|
+
action=params.action,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if params.filters:
|
|
168
|
+
logger.debug("Search filters", filters=params.filters)
|
|
169
|
+
|
|
170
|
+
params.limit = limit
|
|
171
|
+
|
|
172
|
+
fn = SEARCH_FN_MAP[entity_type]
|
|
173
|
+
search_results = await fn(params)
|
|
174
|
+
|
|
175
|
+
logger.debug(
|
|
176
|
+
"Search completed",
|
|
177
|
+
total_results=len(search_results.data) if search_results.data else 0,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
ctx.deps.state.results = search_results.data
|
|
181
|
+
|
|
182
|
+
return StateSnapshotEvent(type=EventType.STATE_SNAPSHOT, snapshot=ctx.deps.state.model_dump())
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@search_toolset.tool
|
|
186
|
+
async def discover_filter_paths(
|
|
187
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
188
|
+
field_names: list[str],
|
|
189
|
+
entity_type: EntityType | None = None,
|
|
190
|
+
) -> dict[str, dict[str, Any]]:
|
|
191
|
+
"""Discovers available filter paths for a list of field names.
|
|
192
|
+
|
|
193
|
+
Returns a dictionary where each key is a field_name from the input list and
|
|
194
|
+
the value is its discovery result.
|
|
195
|
+
"""
|
|
196
|
+
if not entity_type and ctx.deps.state.parameters:
|
|
197
|
+
entity_type = EntityType(ctx.deps.state.parameters.get("entity_type"))
|
|
198
|
+
if not entity_type:
|
|
199
|
+
entity_type = EntityType.SUBSCRIPTION
|
|
200
|
+
|
|
201
|
+
all_results = {}
|
|
202
|
+
for field_name in field_names:
|
|
203
|
+
paths_response = await list_paths(prefix="", q=field_name, entity_type=entity_type, limit=100)
|
|
204
|
+
|
|
205
|
+
matching_leaves = []
|
|
206
|
+
for leaf in paths_response.leaves:
|
|
207
|
+
if field_name.lower() in leaf.name.lower():
|
|
208
|
+
matching_leaves.append(
|
|
209
|
+
{
|
|
210
|
+
"name": leaf.name,
|
|
211
|
+
"value_kind": leaf.ui_types,
|
|
212
|
+
"paths": leaf.paths,
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
matching_components = []
|
|
217
|
+
for comp in paths_response.components:
|
|
218
|
+
if field_name.lower() in comp.name.lower():
|
|
219
|
+
matching_components.append(
|
|
220
|
+
{
|
|
221
|
+
"name": comp.name,
|
|
222
|
+
"value_kind": comp.ui_types,
|
|
223
|
+
}
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
result_for_field: dict[str, Any]
|
|
227
|
+
if not matching_leaves and not matching_components:
|
|
228
|
+
result_for_field = {
|
|
229
|
+
"status": "NOT_FOUND",
|
|
230
|
+
"guidance": f"No filterable paths found containing '{field_name}'. Do not create a filter for this.",
|
|
231
|
+
"leaves": [],
|
|
232
|
+
"components": [],
|
|
233
|
+
}
|
|
234
|
+
else:
|
|
235
|
+
result_for_field = {
|
|
236
|
+
"status": "OK",
|
|
237
|
+
"guidance": f"Found {len(matching_leaves)} field(s) and {len(matching_components)} component(s) for '{field_name}'.",
|
|
238
|
+
"leaves": matching_leaves,
|
|
239
|
+
"components": matching_components,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
all_results[field_name] = result_for_field
|
|
243
|
+
logger.debug("Returning found fieldname - path mapping", all_results=all_results)
|
|
244
|
+
return all_results
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@search_toolset.tool
|
|
248
|
+
async def get_valid_operators() -> dict[str, list[FilterOp]]:
|
|
249
|
+
"""Gets the mapping of field types to their valid filter operators."""
|
|
250
|
+
definitions = await get_definitions()
|
|
251
|
+
|
|
252
|
+
operator_map = {}
|
|
253
|
+
for ui_type, type_def in definitions.items():
|
|
254
|
+
key = ui_type.value
|
|
255
|
+
|
|
256
|
+
if hasattr(type_def, "operators"):
|
|
257
|
+
operator_map[key] = type_def.operators
|
|
258
|
+
return operator_map
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
import structlog
|
|
16
|
+
from litellm import aembedding as llm_aembedding
|
|
17
|
+
from litellm import embedding as llm_embedding
|
|
18
|
+
from litellm import exceptions as llm_exc
|
|
19
|
+
|
|
20
|
+
from orchestrator.llm_settings import llm_settings
|
|
21
|
+
|
|
22
|
+
logger = structlog.get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class EmbeddingIndexer:
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def get_embeddings_from_api_batch(cls, texts: list[str], dry_run: bool) -> list[list[float]]:
|
|
29
|
+
if not texts:
|
|
30
|
+
return []
|
|
31
|
+
if dry_run:
|
|
32
|
+
logger.debug("Dry Run: returning empty embeddings")
|
|
33
|
+
return [[] for _ in texts]
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
resp = llm_embedding(
|
|
37
|
+
model=llm_settings.EMBEDDING_MODEL,
|
|
38
|
+
input=[t.lower() for t in texts],
|
|
39
|
+
api_key=llm_settings.OPENAI_API_KEY,
|
|
40
|
+
api_base=llm_settings.OPENAI_BASE_URL,
|
|
41
|
+
timeout=llm_settings.LLM_TIMEOUT,
|
|
42
|
+
max_retries=llm_settings.LLM_MAX_RETRIES,
|
|
43
|
+
)
|
|
44
|
+
data = sorted(resp.data, key=lambda e: e["index"])
|
|
45
|
+
return [row["embedding"] for row in data]
|
|
46
|
+
except (llm_exc.APIError, llm_exc.APIConnectionError, llm_exc.RateLimitError, llm_exc.Timeout) as e:
|
|
47
|
+
logger.error("Embedding request failed", error=str(e))
|
|
48
|
+
return [[] for _ in texts]
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.error("Unexpected embedding error", error=str(e))
|
|
51
|
+
return [[] for _ in texts]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class QueryEmbedder:
|
|
55
|
+
"""A stateless, async utility for embedding real-time user queries."""
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
async def generate_for_text_async(cls, text: str) -> list[float]:
|
|
59
|
+
if not text:
|
|
60
|
+
return []
|
|
61
|
+
try:
|
|
62
|
+
resp = await llm_aembedding(
|
|
63
|
+
model=llm_settings.EMBEDDING_MODEL,
|
|
64
|
+
input=[text.lower()],
|
|
65
|
+
api_key=llm_settings.OPENAI_API_KEY,
|
|
66
|
+
api_base=llm_settings.OPENAI_BASE_URL,
|
|
67
|
+
timeout=5.0,
|
|
68
|
+
max_retries=0, # No retries, prioritize speed.
|
|
69
|
+
)
|
|
70
|
+
return resp.data[0]["embedding"]
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error("Async embedding generation failed", error=str(e))
|
|
73
|
+
return []
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SearchUtilsError(Exception):
|
|
16
|
+
"""Base exception for this module."""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ProductNotInRegistryError(SearchUtilsError):
|
|
22
|
+
"""Raised when a product is not found in the model registry."""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ModelLoadError(SearchUtilsError):
|
|
28
|
+
"""Raised when a Pydantic model fails to load from a subscription."""
|
|
29
|
+
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class InvalidCursorError(SearchUtilsError):
|
|
34
|
+
"""Raised when cursor cannot be decoded."""
|
|
35
|
+
|
|
36
|
+
pass
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from datetime import date, datetime
|
|
16
|
+
from enum import Enum, IntEnum
|
|
17
|
+
from typing import Annotated, Any, Literal, NamedTuple, TypeAlias, TypedDict, get_args, get_origin
|
|
18
|
+
from uuid import UUID
|
|
19
|
+
|
|
20
|
+
from sqlalchemy.orm.attributes import InstrumentedAttribute
|
|
21
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
22
|
+
from sqlalchemy_utils.types.ltree import Ltree
|
|
23
|
+
|
|
24
|
+
from orchestrator.types import filter_nonetype, get_origin_and_args, is_optional_type, is_union_type
|
|
25
|
+
|
|
26
|
+
from .validators import is_bool_string, is_iso_date, is_uuid
|
|
27
|
+
|
|
28
|
+
SQLAColumn: TypeAlias = ColumnElement[Any] | InstrumentedAttribute[Any]
|
|
29
|
+
|
|
30
|
+
LTREE_SEPARATOR = "."
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class SearchMetadata:
|
|
35
|
+
"""Metadata about the search operation performed."""
|
|
36
|
+
|
|
37
|
+
search_type: str
|
|
38
|
+
description: str
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def structured(cls) -> "SearchMetadata":
|
|
42
|
+
return cls(
|
|
43
|
+
search_type="structured", description="This search performs a filter-based search using structured queries."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def fuzzy(cls) -> "SearchMetadata":
|
|
48
|
+
return cls(
|
|
49
|
+
search_type="fuzzy",
|
|
50
|
+
description="This search performs a trigram similarity search.",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def semantic(cls) -> "SearchMetadata":
|
|
55
|
+
return cls(
|
|
56
|
+
search_type="semantic",
|
|
57
|
+
description="This search performs a vector similarity search, using L2 distance on embeddings with minimum distance scoring (normalized).",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def hybrid(cls) -> "SearchMetadata":
|
|
62
|
+
return cls(
|
|
63
|
+
search_type="hybrid",
|
|
64
|
+
description="This search performs reciprocal rank fusion combining trigram similarity, word_similarity, and L2 vector distance.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def empty(cls) -> "SearchMetadata":
|
|
69
|
+
return cls(search_type="empty", description="Empty search - no criteria provided")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class BooleanOperator(str, Enum):
|
|
73
|
+
AND = "AND"
|
|
74
|
+
OR = "OR"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class FilterOp(str, Enum):
|
|
78
|
+
EQ = "eq"
|
|
79
|
+
NEQ = "neq"
|
|
80
|
+
LT = "lt"
|
|
81
|
+
LIKE = "like"
|
|
82
|
+
LTE = "lte"
|
|
83
|
+
GT = "gt"
|
|
84
|
+
GTE = "gte"
|
|
85
|
+
BETWEEN = "between"
|
|
86
|
+
|
|
87
|
+
MATCHES_LQUERY = "matches_lquery" # The ~ operator for wildcard matching
|
|
88
|
+
IS_ANCESTOR = "is_ancestor" # The @> operator
|
|
89
|
+
IS_DESCENDANT = "is_descendant" # The <@ operator
|
|
90
|
+
PATH_MATCH = "path_match"
|
|
91
|
+
|
|
92
|
+
HAS_COMPONENT = "has_component" # Path contains this segment
|
|
93
|
+
NOT_HAS_COMPONENT = "not_has_component" # Path doesn't contain segment
|
|
94
|
+
ENDS_WITH = "ends_with"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class EntityType(str, Enum):
|
|
98
|
+
SUBSCRIPTION = "SUBSCRIPTION"
|
|
99
|
+
PRODUCT = "PRODUCT"
|
|
100
|
+
WORKFLOW = "WORKFLOW"
|
|
101
|
+
PROCESS = "PROCESS"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class ActionType(str, Enum):
|
|
105
|
+
"""Defines the explicit, safe actions the agent can request."""
|
|
106
|
+
|
|
107
|
+
SELECT = "select" # Retrieve a list of matching records.
|
|
108
|
+
# COUNT = "count" # For phase1; the agent will not support this yet.
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class UIType(str, Enum):
|
|
112
|
+
STRING = "string"
|
|
113
|
+
NUMBER = "number"
|
|
114
|
+
BOOLEAN = "boolean"
|
|
115
|
+
DATETIME = "datetime"
|
|
116
|
+
COMPONENT = "component"
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def from_field_type(cls, ft: "FieldType") -> "UIType":
|
|
120
|
+
"""Create a UIType from a backend FieldType to indicate how a value must be rendered."""
|
|
121
|
+
if ft in (FieldType.INTEGER, FieldType.FLOAT):
|
|
122
|
+
return cls.NUMBER
|
|
123
|
+
if ft == FieldType.BOOLEAN:
|
|
124
|
+
return cls.BOOLEAN
|
|
125
|
+
if ft == FieldType.DATETIME:
|
|
126
|
+
return cls.DATETIME
|
|
127
|
+
return cls.STRING
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class FieldType(str, Enum):
|
|
131
|
+
STRING = "string"
|
|
132
|
+
INTEGER = "integer"
|
|
133
|
+
FLOAT = "float"
|
|
134
|
+
BOOLEAN = "boolean"
|
|
135
|
+
DATETIME = "datetime"
|
|
136
|
+
UUID = "uuid"
|
|
137
|
+
BLOCK = "block"
|
|
138
|
+
RESOURCE_TYPE = "resource_type"
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def infer(cls, val: Any) -> "FieldType":
|
|
142
|
+
if isinstance(val, TypedValue):
|
|
143
|
+
return cls._infer_typed_value(val)
|
|
144
|
+
|
|
145
|
+
if isinstance(val, bool):
|
|
146
|
+
return cls.BOOLEAN
|
|
147
|
+
if isinstance(val, int):
|
|
148
|
+
return cls.INTEGER
|
|
149
|
+
if isinstance(val, float):
|
|
150
|
+
return cls.FLOAT
|
|
151
|
+
if isinstance(val, UUID):
|
|
152
|
+
return cls.UUID
|
|
153
|
+
if isinstance(val, (datetime, date)):
|
|
154
|
+
return cls.DATETIME
|
|
155
|
+
if isinstance(val, str):
|
|
156
|
+
return cls._infer_from_str(val)
|
|
157
|
+
|
|
158
|
+
return cls.STRING
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def _infer_typed_value(cls, val: "TypedValue") -> "FieldType":
|
|
162
|
+
if val.type == cls.BLOCK:
|
|
163
|
+
return cls.BLOCK
|
|
164
|
+
if val.type == cls.RESOURCE_TYPE:
|
|
165
|
+
return cls.RESOURCE_TYPE
|
|
166
|
+
return cls.STRING
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def _infer_from_str(cls, val: str) -> "FieldType":
|
|
170
|
+
if is_uuid(val):
|
|
171
|
+
return cls.UUID
|
|
172
|
+
if is_iso_date(val):
|
|
173
|
+
return cls.DATETIME
|
|
174
|
+
if is_bool_string(val):
|
|
175
|
+
return cls.BOOLEAN
|
|
176
|
+
if val.isdigit():
|
|
177
|
+
return cls.INTEGER
|
|
178
|
+
try:
|
|
179
|
+
float(val)
|
|
180
|
+
return cls.FLOAT
|
|
181
|
+
except ValueError:
|
|
182
|
+
return cls.STRING
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def from_type_hint(cls, type_hint: object) -> "FieldType":
|
|
186
|
+
"""Convert type hint to FieldType."""
|
|
187
|
+
_type_mapping = {
|
|
188
|
+
int: cls.INTEGER,
|
|
189
|
+
float: cls.FLOAT,
|
|
190
|
+
bool: cls.BOOLEAN,
|
|
191
|
+
str: cls.STRING,
|
|
192
|
+
datetime: cls.DATETIME,
|
|
193
|
+
UUID: cls.UUID,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if type_hint in _type_mapping:
|
|
197
|
+
return _type_mapping[type_hint] # type: ignore[index]
|
|
198
|
+
|
|
199
|
+
if get_origin(type_hint) is Annotated:
|
|
200
|
+
inner_type = get_args(type_hint)[0]
|
|
201
|
+
return cls.from_type_hint(inner_type)
|
|
202
|
+
|
|
203
|
+
origin, args = get_origin_and_args(type_hint)
|
|
204
|
+
|
|
205
|
+
if origin is list:
|
|
206
|
+
return cls._handle_list_type(args)
|
|
207
|
+
|
|
208
|
+
if origin is Literal:
|
|
209
|
+
return cls._handle_literal_type(args)
|
|
210
|
+
|
|
211
|
+
if is_optional_type(type_hint) or is_union_type(type_hint):
|
|
212
|
+
return cls._handle_union_type(args)
|
|
213
|
+
|
|
214
|
+
if isinstance(type_hint, type):
|
|
215
|
+
return cls._handle_class_type(type_hint)
|
|
216
|
+
|
|
217
|
+
return cls.STRING
|
|
218
|
+
|
|
219
|
+
@classmethod
|
|
220
|
+
def _handle_list_type(cls, args: tuple) -> "FieldType":
|
|
221
|
+
if args:
|
|
222
|
+
element_type = args[0]
|
|
223
|
+
return cls.from_type_hint(element_type)
|
|
224
|
+
return cls.STRING
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def _handle_literal_type(cls, args: tuple) -> "FieldType":
|
|
228
|
+
if not args:
|
|
229
|
+
return cls.STRING
|
|
230
|
+
first_value = args[0]
|
|
231
|
+
if isinstance(first_value, bool):
|
|
232
|
+
return cls.BOOLEAN
|
|
233
|
+
if isinstance(first_value, int):
|
|
234
|
+
return cls.INTEGER
|
|
235
|
+
if isinstance(first_value, str):
|
|
236
|
+
return cls.STRING
|
|
237
|
+
if isinstance(first_value, float):
|
|
238
|
+
return cls.FLOAT
|
|
239
|
+
return cls.STRING
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def _handle_union_type(cls, args: tuple) -> "FieldType":
|
|
243
|
+
non_none_types = list(filter_nonetype(args))
|
|
244
|
+
if non_none_types:
|
|
245
|
+
return cls.from_type_hint(non_none_types[0])
|
|
246
|
+
return cls.STRING
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def _handle_class_type(cls, type_hint: type) -> "FieldType":
|
|
250
|
+
if issubclass(type_hint, IntEnum):
|
|
251
|
+
return cls.INTEGER
|
|
252
|
+
if issubclass(type_hint, Enum):
|
|
253
|
+
return cls.STRING
|
|
254
|
+
|
|
255
|
+
from orchestrator.domain.base import ProductBlockModel
|
|
256
|
+
|
|
257
|
+
if issubclass(type_hint, ProductBlockModel):
|
|
258
|
+
return cls.BLOCK
|
|
259
|
+
|
|
260
|
+
return cls.STRING
|
|
261
|
+
|
|
262
|
+
def is_embeddable(self, value: str | None) -> bool:
|
|
263
|
+
"""Check if a field should be embedded."""
|
|
264
|
+
if value is None:
|
|
265
|
+
return False
|
|
266
|
+
|
|
267
|
+
# If inference suggests it's not actually a string, don't embed it
|
|
268
|
+
return FieldType._infer_from_str(value) == FieldType.STRING
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@dataclass(frozen=True)
|
|
272
|
+
class TypedValue:
|
|
273
|
+
value: Any
|
|
274
|
+
type: FieldType
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class ExtractedField(NamedTuple):
|
|
278
|
+
path: str
|
|
279
|
+
value: str
|
|
280
|
+
value_type: FieldType
|
|
281
|
+
|
|
282
|
+
@classmethod
|
|
283
|
+
def from_raw(cls, path: str, raw_value: Any) -> "ExtractedField":
|
|
284
|
+
value = str(raw_value.value if isinstance(raw_value, TypedValue) else raw_value)
|
|
285
|
+
value_type = FieldType.infer(raw_value)
|
|
286
|
+
return cls(path=path, value=value, value_type=value_type)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class IndexableRecord(TypedDict):
|
|
290
|
+
entity_id: str
|
|
291
|
+
entity_type: str
|
|
292
|
+
path: Ltree
|
|
293
|
+
value: Any
|
|
294
|
+
value_type: Any
|
|
295
|
+
content_hash: str
|
|
296
|
+
embedding: list[float] | None
|