orchestrator-core 4.5.1a1__py3-none-any.whl → 4.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +3 -12
- orchestrator/agentic_app.py +48 -29
- orchestrator/api/api_v1/api.py +8 -6
- orchestrator/api/api_v1/endpoints/processes.py +2 -0
- orchestrator/api/api_v1/endpoints/search.py +26 -7
- orchestrator/cli/main.py +2 -2
- orchestrator/cli/search/__init__.py +32 -0
- orchestrator/devtools/populator.py +16 -0
- orchestrator/domain/base.py +2 -7
- orchestrator/domain/lifecycle.py +24 -7
- orchestrator/llm_settings.py +9 -3
- orchestrator/log_config.py +1 -0
- orchestrator/migrations/helpers.py +7 -1
- orchestrator/schemas/search.py +13 -0
- orchestrator/schemas/workflow.py +1 -0
- orchestrator/search/agent/__init__.py +15 -2
- orchestrator/search/agent/agent.py +30 -15
- orchestrator/search/agent/prompts.py +75 -37
- orchestrator/search/agent/state.py +13 -0
- orchestrator/search/agent/tools.py +148 -11
- orchestrator/search/core/__init__.py +12 -0
- orchestrator/search/core/embedding.py +13 -4
- orchestrator/search/core/exceptions.py +14 -0
- orchestrator/search/core/types.py +15 -0
- orchestrator/search/core/validators.py +13 -0
- orchestrator/search/docs/running_local_text_embedding_inference.md +1 -0
- orchestrator/search/filters/__init__.py +13 -0
- orchestrator/search/filters/base.py +84 -61
- orchestrator/search/filters/date_filters.py +13 -0
- orchestrator/search/filters/definitions.py +16 -2
- orchestrator/search/filters/ltree_filters.py +16 -3
- orchestrator/search/filters/numeric_filter.py +13 -0
- orchestrator/search/indexing/__init__.py +13 -0
- orchestrator/search/indexing/indexer.py +14 -3
- orchestrator/search/indexing/registry.py +13 -0
- orchestrator/search/indexing/tasks.py +17 -1
- orchestrator/search/indexing/traverse.py +17 -5
- orchestrator/search/llm_migration.py +108 -0
- orchestrator/search/retrieval/__init__.py +13 -0
- orchestrator/search/retrieval/builder.py +23 -8
- orchestrator/search/retrieval/engine.py +36 -34
- orchestrator/search/retrieval/exceptions.py +90 -0
- orchestrator/search/retrieval/pagination.py +13 -0
- orchestrator/search/retrieval/retrievers/__init__.py +26 -0
- orchestrator/search/retrieval/retrievers/base.py +123 -0
- orchestrator/search/retrieval/retrievers/fuzzy.py +94 -0
- orchestrator/search/retrieval/retrievers/hybrid.py +277 -0
- orchestrator/search/retrieval/retrievers/semantic.py +94 -0
- orchestrator/search/retrieval/retrievers/structured.py +39 -0
- orchestrator/search/retrieval/utils.py +21 -7
- orchestrator/search/retrieval/validation.py +54 -76
- orchestrator/search/schemas/__init__.py +12 -0
- orchestrator/search/schemas/parameters.py +13 -0
- orchestrator/search/schemas/results.py +15 -1
- orchestrator/services/processes.py +2 -1
- orchestrator/settings.py +7 -0
- orchestrator/utils/state.py +6 -1
- orchestrator/workflows/steps.py +16 -1
- {orchestrator_core-4.5.1a1.dist-info → orchestrator_core-4.5.2.dist-info}/METADATA +13 -11
- {orchestrator_core-4.5.1a1.dist-info → orchestrator_core-4.5.2.dist-info}/RECORD +66 -59
- orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +0 -95
- orchestrator/search/retrieval/retriever.py +0 -447
- /orchestrator/cli/{index_llm.py → search/index_llm.py} +0 -0
- /orchestrator/cli/{resize_embedding.py → search/resize_embedding.py} +0 -0
- /orchestrator/cli/{search_explore.py → search/search_explore.py} +0 -0
- /orchestrator/cli/{speedtest.py → search/speedtest.py} +0 -0
- {orchestrator_core-4.5.1a1.dist-info → orchestrator_core-4.5.2.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.5.1a1.dist-info → orchestrator_core-4.5.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from __future__ import annotations
|
|
2
15
|
|
|
3
16
|
from itertools import count
|
|
@@ -45,12 +58,13 @@ class StringFilter(BaseModel):
|
|
|
45
58
|
return self
|
|
46
59
|
|
|
47
60
|
|
|
61
|
+
# Order matters! Ambiguous ops (like 'eq') are resolved by first matching filter
|
|
48
62
|
FilterCondition = (
|
|
49
63
|
DateFilter # DATETIME
|
|
50
64
|
| NumericFilter # INT/FLOAT
|
|
51
|
-
|
|
|
52
|
-
| StringFilter # STRING TODO: convert to hybrid search
|
|
65
|
+
| StringFilter # STRING TODO: convert to hybrid search?
|
|
53
66
|
| LtreeFilter # Path
|
|
67
|
+
| EqualityFilter # BOOLEAN/UUID/BLOCK/RESOURCE_TYPE - most generic, try last
|
|
54
68
|
)
|
|
55
69
|
|
|
56
70
|
|
|
@@ -64,28 +78,29 @@ class PathFilter(BaseModel):
|
|
|
64
78
|
model_config = ConfigDict(
|
|
65
79
|
json_schema_extra={
|
|
66
80
|
"examples": [
|
|
67
|
-
{
|
|
68
|
-
"path": "subscription.status",
|
|
69
|
-
"condition": {"op": "eq", "value": "active"},
|
|
70
|
-
},
|
|
81
|
+
{"path": "subscription.status", "condition": {"op": "eq", "value": "active"}, "value_kind": "string"},
|
|
71
82
|
{
|
|
72
83
|
"path": "subscription.customer_id",
|
|
73
|
-
"condition": {"op": "
|
|
84
|
+
"condition": {"op": "neq", "value": "acme"},
|
|
85
|
+
"value_kind": "string",
|
|
74
86
|
},
|
|
75
87
|
{
|
|
76
88
|
"path": "subscription.start_date",
|
|
77
89
|
"condition": {"op": "gt", "value": "2025-01-01"},
|
|
90
|
+
"value_kind": "datetime",
|
|
78
91
|
},
|
|
79
92
|
{
|
|
80
93
|
"path": "subscription.end_date",
|
|
81
94
|
"condition": {
|
|
82
95
|
"op": "between",
|
|
83
|
-
"value": {"
|
|
96
|
+
"value": {"start": "2025-06-01", "end": "2025-07-01"},
|
|
84
97
|
},
|
|
98
|
+
"value_kind": "datetime",
|
|
85
99
|
},
|
|
86
100
|
{
|
|
87
|
-
"path": "subscription
|
|
88
|
-
"condition": {"op": "
|
|
101
|
+
"path": "subscription",
|
|
102
|
+
"condition": {"op": "has_component", "value": "node"},
|
|
103
|
+
"value_kind": "component",
|
|
89
104
|
},
|
|
90
105
|
]
|
|
91
106
|
}
|
|
@@ -121,18 +136,14 @@ class PathFilter(BaseModel):
|
|
|
121
136
|
This method creates a type guard to ensure we only match compatible field types,
|
|
122
137
|
then delegates to the specific filter condition.
|
|
123
138
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
The SQLAlchemy column element representing the value to be filtered.
|
|
128
|
-
value_type_column : ColumnElement
|
|
129
|
-
The SQLAlchemy column element representing the field type.
|
|
139
|
+
Args:
|
|
140
|
+
value_column (ColumnElement): The SQLAlchemy column element representing the value to be filtered.
|
|
141
|
+
value_type_column (ColumnElement): The SQLAlchemy column element representing the field type.
|
|
130
142
|
|
|
131
143
|
Returns:
|
|
132
|
-
|
|
133
|
-
ColumnElement[bool]
|
|
134
|
-
A SQLAlchemy boolean expression that can be used in a ``WHERE`` clause.
|
|
144
|
+
ColumnElement[bool]: A SQLAlchemy boolean expression that can be used in a ``WHERE`` clause.
|
|
135
145
|
"""
|
|
146
|
+
|
|
136
147
|
# Type guard - only match compatible field types
|
|
137
148
|
allowed_field_types = [ft.value for ft in FieldType if UIType.from_field_type(ft) == self.value_kind]
|
|
138
149
|
type_guard = value_type_column.in_(allowed_field_types) if allowed_field_types else literal(True)
|
|
@@ -141,6 +152,14 @@ class PathFilter(BaseModel):
|
|
|
141
152
|
|
|
142
153
|
|
|
143
154
|
class FilterTree(BaseModel):
|
|
155
|
+
op: BooleanOperator = Field(
|
|
156
|
+
description="Operator for grouping conditions in uppercase.", default=BooleanOperator.AND
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
children: list[FilterTree | PathFilter] = Field(min_length=1, description="Path filters or nested groups.")
|
|
160
|
+
|
|
161
|
+
MAX_DEPTH: ClassVar[int] = 5
|
|
162
|
+
|
|
144
163
|
model_config = ConfigDict(
|
|
145
164
|
json_schema_extra={
|
|
146
165
|
"description": (
|
|
@@ -150,11 +169,11 @@ class FilterTree(BaseModel):
|
|
|
150
169
|
" • Leaf (PathFilter): {'path':'<ltree>', 'condition': {...}}\n"
|
|
151
170
|
"Rules:\n"
|
|
152
171
|
" • Do NOT put 'op' or 'children' inside a leaf 'condition'.\n"
|
|
153
|
-
" • Max depth =
|
|
154
|
-
" • Use from_flat_and() for a flat list of leaves."
|
|
172
|
+
f" • Max depth = {MAX_DEPTH}.\n"
|
|
155
173
|
),
|
|
156
174
|
"examples": [
|
|
157
175
|
{
|
|
176
|
+
"description": "Simple filters",
|
|
158
177
|
"op": "AND",
|
|
159
178
|
"children": [
|
|
160
179
|
{"path": "subscription.status", "condition": {"op": "eq", "value": "active"}},
|
|
@@ -162,13 +181,14 @@ class FilterTree(BaseModel):
|
|
|
162
181
|
],
|
|
163
182
|
},
|
|
164
183
|
{
|
|
184
|
+
"description": "Complex filters with OR group",
|
|
165
185
|
"op": "AND",
|
|
166
186
|
"children": [
|
|
167
187
|
{"path": "subscription.start_date", "condition": {"op": "gte", "value": "2024-01-01"}},
|
|
168
188
|
{
|
|
169
189
|
"op": "OR",
|
|
170
190
|
"children": [
|
|
171
|
-
{"path": "subscription.
|
|
191
|
+
{"path": "subscription.product.name", "condition": {"op": "like", "value": "%fiber%"}},
|
|
172
192
|
{"path": "subscription.customer_id", "condition": {"op": "eq", "value": "Surf"}},
|
|
173
193
|
],
|
|
174
194
|
},
|
|
@@ -178,14 +198,6 @@ class FilterTree(BaseModel):
|
|
|
178
198
|
}
|
|
179
199
|
)
|
|
180
200
|
|
|
181
|
-
op: BooleanOperator = Field(
|
|
182
|
-
description="Operator for grouping conditions in uppercase.", default=BooleanOperator.AND
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
children: list[FilterTree | PathFilter] = Field(min_length=1, description="Path filters or nested groups.")
|
|
186
|
-
|
|
187
|
-
MAX_DEPTH: ClassVar[int] = 5
|
|
188
|
-
|
|
189
201
|
@model_validator(mode="after")
|
|
190
202
|
def _validate_depth(self) -> FilterTree:
|
|
191
203
|
def depth(node: "FilterTree | PathFilter") -> int:
|
|
@@ -214,6 +226,38 @@ class FilterTree(BaseModel):
|
|
|
214
226
|
leaves.extend(child.get_all_leaves())
|
|
215
227
|
return leaves
|
|
216
228
|
|
|
229
|
+
@staticmethod
|
|
230
|
+
def _build_correlates(
|
|
231
|
+
alias: Any, entity_id_col: SQLAColumn, entity_type_value: str | None
|
|
232
|
+
) -> list[ColumnElement[bool]]:
|
|
233
|
+
"""Build the correlation predicates that link the subquery to the outer query."""
|
|
234
|
+
correlates = [alias.entity_id == entity_id_col]
|
|
235
|
+
if entity_type_value is not None:
|
|
236
|
+
correlates.append(alias.entity_type == entity_type_value)
|
|
237
|
+
return correlates
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def _handle_ltree_filter(pf: PathFilter, alias: Any, correlates: list[ColumnElement[bool]]) -> ColumnElement[bool]:
|
|
241
|
+
"""Handle path-only filters (has_component, not_has_component, ends_with)."""
|
|
242
|
+
# row-level predicate is always positive
|
|
243
|
+
positive = pf.condition.to_expression(alias.path, pf.path)
|
|
244
|
+
subq = select(1).select_from(alias).where(and_(*correlates, positive))
|
|
245
|
+
if pf.condition.op == FilterOp.NOT_HAS_COMPONENT:
|
|
246
|
+
return ~exists(subq) # NOT at the entity level
|
|
247
|
+
return exists(subq)
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def _handle_value_filter(pf: PathFilter, alias: Any, correlates: list[ColumnElement[bool]]) -> ColumnElement[bool]:
|
|
251
|
+
"""Handle value-based filters (equality, comparison, etc)."""
|
|
252
|
+
if "." not in pf.path:
|
|
253
|
+
path_pred = LtreeFilter(op=FilterOp.ENDS_WITH, value=pf.path).to_expression(alias.path, "")
|
|
254
|
+
else:
|
|
255
|
+
path_pred = alias.path == Ltree(pf.path)
|
|
256
|
+
|
|
257
|
+
value_pred = pf.to_expression(alias.value, alias.value_type)
|
|
258
|
+
subq = select(1).select_from(alias).where(and_(*correlates, path_pred, value_pred))
|
|
259
|
+
return exists(subq)
|
|
260
|
+
|
|
217
261
|
def to_expression(
|
|
218
262
|
self,
|
|
219
263
|
entity_id_col: SQLAColumn,
|
|
@@ -222,46 +266,25 @@ class FilterTree(BaseModel):
|
|
|
222
266
|
) -> ColumnElement[bool]:
|
|
223
267
|
"""Compile this tree into a SQLAlchemy boolean expression.
|
|
224
268
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
Column in the outer query representing the entity ID.
|
|
229
|
-
entity_type_value : str, optional
|
|
230
|
-
If provided, each subquery is additionally constrained to this entity type.
|
|
269
|
+
Args:
|
|
270
|
+
entity_id_col (SQLAColumn): Column in the outer query representing the entity ID.
|
|
271
|
+
entity_type_value (str, optional): If provided, each subquery is additionally constrained to this entity type.
|
|
231
272
|
|
|
232
273
|
Returns:
|
|
233
|
-
|
|
234
|
-
ColumnElement[bool]
|
|
235
|
-
A SQLAlchemy expression suitable for use in a WHERE clause.
|
|
274
|
+
ColumnElement[bool]: A SQLAlchemy expression suitable for use in a WHERE clause.
|
|
236
275
|
"""
|
|
276
|
+
from sqlalchemy.orm import aliased
|
|
277
|
+
|
|
237
278
|
alias_idx = count(1)
|
|
238
279
|
|
|
239
280
|
def leaf_exists(pf: PathFilter) -> ColumnElement[bool]:
|
|
240
|
-
|
|
241
|
-
|
|
281
|
+
"""Convert a PathFilter into an EXISTS subquery."""
|
|
242
282
|
alias = aliased(AiSearchIndex, name=f"flt_{next(alias_idx)}")
|
|
243
|
-
|
|
244
|
-
correlates = [alias.entity_id == entity_id_col]
|
|
245
|
-
if entity_type_value is not None:
|
|
246
|
-
correlates.append(alias.entity_type == entity_type_value)
|
|
283
|
+
correlates = self._build_correlates(alias, entity_id_col, entity_type_value)
|
|
247
284
|
|
|
248
285
|
if isinstance(pf.condition, LtreeFilter):
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
subq = select(1).select_from(alias).where(and_(*correlates, positive))
|
|
252
|
-
if pf.condition.op == FilterOp.NOT_HAS_COMPONENT:
|
|
253
|
-
return ~exists(subq) # NOT at the entity level
|
|
254
|
-
return exists(subq)
|
|
255
|
-
|
|
256
|
-
# value leaf: path predicate + typed value compare
|
|
257
|
-
if "." not in pf.path:
|
|
258
|
-
path_pred = LtreeFilter(op=FilterOp.ENDS_WITH, value=pf.path).to_expression(alias.path, "")
|
|
259
|
-
else:
|
|
260
|
-
path_pred = alias.path == Ltree(pf.path)
|
|
261
|
-
|
|
262
|
-
value_pred = pf.to_expression(alias.value, alias.value_type)
|
|
263
|
-
subq = select(1).select_from(alias).where(and_(*correlates, path_pred, value_pred))
|
|
264
|
-
return exists(subq)
|
|
286
|
+
return self._handle_ltree_filter(pf, alias, correlates)
|
|
287
|
+
return self._handle_value_filter(pf, alias, correlates)
|
|
265
288
|
|
|
266
289
|
def compile_node(node: FilterTree | PathFilter) -> ColumnElement[bool]:
|
|
267
290
|
if isinstance(node, FilterTree):
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from datetime import date, datetime
|
|
2
15
|
from typing import Annotated, Any, Literal
|
|
3
16
|
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from orchestrator.search.core.types import FieldType, FilterOp, UIType
|
|
2
15
|
from orchestrator.search.schemas.results import TypeDefinition, ValueSchema
|
|
3
16
|
|
|
@@ -60,6 +73,7 @@ def value_schema_for(ft: FieldType) -> dict[FilterOp, ValueSchema]:
|
|
|
60
73
|
return {
|
|
61
74
|
FilterOp.EQ: ValueSchema(kind=UIType.STRING),
|
|
62
75
|
FilterOp.NEQ: ValueSchema(kind=UIType.STRING),
|
|
76
|
+
FilterOp.LIKE: ValueSchema(kind=UIType.STRING),
|
|
63
77
|
}
|
|
64
78
|
|
|
65
79
|
|
|
@@ -73,7 +87,7 @@ def generate_definitions() -> dict[UIType, TypeDefinition]:
|
|
|
73
87
|
comp_ops = component_operators()
|
|
74
88
|
definitions[ui_type] = TypeDefinition(
|
|
75
89
|
operators=list(comp_ops.keys()),
|
|
76
|
-
|
|
90
|
+
value_schema=comp_ops,
|
|
77
91
|
)
|
|
78
92
|
else:
|
|
79
93
|
# Regular field types
|
|
@@ -88,6 +102,6 @@ def generate_definitions() -> dict[UIType, TypeDefinition]:
|
|
|
88
102
|
|
|
89
103
|
definitions[ui_type] = TypeDefinition(
|
|
90
104
|
operators=operators_for(rep_ft),
|
|
91
|
-
|
|
105
|
+
value_schema=value_schema_for(rep_ft),
|
|
92
106
|
)
|
|
93
107
|
return definitions
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from typing import Literal
|
|
2
15
|
|
|
3
16
|
from pydantic import BaseModel, Field
|
|
@@ -5,7 +18,7 @@ from sqlalchemy import TEXT, bindparam
|
|
|
5
18
|
from sqlalchemy.sql.elements import ColumnElement
|
|
6
19
|
from sqlalchemy_utils.types.ltree import Ltree
|
|
7
20
|
|
|
8
|
-
from orchestrator.search.core.types import FilterOp, SQLAColumn
|
|
21
|
+
from orchestrator.search.core.types import LTREE_SEPARATOR, FilterOp, SQLAColumn
|
|
9
22
|
|
|
10
23
|
|
|
11
24
|
class LtreeFilter(BaseModel):
|
|
@@ -38,6 +51,6 @@ class LtreeFilter(BaseModel):
|
|
|
38
51
|
ltree_value = Ltree(path)
|
|
39
52
|
return column == ltree_value
|
|
40
53
|
case FilterOp.HAS_COMPONENT | FilterOp.NOT_HAS_COMPONENT:
|
|
41
|
-
return column.op("~")(bindparam(None, f"
|
|
54
|
+
return column.op("~")(bindparam(None, f"*{LTREE_SEPARATOR}{self.value}{LTREE_SEPARATOR}*", type_=TEXT))
|
|
42
55
|
case FilterOp.ENDS_WITH:
|
|
43
|
-
return column.op("~")(bindparam(None, f"
|
|
56
|
+
return column.op("~")(bindparam(None, f"*{LTREE_SEPARATOR}{self.value}", type_=TEXT))
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from typing import Annotated, Any, Literal
|
|
2
15
|
|
|
3
16
|
from pydantic import BaseModel, Field, model_validator
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from .tasks import run_indexing_for_entity
|
|
2
15
|
|
|
3
16
|
__all__ = ["run_indexing_for_entity"]
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
import hashlib
|
|
2
15
|
from collections.abc import Generator, Iterable, Iterator
|
|
3
16
|
from contextlib import contextmanager, nullcontext
|
|
@@ -213,9 +226,7 @@ class Indexer:
|
|
|
213
226
|
safe_margin = int(max_ctx * llm_settings.EMBEDDING_SAFE_MARGIN_PERCENT)
|
|
214
227
|
token_budget = max(1, max_ctx - safe_margin)
|
|
215
228
|
|
|
216
|
-
max_batch_size =
|
|
217
|
-
if llm_settings.OPENAI_BASE_URL: # We are using a local model
|
|
218
|
-
max_batch_size = llm_settings.EMBEDDING_MAX_BATCH_SIZE
|
|
229
|
+
max_batch_size = llm_settings.EMBEDDING_MAX_BATCH_SIZE
|
|
219
230
|
|
|
220
231
|
for entity_id, field in fields_to_upsert:
|
|
221
232
|
if field.value_type.is_embeddable(field.value):
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from dataclasses import dataclass
|
|
2
15
|
from typing import Generic, TypeVar
|
|
3
16
|
from uuid import UUID
|
|
@@ -1,7 +1,21 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
import structlog
|
|
2
15
|
from sqlalchemy.orm import Query
|
|
3
16
|
|
|
4
17
|
from orchestrator.db import db
|
|
18
|
+
from orchestrator.domain.context_cache import cache_subscription_models
|
|
5
19
|
from orchestrator.search.core.types import EntityType
|
|
6
20
|
from orchestrator.search.indexing.indexer import Indexer
|
|
7
21
|
from orchestrator.search.indexing.registry import ENTITY_CONFIG_REGISTRY
|
|
@@ -50,4 +64,6 @@ def run_indexing_for_entity(
|
|
|
50
64
|
entities = db.session.execute(stmt).scalars()
|
|
51
65
|
|
|
52
66
|
indexer = Indexer(config=config, dry_run=dry_run, force_index=force_index, chunk_size=chunk_size)
|
|
53
|
-
|
|
67
|
+
|
|
68
|
+
with cache_subscription_models():
|
|
69
|
+
indexer.run(entities)
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
import re
|
|
2
15
|
from abc import ABC, abstractmethod
|
|
3
16
|
from collections.abc import Iterable
|
|
@@ -19,7 +32,7 @@ from orchestrator.domain.lifecycle import (
|
|
|
19
32
|
from orchestrator.schemas.process import ProcessSchema
|
|
20
33
|
from orchestrator.schemas.workflow import WorkflowSchema
|
|
21
34
|
from orchestrator.search.core.exceptions import ModelLoadError, ProductNotInRegistryError
|
|
22
|
-
from orchestrator.search.core.types import ExtractedField, FieldType
|
|
35
|
+
from orchestrator.search.core.types import LTREE_SEPARATOR, ExtractedField, FieldType
|
|
23
36
|
from orchestrator.types import SubscriptionLifecycle
|
|
24
37
|
|
|
25
38
|
logger = structlog.get_logger(__name__)
|
|
@@ -30,7 +43,6 @@ DatabaseEntity = SubscriptionTable | ProductTable | ProcessTable | WorkflowTable
|
|
|
30
43
|
class BaseTraverser(ABC):
|
|
31
44
|
"""Base class for traversing database models and extracting searchable fields."""
|
|
32
45
|
|
|
33
|
-
_LTREE_SEPARATOR = "."
|
|
34
46
|
_MAX_DEPTH = 40
|
|
35
47
|
|
|
36
48
|
@classmethod
|
|
@@ -62,7 +74,7 @@ class BaseTraverser(ABC):
|
|
|
62
74
|
except Exception as e:
|
|
63
75
|
logger.error(f"Failed to access field '{name}' on {model_class.__name__}", error=str(e))
|
|
64
76
|
continue
|
|
65
|
-
new_path = f"{path}{
|
|
77
|
+
new_path = f"{path}{LTREE_SEPARATOR}{name}" if path else name
|
|
66
78
|
annotation = field.annotation if hasattr(field, "annotation") else field.return_type
|
|
67
79
|
yield from cls._yield_fields_for_value(value, new_path, annotation)
|
|
68
80
|
|
|
@@ -197,7 +209,7 @@ class ProductTraverser(BaseTraverser):
|
|
|
197
209
|
fields = []
|
|
198
210
|
|
|
199
211
|
# Add the block itself as a BLOCK type
|
|
200
|
-
block_name = block_path.split(
|
|
212
|
+
block_name = block_path.split(LTREE_SEPARATOR)[-1]
|
|
201
213
|
fields.append(ExtractedField(path=block_path, value=block_name, value_type=FieldType.BLOCK))
|
|
202
214
|
|
|
203
215
|
# Extract all field names from the block as RESOURCE_TYPE
|
|
@@ -223,7 +235,7 @@ class ProductTraverser(BaseTraverser):
|
|
|
223
235
|
ExtractedField(path=field_path, value=field_name, value_type=FieldType.RESOURCE_TYPE)
|
|
224
236
|
)
|
|
225
237
|
# And potentially traverse the first item for schema
|
|
226
|
-
first_item_path = f"{field_path}{
|
|
238
|
+
first_item_path = f"{field_path}{LTREE_SEPARATOR}0"
|
|
227
239
|
nested_fields = cls._extract_block_schema(field_value[0], first_item_path)
|
|
228
240
|
fields.extend(nested_fields)
|
|
229
241
|
else:
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
"""Simple search migration function that runs when SEARCH_ENABLED = True."""
|
|
15
|
+
|
|
16
|
+
from sqlalchemy import text
|
|
17
|
+
from sqlalchemy.engine import Connection
|
|
18
|
+
from structlog import get_logger
|
|
19
|
+
|
|
20
|
+
from orchestrator.llm_settings import llm_settings
|
|
21
|
+
from orchestrator.search.core.types import FieldType
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
TABLE = "ai_search_index"
|
|
26
|
+
TARGET_DIM = 1536
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def run_migration(connection: Connection) -> None:
|
|
30
|
+
"""Run LLM migration with ON CONFLICT DO NOTHING pattern."""
|
|
31
|
+
logger.info("Running LLM migration")
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
# Test to see if the extenstion exists and then skip the migration; Needed for certain situations where db user
|
|
35
|
+
# has insufficient priviledges to run the `CREATE EXTENSION ...` command.
|
|
36
|
+
res = connection.execute(text("SELECT * FROM pg_extension where extname = 'vector';"))
|
|
37
|
+
if llm_settings.LLM_FORCE_EXTENTION_MIGRATION or res.rowcount == 0:
|
|
38
|
+
# Create PostgreSQL extensions
|
|
39
|
+
logger.info("Attempting to run the extention creation;")
|
|
40
|
+
connection.execute(text("CREATE EXTENSION IF NOT EXISTS ltree;"))
|
|
41
|
+
connection.execute(text("CREATE EXTENSION IF NOT EXISTS unaccent;"))
|
|
42
|
+
connection.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))
|
|
43
|
+
connection.execute(text("CREATE EXTENSION IF NOT EXISTS vector;"))
|
|
44
|
+
|
|
45
|
+
# Create field_type enum
|
|
46
|
+
field_type_values = "', '".join([ft.value for ft in FieldType])
|
|
47
|
+
connection.execute(
|
|
48
|
+
text(
|
|
49
|
+
f"""
|
|
50
|
+
DO $$
|
|
51
|
+
BEGIN
|
|
52
|
+
IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'field_type') THEN
|
|
53
|
+
CREATE TYPE field_type AS ENUM ('{field_type_values}');
|
|
54
|
+
END IF;
|
|
55
|
+
END $$;
|
|
56
|
+
"""
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Create table with ON CONFLICT DO NOTHING pattern
|
|
61
|
+
connection.execute(
|
|
62
|
+
text(
|
|
63
|
+
f"""
|
|
64
|
+
CREATE TABLE IF NOT EXISTS {TABLE} (
|
|
65
|
+
entity_type TEXT NOT NULL,
|
|
66
|
+
entity_id UUID NOT NULL,
|
|
67
|
+
path LTREE NOT NULL,
|
|
68
|
+
value TEXT NOT NULL,
|
|
69
|
+
embedding VECTOR({TARGET_DIM}),
|
|
70
|
+
content_hash VARCHAR(64) NOT NULL,
|
|
71
|
+
value_type field_type NOT NULL DEFAULT '{FieldType.STRING.value}',
|
|
72
|
+
CONSTRAINT pk_ai_search_index PRIMARY KEY (entity_id, path)
|
|
73
|
+
);
|
|
74
|
+
"""
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Drop default
|
|
79
|
+
connection.execute(text(f"ALTER TABLE {TABLE} ALTER COLUMN value_type DROP DEFAULT;"))
|
|
80
|
+
|
|
81
|
+
# Create indexes with IF NOT EXISTS
|
|
82
|
+
connection.execute(text(f"CREATE INDEX IF NOT EXISTS ix_ai_search_index_entity_id ON {TABLE} (entity_id);"))
|
|
83
|
+
connection.execute(
|
|
84
|
+
text(f"CREATE INDEX IF NOT EXISTS idx_ai_search_index_content_hash ON {TABLE} (content_hash);")
|
|
85
|
+
)
|
|
86
|
+
connection.execute(
|
|
87
|
+
text(f"CREATE INDEX IF NOT EXISTS ix_flat_path_gist ON {TABLE} USING GIST (path gist_ltree_ops);")
|
|
88
|
+
)
|
|
89
|
+
connection.execute(text(f"CREATE INDEX IF NOT EXISTS ix_flat_path_btree ON {TABLE} (path);"))
|
|
90
|
+
connection.execute(
|
|
91
|
+
text(f"CREATE INDEX IF NOT EXISTS ix_flat_value_trgm ON {TABLE} USING GIN (value gin_trgm_ops);")
|
|
92
|
+
)
|
|
93
|
+
connection.execute(
|
|
94
|
+
text(
|
|
95
|
+
f"CREATE INDEX IF NOT EXISTS ix_flat_embed_hnsw ON {TABLE} USING HNSW (embedding vector_l2_ops) WITH (m = 16, ef_construction = 64);"
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
connection.commit()
|
|
100
|
+
logger.info("LLM migration completed successfully")
|
|
101
|
+
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error("LLM migration failed", error=str(e))
|
|
104
|
+
raise Exception(
|
|
105
|
+
f"LLM migration failed. This likely means the pgvector extension "
|
|
106
|
+
f"is not installed. Please install pgvector and ensure your PostgreSQL "
|
|
107
|
+
f"version supports it. Error: {e}"
|
|
108
|
+
) from e
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from .engine import execute_search
|
|
2
15
|
|
|
3
16
|
__all__ = ["execute_search"]
|