orchestrator-core 4.5.0a2__py3-none-any.whl → 4.5.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +1 -1
- orchestrator/api/api_v1/api.py +7 -5
- orchestrator/api/api_v1/endpoints/search.py +13 -0
- orchestrator/devtools/populator.py +16 -0
- orchestrator/log_config.py +1 -0
- orchestrator/migrations/helpers.py +1 -1
- orchestrator/schemas/search.py +13 -0
- orchestrator/schemas/workflow.py +1 -0
- orchestrator/search/agent/__init__.py +13 -0
- orchestrator/search/agent/agent.py +13 -0
- orchestrator/search/agent/prompts.py +13 -0
- orchestrator/search/agent/state.py +13 -0
- orchestrator/search/agent/tools.py +27 -5
- orchestrator/search/core/__init__.py +12 -0
- orchestrator/search/core/embedding.py +13 -4
- orchestrator/search/core/exceptions.py +14 -0
- orchestrator/search/core/types.py +15 -0
- orchestrator/search/core/validators.py +13 -0
- orchestrator/search/filters/__init__.py +13 -0
- orchestrator/search/filters/base.py +23 -18
- orchestrator/search/filters/date_filters.py +13 -0
- orchestrator/search/filters/definitions.py +16 -2
- orchestrator/search/filters/ltree_filters.py +16 -3
- orchestrator/search/filters/numeric_filter.py +13 -0
- orchestrator/search/indexing/__init__.py +13 -0
- orchestrator/search/indexing/indexer.py +13 -0
- orchestrator/search/indexing/registry.py +13 -0
- orchestrator/search/indexing/tasks.py +13 -0
- orchestrator/search/indexing/traverse.py +17 -5
- orchestrator/search/retrieval/__init__.py +13 -0
- orchestrator/search/retrieval/builder.py +17 -7
- orchestrator/search/retrieval/engine.py +35 -29
- orchestrator/search/retrieval/exceptions.py +90 -0
- orchestrator/search/retrieval/pagination.py +13 -0
- orchestrator/search/retrieval/retrievers/__init__.py +26 -0
- orchestrator/search/retrieval/retrievers/base.py +122 -0
- orchestrator/search/retrieval/retrievers/fuzzy.py +94 -0
- orchestrator/search/retrieval/retrievers/hybrid.py +188 -0
- orchestrator/search/retrieval/retrievers/semantic.py +94 -0
- orchestrator/search/retrieval/retrievers/structured.py +39 -0
- orchestrator/search/retrieval/utils.py +21 -7
- orchestrator/search/retrieval/validation.py +54 -76
- orchestrator/search/schemas/__init__.py +12 -0
- orchestrator/search/schemas/parameters.py +13 -0
- orchestrator/search/schemas/results.py +14 -1
- orchestrator/workflows/steps.py +15 -1
- orchestrator/workflows/tasks/validate_products.py +1 -1
- {orchestrator_core-4.5.0a2.dist-info → orchestrator_core-4.5.0a4.dist-info}/METADATA +6 -6
- {orchestrator_core-4.5.0a2.dist-info → orchestrator_core-4.5.0a4.dist-info}/RECORD +51 -45
- orchestrator/search/retrieval/retriever.py +0 -447
- {orchestrator_core-4.5.0a2.dist-info → orchestrator_core-4.5.0a4.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.5.0a2.dist-info → orchestrator_core-4.5.0a4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from sqlalchemy import BindParameter, Select, and_, bindparam, case, cast, func, literal, or_, select
|
|
15
|
+
from sqlalchemy.sql.expression import ColumnElement
|
|
16
|
+
|
|
17
|
+
from orchestrator.db.models import AiSearchIndex
|
|
18
|
+
from orchestrator.search.core.types import SearchMetadata
|
|
19
|
+
|
|
20
|
+
from ..pagination import PaginationParams
|
|
21
|
+
from .base import Retriever
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RrfHybridRetriever(Retriever):
|
|
25
|
+
"""Reciprocal Rank Fusion of semantic and fuzzy ranking with parent-child retrieval."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
q_vec: list[float],
|
|
30
|
+
fuzzy_term: str,
|
|
31
|
+
pagination_params: PaginationParams,
|
|
32
|
+
k: int = 60,
|
|
33
|
+
field_candidates_limit: int = 100,
|
|
34
|
+
) -> None:
|
|
35
|
+
self.q_vec = q_vec
|
|
36
|
+
self.fuzzy_term = fuzzy_term
|
|
37
|
+
self.page_after_score = pagination_params.page_after_score
|
|
38
|
+
self.page_after_id = pagination_params.page_after_id
|
|
39
|
+
self.k = k
|
|
40
|
+
self.field_candidates_limit = field_candidates_limit
|
|
41
|
+
|
|
42
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
43
|
+
cand = candidate_query.subquery()
|
|
44
|
+
q_param: BindParameter[list[float]] = bindparam("q_vec", self.q_vec, type_=AiSearchIndex.embedding.type)
|
|
45
|
+
|
|
46
|
+
best_similarity = func.word_similarity(self.fuzzy_term, AiSearchIndex.value)
|
|
47
|
+
sem_expr = case(
|
|
48
|
+
(AiSearchIndex.embedding.is_(None), None),
|
|
49
|
+
else_=AiSearchIndex.embedding.op("<->")(q_param),
|
|
50
|
+
)
|
|
51
|
+
sem_val = func.coalesce(sem_expr, literal(1.0)).label("semantic_distance")
|
|
52
|
+
|
|
53
|
+
filter_condition = literal(self.fuzzy_term).op("<%")(AiSearchIndex.value)
|
|
54
|
+
|
|
55
|
+
field_candidates = (
|
|
56
|
+
select(
|
|
57
|
+
AiSearchIndex.entity_id,
|
|
58
|
+
AiSearchIndex.path,
|
|
59
|
+
AiSearchIndex.value,
|
|
60
|
+
sem_val,
|
|
61
|
+
best_similarity.label("fuzzy_score"),
|
|
62
|
+
)
|
|
63
|
+
.select_from(AiSearchIndex)
|
|
64
|
+
.join(cand, cand.c.entity_id == AiSearchIndex.entity_id)
|
|
65
|
+
.where(
|
|
66
|
+
and_(
|
|
67
|
+
AiSearchIndex.value_type.in_(self.SEARCHABLE_FIELD_TYPES),
|
|
68
|
+
filter_condition,
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
.order_by(
|
|
72
|
+
best_similarity.desc().nulls_last(),
|
|
73
|
+
sem_expr.asc().nulls_last(),
|
|
74
|
+
AiSearchIndex.entity_id.asc(),
|
|
75
|
+
)
|
|
76
|
+
.limit(self.field_candidates_limit)
|
|
77
|
+
).cte("field_candidates")
|
|
78
|
+
|
|
79
|
+
entity_scores = (
|
|
80
|
+
select(
|
|
81
|
+
field_candidates.c.entity_id,
|
|
82
|
+
func.avg(field_candidates.c.semantic_distance).label("avg_semantic_distance"),
|
|
83
|
+
func.avg(field_candidates.c.fuzzy_score).label("avg_fuzzy_score"),
|
|
84
|
+
).group_by(field_candidates.c.entity_id)
|
|
85
|
+
).cte("entity_scores")
|
|
86
|
+
|
|
87
|
+
entity_highlights = (
|
|
88
|
+
select(
|
|
89
|
+
field_candidates.c.entity_id,
|
|
90
|
+
func.first_value(field_candidates.c.value)
|
|
91
|
+
.over(
|
|
92
|
+
partition_by=field_candidates.c.entity_id,
|
|
93
|
+
order_by=[field_candidates.c.fuzzy_score.desc(), field_candidates.c.path.asc()],
|
|
94
|
+
)
|
|
95
|
+
.label(self.HIGHLIGHT_TEXT_LABEL),
|
|
96
|
+
func.first_value(field_candidates.c.path)
|
|
97
|
+
.over(
|
|
98
|
+
partition_by=field_candidates.c.entity_id,
|
|
99
|
+
order_by=[field_candidates.c.fuzzy_score.desc(), field_candidates.c.path.asc()],
|
|
100
|
+
)
|
|
101
|
+
.label(self.HIGHLIGHT_PATH_LABEL),
|
|
102
|
+
).distinct(field_candidates.c.entity_id)
|
|
103
|
+
).cte("entity_highlights")
|
|
104
|
+
|
|
105
|
+
ranked = (
|
|
106
|
+
select(
|
|
107
|
+
entity_scores.c.entity_id,
|
|
108
|
+
entity_scores.c.avg_semantic_distance,
|
|
109
|
+
entity_scores.c.avg_fuzzy_score,
|
|
110
|
+
entity_highlights.c.highlight_text,
|
|
111
|
+
entity_highlights.c.highlight_path,
|
|
112
|
+
func.dense_rank()
|
|
113
|
+
.over(
|
|
114
|
+
order_by=[entity_scores.c.avg_semantic_distance.asc().nulls_last(), entity_scores.c.entity_id.asc()]
|
|
115
|
+
)
|
|
116
|
+
.label("sem_rank"),
|
|
117
|
+
func.dense_rank()
|
|
118
|
+
.over(order_by=[entity_scores.c.avg_fuzzy_score.desc().nulls_last(), entity_scores.c.entity_id.asc()])
|
|
119
|
+
.label("fuzzy_rank"),
|
|
120
|
+
).select_from(
|
|
121
|
+
entity_scores.join(entity_highlights, entity_scores.c.entity_id == entity_highlights.c.entity_id)
|
|
122
|
+
)
|
|
123
|
+
).cte("ranked_results")
|
|
124
|
+
|
|
125
|
+
# RRF (rank-based)
|
|
126
|
+
rrf_raw = (1.0 / (self.k + ranked.c.sem_rank)) + (1.0 / (self.k + ranked.c.fuzzy_rank))
|
|
127
|
+
rrf_num = cast(rrf_raw, self.SCORE_NUMERIC_TYPE)
|
|
128
|
+
|
|
129
|
+
# Perfect flag to boost near perfect fuzzy matches as this most likely indicates the desired record.
|
|
130
|
+
perfect = case((ranked.c.avg_fuzzy_score >= 0.9, 1), else_=0).label("perfect_match")
|
|
131
|
+
|
|
132
|
+
# Dynamic beta based on k (and number of sources)
|
|
133
|
+
# rrf_max = n_sources / (k + 1)
|
|
134
|
+
k_num = literal(float(self.k), type_=self.SCORE_NUMERIC_TYPE)
|
|
135
|
+
n_sources = literal(2.0, type_=self.SCORE_NUMERIC_TYPE) # semantic + fuzzy
|
|
136
|
+
rrf_max = n_sources / (k_num + literal(1.0, type_=self.SCORE_NUMERIC_TYPE))
|
|
137
|
+
|
|
138
|
+
# Choose a small positive margin above rrf_max to ensure strict separation
|
|
139
|
+
# Keep it small to avoid compressing perfects near 1 after normalization
|
|
140
|
+
margin = rrf_max * literal(0.05, type_=self.SCORE_NUMERIC_TYPE) # 5% above bound
|
|
141
|
+
beta = rrf_max + margin
|
|
142
|
+
|
|
143
|
+
fused_num = rrf_num + beta * cast(perfect, self.SCORE_NUMERIC_TYPE)
|
|
144
|
+
|
|
145
|
+
# Normalize to [0,1] via the theoretical max (beta + rrf_max)
|
|
146
|
+
norm_den = beta + rrf_max
|
|
147
|
+
normalized_score = fused_num / norm_den
|
|
148
|
+
|
|
149
|
+
score = cast(
|
|
150
|
+
func.round(cast(normalized_score, self.SCORE_NUMERIC_TYPE), self.SCORE_PRECISION),
|
|
151
|
+
self.SCORE_NUMERIC_TYPE,
|
|
152
|
+
).label(self.SCORE_LABEL)
|
|
153
|
+
|
|
154
|
+
stmt = select(
|
|
155
|
+
ranked.c.entity_id,
|
|
156
|
+
score,
|
|
157
|
+
ranked.c.highlight_text,
|
|
158
|
+
ranked.c.highlight_path,
|
|
159
|
+
perfect.label("perfect_match"),
|
|
160
|
+
).select_from(ranked)
|
|
161
|
+
|
|
162
|
+
stmt = self._apply_fused_pagination(stmt, score, ranked.c.entity_id)
|
|
163
|
+
|
|
164
|
+
return stmt.order_by(
|
|
165
|
+
score.desc().nulls_last(),
|
|
166
|
+
ranked.c.entity_id.asc(),
|
|
167
|
+
).params(q_vec=self.q_vec)
|
|
168
|
+
|
|
169
|
+
def _apply_fused_pagination(
|
|
170
|
+
self,
|
|
171
|
+
stmt: Select,
|
|
172
|
+
score_column: ColumnElement,
|
|
173
|
+
entity_id_column: ColumnElement,
|
|
174
|
+
) -> Select:
|
|
175
|
+
"""Keyset paginate by fused score + id."""
|
|
176
|
+
if self.page_after_score is not None and self.page_after_id is not None:
|
|
177
|
+
score_param = self._quantize_score_for_pagination(self.page_after_score)
|
|
178
|
+
stmt = stmt.where(
|
|
179
|
+
or_(
|
|
180
|
+
score_column < score_param,
|
|
181
|
+
and_(score_column == score_param, entity_id_column > self.page_after_id),
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
return stmt
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def metadata(self) -> SearchMetadata:
|
|
188
|
+
return SearchMetadata.hybrid()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from sqlalchemy import Select, and_, cast, func, literal, or_, select
|
|
15
|
+
from sqlalchemy.sql.expression import ColumnElement
|
|
16
|
+
|
|
17
|
+
from orchestrator.db.models import AiSearchIndex
|
|
18
|
+
from orchestrator.search.core.types import SearchMetadata
|
|
19
|
+
|
|
20
|
+
from ..pagination import PaginationParams
|
|
21
|
+
from .base import Retriever
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SemanticRetriever(Retriever):
|
|
25
|
+
"""Ranks results based on the minimum semantic vector distance."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, vector_query: list[float], pagination_params: PaginationParams) -> None:
|
|
28
|
+
self.vector_query = vector_query
|
|
29
|
+
self.page_after_score = pagination_params.page_after_score
|
|
30
|
+
self.page_after_id = pagination_params.page_after_id
|
|
31
|
+
|
|
32
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
33
|
+
cand = candidate_query.subquery()
|
|
34
|
+
|
|
35
|
+
dist = AiSearchIndex.embedding.l2_distance(self.vector_query)
|
|
36
|
+
|
|
37
|
+
raw_min = func.min(dist).over(partition_by=AiSearchIndex.entity_id)
|
|
38
|
+
|
|
39
|
+
# Normalize score to preserve ordering in accordance with other retrievers:
|
|
40
|
+
# smaller distance = higher score
|
|
41
|
+
similarity = literal(1.0, type_=self.SCORE_NUMERIC_TYPE) / (
|
|
42
|
+
literal(1.0, type_=self.SCORE_NUMERIC_TYPE) + cast(raw_min, self.SCORE_NUMERIC_TYPE)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
score = cast(
|
|
46
|
+
func.round(cast(similarity, self.SCORE_NUMERIC_TYPE), self.SCORE_PRECISION), self.SCORE_NUMERIC_TYPE
|
|
47
|
+
).label(self.SCORE_LABEL)
|
|
48
|
+
|
|
49
|
+
combined_query = (
|
|
50
|
+
select(
|
|
51
|
+
AiSearchIndex.entity_id,
|
|
52
|
+
score,
|
|
53
|
+
func.first_value(AiSearchIndex.value)
|
|
54
|
+
.over(partition_by=AiSearchIndex.entity_id, order_by=[dist.asc(), AiSearchIndex.path.asc()])
|
|
55
|
+
.label(self.HIGHLIGHT_TEXT_LABEL),
|
|
56
|
+
func.first_value(AiSearchIndex.path)
|
|
57
|
+
.over(partition_by=AiSearchIndex.entity_id, order_by=[dist.asc(), AiSearchIndex.path.asc()])
|
|
58
|
+
.label(self.HIGHLIGHT_PATH_LABEL),
|
|
59
|
+
)
|
|
60
|
+
.select_from(AiSearchIndex)
|
|
61
|
+
.join(cand, cand.c.entity_id == AiSearchIndex.entity_id)
|
|
62
|
+
.where(AiSearchIndex.embedding.isnot(None))
|
|
63
|
+
.distinct(AiSearchIndex.entity_id)
|
|
64
|
+
)
|
|
65
|
+
final_query = combined_query.subquery("ranked_semantic")
|
|
66
|
+
|
|
67
|
+
stmt = select(
|
|
68
|
+
final_query.c.entity_id,
|
|
69
|
+
final_query.c.score,
|
|
70
|
+
final_query.c.highlight_text,
|
|
71
|
+
final_query.c.highlight_path,
|
|
72
|
+
).select_from(final_query)
|
|
73
|
+
|
|
74
|
+
stmt = self._apply_semantic_pagination(stmt, final_query.c.score, final_query.c.entity_id)
|
|
75
|
+
|
|
76
|
+
return stmt.order_by(final_query.c.score.desc().nulls_last(), final_query.c.entity_id.asc())
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def metadata(self) -> SearchMetadata:
|
|
80
|
+
return SearchMetadata.semantic()
|
|
81
|
+
|
|
82
|
+
def _apply_semantic_pagination(
|
|
83
|
+
self, stmt: Select, score_column: ColumnElement, entity_id_column: ColumnElement
|
|
84
|
+
) -> Select:
|
|
85
|
+
"""Apply semantic score pagination with precise Decimal handling."""
|
|
86
|
+
if self.page_after_score is not None and self.page_after_id is not None:
|
|
87
|
+
score_param = self._quantize_score_for_pagination(self.page_after_score)
|
|
88
|
+
stmt = stmt.where(
|
|
89
|
+
or_(
|
|
90
|
+
score_column < score_param,
|
|
91
|
+
and_(score_column == score_param, entity_id_column > self.page_after_id),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
return stmt
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from sqlalchemy import Select, literal, select
|
|
15
|
+
|
|
16
|
+
from orchestrator.search.core.types import SearchMetadata
|
|
17
|
+
|
|
18
|
+
from ..pagination import PaginationParams
|
|
19
|
+
from .base import Retriever
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class StructuredRetriever(Retriever):
|
|
23
|
+
"""Applies a dummy score for purely structured searches with no text query."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, pagination_params: PaginationParams) -> None:
|
|
26
|
+
self.page_after_id = pagination_params.page_after_id
|
|
27
|
+
|
|
28
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
29
|
+
cand = candidate_query.subquery()
|
|
30
|
+
stmt = select(cand.c.entity_id, literal(1.0).label("score")).select_from(cand)
|
|
31
|
+
|
|
32
|
+
if self.page_after_id:
|
|
33
|
+
stmt = stmt.where(cand.c.entity_id > self.page_after_id)
|
|
34
|
+
|
|
35
|
+
return stmt.order_by(cand.c.entity_id.asc())
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def metadata(self) -> SearchMetadata:
|
|
39
|
+
return SearchMetadata.structured()
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
import json
|
|
2
15
|
import re
|
|
3
16
|
|
|
@@ -16,7 +29,7 @@ logger = structlog.get_logger(__name__)
|
|
|
16
29
|
|
|
17
30
|
|
|
18
31
|
def generate_highlight_indices(text: str, term: str) -> list[tuple[int, int]]:
|
|
19
|
-
"""Finds all occurrences of individual words from the term
|
|
32
|
+
"""Finds all occurrences of individual words from the term, including both word boundary and substring matches."""
|
|
20
33
|
if not text or not term:
|
|
21
34
|
return []
|
|
22
35
|
|
|
@@ -24,14 +37,15 @@ def generate_highlight_indices(text: str, term: str) -> list[tuple[int, int]]:
|
|
|
24
37
|
words = [w.strip() for w in term.split() if w.strip()]
|
|
25
38
|
|
|
26
39
|
for word in words:
|
|
40
|
+
# First find word boundary matches
|
|
27
41
|
word_boundary_pattern = rf"\b{re.escape(word)}\b"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
if not matches:
|
|
31
|
-
substring_pattern = re.escape(word)
|
|
32
|
-
matches = list(re.finditer(substring_pattern, text, re.IGNORECASE))
|
|
42
|
+
word_matches = list(re.finditer(word_boundary_pattern, text, re.IGNORECASE))
|
|
43
|
+
all_matches.extend([(m.start(), m.end()) for m in word_matches])
|
|
33
44
|
|
|
34
|
-
|
|
45
|
+
# Then find all substring matches
|
|
46
|
+
substring_pattern = re.escape(word)
|
|
47
|
+
substring_matches = list(re.finditer(substring_pattern, text, re.IGNORECASE))
|
|
48
|
+
all_matches.extend([(m.start(), m.end()) for m in substring_matches])
|
|
35
49
|
|
|
36
50
|
return sorted(set(all_matches))
|
|
37
51
|
|
|
@@ -1,4 +1,15 @@
|
|
|
1
|
-
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
2
13
|
|
|
3
14
|
from sqlalchemy import select, text
|
|
4
15
|
from sqlalchemy.exc import ProgrammingError
|
|
@@ -8,72 +19,48 @@ from orchestrator.db import db
|
|
|
8
19
|
from orchestrator.db.database import WrappedSession
|
|
9
20
|
from orchestrator.db.models import AiSearchIndex
|
|
10
21
|
from orchestrator.search.core.types import EntityType, FieldType
|
|
11
|
-
from orchestrator.search.filters import
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
NumericValueFilter,
|
|
20
|
-
PathFilter,
|
|
21
|
-
StringFilter,
|
|
22
|
+
from orchestrator.search.filters import FilterCondition, FilterTree, LtreeFilter, PathFilter
|
|
23
|
+
from orchestrator.search.filters.definitions import operators_for
|
|
24
|
+
from orchestrator.search.retrieval.exceptions import (
|
|
25
|
+
EmptyFilterPathError,
|
|
26
|
+
IncompatibleFilterTypeError,
|
|
27
|
+
InvalidEntityPrefixError,
|
|
28
|
+
InvalidLtreePatternError,
|
|
29
|
+
PathNotFoundError,
|
|
22
30
|
)
|
|
23
31
|
|
|
24
32
|
|
|
25
33
|
def is_filter_compatible_with_field_type(filter_condition: FilterCondition, field_type: FieldType) -> bool:
|
|
26
34
|
"""Check whether a filter condition is compatible with a given field type.
|
|
27
35
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
The filter condition instance to check.
|
|
32
|
-
field_type : FieldType
|
|
33
|
-
The type of field from the index schema.
|
|
36
|
+
Args:
|
|
37
|
+
filter_condition (FilterCondition): The filter condition instance to check.
|
|
38
|
+
field_type (FieldType): The type of field from the index schema.
|
|
34
39
|
|
|
35
40
|
Returns:
|
|
36
|
-
|
|
37
|
-
bool
|
|
38
|
-
True if the filter condition is valid for the given field type, False otherwise.
|
|
41
|
+
bool: True if the filter condition is valid for the given field type, False otherwise.
|
|
39
42
|
"""
|
|
40
43
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
case StringFilter():
|
|
49
|
-
return field_type == FieldType.STRING
|
|
50
|
-
case EqualityFilter():
|
|
51
|
-
return field_type in {
|
|
52
|
-
FieldType.BOOLEAN,
|
|
53
|
-
FieldType.UUID,
|
|
54
|
-
FieldType.BLOCK,
|
|
55
|
-
FieldType.RESOURCE_TYPE,
|
|
56
|
-
FieldType.STRING,
|
|
57
|
-
}
|
|
58
|
-
case _:
|
|
59
|
-
assert_never(filter_condition)
|
|
44
|
+
# LtreeFilter is for path filtering only and is thus compatible with all field types.
|
|
45
|
+
if isinstance(filter_condition, LtreeFilter):
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
# Get valid operators for this field type and check if the filter's operator is valid.
|
|
49
|
+
valid_operators = operators_for(field_type)
|
|
50
|
+
return filter_condition.op in valid_operators
|
|
60
51
|
|
|
61
52
|
|
|
62
53
|
def is_lquery_syntactically_valid(pattern: str, db_session: WrappedSession) -> bool:
|
|
63
54
|
"""Validate whether a string is a syntactically correct `lquery` pattern.
|
|
64
55
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
The LTree lquery pattern string to validate.
|
|
69
|
-
db_session : WrappedSession
|
|
70
|
-
The database session used to test casting.
|
|
56
|
+
Args:
|
|
57
|
+
pattern (str): The LTree lquery pattern string to validate.
|
|
58
|
+
db_session (WrappedSession): The database session used to test casting.
|
|
71
59
|
|
|
72
60
|
Returns:
|
|
73
|
-
|
|
74
|
-
bool
|
|
75
|
-
True if the pattern is valid, False if it fails to cast in PostgreSQL.
|
|
61
|
+
bool: True if the pattern is valid, False if it fails to cast in PostgreSQL.
|
|
76
62
|
"""
|
|
63
|
+
|
|
77
64
|
try:
|
|
78
65
|
with db_session.begin_nested():
|
|
79
66
|
db_session.execute(text("SELECT CAST(:pattern AS lquery)"), {"pattern": pattern})
|
|
@@ -86,10 +73,9 @@ def get_structured_filter_schema() -> dict[str, str]:
|
|
|
86
73
|
"""Retrieve all distinct filterable paths and their field types from the index.
|
|
87
74
|
|
|
88
75
|
Returns:
|
|
89
|
-
|
|
90
|
-
Dict[str, str]
|
|
91
|
-
Mapping of path strings to their corresponding field type values.
|
|
76
|
+
Dict[str, str]: Mapping of path strings to their corresponding field type values.
|
|
92
77
|
"""
|
|
78
|
+
|
|
93
79
|
stmt = select(AiSearchIndex.path, AiSearchIndex.value_type).distinct().order_by(AiSearchIndex.path)
|
|
94
80
|
result = db.session.execute(stmt)
|
|
95
81
|
return {str(path): value_type.value for path, value_type in result}
|
|
@@ -98,16 +84,13 @@ def get_structured_filter_schema() -> dict[str, str]:
|
|
|
98
84
|
def validate_filter_path(path: str) -> str | None:
|
|
99
85
|
"""Check if a given path exists in the index and return its field type.
|
|
100
86
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
path : str
|
|
104
|
-
The fully qualified LTree path.
|
|
87
|
+
Args:
|
|
88
|
+
path (str): The fully qualified LTree path.
|
|
105
89
|
|
|
106
90
|
Returns:
|
|
107
|
-
|
|
108
|
-
Optional[str]
|
|
109
|
-
The value type of the field if found, otherwise None.
|
|
91
|
+
Optional[str]: The value type of the field if found, otherwise None.
|
|
110
92
|
"""
|
|
93
|
+
|
|
111
94
|
stmt = select(AiSearchIndex.value_type).where(AiSearchIndex.path == Ltree(path)).limit(1)
|
|
112
95
|
result = db.session.execute(stmt).scalar_one_or_none()
|
|
113
96
|
return result.value if result else None
|
|
@@ -123,47 +106,42 @@ async def complete_filter_validation(filter: PathFilter, entity_type: EntityType
|
|
|
123
106
|
4. Filter type matches the field's value_type
|
|
124
107
|
5. Path starts with the correct entity type prefix (unless wildcard)
|
|
125
108
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
The filter to validate.
|
|
130
|
-
entity_type : EntityType
|
|
131
|
-
The entity type being searched.
|
|
109
|
+
Args:
|
|
110
|
+
filter (PathFilter): The filter to validate.
|
|
111
|
+
entity_type (EntityType): The entity type being searched.
|
|
132
112
|
|
|
133
113
|
Raises:
|
|
134
|
-
|
|
135
|
-
ValueError
|
|
136
|
-
If any of the validation checks fail.
|
|
114
|
+
ValueError: If any of the validation checks fail.
|
|
137
115
|
"""
|
|
116
|
+
|
|
138
117
|
# Ltree is a special case
|
|
139
118
|
if isinstance(filter.condition, LtreeFilter):
|
|
140
119
|
lquery_pattern = filter.condition.value
|
|
141
120
|
if not is_lquery_syntactically_valid(lquery_pattern, db.session):
|
|
142
|
-
raise
|
|
121
|
+
raise InvalidLtreePatternError(lquery_pattern)
|
|
143
122
|
return
|
|
144
123
|
|
|
145
124
|
if not filter.path or not filter.path.strip():
|
|
146
|
-
raise
|
|
125
|
+
raise EmptyFilterPathError()
|
|
147
126
|
|
|
148
127
|
# 1. Check if path exists in database
|
|
149
128
|
db_field_type_str = validate_filter_path(filter.path)
|
|
150
129
|
if db_field_type_str is None:
|
|
151
|
-
raise
|
|
130
|
+
raise PathNotFoundError(filter.path)
|
|
152
131
|
|
|
153
132
|
db_field_type = FieldType(db_field_type_str)
|
|
154
133
|
|
|
155
134
|
# 2. Check filter compatibility with field type
|
|
156
135
|
if not is_filter_compatible_with_field_type(filter.condition, db_field_type):
|
|
157
|
-
|
|
158
|
-
|
|
136
|
+
expected_operators = operators_for(db_field_type)
|
|
137
|
+
raise IncompatibleFilterTypeError(
|
|
138
|
+
filter.condition.op.value, db_field_type.value, filter.path, expected_operators
|
|
159
139
|
)
|
|
160
140
|
|
|
161
141
|
# 3. Check entity type prefix requirements (unless it's a wildcard path)
|
|
162
142
|
expected_prefix = f"{entity_type.value.lower()}."
|
|
163
143
|
if not filter.path.startswith(expected_prefix) and not filter.path.startswith("*"):
|
|
164
|
-
raise
|
|
165
|
-
f"Filter path '{filter.path}' must start with '{expected_prefix}' for {entity_type.value} searches."
|
|
166
|
-
)
|
|
144
|
+
raise InvalidEntityPrefixError(filter.path, expected_prefix, entity_type.value)
|
|
167
145
|
|
|
168
146
|
|
|
169
147
|
async def validate_filter_tree(filters: FilterTree | None, entity_type: EntityType) -> None:
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
import uuid
|
|
2
15
|
from typing import Any, Literal
|
|
3
16
|
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
1
14
|
from typing import Literal
|
|
2
15
|
|
|
3
16
|
from pydantic import BaseModel, ConfigDict
|
|
@@ -59,6 +72,6 @@ class ComponentInfo(BaseModel):
|
|
|
59
72
|
|
|
60
73
|
class TypeDefinition(BaseModel):
|
|
61
74
|
operators: list[FilterOp]
|
|
62
|
-
|
|
75
|
+
value_schema: dict[FilterOp, ValueSchema]
|
|
63
76
|
|
|
64
77
|
model_config = ConfigDict(use_enum_values=True)
|
orchestrator/workflows/steps.py
CHANGED
|
@@ -15,9 +15,12 @@ from copy import deepcopy
|
|
|
15
15
|
import structlog
|
|
16
16
|
from pydantic import ValidationError
|
|
17
17
|
|
|
18
|
+
from orchestrator import llm_settings
|
|
18
19
|
from orchestrator.db import db
|
|
19
20
|
from orchestrator.db.models import ProcessSubscriptionTable
|
|
20
21
|
from orchestrator.domain.base import SubscriptionModel
|
|
22
|
+
from orchestrator.search.core.types import EntityType
|
|
23
|
+
from orchestrator.search.indexing import run_indexing_for_entity
|
|
21
24
|
from orchestrator.services.settings import reset_search_index
|
|
22
25
|
from orchestrator.services.subscriptions import get_subscription
|
|
23
26
|
from orchestrator.targets import Target
|
|
@@ -141,9 +144,20 @@ def set_status(status: SubscriptionLifecycle) -> Step:
|
|
|
141
144
|
|
|
142
145
|
|
|
143
146
|
@step("Refresh subscription search index")
|
|
144
|
-
def refresh_subscription_search_index() -> State:
|
|
147
|
+
def refresh_subscription_search_index(subscription: SubscriptionModel | None) -> State:
|
|
148
|
+
"""Refresh subscription search index.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
subscription: Subscription to refresh search index.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
State of the workflow.
|
|
155
|
+
|
|
156
|
+
"""
|
|
145
157
|
try:
|
|
146
158
|
reset_search_index()
|
|
159
|
+
if llm_settings.LLM_ENABLED and subscription:
|
|
160
|
+
run_indexing_for_entity(EntityType.SUBSCRIPTION, str(subscription.subscription_id))
|
|
147
161
|
except Exception:
|
|
148
162
|
# Don't fail workflow in case of unexpected error
|
|
149
163
|
logger.warning("Error updated the subscriptions search index")
|
|
@@ -105,7 +105,7 @@ def check_that_products_have_create_modify_and_terminate_workflows() -> State:
|
|
|
105
105
|
product_data = get_products(filters=[ProductTable.status == "active"])
|
|
106
106
|
|
|
107
107
|
workflows_not_complete: list = []
|
|
108
|
-
targets = ["CREATE", "TERMINATE", "MODIFY", "VALIDATE"]
|
|
108
|
+
targets = ["CREATE", "TERMINATE", "MODIFY", "RECONCILE", "VALIDATE"]
|
|
109
109
|
for product in product_data:
|
|
110
110
|
workflows = {c.target for c in product.workflows if c.target in targets and c.name != "modify_note"}
|
|
111
111
|
if len(workflows) < len(targets):
|