orchestrator-core 4.5.3__py3-none-any.whl → 4.6.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +1 -1
- orchestrator/agentic_app.py +1 -21
- orchestrator/api/api_v1/api.py +5 -0
- orchestrator/api/api_v1/endpoints/agent.py +50 -0
- orchestrator/api/api_v1/endpoints/search.py +120 -201
- orchestrator/cli/database.py +3 -0
- orchestrator/cli/generate.py +11 -4
- orchestrator/cli/generator/generator/migration.py +7 -3
- orchestrator/cli/scheduler.py +15 -22
- orchestrator/cli/search/resize_embedding.py +28 -22
- orchestrator/cli/search/speedtest.py +4 -6
- orchestrator/db/__init__.py +6 -0
- orchestrator/db/models.py +75 -0
- orchestrator/migrations/helpers.py +46 -38
- orchestrator/schedules/scheduler.py +32 -15
- orchestrator/schedules/validate_products.py +1 -1
- orchestrator/schemas/search.py +8 -85
- orchestrator/search/agent/__init__.py +2 -2
- orchestrator/search/agent/agent.py +25 -29
- orchestrator/search/agent/json_patch.py +51 -0
- orchestrator/search/agent/prompts.py +35 -9
- orchestrator/search/agent/state.py +28 -2
- orchestrator/search/agent/tools.py +192 -53
- orchestrator/search/core/exceptions.py +6 -0
- orchestrator/search/core/types.py +1 -0
- orchestrator/search/export.py +199 -0
- orchestrator/search/indexing/indexer.py +13 -4
- orchestrator/search/indexing/registry.py +14 -1
- orchestrator/search/llm_migration.py +55 -0
- orchestrator/search/retrieval/__init__.py +3 -2
- orchestrator/search/retrieval/builder.py +5 -1
- orchestrator/search/retrieval/engine.py +66 -23
- orchestrator/search/retrieval/pagination.py +46 -56
- orchestrator/search/retrieval/query_state.py +61 -0
- orchestrator/search/retrieval/retrievers/base.py +26 -40
- orchestrator/search/retrieval/retrievers/fuzzy.py +10 -9
- orchestrator/search/retrieval/retrievers/hybrid.py +11 -8
- orchestrator/search/retrieval/retrievers/semantic.py +9 -8
- orchestrator/search/retrieval/retrievers/structured.py +6 -6
- orchestrator/search/schemas/parameters.py +17 -13
- orchestrator/search/schemas/results.py +4 -1
- orchestrator/settings.py +1 -0
- orchestrator/utils/auth.py +3 -2
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0rc2.dist-info}/METADATA +3 -3
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0rc2.dist-info}/RECORD +47 -43
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0rc2.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0rc2.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,42 +11,21 @@
|
|
|
11
11
|
# See the License for the specific language governing permissions and
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
|
|
14
|
-
import array
|
|
15
14
|
import base64
|
|
16
|
-
from
|
|
15
|
+
from uuid import UUID
|
|
17
16
|
|
|
18
17
|
from pydantic import BaseModel
|
|
19
18
|
|
|
19
|
+
from orchestrator.db import SearchQueryTable, db
|
|
20
20
|
from orchestrator.search.core.exceptions import InvalidCursorError
|
|
21
|
-
from orchestrator.search.schemas.parameters import
|
|
22
|
-
from orchestrator.search.schemas.results import
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@dataclass
|
|
26
|
-
class PaginationParams:
|
|
27
|
-
"""Parameters for pagination in search queries."""
|
|
28
|
-
|
|
29
|
-
page_after_score: float | None = None
|
|
30
|
-
page_after_id: str | None = None
|
|
31
|
-
q_vec_override: list[float] | None = None
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def floats_to_b64(v: list[float]) -> str:
|
|
35
|
-
a = array.array("f", v)
|
|
36
|
-
return base64.urlsafe_b64encode(a.tobytes()).decode("ascii")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def b64_to_floats(s: str) -> list[float]:
|
|
40
|
-
raw = base64.urlsafe_b64decode(s.encode("ascii"))
|
|
41
|
-
a = array.array("f")
|
|
42
|
-
a.frombytes(raw)
|
|
43
|
-
return list(a)
|
|
21
|
+
from orchestrator.search.schemas.parameters import SearchParameters
|
|
22
|
+
from orchestrator.search.schemas.results import SearchResponse
|
|
44
23
|
|
|
45
24
|
|
|
46
25
|
class PageCursor(BaseModel):
|
|
47
26
|
score: float
|
|
48
27
|
id: str
|
|
49
|
-
|
|
28
|
+
query_id: UUID
|
|
50
29
|
|
|
51
30
|
def encode(self) -> str:
|
|
52
31
|
"""Encode the cursor data into a URL-safe Base64 string."""
|
|
@@ -63,34 +42,45 @@ class PageCursor(BaseModel):
|
|
|
63
42
|
raise InvalidCursorError("Invalid pagination cursor") from e
|
|
64
43
|
|
|
65
44
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
return PaginationParams(
|
|
71
|
-
page_after_score=c.score,
|
|
72
|
-
page_after_id=c.id,
|
|
73
|
-
q_vec_override=b64_to_floats(c.q_vec_b64),
|
|
74
|
-
)
|
|
75
|
-
if search_params.vector_query:
|
|
76
|
-
from orchestrator.search.core.embedding import QueryEmbedder
|
|
77
|
-
|
|
78
|
-
q_vec_override = await QueryEmbedder.generate_for_text_async(search_params.vector_query)
|
|
79
|
-
return PaginationParams(q_vec_override=q_vec_override)
|
|
80
|
-
return PaginationParams()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def create_next_page_cursor(
|
|
84
|
-
search_results: list[SearchResult], pagination_params: PaginationParams, limit: int
|
|
45
|
+
def encode_next_page_cursor(
|
|
46
|
+
search_response: SearchResponse,
|
|
47
|
+
cursor: PageCursor | None,
|
|
48
|
+
search_params: SearchParameters,
|
|
85
49
|
) -> str | None:
|
|
86
|
-
"""Create next page cursor if there are more results.
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
50
|
+
"""Create next page cursor if there are more results.
|
|
51
|
+
|
|
52
|
+
On first page, saves the query to database and includes query_id in cursor
|
|
53
|
+
for subsequent pages to ensure consistent parameters across pagination.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
search_response: SearchResponse containing results and query_embedding
|
|
57
|
+
cursor: Current page cursor (None for first page, PageCursor for subsequent pages)
|
|
58
|
+
search_params: Search parameters to save for pagination consistency
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Encoded cursor for next page, or None if no more results
|
|
62
|
+
"""
|
|
63
|
+
from orchestrator.search.retrieval.query_state import SearchQueryState
|
|
64
|
+
|
|
65
|
+
has_next_page = len(search_response.results) == search_params.limit and search_params.limit > 0
|
|
66
|
+
if not has_next_page:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
# If this is the first page, save query state to database
|
|
70
|
+
if cursor is None:
|
|
71
|
+
query_state = SearchQueryState(parameters=search_params, query_embedding=search_response.query_embedding)
|
|
72
|
+
search_query = SearchQueryTable.from_state(state=query_state)
|
|
73
|
+
|
|
74
|
+
db.session.add(search_query)
|
|
75
|
+
db.session.commit()
|
|
76
|
+
query_id = search_query.query_id
|
|
77
|
+
else:
|
|
78
|
+
query_id = cursor.query_id
|
|
79
|
+
|
|
80
|
+
last_item = search_response.results[-1]
|
|
81
|
+
cursor_data = PageCursor(
|
|
82
|
+
score=float(last_item.score),
|
|
83
|
+
id=last_item.entity_id,
|
|
84
|
+
query_id=query_id,
|
|
85
|
+
)
|
|
86
|
+
return cursor_data.encode()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from uuid import UUID
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
17
|
+
|
|
18
|
+
from orchestrator.db import SearchQueryTable, db
|
|
19
|
+
from orchestrator.search.core.exceptions import QueryStateNotFoundError
|
|
20
|
+
from orchestrator.search.schemas.parameters import SearchParameters
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SearchQueryState(BaseModel):
|
|
24
|
+
"""State of a search query including parameters and embedding.
|
|
25
|
+
|
|
26
|
+
This model provides a complete snapshot of what was searched and how.
|
|
27
|
+
Used for both agent and regular API searches.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
parameters: SearchParameters = Field(discriminator="entity_type")
|
|
31
|
+
query_embedding: list[float] | None = Field(default=None, description="The embedding vector for semantic search")
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(from_attributes=True)
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def load_from_id(cls, query_id: UUID | str) -> "SearchQueryState":
|
|
37
|
+
"""Load query state from database by query_id.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
query_id: UUID or string UUID of the saved query
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
SearchQueryState loaded from database
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If query_id format is invalid
|
|
47
|
+
QueryStateNotFoundError: If query not found in database
|
|
48
|
+
"""
|
|
49
|
+
if isinstance(query_id, UUID):
|
|
50
|
+
query_uuid = query_id
|
|
51
|
+
else:
|
|
52
|
+
try:
|
|
53
|
+
query_uuid = UUID(query_id)
|
|
54
|
+
except (ValueError, TypeError) as e:
|
|
55
|
+
raise ValueError(f"Invalid query_id format: {query_id}") from e
|
|
56
|
+
|
|
57
|
+
search_query = db.session.query(SearchQueryTable).filter_by(query_id=query_uuid).first()
|
|
58
|
+
if not search_query:
|
|
59
|
+
raise QueryStateNotFoundError(f"Query {query_uuid} not found in database")
|
|
60
|
+
|
|
61
|
+
return cls.model_validate(search_query)
|
|
@@ -20,7 +20,7 @@ from sqlalchemy import BindParameter, Numeric, Select, literal
|
|
|
20
20
|
from orchestrator.search.core.types import FieldType, SearchMetadata
|
|
21
21
|
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
22
22
|
|
|
23
|
-
from ..pagination import
|
|
23
|
+
from ..pagination import PageCursor
|
|
24
24
|
|
|
25
25
|
logger = structlog.get_logger(__name__)
|
|
26
26
|
|
|
@@ -41,62 +41,48 @@ class Retriever(ABC):
|
|
|
41
41
|
]
|
|
42
42
|
|
|
43
43
|
@classmethod
|
|
44
|
-
async def
|
|
44
|
+
async def route(
|
|
45
45
|
cls,
|
|
46
46
|
params: BaseSearchParameters,
|
|
47
|
-
|
|
47
|
+
cursor: PageCursor | None,
|
|
48
|
+
query_embedding: list[float] | None = None,
|
|
48
49
|
) -> "Retriever":
|
|
49
|
-
"""
|
|
50
|
+
"""Route to the appropriate retriever instance based on search parameters.
|
|
51
|
+
|
|
52
|
+
Selects the retriever type based on available search criteria:
|
|
53
|
+
- Hybrid: both embedding and fuzzy term available
|
|
54
|
+
- Semantic: only embedding available
|
|
55
|
+
- Fuzzy: only text term available (or fallback when embedding generation fails)
|
|
56
|
+
- Structured: only filters available
|
|
50
57
|
|
|
51
58
|
Args:
|
|
52
|
-
params
|
|
53
|
-
|
|
59
|
+
params: Search parameters including vector queries, fuzzy terms, and filters
|
|
60
|
+
cursor: Pagination cursor for cursor-based paging
|
|
61
|
+
query_embedding: Query embedding for semantic search, or None if not available
|
|
54
62
|
|
|
55
63
|
Returns:
|
|
56
|
-
|
|
64
|
+
A concrete retriever instance based on available search criteria
|
|
57
65
|
"""
|
|
58
|
-
|
|
59
66
|
from .fuzzy import FuzzyRetriever
|
|
60
67
|
from .hybrid import RrfHybridRetriever
|
|
61
68
|
from .semantic import SemanticRetriever
|
|
62
69
|
from .structured import StructuredRetriever
|
|
63
70
|
|
|
64
71
|
fuzzy_term = params.fuzzy_term
|
|
65
|
-
q_vec = await cls._get_query_vector(params.vector_query, pagination_params.q_vec_override)
|
|
66
|
-
|
|
67
|
-
# If semantic search was attempted but failed, fall back to fuzzy with the full query
|
|
68
|
-
fallback_fuzzy_term = fuzzy_term
|
|
69
|
-
if q_vec is None and params.vector_query is not None and params.query is not None:
|
|
70
|
-
fallback_fuzzy_term = params.query
|
|
71
|
-
|
|
72
|
-
if q_vec is not None and fallback_fuzzy_term is not None:
|
|
73
|
-
return RrfHybridRetriever(q_vec, fallback_fuzzy_term, pagination_params)
|
|
74
|
-
if q_vec is not None:
|
|
75
|
-
return SemanticRetriever(q_vec, pagination_params)
|
|
76
|
-
if fallback_fuzzy_term is not None:
|
|
77
|
-
return FuzzyRetriever(fallback_fuzzy_term, pagination_params)
|
|
78
|
-
|
|
79
|
-
return StructuredRetriever(pagination_params)
|
|
80
|
-
|
|
81
|
-
@classmethod
|
|
82
|
-
async def _get_query_vector(
|
|
83
|
-
cls, vector_query: str | None, q_vec_override: list[float] | None
|
|
84
|
-
) -> list[float] | None:
|
|
85
|
-
"""Get query vector either from override or by generating from text."""
|
|
86
|
-
if q_vec_override:
|
|
87
|
-
return q_vec_override
|
|
88
|
-
|
|
89
|
-
if not vector_query:
|
|
90
|
-
return None
|
|
91
72
|
|
|
92
|
-
|
|
73
|
+
# If vector_query exists but embedding generation failed, fall back to fuzzy search with full query
|
|
74
|
+
if query_embedding is None and params.vector_query is not None and params.query is not None:
|
|
75
|
+
fuzzy_term = params.query
|
|
93
76
|
|
|
94
|
-
|
|
95
|
-
if not
|
|
96
|
-
|
|
97
|
-
|
|
77
|
+
# Select retriever based on available search criteria
|
|
78
|
+
if query_embedding is not None and fuzzy_term is not None:
|
|
79
|
+
return RrfHybridRetriever(query_embedding, fuzzy_term, cursor)
|
|
80
|
+
if query_embedding is not None:
|
|
81
|
+
return SemanticRetriever(query_embedding, cursor)
|
|
82
|
+
if fuzzy_term is not None:
|
|
83
|
+
return FuzzyRetriever(fuzzy_term, cursor)
|
|
98
84
|
|
|
99
|
-
return
|
|
85
|
+
return StructuredRetriever(cursor)
|
|
100
86
|
|
|
101
87
|
@abstractmethod
|
|
102
88
|
def apply(self, candidate_query: Select) -> Select:
|
|
@@ -17,17 +17,16 @@ from sqlalchemy.sql.expression import ColumnElement
|
|
|
17
17
|
from orchestrator.db.models import AiSearchIndex
|
|
18
18
|
from orchestrator.search.core.types import SearchMetadata
|
|
19
19
|
|
|
20
|
-
from ..pagination import
|
|
20
|
+
from ..pagination import PageCursor
|
|
21
21
|
from .base import Retriever
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class FuzzyRetriever(Retriever):
|
|
25
25
|
"""Ranks results based on the max of fuzzy text similarity scores."""
|
|
26
26
|
|
|
27
|
-
def __init__(self, fuzzy_term: str,
|
|
27
|
+
def __init__(self, fuzzy_term: str, cursor: PageCursor | None) -> None:
|
|
28
28
|
self.fuzzy_term = fuzzy_term
|
|
29
|
-
self.
|
|
30
|
-
self.page_after_id = pagination_params.page_after_id
|
|
29
|
+
self.cursor = cursor
|
|
31
30
|
|
|
32
31
|
def apply(self, candidate_query: Select) -> Select:
|
|
33
32
|
cand = candidate_query.subquery()
|
|
@@ -42,6 +41,7 @@ class FuzzyRetriever(Retriever):
|
|
|
42
41
|
combined_query = (
|
|
43
42
|
select(
|
|
44
43
|
AiSearchIndex.entity_id,
|
|
44
|
+
AiSearchIndex.entity_title,
|
|
45
45
|
score,
|
|
46
46
|
func.first_value(AiSearchIndex.value)
|
|
47
47
|
.over(partition_by=AiSearchIndex.entity_id, order_by=[similarity_expr.desc(), AiSearchIndex.path.asc()])
|
|
@@ -58,12 +58,13 @@ class FuzzyRetriever(Retriever):
|
|
|
58
58
|
literal(self.fuzzy_term).op("<%")(AiSearchIndex.value),
|
|
59
59
|
)
|
|
60
60
|
)
|
|
61
|
-
.distinct(AiSearchIndex.entity_id)
|
|
61
|
+
.distinct(AiSearchIndex.entity_id, AiSearchIndex.entity_title)
|
|
62
62
|
)
|
|
63
63
|
final_query = combined_query.subquery("ranked_fuzzy")
|
|
64
64
|
|
|
65
65
|
stmt = select(
|
|
66
66
|
final_query.c.entity_id,
|
|
67
|
+
final_query.c.entity_title,
|
|
67
68
|
final_query.c.score,
|
|
68
69
|
final_query.c.highlight_text,
|
|
69
70
|
final_query.c.highlight_path,
|
|
@@ -81,13 +82,13 @@ class FuzzyRetriever(Retriever):
|
|
|
81
82
|
self, stmt: Select, score_column: ColumnElement, entity_id_column: ColumnElement
|
|
82
83
|
) -> Select:
|
|
83
84
|
"""Apply standard score + entity_id pagination."""
|
|
84
|
-
if self.
|
|
85
|
+
if self.cursor is not None:
|
|
85
86
|
stmt = stmt.where(
|
|
86
87
|
or_(
|
|
87
|
-
score_column < self.
|
|
88
|
+
score_column < self.cursor.score,
|
|
88
89
|
and_(
|
|
89
|
-
score_column == self.
|
|
90
|
-
entity_id_column > self.
|
|
90
|
+
score_column == self.cursor.score,
|
|
91
|
+
entity_id_column > self.cursor.id,
|
|
91
92
|
),
|
|
92
93
|
)
|
|
93
94
|
)
|
|
@@ -20,7 +20,7 @@ from sqlalchemy.types import TypeEngine
|
|
|
20
20
|
from orchestrator.db.models import AiSearchIndex
|
|
21
21
|
from orchestrator.search.core.types import SearchMetadata
|
|
22
22
|
|
|
23
|
-
from ..pagination import
|
|
23
|
+
from ..pagination import PageCursor
|
|
24
24
|
from .base import Retriever
|
|
25
25
|
|
|
26
26
|
|
|
@@ -127,14 +127,13 @@ class RrfHybridRetriever(Retriever):
|
|
|
127
127
|
self,
|
|
128
128
|
q_vec: list[float],
|
|
129
129
|
fuzzy_term: str,
|
|
130
|
-
|
|
130
|
+
cursor: PageCursor | None,
|
|
131
131
|
k: int = 60,
|
|
132
132
|
field_candidates_limit: int = 100,
|
|
133
133
|
) -> None:
|
|
134
134
|
self.q_vec = q_vec
|
|
135
135
|
self.fuzzy_term = fuzzy_term
|
|
136
|
-
self.
|
|
137
|
-
self.page_after_id = pagination_params.page_after_id
|
|
136
|
+
self.cursor = cursor
|
|
138
137
|
self.k = k
|
|
139
138
|
self.field_candidates_limit = field_candidates_limit
|
|
140
139
|
|
|
@@ -154,6 +153,7 @@ class RrfHybridRetriever(Retriever):
|
|
|
154
153
|
field_candidates = (
|
|
155
154
|
select(
|
|
156
155
|
AiSearchIndex.entity_id,
|
|
156
|
+
AiSearchIndex.entity_title,
|
|
157
157
|
AiSearchIndex.path,
|
|
158
158
|
AiSearchIndex.value,
|
|
159
159
|
sem_val,
|
|
@@ -178,9 +178,10 @@ class RrfHybridRetriever(Retriever):
|
|
|
178
178
|
entity_scores = (
|
|
179
179
|
select(
|
|
180
180
|
field_candidates.c.entity_id,
|
|
181
|
+
field_candidates.c.entity_title,
|
|
181
182
|
func.avg(field_candidates.c.semantic_distance).label("avg_semantic_distance"),
|
|
182
183
|
func.avg(field_candidates.c.fuzzy_score).label("avg_fuzzy_score"),
|
|
183
|
-
).group_by(field_candidates.c.entity_id)
|
|
184
|
+
).group_by(field_candidates.c.entity_id, field_candidates.c.entity_title)
|
|
184
185
|
).cte("entity_scores")
|
|
185
186
|
|
|
186
187
|
entity_highlights = (
|
|
@@ -204,6 +205,7 @@ class RrfHybridRetriever(Retriever):
|
|
|
204
205
|
ranked = (
|
|
205
206
|
select(
|
|
206
207
|
entity_scores.c.entity_id,
|
|
208
|
+
entity_scores.c.entity_title,
|
|
207
209
|
entity_scores.c.avg_semantic_distance,
|
|
208
210
|
entity_scores.c.avg_fuzzy_score,
|
|
209
211
|
entity_highlights.c.highlight_text,
|
|
@@ -242,6 +244,7 @@ class RrfHybridRetriever(Retriever):
|
|
|
242
244
|
|
|
243
245
|
stmt = select(
|
|
244
246
|
ranked.c.entity_id,
|
|
247
|
+
ranked.c.entity_title,
|
|
245
248
|
score,
|
|
246
249
|
ranked.c.highlight_text,
|
|
247
250
|
ranked.c.highlight_path,
|
|
@@ -262,12 +265,12 @@ class RrfHybridRetriever(Retriever):
|
|
|
262
265
|
entity_id_column: ColumnElement,
|
|
263
266
|
) -> Select:
|
|
264
267
|
"""Keyset paginate by fused score + id."""
|
|
265
|
-
if self.
|
|
266
|
-
score_param = self._quantize_score_for_pagination(self.
|
|
268
|
+
if self.cursor is not None:
|
|
269
|
+
score_param = self._quantize_score_for_pagination(self.cursor.score)
|
|
267
270
|
stmt = stmt.where(
|
|
268
271
|
or_(
|
|
269
272
|
score_column < score_param,
|
|
270
|
-
and_(score_column == score_param, entity_id_column > self.
|
|
273
|
+
and_(score_column == score_param, entity_id_column > self.cursor.id),
|
|
271
274
|
)
|
|
272
275
|
)
|
|
273
276
|
return stmt
|
|
@@ -17,17 +17,16 @@ from sqlalchemy.sql.expression import ColumnElement
|
|
|
17
17
|
from orchestrator.db.models import AiSearchIndex
|
|
18
18
|
from orchestrator.search.core.types import SearchMetadata
|
|
19
19
|
|
|
20
|
-
from ..pagination import
|
|
20
|
+
from ..pagination import PageCursor
|
|
21
21
|
from .base import Retriever
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class SemanticRetriever(Retriever):
|
|
25
25
|
"""Ranks results based on the minimum semantic vector distance."""
|
|
26
26
|
|
|
27
|
-
def __init__(self, vector_query: list[float],
|
|
27
|
+
def __init__(self, vector_query: list[float], cursor: PageCursor | None) -> None:
|
|
28
28
|
self.vector_query = vector_query
|
|
29
|
-
self.
|
|
30
|
-
self.page_after_id = pagination_params.page_after_id
|
|
29
|
+
self.cursor = cursor
|
|
31
30
|
|
|
32
31
|
def apply(self, candidate_query: Select) -> Select:
|
|
33
32
|
cand = candidate_query.subquery()
|
|
@@ -49,6 +48,7 @@ class SemanticRetriever(Retriever):
|
|
|
49
48
|
combined_query = (
|
|
50
49
|
select(
|
|
51
50
|
AiSearchIndex.entity_id,
|
|
51
|
+
AiSearchIndex.entity_title,
|
|
52
52
|
score,
|
|
53
53
|
func.first_value(AiSearchIndex.value)
|
|
54
54
|
.over(partition_by=AiSearchIndex.entity_id, order_by=[dist.asc(), AiSearchIndex.path.asc()])
|
|
@@ -60,12 +60,13 @@ class SemanticRetriever(Retriever):
|
|
|
60
60
|
.select_from(AiSearchIndex)
|
|
61
61
|
.join(cand, cand.c.entity_id == AiSearchIndex.entity_id)
|
|
62
62
|
.where(AiSearchIndex.embedding.isnot(None))
|
|
63
|
-
.distinct(AiSearchIndex.entity_id)
|
|
63
|
+
.distinct(AiSearchIndex.entity_id, AiSearchIndex.entity_title)
|
|
64
64
|
)
|
|
65
65
|
final_query = combined_query.subquery("ranked_semantic")
|
|
66
66
|
|
|
67
67
|
stmt = select(
|
|
68
68
|
final_query.c.entity_id,
|
|
69
|
+
final_query.c.entity_title,
|
|
69
70
|
final_query.c.score,
|
|
70
71
|
final_query.c.highlight_text,
|
|
71
72
|
final_query.c.highlight_path,
|
|
@@ -83,12 +84,12 @@ class SemanticRetriever(Retriever):
|
|
|
83
84
|
self, stmt: Select, score_column: ColumnElement, entity_id_column: ColumnElement
|
|
84
85
|
) -> Select:
|
|
85
86
|
"""Apply semantic score pagination with precise Decimal handling."""
|
|
86
|
-
if self.
|
|
87
|
-
score_param = self._quantize_score_for_pagination(self.
|
|
87
|
+
if self.cursor is not None:
|
|
88
|
+
score_param = self._quantize_score_for_pagination(self.cursor.score)
|
|
88
89
|
stmt = stmt.where(
|
|
89
90
|
or_(
|
|
90
91
|
score_column < score_param,
|
|
91
|
-
and_(score_column == score_param, entity_id_column > self.
|
|
92
|
+
and_(score_column == score_param, entity_id_column > self.cursor.id),
|
|
92
93
|
)
|
|
93
94
|
)
|
|
94
95
|
return stmt
|
|
@@ -15,22 +15,22 @@ from sqlalchemy import Select, literal, select
|
|
|
15
15
|
|
|
16
16
|
from orchestrator.search.core.types import SearchMetadata
|
|
17
17
|
|
|
18
|
-
from ..pagination import
|
|
18
|
+
from ..pagination import PageCursor
|
|
19
19
|
from .base import Retriever
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class StructuredRetriever(Retriever):
|
|
23
23
|
"""Applies a dummy score for purely structured searches with no text query."""
|
|
24
24
|
|
|
25
|
-
def __init__(self,
|
|
26
|
-
self.
|
|
25
|
+
def __init__(self, cursor: PageCursor | None) -> None:
|
|
26
|
+
self.cursor = cursor
|
|
27
27
|
|
|
28
28
|
def apply(self, candidate_query: Select) -> Select:
|
|
29
29
|
cand = candidate_query.subquery()
|
|
30
|
-
stmt = select(cand.c.entity_id, literal(1.0).label("score")).select_from(cand)
|
|
30
|
+
stmt = select(cand.c.entity_id, cand.c.entity_title, literal(1.0).label("score")).select_from(cand)
|
|
31
31
|
|
|
32
|
-
if self.
|
|
33
|
-
stmt = stmt.where(cand.c.entity_id > self.
|
|
32
|
+
if self.cursor is not None:
|
|
33
|
+
stmt = stmt.where(cand.c.entity_id > self.cursor.id)
|
|
34
34
|
|
|
35
35
|
return stmt.order_by(cand.c.entity_id.asc())
|
|
36
36
|
|
|
@@ -12,9 +12,9 @@
|
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
|
|
14
14
|
import uuid
|
|
15
|
-
from typing import Any, Literal
|
|
15
|
+
from typing import Any, ClassVar, Literal
|
|
16
16
|
|
|
17
|
-
from pydantic import BaseModel, ConfigDict, Field
|
|
17
|
+
from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
|
|
18
18
|
|
|
19
19
|
from orchestrator.search.core.types import ActionType, EntityType
|
|
20
20
|
from orchestrator.search.filters import FilterTree
|
|
@@ -23,6 +23,9 @@ from orchestrator.search.filters import FilterTree
|
|
|
23
23
|
class BaseSearchParameters(BaseModel):
|
|
24
24
|
"""Base model with common search parameters."""
|
|
25
25
|
|
|
26
|
+
DEFAULT_EXPORT_LIMIT: ClassVar[int] = 1000
|
|
27
|
+
MAX_EXPORT_LIMIT: ClassVar[int] = 10000
|
|
28
|
+
|
|
26
29
|
action: ActionType = Field(default=ActionType.SELECT, description="The action to perform.")
|
|
27
30
|
entity_type: EntityType
|
|
28
31
|
|
|
@@ -33,14 +36,18 @@ class BaseSearchParameters(BaseModel):
|
|
|
33
36
|
)
|
|
34
37
|
|
|
35
38
|
limit: int = Field(default=10, ge=1, le=30, description="Maximum number of search results to return.")
|
|
39
|
+
export_limit: int = Field(
|
|
40
|
+
default=DEFAULT_EXPORT_LIMIT, ge=1, le=MAX_EXPORT_LIMIT, description="Maximum number of results to export."
|
|
41
|
+
)
|
|
36
42
|
model_config = ConfigDict(extra="forbid")
|
|
37
43
|
|
|
38
44
|
@classmethod
|
|
39
|
-
def create(cls,
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
45
|
+
def create(cls, **kwargs: Any) -> "SearchParameters":
|
|
46
|
+
"""Create the correct search parameter subclass instance based on entity_type."""
|
|
47
|
+
from orchestrator.search.schemas.parameters import SearchParameters
|
|
48
|
+
|
|
49
|
+
adapter: TypeAdapter = TypeAdapter(SearchParameters)
|
|
50
|
+
return adapter.validate_python(kwargs)
|
|
44
51
|
|
|
45
52
|
@property
|
|
46
53
|
def vector_query(self) -> str | None:
|
|
@@ -121,9 +128,6 @@ class ProcessSearchParameters(BaseSearchParameters):
|
|
|
121
128
|
)
|
|
122
129
|
|
|
123
130
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
EntityType.WORKFLOW: WorkflowSearchParameters,
|
|
128
|
-
EntityType.PROCESS: ProcessSearchParameters,
|
|
129
|
-
}
|
|
131
|
+
SearchParameters = (
|
|
132
|
+
SubscriptionSearchParameters | ProductSearchParameters | WorkflowSearchParameters | ProcessSearchParameters
|
|
133
|
+
)
|
|
@@ -15,7 +15,7 @@ from typing import Literal
|
|
|
15
15
|
|
|
16
16
|
from pydantic import BaseModel, ConfigDict
|
|
17
17
|
|
|
18
|
-
from orchestrator.search.core.types import FilterOp, SearchMetadata, UIType
|
|
18
|
+
from orchestrator.search.core.types import EntityType, FilterOp, SearchMetadata, UIType
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class MatchingField(BaseModel):
|
|
@@ -30,6 +30,8 @@ class SearchResult(BaseModel):
|
|
|
30
30
|
"""Represents a single search result item."""
|
|
31
31
|
|
|
32
32
|
entity_id: str
|
|
33
|
+
entity_type: EntityType
|
|
34
|
+
entity_title: str
|
|
33
35
|
score: float
|
|
34
36
|
perfect_match: int = 0
|
|
35
37
|
matching_field: MatchingField | None = None
|
|
@@ -40,6 +42,7 @@ class SearchResponse(BaseModel):
|
|
|
40
42
|
|
|
41
43
|
results: list[SearchResult]
|
|
42
44
|
metadata: SearchMetadata
|
|
45
|
+
query_embedding: list[float] | None = None
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
class ValueSchema(BaseModel):
|
orchestrator/settings.py
CHANGED
|
@@ -57,6 +57,7 @@ class AppSettings(BaseSettings):
|
|
|
57
57
|
EXECUTOR: str = ExecutorType.THREADPOOL
|
|
58
58
|
WORKFLOWS_SWAGGER_HOST: str = "localhost"
|
|
59
59
|
WORKFLOWS_GUI_URI: str = "http://localhost:3000"
|
|
60
|
+
BASE_URL: str = "http://localhost:8080" # Base URL for the API (used for generating export URLs)
|
|
60
61
|
DATABASE_URI: PostgresDsn = "postgresql://nwa:nwa@localhost/orchestrator-core" # type: ignore
|
|
61
62
|
MAX_WORKERS: int = 5
|
|
62
63
|
MAIL_SERVER: str = "localhost"
|
orchestrator/utils/auth.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from collections.abc import Callable
|
|
2
|
-
from typing import TypeAlias
|
|
2
|
+
from typing import TypeAlias, TypeVar
|
|
3
3
|
|
|
4
4
|
from oauth2_lib.fastapi import OIDCUserModel
|
|
5
5
|
|
|
6
6
|
# This file is broken out separately to avoid circular imports.
|
|
7
7
|
|
|
8
8
|
# Can instead use "type Authorizer = ..." in later Python versions.
|
|
9
|
-
|
|
9
|
+
T = TypeVar("T", bound=OIDCUserModel)
|
|
10
|
+
Authorizer: TypeAlias = Callable[[T | None], bool]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: orchestrator-core
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.6.0rc2
|
|
4
4
|
Summary: This is the orchestrator workflow engine.
|
|
5
5
|
Author-email: SURF <automation-beheer@surf.nl>
|
|
6
6
|
Requires-Python: >=3.11,<3.14
|
|
@@ -42,7 +42,7 @@ Requires-Dist: itsdangerous>=2.2.0
|
|
|
42
42
|
Requires-Dist: jinja2==3.1.6
|
|
43
43
|
Requires-Dist: more-itertools~=10.7.0
|
|
44
44
|
Requires-Dist: nwa-stdlib~=1.9.2
|
|
45
|
-
Requires-Dist: oauth2-lib
|
|
45
|
+
Requires-Dist: oauth2-lib==2.4.2
|
|
46
46
|
Requires-Dist: orjson==3.10.18
|
|
47
47
|
Requires-Dist: pgvector>=0.4.1
|
|
48
48
|
Requires-Dist: prometheus-client==0.22.1
|
|
@@ -63,7 +63,7 @@ Requires-Dist: structlog>=25.4.0
|
|
|
63
63
|
Requires-Dist: tabulate==0.9.0
|
|
64
64
|
Requires-Dist: typer==0.15.4
|
|
65
65
|
Requires-Dist: uvicorn[standard]~=0.34.0
|
|
66
|
-
Requires-Dist: pydantic-ai-slim
|
|
66
|
+
Requires-Dist: pydantic-ai-slim >=1.3.0 ; extra == "agent"
|
|
67
67
|
Requires-Dist: ag-ui-protocol>=0.1.8 ; extra == "agent"
|
|
68
68
|
Requires-Dist: litellm>=1.75.7 ; extra == "agent"
|
|
69
69
|
Requires-Dist: celery~=5.5.1 ; extra == "celery"
|