orchestrator-core 4.5.3__py3-none-any.whl → 4.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +2 -2
- orchestrator/agentic_app.py +3 -23
- orchestrator/api/api_v1/api.py +5 -0
- orchestrator/api/api_v1/endpoints/agent.py +49 -0
- orchestrator/api/api_v1/endpoints/search.py +120 -201
- orchestrator/app.py +1 -1
- orchestrator/cli/database.py +3 -0
- orchestrator/cli/generate.py +11 -4
- orchestrator/cli/generator/generator/migration.py +7 -3
- orchestrator/cli/main.py +1 -1
- orchestrator/cli/scheduler.py +15 -22
- orchestrator/cli/search/resize_embedding.py +28 -22
- orchestrator/cli/search/speedtest.py +4 -6
- orchestrator/db/__init__.py +6 -0
- orchestrator/db/models.py +75 -0
- orchestrator/llm_settings.py +18 -1
- orchestrator/migrations/helpers.py +47 -39
- orchestrator/schedules/scheduler.py +32 -15
- orchestrator/schedules/validate_products.py +1 -1
- orchestrator/schemas/search.py +8 -85
- orchestrator/search/agent/__init__.py +2 -2
- orchestrator/search/agent/agent.py +26 -30
- orchestrator/search/agent/json_patch.py +51 -0
- orchestrator/search/agent/prompts.py +35 -9
- orchestrator/search/agent/state.py +28 -2
- orchestrator/search/agent/tools.py +192 -53
- orchestrator/search/core/embedding.py +2 -2
- orchestrator/search/core/exceptions.py +6 -0
- orchestrator/search/core/types.py +1 -0
- orchestrator/search/export.py +199 -0
- orchestrator/search/indexing/indexer.py +13 -4
- orchestrator/search/indexing/registry.py +14 -1
- orchestrator/search/llm_migration.py +55 -0
- orchestrator/search/retrieval/__init__.py +3 -2
- orchestrator/search/retrieval/builder.py +5 -1
- orchestrator/search/retrieval/engine.py +66 -23
- orchestrator/search/retrieval/pagination.py +46 -56
- orchestrator/search/retrieval/query_state.py +61 -0
- orchestrator/search/retrieval/retrievers/base.py +26 -40
- orchestrator/search/retrieval/retrievers/fuzzy.py +10 -9
- orchestrator/search/retrieval/retrievers/hybrid.py +11 -8
- orchestrator/search/retrieval/retrievers/semantic.py +9 -8
- orchestrator/search/retrieval/retrievers/structured.py +6 -6
- orchestrator/search/schemas/parameters.py +17 -13
- orchestrator/search/schemas/results.py +4 -1
- orchestrator/settings.py +1 -0
- orchestrator/utils/auth.py +3 -2
- orchestrator/workflow.py +23 -6
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0.dist-info}/METADATA +16 -11
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0.dist-info}/RECORD +52 -48
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.5.3.dist-info → orchestrator_core-4.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -25,7 +25,7 @@ from orchestrator.db import (
|
|
|
25
25
|
WorkflowTable,
|
|
26
26
|
)
|
|
27
27
|
from orchestrator.db.database import BaseModel
|
|
28
|
-
from orchestrator.search.core.types import EntityType
|
|
28
|
+
from orchestrator.search.core.types import EntityType, ExtractedField
|
|
29
29
|
|
|
30
30
|
from .traverse import (
|
|
31
31
|
BaseTraverser,
|
|
@@ -48,6 +48,7 @@ class EntityConfig(Generic[ModelT]):
|
|
|
48
48
|
traverser: "type[BaseTraverser]"
|
|
49
49
|
pk_name: str
|
|
50
50
|
root_name: str
|
|
51
|
+
title_paths: list[str] # List of field paths to check for title (with fallback)
|
|
51
52
|
|
|
52
53
|
def get_all_query(self, entity_id: str | None = None) -> Query | Select:
|
|
53
54
|
query = self.table.query
|
|
@@ -56,6 +57,14 @@ class EntityConfig(Generic[ModelT]):
|
|
|
56
57
|
query = query.filter(pk_column == UUID(entity_id))
|
|
57
58
|
return query
|
|
58
59
|
|
|
60
|
+
def get_title_from_fields(self, fields: list[ExtractedField]) -> str:
|
|
61
|
+
"""Extract title from fields using configured paths."""
|
|
62
|
+
for title_path in self.title_paths:
|
|
63
|
+
for field in fields:
|
|
64
|
+
if field.path == title_path and field.value:
|
|
65
|
+
return str(field.value)
|
|
66
|
+
return "UNKNOWN"
|
|
67
|
+
|
|
59
68
|
|
|
60
69
|
@dataclass(frozen=True)
|
|
61
70
|
class WorkflowConfig(EntityConfig[WorkflowTable]):
|
|
@@ -76,6 +85,7 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
|
|
|
76
85
|
traverser=SubscriptionTraverser,
|
|
77
86
|
pk_name="subscription_id",
|
|
78
87
|
root_name="subscription",
|
|
88
|
+
title_paths=["subscription.description"],
|
|
79
89
|
),
|
|
80
90
|
EntityType.PRODUCT: EntityConfig(
|
|
81
91
|
entity_kind=EntityType.PRODUCT,
|
|
@@ -83,6 +93,7 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
|
|
|
83
93
|
traverser=ProductTraverser,
|
|
84
94
|
pk_name="product_id",
|
|
85
95
|
root_name="product",
|
|
96
|
+
title_paths=["product.description", "product.name"],
|
|
86
97
|
),
|
|
87
98
|
EntityType.PROCESS: EntityConfig(
|
|
88
99
|
entity_kind=EntityType.PROCESS,
|
|
@@ -90,6 +101,7 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
|
|
|
90
101
|
traverser=ProcessTraverser,
|
|
91
102
|
pk_name="process_id",
|
|
92
103
|
root_name="process",
|
|
104
|
+
title_paths=["process.workflow_name"],
|
|
93
105
|
),
|
|
94
106
|
EntityType.WORKFLOW: WorkflowConfig(
|
|
95
107
|
entity_kind=EntityType.WORKFLOW,
|
|
@@ -97,5 +109,6 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
|
|
|
97
109
|
traverser=WorkflowTraverser,
|
|
98
110
|
pk_name="workflow_id",
|
|
99
111
|
root_name="workflow",
|
|
112
|
+
title_paths=["workflow.description", "workflow.name"],
|
|
100
113
|
),
|
|
101
114
|
}
|
|
@@ -37,6 +37,7 @@ def run_migration(connection: Connection) -> None:
|
|
|
37
37
|
if llm_settings.LLM_FORCE_EXTENTION_MIGRATION or res.rowcount == 0:
|
|
38
38
|
# Create PostgreSQL extensions
|
|
39
39
|
logger.info("Attempting to run the extention creation;")
|
|
40
|
+
connection.execute(text('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";'))
|
|
40
41
|
connection.execute(text("CREATE EXTENSION IF NOT EXISTS ltree;"))
|
|
41
42
|
connection.execute(text("CREATE EXTENSION IF NOT EXISTS unaccent;"))
|
|
42
43
|
connection.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))
|
|
@@ -64,6 +65,7 @@ def run_migration(connection: Connection) -> None:
|
|
|
64
65
|
CREATE TABLE IF NOT EXISTS {TABLE} (
|
|
65
66
|
entity_type TEXT NOT NULL,
|
|
66
67
|
entity_id UUID NOT NULL,
|
|
68
|
+
entity_title TEXT,
|
|
67
69
|
path LTREE NOT NULL,
|
|
68
70
|
value TEXT NOT NULL,
|
|
69
71
|
embedding VECTOR({TARGET_DIM}),
|
|
@@ -78,6 +80,23 @@ def run_migration(connection: Connection) -> None:
|
|
|
78
80
|
# Drop default
|
|
79
81
|
connection.execute(text(f"ALTER TABLE {TABLE} ALTER COLUMN value_type DROP DEFAULT;"))
|
|
80
82
|
|
|
83
|
+
# Add entity_title column if it doesn't exist (for existing installations)
|
|
84
|
+
connection.execute(
|
|
85
|
+
text(
|
|
86
|
+
f"""
|
|
87
|
+
DO $$
|
|
88
|
+
BEGIN
|
|
89
|
+
IF NOT EXISTS (
|
|
90
|
+
SELECT 1 FROM information_schema.columns
|
|
91
|
+
WHERE table_name = '{TABLE}' AND column_name = 'entity_title'
|
|
92
|
+
) THEN
|
|
93
|
+
ALTER TABLE {TABLE} ADD COLUMN entity_title TEXT;
|
|
94
|
+
END IF;
|
|
95
|
+
END $$;
|
|
96
|
+
"""
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
|
|
81
100
|
# Create indexes with IF NOT EXISTS
|
|
82
101
|
connection.execute(text(f"CREATE INDEX IF NOT EXISTS ix_ai_search_index_entity_id ON {TABLE} (entity_id);"))
|
|
83
102
|
connection.execute(
|
|
@@ -96,6 +115,42 @@ def run_migration(connection: Connection) -> None:
|
|
|
96
115
|
)
|
|
97
116
|
)
|
|
98
117
|
|
|
118
|
+
# Create agent_runs table
|
|
119
|
+
connection.execute(
|
|
120
|
+
text(
|
|
121
|
+
"""
|
|
122
|
+
CREATE TABLE IF NOT EXISTS agent_runs (
|
|
123
|
+
run_id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
124
|
+
agent_type VARCHAR(50) NOT NULL,
|
|
125
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP NOT NULL
|
|
126
|
+
);
|
|
127
|
+
"""
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
connection.execute(text("CREATE INDEX IF NOT EXISTS ix_agent_runs_created_at ON agent_runs (created_at);"))
|
|
131
|
+
|
|
132
|
+
# Create search_queries table
|
|
133
|
+
connection.execute(
|
|
134
|
+
text(
|
|
135
|
+
f"""
|
|
136
|
+
CREATE TABLE IF NOT EXISTS search_queries (
|
|
137
|
+
query_id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
138
|
+
run_id UUID,
|
|
139
|
+
query_number INTEGER NOT NULL,
|
|
140
|
+
parameters JSONB NOT NULL,
|
|
141
|
+
query_embedding VECTOR({TARGET_DIM}),
|
|
142
|
+
executed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP NOT NULL,
|
|
143
|
+
CONSTRAINT fk_search_queries_run_id FOREIGN KEY (run_id) REFERENCES agent_runs(run_id) ON DELETE CASCADE
|
|
144
|
+
);
|
|
145
|
+
"""
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
connection.execute(text("CREATE INDEX IF NOT EXISTS ix_search_queries_run_id ON search_queries (run_id);"))
|
|
149
|
+
connection.execute(
|
|
150
|
+
text("CREATE INDEX IF NOT EXISTS ix_search_queries_executed_at ON search_queries (executed_at);")
|
|
151
|
+
)
|
|
152
|
+
connection.execute(text("CREATE INDEX IF NOT EXISTS ix_search_queries_query_id ON search_queries (query_id);"))
|
|
153
|
+
|
|
99
154
|
connection.commit()
|
|
100
155
|
logger.info("LLM migration completed successfully")
|
|
101
156
|
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# See the License for the specific language governing permissions and
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
|
|
14
|
-
from .engine import execute_search
|
|
14
|
+
from .engine import execute_search, execute_search_for_export
|
|
15
|
+
from .query_state import SearchQueryState
|
|
15
16
|
|
|
16
|
-
__all__ = ["execute_search"]
|
|
17
|
+
__all__ = ["execute_search", "execute_search_for_export", "SearchQueryState"]
|
|
@@ -43,7 +43,11 @@ def build_candidate_query(params: BaseSearchParameters) -> Select:
|
|
|
43
43
|
Select: The SQLAlchemy `Select` object representing the query.
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
|
-
stmt =
|
|
46
|
+
stmt = (
|
|
47
|
+
select(AiSearchIndex.entity_id, AiSearchIndex.entity_title)
|
|
48
|
+
.where(AiSearchIndex.entity_type == params.entity_type.value)
|
|
49
|
+
.distinct()
|
|
50
|
+
)
|
|
47
51
|
|
|
48
52
|
if params.filters is not None:
|
|
49
53
|
entity_id_col = AiSearchIndex.entity_id
|
|
@@ -17,13 +17,15 @@ import structlog
|
|
|
17
17
|
from sqlalchemy.engine.row import RowMapping
|
|
18
18
|
from sqlalchemy.orm import Session
|
|
19
19
|
|
|
20
|
+
from orchestrator.search.core.embedding import QueryEmbedder
|
|
20
21
|
from orchestrator.search.core.types import FilterOp, SearchMetadata
|
|
21
22
|
from orchestrator.search.filters import FilterTree, LtreeFilter
|
|
22
23
|
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
23
24
|
from orchestrator.search.schemas.results import MatchingField, SearchResponse, SearchResult
|
|
24
25
|
|
|
25
26
|
from .builder import build_candidate_query
|
|
26
|
-
from .pagination import
|
|
27
|
+
from .pagination import PageCursor
|
|
28
|
+
from .query_state import SearchQueryState
|
|
27
29
|
from .retrievers import Retriever
|
|
28
30
|
from .utils import generate_highlight_indices
|
|
29
31
|
|
|
@@ -74,9 +76,15 @@ def _format_response(
|
|
|
74
76
|
# Structured search (filter-only)
|
|
75
77
|
matching_field = _extract_matching_field_from_filters(search_params.filters)
|
|
76
78
|
|
|
79
|
+
entity_title = row.get("entity_title", "")
|
|
80
|
+
if not isinstance(entity_title, str):
|
|
81
|
+
entity_title = str(entity_title) if entity_title is not None else ""
|
|
82
|
+
|
|
77
83
|
results.append(
|
|
78
84
|
SearchResult(
|
|
79
85
|
entity_id=str(row.entity_id),
|
|
86
|
+
entity_type=search_params.entity_type,
|
|
87
|
+
entity_title=entity_title,
|
|
80
88
|
score=row.score,
|
|
81
89
|
perfect_match=row.get("perfect_match", 0),
|
|
82
90
|
matching_field=matching_field,
|
|
@@ -110,45 +118,80 @@ def _extract_matching_field_from_filters(filters: FilterTree) -> MatchingField |
|
|
|
110
118
|
return MatchingField(text=text, path=pf.path, highlight_indices=[(0, len(text))])
|
|
111
119
|
|
|
112
120
|
|
|
113
|
-
async def
|
|
121
|
+
async def _execute_search_internal(
|
|
114
122
|
search_params: BaseSearchParameters,
|
|
115
123
|
db_session: Session,
|
|
116
|
-
|
|
124
|
+
limit: int,
|
|
125
|
+
cursor: PageCursor | None = None,
|
|
126
|
+
query_embedding: list[float] | None = None,
|
|
117
127
|
) -> SearchResponse:
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
Builds a candidate entity query based on the given search parameters,
|
|
121
|
-
applies the appropriate ranking strategy, and executes the final ranked
|
|
122
|
-
query to retrieve results.
|
|
128
|
+
"""Internal function to execute search with specified parameters.
|
|
123
129
|
|
|
124
130
|
Args:
|
|
125
|
-
search_params
|
|
126
|
-
db_session
|
|
127
|
-
|
|
128
|
-
|
|
131
|
+
search_params: The search parameters specifying vector, fuzzy, or filter criteria.
|
|
132
|
+
db_session: The active SQLAlchemy session for executing the query.
|
|
133
|
+
limit: Maximum number of results to return.
|
|
134
|
+
cursor: Optional pagination cursor.
|
|
135
|
+
query_embedding: Optional pre-computed query embedding to use instead of generating a new one.
|
|
129
136
|
|
|
130
137
|
Returns:
|
|
131
|
-
SearchResponse
|
|
132
|
-
and optional highlight metadata.
|
|
133
|
-
|
|
134
|
-
Notes:
|
|
135
|
-
If no vector query, filters, or fuzzy term are provided, a warning is logged
|
|
136
|
-
and an empty result set is returned.
|
|
138
|
+
SearchResponse with results and embedding (for internal use).
|
|
137
139
|
"""
|
|
138
|
-
|
|
139
140
|
if not search_params.vector_query and not search_params.filters and not search_params.fuzzy_term:
|
|
140
141
|
logger.warning("No search criteria provided (vector_query, fuzzy_term, or filters).")
|
|
141
142
|
return SearchResponse(results=[], metadata=SearchMetadata.empty())
|
|
142
143
|
|
|
143
144
|
candidate_query = build_candidate_query(search_params)
|
|
144
145
|
|
|
145
|
-
|
|
146
|
-
|
|
146
|
+
if search_params.vector_query and not query_embedding:
|
|
147
|
+
|
|
148
|
+
query_embedding = await QueryEmbedder.generate_for_text_async(search_params.vector_query)
|
|
149
|
+
|
|
150
|
+
retriever = await Retriever.route(search_params, cursor, query_embedding)
|
|
147
151
|
logger.debug("Using retriever", retriever_type=retriever.__class__.__name__)
|
|
148
152
|
|
|
149
153
|
final_stmt = retriever.apply(candidate_query)
|
|
150
|
-
final_stmt = final_stmt.limit(
|
|
154
|
+
final_stmt = final_stmt.limit(limit)
|
|
151
155
|
logger.debug(final_stmt)
|
|
152
156
|
result = db_session.execute(final_stmt).mappings().all()
|
|
153
157
|
|
|
154
|
-
|
|
158
|
+
response = _format_response(result, search_params, retriever.metadata)
|
|
159
|
+
# Store embedding in response for agent to save to DB
|
|
160
|
+
response.query_embedding = query_embedding
|
|
161
|
+
return response
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
async def execute_search(
|
|
165
|
+
search_params: BaseSearchParameters,
|
|
166
|
+
db_session: Session,
|
|
167
|
+
cursor: PageCursor | None = None,
|
|
168
|
+
query_embedding: list[float] | None = None,
|
|
169
|
+
) -> SearchResponse:
|
|
170
|
+
"""Execute a search and return ranked results."""
|
|
171
|
+
return await _execute_search_internal(search_params, db_session, search_params.limit, cursor, query_embedding)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
async def execute_search_for_export(
|
|
175
|
+
query_state: SearchQueryState,
|
|
176
|
+
db_session: Session,
|
|
177
|
+
) -> list[dict]:
|
|
178
|
+
"""Execute a search for export and fetch flattened entity data.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
query_state: Query state containing parameters and query_embedding.
|
|
182
|
+
db_session: The active SQLAlchemy session for executing the query.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
List of flattened entity records suitable for export.
|
|
186
|
+
"""
|
|
187
|
+
from orchestrator.search.export import fetch_export_data
|
|
188
|
+
|
|
189
|
+
search_response = await _execute_search_internal(
|
|
190
|
+
search_params=query_state.parameters,
|
|
191
|
+
db_session=db_session,
|
|
192
|
+
limit=query_state.parameters.export_limit,
|
|
193
|
+
query_embedding=query_state.query_embedding,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
entity_ids = [res.entity_id for res in search_response.results]
|
|
197
|
+
return fetch_export_data(query_state.parameters.entity_type, entity_ids)
|
|
@@ -11,42 +11,21 @@
|
|
|
11
11
|
# See the License for the specific language governing permissions and
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
|
|
14
|
-
import array
|
|
15
14
|
import base64
|
|
16
|
-
from
|
|
15
|
+
from uuid import UUID
|
|
17
16
|
|
|
18
17
|
from pydantic import BaseModel
|
|
19
18
|
|
|
19
|
+
from orchestrator.db import SearchQueryTable, db
|
|
20
20
|
from orchestrator.search.core.exceptions import InvalidCursorError
|
|
21
|
-
from orchestrator.search.schemas.parameters import
|
|
22
|
-
from orchestrator.search.schemas.results import
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@dataclass
|
|
26
|
-
class PaginationParams:
|
|
27
|
-
"""Parameters for pagination in search queries."""
|
|
28
|
-
|
|
29
|
-
page_after_score: float | None = None
|
|
30
|
-
page_after_id: str | None = None
|
|
31
|
-
q_vec_override: list[float] | None = None
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def floats_to_b64(v: list[float]) -> str:
|
|
35
|
-
a = array.array("f", v)
|
|
36
|
-
return base64.urlsafe_b64encode(a.tobytes()).decode("ascii")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def b64_to_floats(s: str) -> list[float]:
|
|
40
|
-
raw = base64.urlsafe_b64decode(s.encode("ascii"))
|
|
41
|
-
a = array.array("f")
|
|
42
|
-
a.frombytes(raw)
|
|
43
|
-
return list(a)
|
|
21
|
+
from orchestrator.search.schemas.parameters import SearchParameters
|
|
22
|
+
from orchestrator.search.schemas.results import SearchResponse
|
|
44
23
|
|
|
45
24
|
|
|
46
25
|
class PageCursor(BaseModel):
|
|
47
26
|
score: float
|
|
48
27
|
id: str
|
|
49
|
-
|
|
28
|
+
query_id: UUID
|
|
50
29
|
|
|
51
30
|
def encode(self) -> str:
|
|
52
31
|
"""Encode the cursor data into a URL-safe Base64 string."""
|
|
@@ -63,34 +42,45 @@ class PageCursor(BaseModel):
|
|
|
63
42
|
raise InvalidCursorError("Invalid pagination cursor") from e
|
|
64
43
|
|
|
65
44
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
return PaginationParams(
|
|
71
|
-
page_after_score=c.score,
|
|
72
|
-
page_after_id=c.id,
|
|
73
|
-
q_vec_override=b64_to_floats(c.q_vec_b64),
|
|
74
|
-
)
|
|
75
|
-
if search_params.vector_query:
|
|
76
|
-
from orchestrator.search.core.embedding import QueryEmbedder
|
|
77
|
-
|
|
78
|
-
q_vec_override = await QueryEmbedder.generate_for_text_async(search_params.vector_query)
|
|
79
|
-
return PaginationParams(q_vec_override=q_vec_override)
|
|
80
|
-
return PaginationParams()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def create_next_page_cursor(
|
|
84
|
-
search_results: list[SearchResult], pagination_params: PaginationParams, limit: int
|
|
45
|
+
def encode_next_page_cursor(
|
|
46
|
+
search_response: SearchResponse,
|
|
47
|
+
cursor: PageCursor | None,
|
|
48
|
+
search_params: SearchParameters,
|
|
85
49
|
) -> str | None:
|
|
86
|
-
"""Create next page cursor if there are more results.
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
50
|
+
"""Create next page cursor if there are more results.
|
|
51
|
+
|
|
52
|
+
On first page, saves the query to database and includes query_id in cursor
|
|
53
|
+
for subsequent pages to ensure consistent parameters across pagination.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
search_response: SearchResponse containing results and query_embedding
|
|
57
|
+
cursor: Current page cursor (None for first page, PageCursor for subsequent pages)
|
|
58
|
+
search_params: Search parameters to save for pagination consistency
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Encoded cursor for next page, or None if no more results
|
|
62
|
+
"""
|
|
63
|
+
from orchestrator.search.retrieval.query_state import SearchQueryState
|
|
64
|
+
|
|
65
|
+
has_next_page = len(search_response.results) == search_params.limit and search_params.limit > 0
|
|
66
|
+
if not has_next_page:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
# If this is the first page, save query state to database
|
|
70
|
+
if cursor is None:
|
|
71
|
+
query_state = SearchQueryState(parameters=search_params, query_embedding=search_response.query_embedding)
|
|
72
|
+
search_query = SearchQueryTable.from_state(state=query_state)
|
|
73
|
+
|
|
74
|
+
db.session.add(search_query)
|
|
75
|
+
db.session.commit()
|
|
76
|
+
query_id = search_query.query_id
|
|
77
|
+
else:
|
|
78
|
+
query_id = cursor.query_id
|
|
79
|
+
|
|
80
|
+
last_item = search_response.results[-1]
|
|
81
|
+
cursor_data = PageCursor(
|
|
82
|
+
score=float(last_item.score),
|
|
83
|
+
id=last_item.entity_id,
|
|
84
|
+
query_id=query_id,
|
|
85
|
+
)
|
|
86
|
+
return cursor_data.encode()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Copyright 2019-2025 SURF, GÉANT.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
from uuid import UUID
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
17
|
+
|
|
18
|
+
from orchestrator.db import SearchQueryTable, db
|
|
19
|
+
from orchestrator.search.core.exceptions import QueryStateNotFoundError
|
|
20
|
+
from orchestrator.search.schemas.parameters import SearchParameters
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SearchQueryState(BaseModel):
|
|
24
|
+
"""State of a search query including parameters and embedding.
|
|
25
|
+
|
|
26
|
+
This model provides a complete snapshot of what was searched and how.
|
|
27
|
+
Used for both agent and regular API searches.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
parameters: SearchParameters = Field(discriminator="entity_type")
|
|
31
|
+
query_embedding: list[float] | None = Field(default=None, description="The embedding vector for semantic search")
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(from_attributes=True)
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def load_from_id(cls, query_id: UUID | str) -> "SearchQueryState":
|
|
37
|
+
"""Load query state from database by query_id.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
query_id: UUID or string UUID of the saved query
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
SearchQueryState loaded from database
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If query_id format is invalid
|
|
47
|
+
QueryStateNotFoundError: If query not found in database
|
|
48
|
+
"""
|
|
49
|
+
if isinstance(query_id, UUID):
|
|
50
|
+
query_uuid = query_id
|
|
51
|
+
else:
|
|
52
|
+
try:
|
|
53
|
+
query_uuid = UUID(query_id)
|
|
54
|
+
except (ValueError, TypeError) as e:
|
|
55
|
+
raise ValueError(f"Invalid query_id format: {query_id}") from e
|
|
56
|
+
|
|
57
|
+
search_query = db.session.query(SearchQueryTable).filter_by(query_id=query_uuid).first()
|
|
58
|
+
if not search_query:
|
|
59
|
+
raise QueryStateNotFoundError(f"Query {query_uuid} not found in database")
|
|
60
|
+
|
|
61
|
+
return cls.model_validate(search_query)
|
|
@@ -20,7 +20,7 @@ from sqlalchemy import BindParameter, Numeric, Select, literal
|
|
|
20
20
|
from orchestrator.search.core.types import FieldType, SearchMetadata
|
|
21
21
|
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
22
22
|
|
|
23
|
-
from ..pagination import
|
|
23
|
+
from ..pagination import PageCursor
|
|
24
24
|
|
|
25
25
|
logger = structlog.get_logger(__name__)
|
|
26
26
|
|
|
@@ -41,62 +41,48 @@ class Retriever(ABC):
|
|
|
41
41
|
]
|
|
42
42
|
|
|
43
43
|
@classmethod
|
|
44
|
-
async def
|
|
44
|
+
async def route(
|
|
45
45
|
cls,
|
|
46
46
|
params: BaseSearchParameters,
|
|
47
|
-
|
|
47
|
+
cursor: PageCursor | None,
|
|
48
|
+
query_embedding: list[float] | None = None,
|
|
48
49
|
) -> "Retriever":
|
|
49
|
-
"""
|
|
50
|
+
"""Route to the appropriate retriever instance based on search parameters.
|
|
51
|
+
|
|
52
|
+
Selects the retriever type based on available search criteria:
|
|
53
|
+
- Hybrid: both embedding and fuzzy term available
|
|
54
|
+
- Semantic: only embedding available
|
|
55
|
+
- Fuzzy: only text term available (or fallback when embedding generation fails)
|
|
56
|
+
- Structured: only filters available
|
|
50
57
|
|
|
51
58
|
Args:
|
|
52
|
-
params
|
|
53
|
-
|
|
59
|
+
params: Search parameters including vector queries, fuzzy terms, and filters
|
|
60
|
+
cursor: Pagination cursor for cursor-based paging
|
|
61
|
+
query_embedding: Query embedding for semantic search, or None if not available
|
|
54
62
|
|
|
55
63
|
Returns:
|
|
56
|
-
|
|
64
|
+
A concrete retriever instance based on available search criteria
|
|
57
65
|
"""
|
|
58
|
-
|
|
59
66
|
from .fuzzy import FuzzyRetriever
|
|
60
67
|
from .hybrid import RrfHybridRetriever
|
|
61
68
|
from .semantic import SemanticRetriever
|
|
62
69
|
from .structured import StructuredRetriever
|
|
63
70
|
|
|
64
71
|
fuzzy_term = params.fuzzy_term
|
|
65
|
-
q_vec = await cls._get_query_vector(params.vector_query, pagination_params.q_vec_override)
|
|
66
|
-
|
|
67
|
-
# If semantic search was attempted but failed, fall back to fuzzy with the full query
|
|
68
|
-
fallback_fuzzy_term = fuzzy_term
|
|
69
|
-
if q_vec is None and params.vector_query is not None and params.query is not None:
|
|
70
|
-
fallback_fuzzy_term = params.query
|
|
71
|
-
|
|
72
|
-
if q_vec is not None and fallback_fuzzy_term is not None:
|
|
73
|
-
return RrfHybridRetriever(q_vec, fallback_fuzzy_term, pagination_params)
|
|
74
|
-
if q_vec is not None:
|
|
75
|
-
return SemanticRetriever(q_vec, pagination_params)
|
|
76
|
-
if fallback_fuzzy_term is not None:
|
|
77
|
-
return FuzzyRetriever(fallback_fuzzy_term, pagination_params)
|
|
78
|
-
|
|
79
|
-
return StructuredRetriever(pagination_params)
|
|
80
|
-
|
|
81
|
-
@classmethod
|
|
82
|
-
async def _get_query_vector(
|
|
83
|
-
cls, vector_query: str | None, q_vec_override: list[float] | None
|
|
84
|
-
) -> list[float] | None:
|
|
85
|
-
"""Get query vector either from override or by generating from text."""
|
|
86
|
-
if q_vec_override:
|
|
87
|
-
return q_vec_override
|
|
88
|
-
|
|
89
|
-
if not vector_query:
|
|
90
|
-
return None
|
|
91
72
|
|
|
92
|
-
|
|
73
|
+
# If vector_query exists but embedding generation failed, fall back to fuzzy search with full query
|
|
74
|
+
if query_embedding is None and params.vector_query is not None and params.query is not None:
|
|
75
|
+
fuzzy_term = params.query
|
|
93
76
|
|
|
94
|
-
|
|
95
|
-
if not
|
|
96
|
-
|
|
97
|
-
|
|
77
|
+
# Select retriever based on available search criteria
|
|
78
|
+
if query_embedding is not None and fuzzy_term is not None:
|
|
79
|
+
return RrfHybridRetriever(query_embedding, fuzzy_term, cursor)
|
|
80
|
+
if query_embedding is not None:
|
|
81
|
+
return SemanticRetriever(query_embedding, cursor)
|
|
82
|
+
if fuzzy_term is not None:
|
|
83
|
+
return FuzzyRetriever(fuzzy_term, cursor)
|
|
98
84
|
|
|
99
|
-
return
|
|
85
|
+
return StructuredRetriever(cursor)
|
|
100
86
|
|
|
101
87
|
@abstractmethod
|
|
102
88
|
def apply(self, candidate_query: Select) -> Select:
|
|
@@ -17,17 +17,16 @@ from sqlalchemy.sql.expression import ColumnElement
|
|
|
17
17
|
from orchestrator.db.models import AiSearchIndex
|
|
18
18
|
from orchestrator.search.core.types import SearchMetadata
|
|
19
19
|
|
|
20
|
-
from ..pagination import
|
|
20
|
+
from ..pagination import PageCursor
|
|
21
21
|
from .base import Retriever
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class FuzzyRetriever(Retriever):
|
|
25
25
|
"""Ranks results based on the max of fuzzy text similarity scores."""
|
|
26
26
|
|
|
27
|
-
def __init__(self, fuzzy_term: str,
|
|
27
|
+
def __init__(self, fuzzy_term: str, cursor: PageCursor | None) -> None:
|
|
28
28
|
self.fuzzy_term = fuzzy_term
|
|
29
|
-
self.
|
|
30
|
-
self.page_after_id = pagination_params.page_after_id
|
|
29
|
+
self.cursor = cursor
|
|
31
30
|
|
|
32
31
|
def apply(self, candidate_query: Select) -> Select:
|
|
33
32
|
cand = candidate_query.subquery()
|
|
@@ -42,6 +41,7 @@ class FuzzyRetriever(Retriever):
|
|
|
42
41
|
combined_query = (
|
|
43
42
|
select(
|
|
44
43
|
AiSearchIndex.entity_id,
|
|
44
|
+
AiSearchIndex.entity_title,
|
|
45
45
|
score,
|
|
46
46
|
func.first_value(AiSearchIndex.value)
|
|
47
47
|
.over(partition_by=AiSearchIndex.entity_id, order_by=[similarity_expr.desc(), AiSearchIndex.path.asc()])
|
|
@@ -58,12 +58,13 @@ class FuzzyRetriever(Retriever):
|
|
|
58
58
|
literal(self.fuzzy_term).op("<%")(AiSearchIndex.value),
|
|
59
59
|
)
|
|
60
60
|
)
|
|
61
|
-
.distinct(AiSearchIndex.entity_id)
|
|
61
|
+
.distinct(AiSearchIndex.entity_id, AiSearchIndex.entity_title)
|
|
62
62
|
)
|
|
63
63
|
final_query = combined_query.subquery("ranked_fuzzy")
|
|
64
64
|
|
|
65
65
|
stmt = select(
|
|
66
66
|
final_query.c.entity_id,
|
|
67
|
+
final_query.c.entity_title,
|
|
67
68
|
final_query.c.score,
|
|
68
69
|
final_query.c.highlight_text,
|
|
69
70
|
final_query.c.highlight_path,
|
|
@@ -81,13 +82,13 @@ class FuzzyRetriever(Retriever):
|
|
|
81
82
|
self, stmt: Select, score_column: ColumnElement, entity_id_column: ColumnElement
|
|
82
83
|
) -> Select:
|
|
83
84
|
"""Apply standard score + entity_id pagination."""
|
|
84
|
-
if self.
|
|
85
|
+
if self.cursor is not None:
|
|
85
86
|
stmt = stmt.where(
|
|
86
87
|
or_(
|
|
87
|
-
score_column < self.
|
|
88
|
+
score_column < self.cursor.score,
|
|
88
89
|
and_(
|
|
89
|
-
score_column == self.
|
|
90
|
-
entity_id_column > self.
|
|
90
|
+
score_column == self.cursor.score,
|
|
91
|
+
entity_id_column > self.cursor.id,
|
|
91
92
|
),
|
|
92
93
|
)
|
|
93
94
|
)
|