orchestrator-core 4.5.2__py3-none-any.whl → 4.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. orchestrator/__init__.py +2 -2
  2. orchestrator/agentic_app.py +3 -23
  3. orchestrator/api/api_v1/api.py +5 -0
  4. orchestrator/api/api_v1/endpoints/agent.py +49 -0
  5. orchestrator/api/api_v1/endpoints/search.py +120 -201
  6. orchestrator/app.py +1 -1
  7. orchestrator/cli/database.py +3 -0
  8. orchestrator/cli/generate.py +11 -4
  9. orchestrator/cli/generator/generator/migration.py +7 -3
  10. orchestrator/cli/main.py +1 -1
  11. orchestrator/cli/scheduler.py +15 -22
  12. orchestrator/cli/search/resize_embedding.py +28 -22
  13. orchestrator/cli/search/speedtest.py +4 -6
  14. orchestrator/db/__init__.py +6 -0
  15. orchestrator/db/models.py +75 -0
  16. orchestrator/llm_settings.py +18 -1
  17. orchestrator/migrations/helpers.py +47 -39
  18. orchestrator/schedules/scheduler.py +32 -15
  19. orchestrator/schedules/validate_products.py +1 -1
  20. orchestrator/schemas/search.py +8 -85
  21. orchestrator/search/agent/__init__.py +2 -2
  22. orchestrator/search/agent/agent.py +26 -30
  23. orchestrator/search/agent/json_patch.py +51 -0
  24. orchestrator/search/agent/prompts.py +35 -9
  25. orchestrator/search/agent/state.py +28 -2
  26. orchestrator/search/agent/tools.py +192 -53
  27. orchestrator/search/core/embedding.py +2 -2
  28. orchestrator/search/core/exceptions.py +6 -0
  29. orchestrator/search/core/types.py +1 -0
  30. orchestrator/search/export.py +199 -0
  31. orchestrator/search/indexing/indexer.py +13 -4
  32. orchestrator/search/indexing/registry.py +14 -1
  33. orchestrator/search/llm_migration.py +55 -0
  34. orchestrator/search/retrieval/__init__.py +3 -2
  35. orchestrator/search/retrieval/builder.py +5 -1
  36. orchestrator/search/retrieval/engine.py +66 -23
  37. orchestrator/search/retrieval/pagination.py +46 -56
  38. orchestrator/search/retrieval/query_state.py +61 -0
  39. orchestrator/search/retrieval/retrievers/base.py +26 -40
  40. orchestrator/search/retrieval/retrievers/fuzzy.py +10 -9
  41. orchestrator/search/retrieval/retrievers/hybrid.py +11 -8
  42. orchestrator/search/retrieval/retrievers/semantic.py +9 -8
  43. orchestrator/search/retrieval/retrievers/structured.py +6 -6
  44. orchestrator/search/schemas/parameters.py +17 -13
  45. orchestrator/search/schemas/results.py +4 -1
  46. orchestrator/settings.py +1 -0
  47. orchestrator/utils/auth.py +3 -2
  48. orchestrator/workflow.py +23 -6
  49. orchestrator/workflows/tasks/validate_product_type.py +3 -3
  50. {orchestrator_core-4.5.2.dist-info → orchestrator_core-4.6.0.dist-info}/METADATA +17 -12
  51. {orchestrator_core-4.5.2.dist-info → orchestrator_core-4.6.0.dist-info}/RECORD +53 -49
  52. {orchestrator_core-4.5.2.dist-info → orchestrator_core-4.6.0.dist-info}/WHEEL +0 -0
  53. {orchestrator_core-4.5.2.dist-info → orchestrator_core-4.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -25,7 +25,7 @@ from orchestrator.db import (
25
25
  WorkflowTable,
26
26
  )
27
27
  from orchestrator.db.database import BaseModel
28
- from orchestrator.search.core.types import EntityType
28
+ from orchestrator.search.core.types import EntityType, ExtractedField
29
29
 
30
30
  from .traverse import (
31
31
  BaseTraverser,
@@ -48,6 +48,7 @@ class EntityConfig(Generic[ModelT]):
48
48
  traverser: "type[BaseTraverser]"
49
49
  pk_name: str
50
50
  root_name: str
51
+ title_paths: list[str] # List of field paths to check for title (with fallback)
51
52
 
52
53
  def get_all_query(self, entity_id: str | None = None) -> Query | Select:
53
54
  query = self.table.query
@@ -56,6 +57,14 @@ class EntityConfig(Generic[ModelT]):
56
57
  query = query.filter(pk_column == UUID(entity_id))
57
58
  return query
58
59
 
60
+ def get_title_from_fields(self, fields: list[ExtractedField]) -> str:
61
+ """Extract title from fields using configured paths."""
62
+ for title_path in self.title_paths:
63
+ for field in fields:
64
+ if field.path == title_path and field.value:
65
+ return str(field.value)
66
+ return "UNKNOWN"
67
+
59
68
 
60
69
  @dataclass(frozen=True)
61
70
  class WorkflowConfig(EntityConfig[WorkflowTable]):
@@ -76,6 +85,7 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
76
85
  traverser=SubscriptionTraverser,
77
86
  pk_name="subscription_id",
78
87
  root_name="subscription",
88
+ title_paths=["subscription.description"],
79
89
  ),
80
90
  EntityType.PRODUCT: EntityConfig(
81
91
  entity_kind=EntityType.PRODUCT,
@@ -83,6 +93,7 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
83
93
  traverser=ProductTraverser,
84
94
  pk_name="product_id",
85
95
  root_name="product",
96
+ title_paths=["product.description", "product.name"],
86
97
  ),
87
98
  EntityType.PROCESS: EntityConfig(
88
99
  entity_kind=EntityType.PROCESS,
@@ -90,6 +101,7 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
90
101
  traverser=ProcessTraverser,
91
102
  pk_name="process_id",
92
103
  root_name="process",
104
+ title_paths=["process.workflow_name"],
93
105
  ),
94
106
  EntityType.WORKFLOW: WorkflowConfig(
95
107
  entity_kind=EntityType.WORKFLOW,
@@ -97,5 +109,6 @@ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
97
109
  traverser=WorkflowTraverser,
98
110
  pk_name="workflow_id",
99
111
  root_name="workflow",
112
+ title_paths=["workflow.description", "workflow.name"],
100
113
  ),
101
114
  }
@@ -37,6 +37,7 @@ def run_migration(connection: Connection) -> None:
37
37
  if llm_settings.LLM_FORCE_EXTENTION_MIGRATION or res.rowcount == 0:
38
38
  # Create PostgreSQL extensions
39
39
  logger.info("Attempting to run the extention creation;")
40
+ connection.execute(text('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";'))
40
41
  connection.execute(text("CREATE EXTENSION IF NOT EXISTS ltree;"))
41
42
  connection.execute(text("CREATE EXTENSION IF NOT EXISTS unaccent;"))
42
43
  connection.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))
@@ -64,6 +65,7 @@ def run_migration(connection: Connection) -> None:
64
65
  CREATE TABLE IF NOT EXISTS {TABLE} (
65
66
  entity_type TEXT NOT NULL,
66
67
  entity_id UUID NOT NULL,
68
+ entity_title TEXT,
67
69
  path LTREE NOT NULL,
68
70
  value TEXT NOT NULL,
69
71
  embedding VECTOR({TARGET_DIM}),
@@ -78,6 +80,23 @@ def run_migration(connection: Connection) -> None:
78
80
  # Drop default
79
81
  connection.execute(text(f"ALTER TABLE {TABLE} ALTER COLUMN value_type DROP DEFAULT;"))
80
82
 
83
+ # Add entity_title column if it doesn't exist (for existing installations)
84
+ connection.execute(
85
+ text(
86
+ f"""
87
+ DO $$
88
+ BEGIN
89
+ IF NOT EXISTS (
90
+ SELECT 1 FROM information_schema.columns
91
+ WHERE table_name = '{TABLE}' AND column_name = 'entity_title'
92
+ ) THEN
93
+ ALTER TABLE {TABLE} ADD COLUMN entity_title TEXT;
94
+ END IF;
95
+ END $$;
96
+ """
97
+ )
98
+ )
99
+
81
100
  # Create indexes with IF NOT EXISTS
82
101
  connection.execute(text(f"CREATE INDEX IF NOT EXISTS ix_ai_search_index_entity_id ON {TABLE} (entity_id);"))
83
102
  connection.execute(
@@ -96,6 +115,42 @@ def run_migration(connection: Connection) -> None:
96
115
  )
97
116
  )
98
117
 
118
+ # Create agent_runs table
119
+ connection.execute(
120
+ text(
121
+ """
122
+ CREATE TABLE IF NOT EXISTS agent_runs (
123
+ run_id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
124
+ agent_type VARCHAR(50) NOT NULL,
125
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP NOT NULL
126
+ );
127
+ """
128
+ )
129
+ )
130
+ connection.execute(text("CREATE INDEX IF NOT EXISTS ix_agent_runs_created_at ON agent_runs (created_at);"))
131
+
132
+ # Create search_queries table
133
+ connection.execute(
134
+ text(
135
+ f"""
136
+ CREATE TABLE IF NOT EXISTS search_queries (
137
+ query_id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
138
+ run_id UUID,
139
+ query_number INTEGER NOT NULL,
140
+ parameters JSONB NOT NULL,
141
+ query_embedding VECTOR({TARGET_DIM}),
142
+ executed_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP NOT NULL,
143
+ CONSTRAINT fk_search_queries_run_id FOREIGN KEY (run_id) REFERENCES agent_runs(run_id) ON DELETE CASCADE
144
+ );
145
+ """
146
+ )
147
+ )
148
+ connection.execute(text("CREATE INDEX IF NOT EXISTS ix_search_queries_run_id ON search_queries (run_id);"))
149
+ connection.execute(
150
+ text("CREATE INDEX IF NOT EXISTS ix_search_queries_executed_at ON search_queries (executed_at);")
151
+ )
152
+ connection.execute(text("CREATE INDEX IF NOT EXISTS ix_search_queries_query_id ON search_queries (query_id);"))
153
+
99
154
  connection.commit()
100
155
  logger.info("LLM migration completed successfully")
101
156
 
@@ -11,6 +11,7 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
 
14
- from .engine import execute_search
14
+ from .engine import execute_search, execute_search_for_export
15
+ from .query_state import SearchQueryState
15
16
 
16
- __all__ = ["execute_search"]
17
+ __all__ = ["execute_search", "execute_search_for_export", "SearchQueryState"]
@@ -43,7 +43,11 @@ def build_candidate_query(params: BaseSearchParameters) -> Select:
43
43
  Select: The SQLAlchemy `Select` object representing the query.
44
44
  """
45
45
 
46
- stmt = select(AiSearchIndex.entity_id).where(AiSearchIndex.entity_type == params.entity_type.value).distinct()
46
+ stmt = (
47
+ select(AiSearchIndex.entity_id, AiSearchIndex.entity_title)
48
+ .where(AiSearchIndex.entity_type == params.entity_type.value)
49
+ .distinct()
50
+ )
47
51
 
48
52
  if params.filters is not None:
49
53
  entity_id_col = AiSearchIndex.entity_id
@@ -17,13 +17,15 @@ import structlog
17
17
  from sqlalchemy.engine.row import RowMapping
18
18
  from sqlalchemy.orm import Session
19
19
 
20
+ from orchestrator.search.core.embedding import QueryEmbedder
20
21
  from orchestrator.search.core.types import FilterOp, SearchMetadata
21
22
  from orchestrator.search.filters import FilterTree, LtreeFilter
22
23
  from orchestrator.search.schemas.parameters import BaseSearchParameters
23
24
  from orchestrator.search.schemas.results import MatchingField, SearchResponse, SearchResult
24
25
 
25
26
  from .builder import build_candidate_query
26
- from .pagination import PaginationParams
27
+ from .pagination import PageCursor
28
+ from .query_state import SearchQueryState
27
29
  from .retrievers import Retriever
28
30
  from .utils import generate_highlight_indices
29
31
 
@@ -74,9 +76,15 @@ def _format_response(
74
76
  # Structured search (filter-only)
75
77
  matching_field = _extract_matching_field_from_filters(search_params.filters)
76
78
 
79
+ entity_title = row.get("entity_title", "")
80
+ if not isinstance(entity_title, str):
81
+ entity_title = str(entity_title) if entity_title is not None else ""
82
+
77
83
  results.append(
78
84
  SearchResult(
79
85
  entity_id=str(row.entity_id),
86
+ entity_type=search_params.entity_type,
87
+ entity_title=entity_title,
80
88
  score=row.score,
81
89
  perfect_match=row.get("perfect_match", 0),
82
90
  matching_field=matching_field,
@@ -110,45 +118,80 @@ def _extract_matching_field_from_filters(filters: FilterTree) -> MatchingField |
110
118
  return MatchingField(text=text, path=pf.path, highlight_indices=[(0, len(text))])
111
119
 
112
120
 
113
- async def execute_search(
121
+ async def _execute_search_internal(
114
122
  search_params: BaseSearchParameters,
115
123
  db_session: Session,
116
- pagination_params: PaginationParams | None = None,
124
+ limit: int,
125
+ cursor: PageCursor | None = None,
126
+ query_embedding: list[float] | None = None,
117
127
  ) -> SearchResponse:
118
- """Execute a hybrid search and return ranked results.
119
-
120
- Builds a candidate entity query based on the given search parameters,
121
- applies the appropriate ranking strategy, and executes the final ranked
122
- query to retrieve results.
128
+ """Internal function to execute search with specified parameters.
123
129
 
124
130
  Args:
125
- search_params (BaseSearchParameters): The search parameters specifying vector, fuzzy, or filter criteria.
126
- db_session (Session): The active SQLAlchemy session for executing the query.
127
- pagination_params (PaginationParams): Parameters controlling pagination of the search results.
128
- limit (int, optional): The maximum number of search results to return, by default 5.
131
+ search_params: The search parameters specifying vector, fuzzy, or filter criteria.
132
+ db_session: The active SQLAlchemy session for executing the query.
133
+ limit: Maximum number of results to return.
134
+ cursor: Optional pagination cursor.
135
+ query_embedding: Optional pre-computed query embedding to use instead of generating a new one.
129
136
 
130
137
  Returns:
131
- SearchResponse: A list of `SearchResult` objects containing entity IDs, scores,
132
- and optional highlight metadata.
133
-
134
- Notes:
135
- If no vector query, filters, or fuzzy term are provided, a warning is logged
136
- and an empty result set is returned.
138
+ SearchResponse with results and embedding (for internal use).
137
139
  """
138
-
139
140
  if not search_params.vector_query and not search_params.filters and not search_params.fuzzy_term:
140
141
  logger.warning("No search criteria provided (vector_query, fuzzy_term, or filters).")
141
142
  return SearchResponse(results=[], metadata=SearchMetadata.empty())
142
143
 
143
144
  candidate_query = build_candidate_query(search_params)
144
145
 
145
- pagination_params = pagination_params or PaginationParams()
146
- retriever = await Retriever.from_params(search_params, pagination_params)
146
+ if search_params.vector_query and not query_embedding:
147
+
148
+ query_embedding = await QueryEmbedder.generate_for_text_async(search_params.vector_query)
149
+
150
+ retriever = await Retriever.route(search_params, cursor, query_embedding)
147
151
  logger.debug("Using retriever", retriever_type=retriever.__class__.__name__)
148
152
 
149
153
  final_stmt = retriever.apply(candidate_query)
150
- final_stmt = final_stmt.limit(search_params.limit)
154
+ final_stmt = final_stmt.limit(limit)
151
155
  logger.debug(final_stmt)
152
156
  result = db_session.execute(final_stmt).mappings().all()
153
157
 
154
- return _format_response(result, search_params, retriever.metadata)
158
+ response = _format_response(result, search_params, retriever.metadata)
159
+ # Store embedding in response for agent to save to DB
160
+ response.query_embedding = query_embedding
161
+ return response
162
+
163
+
164
+ async def execute_search(
165
+ search_params: BaseSearchParameters,
166
+ db_session: Session,
167
+ cursor: PageCursor | None = None,
168
+ query_embedding: list[float] | None = None,
169
+ ) -> SearchResponse:
170
+ """Execute a search and return ranked results."""
171
+ return await _execute_search_internal(search_params, db_session, search_params.limit, cursor, query_embedding)
172
+
173
+
174
+ async def execute_search_for_export(
175
+ query_state: SearchQueryState,
176
+ db_session: Session,
177
+ ) -> list[dict]:
178
+ """Execute a search for export and fetch flattened entity data.
179
+
180
+ Args:
181
+ query_state: Query state containing parameters and query_embedding.
182
+ db_session: The active SQLAlchemy session for executing the query.
183
+
184
+ Returns:
185
+ List of flattened entity records suitable for export.
186
+ """
187
+ from orchestrator.search.export import fetch_export_data
188
+
189
+ search_response = await _execute_search_internal(
190
+ search_params=query_state.parameters,
191
+ db_session=db_session,
192
+ limit=query_state.parameters.export_limit,
193
+ query_embedding=query_state.query_embedding,
194
+ )
195
+
196
+ entity_ids = [res.entity_id for res in search_response.results]
197
+ return fetch_export_data(query_state.parameters.entity_type, entity_ids)
@@ -11,42 +11,21 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
 
14
- import array
15
14
  import base64
16
- from dataclasses import dataclass
15
+ from uuid import UUID
17
16
 
18
17
  from pydantic import BaseModel
19
18
 
19
+ from orchestrator.db import SearchQueryTable, db
20
20
  from orchestrator.search.core.exceptions import InvalidCursorError
21
- from orchestrator.search.schemas.parameters import BaseSearchParameters
22
- from orchestrator.search.schemas.results import SearchResult
23
-
24
-
25
- @dataclass
26
- class PaginationParams:
27
- """Parameters for pagination in search queries."""
28
-
29
- page_after_score: float | None = None
30
- page_after_id: str | None = None
31
- q_vec_override: list[float] | None = None
32
-
33
-
34
- def floats_to_b64(v: list[float]) -> str:
35
- a = array.array("f", v)
36
- return base64.urlsafe_b64encode(a.tobytes()).decode("ascii")
37
-
38
-
39
- def b64_to_floats(s: str) -> list[float]:
40
- raw = base64.urlsafe_b64decode(s.encode("ascii"))
41
- a = array.array("f")
42
- a.frombytes(raw)
43
- return list(a)
21
+ from orchestrator.search.schemas.parameters import SearchParameters
22
+ from orchestrator.search.schemas.results import SearchResponse
44
23
 
45
24
 
46
25
  class PageCursor(BaseModel):
47
26
  score: float
48
27
  id: str
49
- q_vec_b64: str
28
+ query_id: UUID
50
29
 
51
30
  def encode(self) -> str:
52
31
  """Encode the cursor data into a URL-safe Base64 string."""
@@ -63,34 +42,45 @@ class PageCursor(BaseModel):
63
42
  raise InvalidCursorError("Invalid pagination cursor") from e
64
43
 
65
44
 
66
- async def process_pagination_cursor(cursor: str | None, search_params: BaseSearchParameters) -> PaginationParams:
67
- """Process pagination cursor and return pagination parameters."""
68
- if cursor:
69
- c = PageCursor.decode(cursor)
70
- return PaginationParams(
71
- page_after_score=c.score,
72
- page_after_id=c.id,
73
- q_vec_override=b64_to_floats(c.q_vec_b64),
74
- )
75
- if search_params.vector_query:
76
- from orchestrator.search.core.embedding import QueryEmbedder
77
-
78
- q_vec_override = await QueryEmbedder.generate_for_text_async(search_params.vector_query)
79
- return PaginationParams(q_vec_override=q_vec_override)
80
- return PaginationParams()
81
-
82
-
83
- def create_next_page_cursor(
84
- search_results: list[SearchResult], pagination_params: PaginationParams, limit: int
45
+ def encode_next_page_cursor(
46
+ search_response: SearchResponse,
47
+ cursor: PageCursor | None,
48
+ search_params: SearchParameters,
85
49
  ) -> str | None:
86
- """Create next page cursor if there are more results."""
87
- has_next_page = len(search_results) == limit and limit > 0
88
- if has_next_page:
89
- last_item = search_results[-1]
90
- cursor_data = PageCursor(
91
- score=float(last_item.score),
92
- id=last_item.entity_id,
93
- q_vec_b64=floats_to_b64(pagination_params.q_vec_override or []),
94
- )
95
- return cursor_data.encode()
96
- return None
50
+ """Create next page cursor if there are more results.
51
+
52
+ On first page, saves the query to database and includes query_id in cursor
53
+ for subsequent pages to ensure consistent parameters across pagination.
54
+
55
+ Args:
56
+ search_response: SearchResponse containing results and query_embedding
57
+ cursor: Current page cursor (None for first page, PageCursor for subsequent pages)
58
+ search_params: Search parameters to save for pagination consistency
59
+
60
+ Returns:
61
+ Encoded cursor for next page, or None if no more results
62
+ """
63
+ from orchestrator.search.retrieval.query_state import SearchQueryState
64
+
65
+ has_next_page = len(search_response.results) == search_params.limit and search_params.limit > 0
66
+ if not has_next_page:
67
+ return None
68
+
69
+ # If this is the first page, save query state to database
70
+ if cursor is None:
71
+ query_state = SearchQueryState(parameters=search_params, query_embedding=search_response.query_embedding)
72
+ search_query = SearchQueryTable.from_state(state=query_state)
73
+
74
+ db.session.add(search_query)
75
+ db.session.commit()
76
+ query_id = search_query.query_id
77
+ else:
78
+ query_id = cursor.query_id
79
+
80
+ last_item = search_response.results[-1]
81
+ cursor_data = PageCursor(
82
+ score=float(last_item.score),
83
+ id=last_item.entity_id,
84
+ query_id=query_id,
85
+ )
86
+ return cursor_data.encode()
@@ -0,0 +1,61 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from uuid import UUID
15
+
16
+ from pydantic import BaseModel, ConfigDict, Field
17
+
18
+ from orchestrator.db import SearchQueryTable, db
19
+ from orchestrator.search.core.exceptions import QueryStateNotFoundError
20
+ from orchestrator.search.schemas.parameters import SearchParameters
21
+
22
+
23
+ class SearchQueryState(BaseModel):
24
+ """State of a search query including parameters and embedding.
25
+
26
+ This model provides a complete snapshot of what was searched and how.
27
+ Used for both agent and regular API searches.
28
+ """
29
+
30
+ parameters: SearchParameters = Field(discriminator="entity_type")
31
+ query_embedding: list[float] | None = Field(default=None, description="The embedding vector for semantic search")
32
+
33
+ model_config = ConfigDict(from_attributes=True)
34
+
35
+ @classmethod
36
+ def load_from_id(cls, query_id: UUID | str) -> "SearchQueryState":
37
+ """Load query state from database by query_id.
38
+
39
+ Args:
40
+ query_id: UUID or string UUID of the saved query
41
+
42
+ Returns:
43
+ SearchQueryState loaded from database
44
+
45
+ Raises:
46
+ ValueError: If query_id format is invalid
47
+ QueryStateNotFoundError: If query not found in database
48
+ """
49
+ if isinstance(query_id, UUID):
50
+ query_uuid = query_id
51
+ else:
52
+ try:
53
+ query_uuid = UUID(query_id)
54
+ except (ValueError, TypeError) as e:
55
+ raise ValueError(f"Invalid query_id format: {query_id}") from e
56
+
57
+ search_query = db.session.query(SearchQueryTable).filter_by(query_id=query_uuid).first()
58
+ if not search_query:
59
+ raise QueryStateNotFoundError(f"Query {query_uuid} not found in database")
60
+
61
+ return cls.model_validate(search_query)
@@ -20,7 +20,7 @@ from sqlalchemy import BindParameter, Numeric, Select, literal
20
20
  from orchestrator.search.core.types import FieldType, SearchMetadata
21
21
  from orchestrator.search.schemas.parameters import BaseSearchParameters
22
22
 
23
- from ..pagination import PaginationParams
23
+ from ..pagination import PageCursor
24
24
 
25
25
  logger = structlog.get_logger(__name__)
26
26
 
@@ -41,62 +41,48 @@ class Retriever(ABC):
41
41
  ]
42
42
 
43
43
  @classmethod
44
- async def from_params(
44
+ async def route(
45
45
  cls,
46
46
  params: BaseSearchParameters,
47
- pagination_params: PaginationParams,
47
+ cursor: PageCursor | None,
48
+ query_embedding: list[float] | None = None,
48
49
  ) -> "Retriever":
49
- """Create the appropriate retriever instance from search parameters.
50
+ """Route to the appropriate retriever instance based on search parameters.
51
+
52
+ Selects the retriever type based on available search criteria:
53
+ - Hybrid: both embedding and fuzzy term available
54
+ - Semantic: only embedding available
55
+ - Fuzzy: only text term available (or fallback when embedding generation fails)
56
+ - Structured: only filters available
50
57
 
51
58
  Args:
52
- params (BaseSearchParameters): Search parameters including vector queries, fuzzy terms, and filters.
53
- pagination_params (PaginationParams): Pagination parameters for cursor-based paging.
59
+ params: Search parameters including vector queries, fuzzy terms, and filters
60
+ cursor: Pagination cursor for cursor-based paging
61
+ query_embedding: Query embedding for semantic search, or None if not available
54
62
 
55
63
  Returns:
56
- Retriever: A concrete retriever instance (semantic, fuzzy, hybrid, or structured).
64
+ A concrete retriever instance based on available search criteria
57
65
  """
58
-
59
66
  from .fuzzy import FuzzyRetriever
60
67
  from .hybrid import RrfHybridRetriever
61
68
  from .semantic import SemanticRetriever
62
69
  from .structured import StructuredRetriever
63
70
 
64
71
  fuzzy_term = params.fuzzy_term
65
- q_vec = await cls._get_query_vector(params.vector_query, pagination_params.q_vec_override)
66
-
67
- # If semantic search was attempted but failed, fall back to fuzzy with the full query
68
- fallback_fuzzy_term = fuzzy_term
69
- if q_vec is None and params.vector_query is not None and params.query is not None:
70
- fallback_fuzzy_term = params.query
71
-
72
- if q_vec is not None and fallback_fuzzy_term is not None:
73
- return RrfHybridRetriever(q_vec, fallback_fuzzy_term, pagination_params)
74
- if q_vec is not None:
75
- return SemanticRetriever(q_vec, pagination_params)
76
- if fallback_fuzzy_term is not None:
77
- return FuzzyRetriever(fallback_fuzzy_term, pagination_params)
78
-
79
- return StructuredRetriever(pagination_params)
80
-
81
- @classmethod
82
- async def _get_query_vector(
83
- cls, vector_query: str | None, q_vec_override: list[float] | None
84
- ) -> list[float] | None:
85
- """Get query vector either from override or by generating from text."""
86
- if q_vec_override:
87
- return q_vec_override
88
-
89
- if not vector_query:
90
- return None
91
72
 
92
- from orchestrator.search.core.embedding import QueryEmbedder
73
+ # If vector_query exists but embedding generation failed, fall back to fuzzy search with full query
74
+ if query_embedding is None and params.vector_query is not None and params.query is not None:
75
+ fuzzy_term = params.query
93
76
 
94
- q_vec = await QueryEmbedder.generate_for_text_async(vector_query)
95
- if not q_vec:
96
- logger.warning("Embedding generation failed; using non-semantic retriever")
97
- return None
77
+ # Select retriever based on available search criteria
78
+ if query_embedding is not None and fuzzy_term is not None:
79
+ return RrfHybridRetriever(query_embedding, fuzzy_term, cursor)
80
+ if query_embedding is not None:
81
+ return SemanticRetriever(query_embedding, cursor)
82
+ if fuzzy_term is not None:
83
+ return FuzzyRetriever(fuzzy_term, cursor)
98
84
 
99
- return q_vec
85
+ return StructuredRetriever(cursor)
100
86
 
101
87
  @abstractmethod
102
88
  def apply(self, candidate_query: Select) -> Select:
@@ -17,17 +17,16 @@ from sqlalchemy.sql.expression import ColumnElement
17
17
  from orchestrator.db.models import AiSearchIndex
18
18
  from orchestrator.search.core.types import SearchMetadata
19
19
 
20
- from ..pagination import PaginationParams
20
+ from ..pagination import PageCursor
21
21
  from .base import Retriever
22
22
 
23
23
 
24
24
  class FuzzyRetriever(Retriever):
25
25
  """Ranks results based on the max of fuzzy text similarity scores."""
26
26
 
27
- def __init__(self, fuzzy_term: str, pagination_params: PaginationParams) -> None:
27
+ def __init__(self, fuzzy_term: str, cursor: PageCursor | None) -> None:
28
28
  self.fuzzy_term = fuzzy_term
29
- self.page_after_score = pagination_params.page_after_score
30
- self.page_after_id = pagination_params.page_after_id
29
+ self.cursor = cursor
31
30
 
32
31
  def apply(self, candidate_query: Select) -> Select:
33
32
  cand = candidate_query.subquery()
@@ -42,6 +41,7 @@ class FuzzyRetriever(Retriever):
42
41
  combined_query = (
43
42
  select(
44
43
  AiSearchIndex.entity_id,
44
+ AiSearchIndex.entity_title,
45
45
  score,
46
46
  func.first_value(AiSearchIndex.value)
47
47
  .over(partition_by=AiSearchIndex.entity_id, order_by=[similarity_expr.desc(), AiSearchIndex.path.asc()])
@@ -58,12 +58,13 @@ class FuzzyRetriever(Retriever):
58
58
  literal(self.fuzzy_term).op("<%")(AiSearchIndex.value),
59
59
  )
60
60
  )
61
- .distinct(AiSearchIndex.entity_id)
61
+ .distinct(AiSearchIndex.entity_id, AiSearchIndex.entity_title)
62
62
  )
63
63
  final_query = combined_query.subquery("ranked_fuzzy")
64
64
 
65
65
  stmt = select(
66
66
  final_query.c.entity_id,
67
+ final_query.c.entity_title,
67
68
  final_query.c.score,
68
69
  final_query.c.highlight_text,
69
70
  final_query.c.highlight_path,
@@ -81,13 +82,13 @@ class FuzzyRetriever(Retriever):
81
82
  self, stmt: Select, score_column: ColumnElement, entity_id_column: ColumnElement
82
83
  ) -> Select:
83
84
  """Apply standard score + entity_id pagination."""
84
- if self.page_after_score is not None and self.page_after_id is not None:
85
+ if self.cursor is not None:
85
86
  stmt = stmt.where(
86
87
  or_(
87
- score_column < self.page_after_score,
88
+ score_column < self.cursor.score,
88
89
  and_(
89
- score_column == self.page_after_score,
90
- entity_id_column > self.page_after_id,
90
+ score_column == self.cursor.score,
91
+ entity_id_column > self.cursor.id,
91
92
  ),
92
93
  )
93
94
  )