orchestrator-core 4.4.1__py3-none-any.whl → 4.5.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. orchestrator/__init__.py +26 -2
  2. orchestrator/agentic_app.py +84 -0
  3. orchestrator/api/api_v1/api.py +10 -0
  4. orchestrator/api/api_v1/endpoints/search.py +290 -0
  5. orchestrator/app.py +32 -0
  6. orchestrator/cli/index_llm.py +73 -0
  7. orchestrator/cli/main.py +22 -1
  8. orchestrator/cli/resize_embedding.py +135 -0
  9. orchestrator/cli/search_explore.py +208 -0
  10. orchestrator/cli/speedtest.py +151 -0
  11. orchestrator/db/models.py +37 -1
  12. orchestrator/devtools/populator.py +16 -0
  13. orchestrator/llm_settings.py +51 -0
  14. orchestrator/log_config.py +1 -0
  15. orchestrator/migrations/helpers.py +1 -1
  16. orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
  17. orchestrator/schemas/search.py +130 -0
  18. orchestrator/schemas/workflow.py +1 -0
  19. orchestrator/search/__init__.py +12 -0
  20. orchestrator/search/agent/__init__.py +21 -0
  21. orchestrator/search/agent/agent.py +60 -0
  22. orchestrator/search/agent/prompts.py +100 -0
  23. orchestrator/search/agent/state.py +21 -0
  24. orchestrator/search/agent/tools.py +258 -0
  25. orchestrator/search/core/__init__.py +12 -0
  26. orchestrator/search/core/embedding.py +73 -0
  27. orchestrator/search/core/exceptions.py +36 -0
  28. orchestrator/search/core/types.py +296 -0
  29. orchestrator/search/core/validators.py +40 -0
  30. orchestrator/search/docs/index.md +37 -0
  31. orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
  32. orchestrator/search/filters/__init__.py +40 -0
  33. orchestrator/search/filters/base.py +280 -0
  34. orchestrator/search/filters/date_filters.py +88 -0
  35. orchestrator/search/filters/definitions.py +107 -0
  36. orchestrator/search/filters/ltree_filters.py +56 -0
  37. orchestrator/search/filters/numeric_filter.py +73 -0
  38. orchestrator/search/indexing/__init__.py +16 -0
  39. orchestrator/search/indexing/indexer.py +336 -0
  40. orchestrator/search/indexing/registry.py +101 -0
  41. orchestrator/search/indexing/tasks.py +66 -0
  42. orchestrator/search/indexing/traverse.py +334 -0
  43. orchestrator/search/retrieval/__init__.py +16 -0
  44. orchestrator/search/retrieval/builder.py +123 -0
  45. orchestrator/search/retrieval/engine.py +158 -0
  46. orchestrator/search/retrieval/exceptions.py +90 -0
  47. orchestrator/search/retrieval/pagination.py +96 -0
  48. orchestrator/search/retrieval/retrievers/__init__.py +26 -0
  49. orchestrator/search/retrieval/retrievers/base.py +122 -0
  50. orchestrator/search/retrieval/retrievers/fuzzy.py +94 -0
  51. orchestrator/search/retrieval/retrievers/hybrid.py +188 -0
  52. orchestrator/search/retrieval/retrievers/semantic.py +94 -0
  53. orchestrator/search/retrieval/retrievers/structured.py +39 -0
  54. orchestrator/search/retrieval/utils.py +120 -0
  55. orchestrator/search/retrieval/validation.py +152 -0
  56. orchestrator/search/schemas/__init__.py +12 -0
  57. orchestrator/search/schemas/parameters.py +129 -0
  58. orchestrator/search/schemas/results.py +77 -0
  59. orchestrator/services/settings_env_variables.py +2 -2
  60. orchestrator/settings.py +1 -1
  61. orchestrator/workflows/tasks/validate_products.py +1 -1
  62. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a3.dist-info}/METADATA +9 -4
  63. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a3.dist-info}/RECORD +65 -16
  64. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a3.dist-info}/WHEEL +0 -0
  65. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,90 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from orchestrator.search.core.types import FilterOp
15
+
16
+
17
+ class FilterValidationError(Exception):
18
+ """Base exception for filter validation errors."""
19
+
20
+ pass
21
+
22
+
23
+ class InvalidLtreePatternError(FilterValidationError):
24
+ """Raised when an ltree pattern has invalid ltree query syntax."""
25
+
26
+ def __init__(self, pattern: str) -> None:
27
+ message = f"Ltree pattern '{pattern}' has invalid syntax. Use valid PostgreSQL ltree lquery syntax."
28
+ super().__init__(message)
29
+
30
+
31
+ class EmptyFilterPathError(FilterValidationError):
32
+ """Raised when a filter path is empty or contains only whitespace."""
33
+
34
+ def __init__(self) -> None:
35
+ message = (
36
+ "Filter path cannot be empty. Provide a valid path like 'subscription.product.name' or 'workflow.name'."
37
+ )
38
+ super().__init__(message)
39
+
40
+
41
+ class PathNotFoundError(FilterValidationError):
42
+ """Raised when a filter path doesn't exist in the database schema.
43
+
44
+ Examples:
45
+ Using a non-existent filter path:
46
+
47
+ >>> print(PathNotFoundError('subscription.nonexistent.field'))
48
+ Path 'subscription.nonexistent.field' does not exist in the database.
49
+ """
50
+
51
+ def __init__(self, path: str) -> None:
52
+ message = f"Path '{path}' does not exist in the database."
53
+ super().__init__(message)
54
+
55
+
56
+ class IncompatibleFilterTypeError(FilterValidationError):
57
+ """Raised when a filter operator is incompatible with the field's data type.
58
+
59
+ Examples:
60
+ Using a numeric comparison operator on a string field:
61
+
62
+ >>> print(IncompatibleFilterTypeError(
63
+ ... operator='gt',
64
+ ... field_type='string',
65
+ ... path='subscription.customer_name',
66
+ ... expected_operators=[FilterOp.EQ, FilterOp.NEQ, FilterOp.LIKE],
67
+ ... ))
68
+ Operator 'gt' is not compatible with field type 'string' for path 'subscription.customer_name'. Valid operators for 'string': [eq, neq, like]
69
+ """
70
+
71
+ def __init__(self, operator: str, field_type: str, path: str, expected_operators: list[FilterOp]) -> None:
72
+ valid_ops_str = ", ".join([op.value for op in expected_operators])
73
+ message = f"Operator '{operator}' is not compatible with field type '{field_type}' for path '{path}'. Valid operators for '{field_type}': [{valid_ops_str}]"
74
+
75
+ super().__init__(message)
76
+
77
+
78
+ class InvalidEntityPrefixError(FilterValidationError):
79
+ """Raised when a filter path doesn't have the correct entity type prefix.
80
+
81
+ Examples:
82
+ Using wrong entity prefix in filter path:
83
+
84
+ >>> print(InvalidEntityPrefixError('workflow.name', 'subscription.', 'SUBSCRIPTION'))
85
+ Filter path 'workflow.name' must start with 'subscription.' for SUBSCRIPTION searches, or use '*' for wildcard paths.
86
+ """
87
+
88
+ def __init__(self, path: str, expected_prefix: str, entity_type: str) -> None:
89
+ message = f"Filter path '{path}' must start with '{expected_prefix}' for {entity_type} searches, or use '*' for wildcard paths."
90
+ super().__init__(message)
@@ -0,0 +1,96 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ import array
15
+ import base64
16
+ from dataclasses import dataclass
17
+
18
+ from pydantic import BaseModel
19
+
20
+ from orchestrator.search.core.exceptions import InvalidCursorError
21
+ from orchestrator.search.schemas.parameters import BaseSearchParameters
22
+ from orchestrator.search.schemas.results import SearchResult
23
+
24
+
25
+ @dataclass
26
+ class PaginationParams:
27
+ """Parameters for pagination in search queries."""
28
+
29
+ page_after_score: float | None = None
30
+ page_after_id: str | None = None
31
+ q_vec_override: list[float] | None = None
32
+
33
+
34
+ def floats_to_b64(v: list[float]) -> str:
35
+ a = array.array("f", v)
36
+ return base64.urlsafe_b64encode(a.tobytes()).decode("ascii")
37
+
38
+
39
+ def b64_to_floats(s: str) -> list[float]:
40
+ raw = base64.urlsafe_b64decode(s.encode("ascii"))
41
+ a = array.array("f")
42
+ a.frombytes(raw)
43
+ return list(a)
44
+
45
+
46
+ class PageCursor(BaseModel):
47
+ score: float
48
+ id: str
49
+ q_vec_b64: str
50
+
51
+ def encode(self) -> str:
52
+ """Encode the cursor data into a URL-safe Base64 string."""
53
+ json_str = self.model_dump_json()
54
+ return base64.urlsafe_b64encode(json_str.encode("utf-8")).decode("utf-8")
55
+
56
+ @classmethod
57
+ def decode(cls, cursor: str) -> "PageCursor":
58
+ """Decode a Base64 string back into a PageCursor instance."""
59
+ try:
60
+ decoded_str = base64.urlsafe_b64decode(cursor).decode("utf-8")
61
+ return cls.model_validate_json(decoded_str)
62
+ except Exception as e:
63
+ raise InvalidCursorError("Invalid pagination cursor") from e
64
+
65
+
66
+ async def process_pagination_cursor(cursor: str | None, search_params: BaseSearchParameters) -> PaginationParams:
67
+ """Process pagination cursor and return pagination parameters."""
68
+ if cursor:
69
+ c = PageCursor.decode(cursor)
70
+ return PaginationParams(
71
+ page_after_score=c.score,
72
+ page_after_id=c.id,
73
+ q_vec_override=b64_to_floats(c.q_vec_b64),
74
+ )
75
+ if search_params.vector_query:
76
+ from orchestrator.search.core.embedding import QueryEmbedder
77
+
78
+ q_vec_override = await QueryEmbedder.generate_for_text_async(search_params.vector_query)
79
+ return PaginationParams(q_vec_override=q_vec_override)
80
+ return PaginationParams()
81
+
82
+
83
+ def create_next_page_cursor(
84
+ search_results: list[SearchResult], pagination_params: PaginationParams, limit: int
85
+ ) -> str | None:
86
+ """Create next page cursor if there are more results."""
87
+ has_next_page = len(search_results) == limit and limit > 0
88
+ if has_next_page:
89
+ last_item = search_results[-1]
90
+ cursor_data = PageCursor(
91
+ score=float(last_item.score),
92
+ id=last_item.entity_id,
93
+ q_vec_b64=floats_to_b64(pagination_params.q_vec_override or []),
94
+ )
95
+ return cursor_data.encode()
96
+ return None
@@ -0,0 +1,26 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from .base import Retriever
15
+ from .fuzzy import FuzzyRetriever
16
+ from .hybrid import RrfHybridRetriever
17
+ from .semantic import SemanticRetriever
18
+ from .structured import StructuredRetriever
19
+
20
+ __all__ = [
21
+ "Retriever",
22
+ "FuzzyRetriever",
23
+ "RrfHybridRetriever",
24
+ "SemanticRetriever",
25
+ "StructuredRetriever",
26
+ ]
@@ -0,0 +1,122 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from abc import ABC, abstractmethod
15
+ from decimal import Decimal
16
+
17
+ import structlog
18
+ from sqlalchemy import BindParameter, Numeric, Select, literal
19
+
20
+ from orchestrator.search.core.types import FieldType, SearchMetadata
21
+ from orchestrator.search.schemas.parameters import BaseSearchParameters
22
+
23
+ from ..pagination import PaginationParams
24
+
25
+ logger = structlog.get_logger(__name__)
26
+
27
+
28
+ class Retriever(ABC):
29
+ """Abstract base class for applying a ranking strategy to a search query."""
30
+
31
+ SCORE_PRECISION = 12
32
+ SCORE_NUMERIC_TYPE = Numeric(38, 12)
33
+ HIGHLIGHT_TEXT_LABEL = "highlight_text"
34
+ HIGHLIGHT_PATH_LABEL = "highlight_path"
35
+ SCORE_LABEL = "score"
36
+ SEARCHABLE_FIELD_TYPES = [
37
+ FieldType.STRING.value,
38
+ FieldType.UUID.value,
39
+ FieldType.BLOCK.value,
40
+ FieldType.RESOURCE_TYPE.value,
41
+ ]
42
+
43
+ @classmethod
44
+ async def from_params(
45
+ cls,
46
+ params: BaseSearchParameters,
47
+ pagination_params: PaginationParams,
48
+ ) -> "Retriever":
49
+ """Create the appropriate retriever instance from search parameters.
50
+
51
+ Args:
52
+ params (BaseSearchParameters): Search parameters including vector queries, fuzzy terms, and filters.
53
+ pagination_params (PaginationParams): Pagination parameters for cursor-based paging.
54
+
55
+ Returns:
56
+ Retriever: A concrete retriever instance (semantic, fuzzy, hybrid, or structured).
57
+ """
58
+
59
+ from .fuzzy import FuzzyRetriever
60
+ from .hybrid import RrfHybridRetriever
61
+ from .semantic import SemanticRetriever
62
+ from .structured import StructuredRetriever
63
+
64
+ fuzzy_term = params.fuzzy_term
65
+ q_vec = await cls._get_query_vector(params.vector_query, pagination_params.q_vec_override)
66
+
67
+ # If semantic search was attempted but failed, fall back to fuzzy with the full query
68
+ fallback_fuzzy_term = fuzzy_term
69
+ if q_vec is None and params.vector_query is not None and params.query is not None:
70
+ fallback_fuzzy_term = params.query
71
+
72
+ if q_vec is not None and fallback_fuzzy_term is not None:
73
+ return RrfHybridRetriever(q_vec, fallback_fuzzy_term, pagination_params)
74
+ if q_vec is not None:
75
+ return SemanticRetriever(q_vec, pagination_params)
76
+ if fallback_fuzzy_term is not None:
77
+ return FuzzyRetriever(fallback_fuzzy_term, pagination_params)
78
+
79
+ return StructuredRetriever(pagination_params)
80
+
81
+ @classmethod
82
+ async def _get_query_vector(
83
+ cls, vector_query: str | None, q_vec_override: list[float] | None
84
+ ) -> list[float] | None:
85
+ """Get query vector either from override or by generating from text."""
86
+ if q_vec_override:
87
+ return q_vec_override
88
+
89
+ if not vector_query:
90
+ return None
91
+
92
+ from orchestrator.search.core.embedding import QueryEmbedder
93
+
94
+ q_vec = await QueryEmbedder.generate_for_text_async(vector_query)
95
+ if not q_vec:
96
+ logger.warning("Embedding generation failed; using non-semantic retriever")
97
+ return None
98
+
99
+ return q_vec
100
+
101
+ @abstractmethod
102
+ def apply(self, candidate_query: Select) -> Select:
103
+ """Apply the ranking logic to the given candidate query.
104
+
105
+ Args:
106
+ candidate_query (Select): A SQLAlchemy `Select` statement returning candidate entity IDs.
107
+
108
+ Returns:
109
+ Select: A new `Select` statement with ranking expressions applied.
110
+ """
111
+ ...
112
+
113
+ def _quantize_score_for_pagination(self, score_value: float) -> BindParameter[Decimal]:
114
+ """Convert score value to properly quantized Decimal parameter for pagination."""
115
+ pas_dec = Decimal(str(score_value)).quantize(Decimal("0.000000000001"))
116
+ return literal(pas_dec, type_=self.SCORE_NUMERIC_TYPE)
117
+
118
+ @property
119
+ @abstractmethod
120
+ def metadata(self) -> SearchMetadata:
121
+ """Return metadata describing this search strategy."""
122
+ ...
@@ -0,0 +1,94 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from sqlalchemy import Select, and_, cast, func, literal, or_, select
15
+ from sqlalchemy.sql.expression import ColumnElement
16
+
17
+ from orchestrator.db.models import AiSearchIndex
18
+ from orchestrator.search.core.types import SearchMetadata
19
+
20
+ from ..pagination import PaginationParams
21
+ from .base import Retriever
22
+
23
+
24
+ class FuzzyRetriever(Retriever):
25
+ """Ranks results based on the max of fuzzy text similarity scores."""
26
+
27
+ def __init__(self, fuzzy_term: str, pagination_params: PaginationParams) -> None:
28
+ self.fuzzy_term = fuzzy_term
29
+ self.page_after_score = pagination_params.page_after_score
30
+ self.page_after_id = pagination_params.page_after_id
31
+
32
+ def apply(self, candidate_query: Select) -> Select:
33
+ cand = candidate_query.subquery()
34
+
35
+ similarity_expr = func.word_similarity(self.fuzzy_term, AiSearchIndex.value)
36
+
37
+ raw_max = func.max(similarity_expr).over(partition_by=AiSearchIndex.entity_id)
38
+ score = cast(
39
+ func.round(cast(raw_max, self.SCORE_NUMERIC_TYPE), self.SCORE_PRECISION), self.SCORE_NUMERIC_TYPE
40
+ ).label(self.SCORE_LABEL)
41
+
42
+ combined_query = (
43
+ select(
44
+ AiSearchIndex.entity_id,
45
+ score,
46
+ func.first_value(AiSearchIndex.value)
47
+ .over(partition_by=AiSearchIndex.entity_id, order_by=[similarity_expr.desc(), AiSearchIndex.path.asc()])
48
+ .label(self.HIGHLIGHT_TEXT_LABEL),
49
+ func.first_value(AiSearchIndex.path)
50
+ .over(partition_by=AiSearchIndex.entity_id, order_by=[similarity_expr.desc(), AiSearchIndex.path.asc()])
51
+ .label(self.HIGHLIGHT_PATH_LABEL),
52
+ )
53
+ .select_from(AiSearchIndex)
54
+ .join(cand, cand.c.entity_id == AiSearchIndex.entity_id)
55
+ .where(
56
+ and_(
57
+ AiSearchIndex.value_type.in_(self.SEARCHABLE_FIELD_TYPES),
58
+ literal(self.fuzzy_term).op("<%")(AiSearchIndex.value),
59
+ )
60
+ )
61
+ .distinct(AiSearchIndex.entity_id)
62
+ )
63
+ final_query = combined_query.subquery("ranked_fuzzy")
64
+
65
+ stmt = select(
66
+ final_query.c.entity_id,
67
+ final_query.c.score,
68
+ final_query.c.highlight_text,
69
+ final_query.c.highlight_path,
70
+ ).select_from(final_query)
71
+
72
+ stmt = self._apply_score_pagination(stmt, final_query.c.score, final_query.c.entity_id)
73
+
74
+ return stmt.order_by(final_query.c.score.desc().nulls_last(), final_query.c.entity_id.asc())
75
+
76
+ @property
77
+ def metadata(self) -> SearchMetadata:
78
+ return SearchMetadata.fuzzy()
79
+
80
+ def _apply_score_pagination(
81
+ self, stmt: Select, score_column: ColumnElement, entity_id_column: ColumnElement
82
+ ) -> Select:
83
+ """Apply standard score + entity_id pagination."""
84
+ if self.page_after_score is not None and self.page_after_id is not None:
85
+ stmt = stmt.where(
86
+ or_(
87
+ score_column < self.page_after_score,
88
+ and_(
89
+ score_column == self.page_after_score,
90
+ entity_id_column > self.page_after_id,
91
+ ),
92
+ )
93
+ )
94
+ return stmt
@@ -0,0 +1,188 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from sqlalchemy import BindParameter, Select, and_, bindparam, case, cast, func, literal, or_, select
15
+ from sqlalchemy.sql.expression import ColumnElement
16
+
17
+ from orchestrator.db.models import AiSearchIndex
18
+ from orchestrator.search.core.types import SearchMetadata
19
+
20
+ from ..pagination import PaginationParams
21
+ from .base import Retriever
22
+
23
+
24
+ class RrfHybridRetriever(Retriever):
25
+ """Reciprocal Rank Fusion of semantic and fuzzy ranking with parent-child retrieval."""
26
+
27
+ def __init__(
28
+ self,
29
+ q_vec: list[float],
30
+ fuzzy_term: str,
31
+ pagination_params: PaginationParams,
32
+ k: int = 60,
33
+ field_candidates_limit: int = 100,
34
+ ) -> None:
35
+ self.q_vec = q_vec
36
+ self.fuzzy_term = fuzzy_term
37
+ self.page_after_score = pagination_params.page_after_score
38
+ self.page_after_id = pagination_params.page_after_id
39
+ self.k = k
40
+ self.field_candidates_limit = field_candidates_limit
41
+
42
+ def apply(self, candidate_query: Select) -> Select:
43
+ cand = candidate_query.subquery()
44
+ q_param: BindParameter[list[float]] = bindparam("q_vec", self.q_vec, type_=AiSearchIndex.embedding.type)
45
+
46
+ best_similarity = func.word_similarity(self.fuzzy_term, AiSearchIndex.value)
47
+ sem_expr = case(
48
+ (AiSearchIndex.embedding.is_(None), None),
49
+ else_=AiSearchIndex.embedding.op("<->")(q_param),
50
+ )
51
+ sem_val = func.coalesce(sem_expr, literal(1.0)).label("semantic_distance")
52
+
53
+ filter_condition = literal(self.fuzzy_term).op("<%")(AiSearchIndex.value)
54
+
55
+ field_candidates = (
56
+ select(
57
+ AiSearchIndex.entity_id,
58
+ AiSearchIndex.path,
59
+ AiSearchIndex.value,
60
+ sem_val,
61
+ best_similarity.label("fuzzy_score"),
62
+ )
63
+ .select_from(AiSearchIndex)
64
+ .join(cand, cand.c.entity_id == AiSearchIndex.entity_id)
65
+ .where(
66
+ and_(
67
+ AiSearchIndex.value_type.in_(self.SEARCHABLE_FIELD_TYPES),
68
+ filter_condition,
69
+ )
70
+ )
71
+ .order_by(
72
+ best_similarity.desc().nulls_last(),
73
+ sem_expr.asc().nulls_last(),
74
+ AiSearchIndex.entity_id.asc(),
75
+ )
76
+ .limit(self.field_candidates_limit)
77
+ ).cte("field_candidates")
78
+
79
+ entity_scores = (
80
+ select(
81
+ field_candidates.c.entity_id,
82
+ func.avg(field_candidates.c.semantic_distance).label("avg_semantic_distance"),
83
+ func.avg(field_candidates.c.fuzzy_score).label("avg_fuzzy_score"),
84
+ ).group_by(field_candidates.c.entity_id)
85
+ ).cte("entity_scores")
86
+
87
+ entity_highlights = (
88
+ select(
89
+ field_candidates.c.entity_id,
90
+ func.first_value(field_candidates.c.value)
91
+ .over(
92
+ partition_by=field_candidates.c.entity_id,
93
+ order_by=[field_candidates.c.fuzzy_score.desc(), field_candidates.c.path.asc()],
94
+ )
95
+ .label(self.HIGHLIGHT_TEXT_LABEL),
96
+ func.first_value(field_candidates.c.path)
97
+ .over(
98
+ partition_by=field_candidates.c.entity_id,
99
+ order_by=[field_candidates.c.fuzzy_score.desc(), field_candidates.c.path.asc()],
100
+ )
101
+ .label(self.HIGHLIGHT_PATH_LABEL),
102
+ ).distinct(field_candidates.c.entity_id)
103
+ ).cte("entity_highlights")
104
+
105
+ ranked = (
106
+ select(
107
+ entity_scores.c.entity_id,
108
+ entity_scores.c.avg_semantic_distance,
109
+ entity_scores.c.avg_fuzzy_score,
110
+ entity_highlights.c.highlight_text,
111
+ entity_highlights.c.highlight_path,
112
+ func.dense_rank()
113
+ .over(
114
+ order_by=[entity_scores.c.avg_semantic_distance.asc().nulls_last(), entity_scores.c.entity_id.asc()]
115
+ )
116
+ .label("sem_rank"),
117
+ func.dense_rank()
118
+ .over(order_by=[entity_scores.c.avg_fuzzy_score.desc().nulls_last(), entity_scores.c.entity_id.asc()])
119
+ .label("fuzzy_rank"),
120
+ ).select_from(
121
+ entity_scores.join(entity_highlights, entity_scores.c.entity_id == entity_highlights.c.entity_id)
122
+ )
123
+ ).cte("ranked_results")
124
+
125
+ # RRF (rank-based)
126
+ rrf_raw = (1.0 / (self.k + ranked.c.sem_rank)) + (1.0 / (self.k + ranked.c.fuzzy_rank))
127
+ rrf_num = cast(rrf_raw, self.SCORE_NUMERIC_TYPE)
128
+
129
+ # Perfect flag to boost near perfect fuzzy matches as this most likely indicates the desired record.
130
+ perfect = case((ranked.c.avg_fuzzy_score >= 0.9, 1), else_=0).label("perfect_match")
131
+
132
+ # Dynamic beta based on k (and number of sources)
133
+ # rrf_max = n_sources / (k + 1)
134
+ k_num = literal(float(self.k), type_=self.SCORE_NUMERIC_TYPE)
135
+ n_sources = literal(2.0, type_=self.SCORE_NUMERIC_TYPE) # semantic + fuzzy
136
+ rrf_max = n_sources / (k_num + literal(1.0, type_=self.SCORE_NUMERIC_TYPE))
137
+
138
+ # Choose a small positive margin above rrf_max to ensure strict separation
139
+ # Keep it small to avoid compressing perfects near 1 after normalization
140
+ margin = rrf_max * literal(0.05, type_=self.SCORE_NUMERIC_TYPE) # 5% above bound
141
+ beta = rrf_max + margin
142
+
143
+ fused_num = rrf_num + beta * cast(perfect, self.SCORE_NUMERIC_TYPE)
144
+
145
+ # Normalize to [0,1] via the theoretical max (beta + rrf_max)
146
+ norm_den = beta + rrf_max
147
+ normalized_score = fused_num / norm_den
148
+
149
+ score = cast(
150
+ func.round(cast(normalized_score, self.SCORE_NUMERIC_TYPE), self.SCORE_PRECISION),
151
+ self.SCORE_NUMERIC_TYPE,
152
+ ).label(self.SCORE_LABEL)
153
+
154
+ stmt = select(
155
+ ranked.c.entity_id,
156
+ score,
157
+ ranked.c.highlight_text,
158
+ ranked.c.highlight_path,
159
+ perfect.label("perfect_match"),
160
+ ).select_from(ranked)
161
+
162
+ stmt = self._apply_fused_pagination(stmt, score, ranked.c.entity_id)
163
+
164
+ return stmt.order_by(
165
+ score.desc().nulls_last(),
166
+ ranked.c.entity_id.asc(),
167
+ ).params(q_vec=self.q_vec)
168
+
169
+ def _apply_fused_pagination(
170
+ self,
171
+ stmt: Select,
172
+ score_column: ColumnElement,
173
+ entity_id_column: ColumnElement,
174
+ ) -> Select:
175
+ """Keyset paginate by fused score + id."""
176
+ if self.page_after_score is not None and self.page_after_id is not None:
177
+ score_param = self._quantize_score_for_pagination(self.page_after_score)
178
+ stmt = stmt.where(
179
+ or_(
180
+ score_column < score_param,
181
+ and_(score_column == score_param, entity_id_column > self.page_after_id),
182
+ )
183
+ )
184
+ return stmt
185
+
186
+ @property
187
+ def metadata(self) -> SearchMetadata:
188
+ return SearchMetadata.hybrid()