orchestrator-core 4.4.0rc2__py3-none-any.whl → 5.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +1 -1
- orchestrator/api/api_v1/api.py +7 -0
- orchestrator/api/api_v1/endpoints/agent.py +62 -0
- orchestrator/api/api_v1/endpoints/processes.py +6 -12
- orchestrator/api/api_v1/endpoints/search.py +197 -0
- orchestrator/api/api_v1/endpoints/subscriptions.py +0 -1
- orchestrator/app.py +4 -0
- orchestrator/cli/index_llm.py +73 -0
- orchestrator/cli/main.py +8 -1
- orchestrator/cli/resize_embedding.py +136 -0
- orchestrator/cli/scheduler.py +29 -40
- orchestrator/cli/search_explore.py +203 -0
- orchestrator/db/models.py +37 -1
- orchestrator/graphql/schema.py +0 -5
- orchestrator/graphql/schemas/process.py +2 -2
- orchestrator/graphql/utils/create_resolver_error_handler.py +1 -1
- orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
- orchestrator/schedules/__init__.py +2 -1
- orchestrator/schedules/resume_workflows.py +2 -2
- orchestrator/schedules/scheduling.py +24 -64
- orchestrator/schedules/task_vacuum.py +2 -2
- orchestrator/schedules/validate_products.py +2 -8
- orchestrator/schedules/validate_subscriptions.py +2 -2
- orchestrator/schemas/search.py +101 -0
- orchestrator/search/__init__.py +0 -0
- orchestrator/search/agent/__init__.py +1 -0
- orchestrator/search/agent/prompts.py +62 -0
- orchestrator/search/agent/state.py +8 -0
- orchestrator/search/agent/tools.py +122 -0
- orchestrator/search/core/__init__.py +0 -0
- orchestrator/search/core/embedding.py +64 -0
- orchestrator/search/core/exceptions.py +16 -0
- orchestrator/search/core/types.py +162 -0
- orchestrator/search/core/validators.py +27 -0
- orchestrator/search/docs/index.md +37 -0
- orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
- orchestrator/search/filters/__init__.py +27 -0
- orchestrator/search/filters/base.py +236 -0
- orchestrator/search/filters/date_filters.py +75 -0
- orchestrator/search/filters/definitions.py +76 -0
- orchestrator/search/filters/ltree_filters.py +31 -0
- orchestrator/search/filters/numeric_filter.py +60 -0
- orchestrator/search/indexing/__init__.py +3 -0
- orchestrator/search/indexing/indexer.py +316 -0
- orchestrator/search/indexing/registry.py +88 -0
- orchestrator/search/indexing/tasks.py +53 -0
- orchestrator/search/indexing/traverse.py +209 -0
- orchestrator/search/retrieval/__init__.py +3 -0
- orchestrator/search/retrieval/builder.py +64 -0
- orchestrator/search/retrieval/engine.py +96 -0
- orchestrator/search/retrieval/ranker.py +202 -0
- orchestrator/search/retrieval/utils.py +88 -0
- orchestrator/search/retrieval/validation.py +174 -0
- orchestrator/search/schemas/__init__.py +0 -0
- orchestrator/search/schemas/parameters.py +114 -0
- orchestrator/search/schemas/results.py +47 -0
- orchestrator/services/processes.py +11 -16
- orchestrator/services/subscriptions.py +0 -4
- orchestrator/settings.py +29 -1
- orchestrator/targets.py +0 -1
- orchestrator/workflow.py +1 -8
- orchestrator/workflows/utils.py +1 -48
- {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/METADATA +6 -3
- {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/RECORD +66 -30
- orchestrator/graphql/resolvers/scheduled_tasks.py +0 -36
- orchestrator/graphql/schemas/scheduled_task.py +0 -8
- orchestrator/schedules/scheduler.py +0 -163
- {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from sqlalchemy import Select, String, cast, func, select
|
|
2
|
+
|
|
3
|
+
from orchestrator.db.models import AiSearchIndex
|
|
4
|
+
from orchestrator.search.core.types import EntityType, FilterOp
|
|
5
|
+
from orchestrator.search.filters import LtreeFilter
|
|
6
|
+
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def create_path_autocomplete_lquery(prefix: str) -> str:
|
|
10
|
+
"""Create the lquery pattern for a multi-level path autocomplete search."""
|
|
11
|
+
return f"{prefix}*.*"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_candidate_query(params: BaseSearchParameters) -> Select:
|
|
15
|
+
"""Build the base query for retrieving candidate entities.
|
|
16
|
+
|
|
17
|
+
Constructs a `SELECT` statement that retrieves distinct `entity_id` values
|
|
18
|
+
from the index table for the given entity type, applying any structured
|
|
19
|
+
filters from the provided search parameters.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
params : BaseSearchParameters
|
|
24
|
+
The search parameters containing the entity type and optional filters.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
-------
|
|
28
|
+
Select
|
|
29
|
+
The SQLAlchemy `Select` object representing the query.
|
|
30
|
+
"""
|
|
31
|
+
stmt = select(AiSearchIndex.entity_id).where(AiSearchIndex.entity_type == params.entity_type.value).distinct()
|
|
32
|
+
|
|
33
|
+
if params.filters is not None:
|
|
34
|
+
entity_id_col = AiSearchIndex.entity_id
|
|
35
|
+
stmt = stmt.where(
|
|
36
|
+
params.filters.to_expression(
|
|
37
|
+
entity_id_col,
|
|
38
|
+
entity_type_value=params.entity_type.value,
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
return stmt
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_paths_query(entity_type: EntityType, prefix: str | None = None, q: str | None = None) -> Select:
|
|
46
|
+
"""Build the query for retrieving paths."""
|
|
47
|
+
stmt = select(AiSearchIndex.path, AiSearchIndex.value_type).where(AiSearchIndex.entity_type == entity_type.value)
|
|
48
|
+
|
|
49
|
+
if prefix:
|
|
50
|
+
lquery_pattern = create_path_autocomplete_lquery(prefix)
|
|
51
|
+
ltree_filter = LtreeFilter(op=FilterOp.MATCHES_LQUERY, value=lquery_pattern)
|
|
52
|
+
stmt = stmt.where(ltree_filter.to_expression(AiSearchIndex.path, path=""))
|
|
53
|
+
|
|
54
|
+
if q:
|
|
55
|
+
score = func.similarity(cast(AiSearchIndex.path, String), q).label("score")
|
|
56
|
+
stmt = (
|
|
57
|
+
stmt.add_columns(score)
|
|
58
|
+
.group_by(AiSearchIndex.path, AiSearchIndex.value_type, score)
|
|
59
|
+
.order_by(score.desc(), AiSearchIndex.path)
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
stmt = stmt.group_by(AiSearchIndex.path, AiSearchIndex.value_type).order_by(AiSearchIndex.path)
|
|
63
|
+
|
|
64
|
+
return stmt
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
|
|
3
|
+
import structlog
|
|
4
|
+
from sqlalchemy.engine.row import RowMapping
|
|
5
|
+
from sqlalchemy.orm import Session
|
|
6
|
+
|
|
7
|
+
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
8
|
+
from orchestrator.search.schemas.results import Highlight, SearchResponse, SearchResult
|
|
9
|
+
|
|
10
|
+
from .builder import build_candidate_query
|
|
11
|
+
from .ranker import Ranker
|
|
12
|
+
from .utils import generate_highlight_indices
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _format_response(db_rows: Sequence[RowMapping], search_params: BaseSearchParameters) -> SearchResponse:
|
|
18
|
+
"""Format database query results into a `SearchResponse`.
|
|
19
|
+
|
|
20
|
+
Converts raw SQLAlchemy `RowMapping` objects into `SearchResult` instances,
|
|
21
|
+
optionally generating highlight metadata if a fuzzy term is present.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
db_rows : Sequence[RowMapping]
|
|
26
|
+
The rows returned from the executed SQLAlchemy query.
|
|
27
|
+
search_params : BaseSearchParameters
|
|
28
|
+
The parameters used for the search, including any fuzzy term for highlighting.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
-------
|
|
32
|
+
SearchResponse
|
|
33
|
+
A list of `SearchResult` objects containing entity IDs, scores, and
|
|
34
|
+
optional highlight information.
|
|
35
|
+
"""
|
|
36
|
+
response: SearchResponse = []
|
|
37
|
+
for row in db_rows:
|
|
38
|
+
highlight = None
|
|
39
|
+
if search_params.fuzzy_term and row.get("highlight_text"):
|
|
40
|
+
text = row.highlight_text
|
|
41
|
+
indices = generate_highlight_indices(text, search_params.fuzzy_term)
|
|
42
|
+
if indices:
|
|
43
|
+
highlight = Highlight(text=text, indices=indices)
|
|
44
|
+
|
|
45
|
+
response.append(
|
|
46
|
+
SearchResult(
|
|
47
|
+
entity_id=str(row.entity_id),
|
|
48
|
+
score=row.score,
|
|
49
|
+
highlight=highlight,
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
return response
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def execute_search(search_params: BaseSearchParameters, db_session: Session, limit: int = 5) -> SearchResponse:
|
|
56
|
+
"""Execute a hybrid search and return ranked results.
|
|
57
|
+
|
|
58
|
+
Builds a candidate entity query based on the given search parameters,
|
|
59
|
+
applies the appropriate ranking strategy, and executes the final ranked
|
|
60
|
+
query to retrieve results.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
search_params : BaseSearchParameters
|
|
65
|
+
The search parameters specifying vector, fuzzy, or filter criteria.
|
|
66
|
+
db_session : Session
|
|
67
|
+
The active SQLAlchemy session for executing the query.
|
|
68
|
+
limit : int, optional
|
|
69
|
+
The maximum number of search results to return, by default 5.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
-------
|
|
73
|
+
SearchResponse
|
|
74
|
+
A list of `SearchResult` objects containing entity IDs, scores, and
|
|
75
|
+
optional highlight metadata.
|
|
76
|
+
|
|
77
|
+
Notes:
|
|
78
|
+
-----
|
|
79
|
+
If no vector query, filters, or fuzzy term are provided, a warning is logged
|
|
80
|
+
and an empty result set is returned.
|
|
81
|
+
"""
|
|
82
|
+
if not search_params.vector_query and not search_params.filters and not search_params.fuzzy_term:
|
|
83
|
+
logger.warning("No search criteria provided (vector_query, fuzzy_term, or filters).")
|
|
84
|
+
return []
|
|
85
|
+
|
|
86
|
+
candidate_query = build_candidate_query(search_params)
|
|
87
|
+
|
|
88
|
+
ranker = await Ranker.from_params(search_params)
|
|
89
|
+
logger.debug("Using ranker", ranker_type=ranker.__class__.__name__)
|
|
90
|
+
|
|
91
|
+
final_stmt = ranker.apply(candidate_query)
|
|
92
|
+
final_stmt = final_stmt.limit(limit)
|
|
93
|
+
logger.debug(final_stmt)
|
|
94
|
+
result = db_session.execute(final_stmt).mappings().all()
|
|
95
|
+
|
|
96
|
+
return _format_response(result, search_params)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import structlog
|
|
5
|
+
from sqlalchemy import Select, bindparam, case, func, literal, select
|
|
6
|
+
from sqlalchemy.sql.expression import ColumnElement
|
|
7
|
+
|
|
8
|
+
from orchestrator.db.models import AiSearchIndex
|
|
9
|
+
from orchestrator.search.core.embedding import QueryEmbedder
|
|
10
|
+
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
11
|
+
|
|
12
|
+
logger = structlog.get_logger(__name__)
|
|
13
|
+
Index = AiSearchIndex
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Ranker(ABC):
|
|
17
|
+
"""Abstract base class for applying a ranking strategy to a search query."""
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
async def from_params(cls, params: BaseSearchParameters, use_rrf: bool = True) -> "Ranker":
|
|
21
|
+
"""Create the appropriate ranker instance from search parameters.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
params : BaseSearchParameters
|
|
26
|
+
Search parameters including vector queries, fuzzy terms, and filters.
|
|
27
|
+
use_rrf : bool, optional
|
|
28
|
+
Whether to use Reciprocal Rank Fusion for hybrid searches, by default True.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
-------
|
|
32
|
+
Ranker
|
|
33
|
+
A concrete ranker instance (semantic, fuzzy, hybrid, RRF hybrid, or structured).
|
|
34
|
+
"""
|
|
35
|
+
vq, fq = params.vector_query, params.fuzzy_term
|
|
36
|
+
q_vec = None
|
|
37
|
+
if vq:
|
|
38
|
+
q_vec = await QueryEmbedder.generate_for_text_async(vq)
|
|
39
|
+
if not q_vec:
|
|
40
|
+
logger.warning("Embedding generation failed; using non-semantic ranker")
|
|
41
|
+
vq = None
|
|
42
|
+
|
|
43
|
+
if vq and fq and q_vec is not None:
|
|
44
|
+
return RrfHybridRanker(q_vec, fq) if use_rrf else HybridRanker(q_vec, fq)
|
|
45
|
+
if vq and q_vec is not None:
|
|
46
|
+
return SemanticRanker(q_vec)
|
|
47
|
+
if fq:
|
|
48
|
+
return FuzzyRanker(fq)
|
|
49
|
+
return StructuredRanker()
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
53
|
+
"""Apply the ranking logic to the given candidate query.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
candidate_query : Select
|
|
58
|
+
A SQLAlchemy `Select` statement returning candidate entity IDs.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
-------
|
|
62
|
+
Select
|
|
63
|
+
A new `Select` statement with ranking expressions applied.
|
|
64
|
+
"""
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class StructuredRanker(Ranker):
|
|
69
|
+
"""Applies a dummy score for purely structured searches with no text query."""
|
|
70
|
+
|
|
71
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
72
|
+
cand = candidate_query.subquery()
|
|
73
|
+
return select(cand.c.entity_id, literal(1.0).label("score")).select_from(cand).order_by(cand.c.entity_id.asc())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class FuzzyRanker(Ranker):
|
|
77
|
+
"""Ranks results based on the max of fuzzy text similarity scores."""
|
|
78
|
+
|
|
79
|
+
def __init__(self, fuzzy_term: str) -> None:
|
|
80
|
+
self.fuzzy_term = fuzzy_term
|
|
81
|
+
|
|
82
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
83
|
+
|
|
84
|
+
cand = candidate_query.subquery()
|
|
85
|
+
score_expr = func.max(func.similarity(Index.value, self.fuzzy_term))
|
|
86
|
+
|
|
87
|
+
return (
|
|
88
|
+
select(Index.entity_id, score_expr.label("score"))
|
|
89
|
+
.select_from(Index)
|
|
90
|
+
.join(cand, cand.c.entity_id == Index.entity_id)
|
|
91
|
+
.group_by(Index.entity_id)
|
|
92
|
+
.order_by(score_expr.desc().nulls_last(), Index.entity_id.asc())
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class SemanticRanker(Ranker):
|
|
97
|
+
"""Ranks results based on the minimum semantic vector distance."""
|
|
98
|
+
|
|
99
|
+
def __init__(self, vector_query: list[float]) -> None:
|
|
100
|
+
self.vector_query = vector_query
|
|
101
|
+
|
|
102
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
103
|
+
cand = candidate_query.subquery()
|
|
104
|
+
|
|
105
|
+
dist = Index.embedding.l2_distance(self.vector_query)
|
|
106
|
+
score_expr = func.min(dist).label("score")
|
|
107
|
+
|
|
108
|
+
return (
|
|
109
|
+
select(Index.entity_id, score_expr)
|
|
110
|
+
.select_from(Index)
|
|
111
|
+
.join(cand, cand.c.entity_id == Index.entity_id)
|
|
112
|
+
.where(Index.embedding.isnot(None))
|
|
113
|
+
.group_by(Index.entity_id)
|
|
114
|
+
.order_by(score_expr.asc().nulls_last(), Index.entity_id.asc())
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class HybridRanker(Ranker):
|
|
119
|
+
"""Ranks results by combining semantic distance and fuzzy similarity.
|
|
120
|
+
|
|
121
|
+
Prioritizes fuzzy score, using semantic score as a tie-breaker.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
def __init__(self, q_vec: list[float], fuzzy_term: str) -> None:
|
|
125
|
+
self.q_vec = q_vec
|
|
126
|
+
self.fuzzy_term = fuzzy_term
|
|
127
|
+
|
|
128
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
129
|
+
cand = candidate_query.subquery()
|
|
130
|
+
|
|
131
|
+
dist = Index.embedding.l2_distance(self.q_vec)
|
|
132
|
+
# Semantic: only consider rows where an embedding exists
|
|
133
|
+
sem_agg = func.min(dist).filter(Index.embedding.isnot(None))
|
|
134
|
+
# Fuzzy: consider all rows (strings, uuids, etc.)
|
|
135
|
+
fuzzy_agg = func.max(func.similarity(Index.value, self.fuzzy_term))
|
|
136
|
+
|
|
137
|
+
score = sem_agg.label("score")
|
|
138
|
+
|
|
139
|
+
return (
|
|
140
|
+
select(Index.entity_id, score)
|
|
141
|
+
.select_from(Index)
|
|
142
|
+
.join(cand, cand.c.entity_id == Index.entity_id)
|
|
143
|
+
.group_by(Index.entity_id)
|
|
144
|
+
.order_by(
|
|
145
|
+
fuzzy_agg.desc().nulls_last(),
|
|
146
|
+
sem_agg.asc().nulls_last(),
|
|
147
|
+
Index.entity_id.asc(),
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class RrfHybridRanker(Ranker):
|
|
153
|
+
"""Reciprocal Rank Fusion of semantic and fuzzy ranking."""
|
|
154
|
+
|
|
155
|
+
def __init__(self, q_vec: list[float], fuzzy_term: str, k: int = 60) -> None:
|
|
156
|
+
self.q_vec = q_vec
|
|
157
|
+
self.fuzzy_term = fuzzy_term
|
|
158
|
+
self.k = k
|
|
159
|
+
|
|
160
|
+
def apply(self, candidate_query: Select) -> Select:
|
|
161
|
+
cand = candidate_query.subquery()
|
|
162
|
+
|
|
163
|
+
# centroid over rows that have embeddings
|
|
164
|
+
q_param: ColumnElement[Any] = bindparam("q_vec", self.q_vec, type_=Index.embedding.type)
|
|
165
|
+
avg_vec = func.avg(Index.embedding).filter(Index.embedding.isnot(None))
|
|
166
|
+
sem_dist = avg_vec.op("<->")(q_param)
|
|
167
|
+
|
|
168
|
+
# fuzzy over ALL rows, substring-friendly for partial UUIDs
|
|
169
|
+
sim_base = func.similarity(Index.value, self.fuzzy_term)
|
|
170
|
+
sim_word = func.word_similarity(self.fuzzy_term, Index.value)
|
|
171
|
+
fuzzy_agg = func.max(func.greatest(sim_base, sim_word))
|
|
172
|
+
|
|
173
|
+
scores = (
|
|
174
|
+
select(
|
|
175
|
+
Index.entity_id,
|
|
176
|
+
sem_dist.label("semantic_distance"),
|
|
177
|
+
fuzzy_agg.label("fuzzy_score"),
|
|
178
|
+
)
|
|
179
|
+
.select_from(Index)
|
|
180
|
+
.join(cand, cand.c.entity_id == Index.entity_id)
|
|
181
|
+
.group_by(Index.entity_id)
|
|
182
|
+
.cte("scores")
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
ranked = select(
|
|
186
|
+
scores.c.entity_id,
|
|
187
|
+
scores.c.semantic_distance,
|
|
188
|
+
scores.c.fuzzy_score,
|
|
189
|
+
func.dense_rank().over(order_by=scores.c.semantic_distance.asc().nulls_last()).label("sem_rank"),
|
|
190
|
+
func.dense_rank().over(order_by=scores.c.fuzzy_score.desc().nulls_last()).label("fuzzy_rank"),
|
|
191
|
+
).cte("ranked_results")
|
|
192
|
+
|
|
193
|
+
rrf = (1.0 / (self.k + ranked.c.sem_rank)) + (1.0 / (self.k + ranked.c.fuzzy_rank))
|
|
194
|
+
score_expr = rrf.label("score")
|
|
195
|
+
perfect = case((ranked.c.fuzzy_score >= 0.9, 0), else_=1)
|
|
196
|
+
|
|
197
|
+
return (
|
|
198
|
+
select(ranked.c.entity_id, score_expr)
|
|
199
|
+
.select_from(ranked)
|
|
200
|
+
.order_by(perfect.asc(), score_expr.desc(), ranked.c.entity_id.asc())
|
|
201
|
+
.params(q_vec=self.q_vec)
|
|
202
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import structlog
|
|
4
|
+
from sqlalchemy import and_
|
|
5
|
+
from sqlalchemy_utils.types.ltree import Ltree
|
|
6
|
+
|
|
7
|
+
from orchestrator.db.database import WrappedSession
|
|
8
|
+
from orchestrator.db.models import AiSearchIndex
|
|
9
|
+
from orchestrator.search.core.types import EntityType
|
|
10
|
+
from orchestrator.search.indexing.registry import ENTITY_CONFIG_REGISTRY
|
|
11
|
+
from orchestrator.search.schemas.parameters import BaseSearchParameters
|
|
12
|
+
from orchestrator.search.schemas.results import SearchResult
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate_highlight_indices(text: str, term: str) -> list[tuple[int, int]]:
|
|
18
|
+
if not text or not term:
|
|
19
|
+
return []
|
|
20
|
+
indices = []
|
|
21
|
+
start = text.lower().find(term.lower())
|
|
22
|
+
if start != -1:
|
|
23
|
+
end = start + len(term)
|
|
24
|
+
indices.append((start, end))
|
|
25
|
+
return indices
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def display_filtered_paths_only(
|
|
29
|
+
results: list[SearchResult], search_params: BaseSearchParameters, db_session: WrappedSession
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Display only the paths that were searched for in the results."""
|
|
32
|
+
if not results:
|
|
33
|
+
logger.info("No results found.")
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
logger.info("--- Search Results ---")
|
|
37
|
+
|
|
38
|
+
searched_paths = search_params.filters.get_all_paths() if search_params.filters else []
|
|
39
|
+
if not searched_paths:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
for result in results:
|
|
43
|
+
for path in searched_paths:
|
|
44
|
+
record: AiSearchIndex | None = (
|
|
45
|
+
db_session.query(AiSearchIndex)
|
|
46
|
+
.filter(and_(AiSearchIndex.entity_id == result.entity_id, AiSearchIndex.path == Ltree(path)))
|
|
47
|
+
.first()
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if record:
|
|
51
|
+
logger.info(f" {record.path}: {record.value}")
|
|
52
|
+
|
|
53
|
+
logger.info("-" * 40)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def display_results(
|
|
57
|
+
results: list[SearchResult],
|
|
58
|
+
db_session: WrappedSession,
|
|
59
|
+
score_label: str = "Score",
|
|
60
|
+
) -> None:
|
|
61
|
+
"""Finds the original DB record for each search result and logs its traversed fields."""
|
|
62
|
+
if not results:
|
|
63
|
+
logger.info("No results found.")
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
logger.info("--- Search Results ---")
|
|
67
|
+
for result in results:
|
|
68
|
+
entity_id = result.entity_id
|
|
69
|
+
score = result.score
|
|
70
|
+
|
|
71
|
+
index_records = db_session.query(AiSearchIndex).filter(AiSearchIndex.entity_id == entity_id).all()
|
|
72
|
+
if not index_records:
|
|
73
|
+
logger.warning(f"Could not find indexed records for entity_id={entity_id}")
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
first_record = index_records[0]
|
|
77
|
+
kind = EntityType(first_record.entity_type)
|
|
78
|
+
config = ENTITY_CONFIG_REGISTRY[kind]
|
|
79
|
+
|
|
80
|
+
db_entity = db_session.get(config.table, entity_id) if config.table else None
|
|
81
|
+
|
|
82
|
+
if db_entity and config.traverser:
|
|
83
|
+
fields = config.traverser.get_fields(db_entity, config.pk_name, config.root_name)
|
|
84
|
+
result_obj = {p: v for p, v, _ in fields}
|
|
85
|
+
logger.info(json.dumps(result_obj, indent=2, default=str))
|
|
86
|
+
logger.info(f"{score_label}: {score:.4f}\n" + "-" * 20)
|
|
87
|
+
else:
|
|
88
|
+
logger.warning(f"Could not display entity {kind.value} with id={entity_id}")
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from typing import assert_never
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import select, text
|
|
4
|
+
from sqlalchemy.exc import ProgrammingError
|
|
5
|
+
from sqlalchemy_utils import Ltree
|
|
6
|
+
|
|
7
|
+
from orchestrator.db import db
|
|
8
|
+
from orchestrator.db.database import WrappedSession
|
|
9
|
+
from orchestrator.db.models import AiSearchIndex
|
|
10
|
+
from orchestrator.search.core.types import EntityType, FieldType
|
|
11
|
+
from orchestrator.search.filters import (
|
|
12
|
+
DateRangeFilter,
|
|
13
|
+
DateValueFilter,
|
|
14
|
+
EqualityFilter,
|
|
15
|
+
FilterCondition,
|
|
16
|
+
FilterTree,
|
|
17
|
+
LtreeFilter,
|
|
18
|
+
NumericRangeFilter,
|
|
19
|
+
NumericValueFilter,
|
|
20
|
+
PathFilter,
|
|
21
|
+
StringFilter,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def is_filter_compatible_with_field_type(filter_condition: FilterCondition, field_type: FieldType) -> bool:
|
|
26
|
+
"""Check whether a filter condition is compatible with a given field type.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
filter_condition : FilterCondition
|
|
31
|
+
The filter condition instance to check.
|
|
32
|
+
field_type : FieldType
|
|
33
|
+
The type of field from the index schema.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
-------
|
|
37
|
+
bool
|
|
38
|
+
True if the filter condition is valid for the given field type, False otherwise.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
match filter_condition:
|
|
42
|
+
case LtreeFilter():
|
|
43
|
+
return True # Filters for path only
|
|
44
|
+
case DateRangeFilter() | DateValueFilter():
|
|
45
|
+
return field_type == FieldType.DATETIME
|
|
46
|
+
case NumericRangeFilter() | NumericValueFilter():
|
|
47
|
+
return field_type in {FieldType.INTEGER, FieldType.FLOAT}
|
|
48
|
+
case StringFilter():
|
|
49
|
+
return field_type == FieldType.STRING
|
|
50
|
+
case EqualityFilter():
|
|
51
|
+
return field_type in {
|
|
52
|
+
FieldType.BOOLEAN,
|
|
53
|
+
FieldType.UUID,
|
|
54
|
+
FieldType.BLOCK,
|
|
55
|
+
FieldType.RESOURCE_TYPE,
|
|
56
|
+
FieldType.STRING,
|
|
57
|
+
}
|
|
58
|
+
case _:
|
|
59
|
+
assert_never(filter_condition)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_lquery_syntactically_valid(pattern: str, db_session: WrappedSession) -> bool:
|
|
63
|
+
"""Validate whether a string is a syntactically correct `lquery` pattern.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
pattern : str
|
|
68
|
+
The LTree lquery pattern string to validate.
|
|
69
|
+
db_session : WrappedSession
|
|
70
|
+
The database session used to test casting.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
-------
|
|
74
|
+
bool
|
|
75
|
+
True if the pattern is valid, False if it fails to cast in PostgreSQL.
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
with db_session.begin_nested():
|
|
79
|
+
db_session.execute(text("SELECT CAST(:pattern AS lquery)"), {"pattern": pattern})
|
|
80
|
+
return True
|
|
81
|
+
except ProgrammingError:
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_structured_filter_schema() -> dict[str, str]:
|
|
86
|
+
"""Retrieve all distinct filterable paths and their field types from the index.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
-------
|
|
90
|
+
Dict[str, str]
|
|
91
|
+
Mapping of path strings to their corresponding field type values.
|
|
92
|
+
"""
|
|
93
|
+
stmt = select(AiSearchIndex.path, AiSearchIndex.value_type).distinct().order_by(AiSearchIndex.path)
|
|
94
|
+
result = db.session.execute(stmt)
|
|
95
|
+
return {str(path): value_type.value for path, value_type in result}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def validate_filter_path(path: str) -> str | None:
|
|
99
|
+
"""Check if a given path exists in the index and return its field type.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
path : str
|
|
104
|
+
The fully qualified LTree path.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
-------
|
|
108
|
+
Optional[str]
|
|
109
|
+
The value type of the field if found, otherwise None.
|
|
110
|
+
"""
|
|
111
|
+
stmt = select(AiSearchIndex.value_type).where(AiSearchIndex.path == Ltree(path)).limit(1)
|
|
112
|
+
result = db.session.execute(stmt).scalar_one_or_none()
|
|
113
|
+
return result.value if result else None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
async def complete_filter_validation(filter: PathFilter, entity_type: EntityType) -> None:
|
|
117
|
+
"""Validate a PathFilter against the database schema and entity type.
|
|
118
|
+
|
|
119
|
+
Checks performed:
|
|
120
|
+
1. LTree filter syntax (for LtreeFilter only)
|
|
121
|
+
2. Non-empty path
|
|
122
|
+
3. Path exists in the database schema
|
|
123
|
+
4. Filter type matches the field's value_type
|
|
124
|
+
5. Path starts with the correct entity type prefix (unless wildcard)
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
filter : PathFilter
|
|
129
|
+
The filter to validate.
|
|
130
|
+
entity_type : EntityType
|
|
131
|
+
The entity type being searched.
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
------
|
|
135
|
+
ValueError
|
|
136
|
+
If any of the validation checks fail.
|
|
137
|
+
"""
|
|
138
|
+
# Ltree is a special case
|
|
139
|
+
if isinstance(filter.condition, LtreeFilter):
|
|
140
|
+
lquery_pattern = filter.condition.value
|
|
141
|
+
if not is_lquery_syntactically_valid(lquery_pattern, db.session):
|
|
142
|
+
raise ValueError(f"Ltree pattern '{lquery_pattern}' has invalid syntax.")
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
if not filter.path or not filter.path.strip():
|
|
146
|
+
raise ValueError("Filter path cannot be empty")
|
|
147
|
+
|
|
148
|
+
# 1. Check if path exists in database
|
|
149
|
+
db_field_type_str = validate_filter_path(filter.path)
|
|
150
|
+
if db_field_type_str is None:
|
|
151
|
+
raise ValueError(f"Path '{filter.path}' does not exist in database schema")
|
|
152
|
+
|
|
153
|
+
db_field_type = FieldType(db_field_type_str)
|
|
154
|
+
|
|
155
|
+
# 2. Check filter compatibility with field type
|
|
156
|
+
if not is_filter_compatible_with_field_type(filter.condition, db_field_type):
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"Filter '{type(filter.condition).__name__}' not compatible with field type '{db_field_type.value}'"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# 3. Check entity type prefix requirements (unless it's a wildcard path)
|
|
162
|
+
expected_prefix = f"{entity_type.value.lower()}."
|
|
163
|
+
if not filter.path.startswith(expected_prefix) and not filter.path.startswith("*"):
|
|
164
|
+
raise ValueError(
|
|
165
|
+
f"Filter path '{filter.path}' must start with '{expected_prefix}' for {entity_type.value} searches."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
async def validate_filter_tree(filters: FilterTree | None, entity_type: EntityType) -> None:
|
|
170
|
+
"""Validate all PathFilter leaves in a FilterTree."""
|
|
171
|
+
if filters is None:
|
|
172
|
+
return
|
|
173
|
+
for leaf in filters.get_all_leaves():
|
|
174
|
+
await complete_filter_validation(leaf, entity_type)
|
|
File without changes
|