orchestrator-core 4.4.1__py3-none-any.whl → 4.5.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +26 -2
- orchestrator/agentic_app.py +84 -0
- orchestrator/api/api_v1/api.py +10 -0
- orchestrator/api/api_v1/endpoints/search.py +277 -0
- orchestrator/app.py +32 -0
- orchestrator/cli/index_llm.py +73 -0
- orchestrator/cli/main.py +22 -1
- orchestrator/cli/resize_embedding.py +135 -0
- orchestrator/cli/search_explore.py +208 -0
- orchestrator/cli/speedtest.py +151 -0
- orchestrator/db/models.py +37 -1
- orchestrator/llm_settings.py +51 -0
- orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
- orchestrator/schemas/search.py +117 -0
- orchestrator/search/__init__.py +12 -0
- orchestrator/search/agent/__init__.py +8 -0
- orchestrator/search/agent/agent.py +47 -0
- orchestrator/search/agent/prompts.py +62 -0
- orchestrator/search/agent/state.py +8 -0
- orchestrator/search/agent/tools.py +121 -0
- orchestrator/search/core/__init__.py +0 -0
- orchestrator/search/core/embedding.py +64 -0
- orchestrator/search/core/exceptions.py +22 -0
- orchestrator/search/core/types.py +281 -0
- orchestrator/search/core/validators.py +27 -0
- orchestrator/search/docs/index.md +37 -0
- orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
- orchestrator/search/filters/__init__.py +27 -0
- orchestrator/search/filters/base.py +272 -0
- orchestrator/search/filters/date_filters.py +75 -0
- orchestrator/search/filters/definitions.py +93 -0
- orchestrator/search/filters/ltree_filters.py +43 -0
- orchestrator/search/filters/numeric_filter.py +60 -0
- orchestrator/search/indexing/__init__.py +3 -0
- orchestrator/search/indexing/indexer.py +323 -0
- orchestrator/search/indexing/registry.py +88 -0
- orchestrator/search/indexing/tasks.py +53 -0
- orchestrator/search/indexing/traverse.py +322 -0
- orchestrator/search/retrieval/__init__.py +3 -0
- orchestrator/search/retrieval/builder.py +108 -0
- orchestrator/search/retrieval/engine.py +152 -0
- orchestrator/search/retrieval/pagination.py +83 -0
- orchestrator/search/retrieval/retriever.py +447 -0
- orchestrator/search/retrieval/utils.py +106 -0
- orchestrator/search/retrieval/validation.py +174 -0
- orchestrator/search/schemas/__init__.py +0 -0
- orchestrator/search/schemas/parameters.py +116 -0
- orchestrator/search/schemas/results.py +63 -0
- orchestrator/services/settings_env_variables.py +2 -2
- orchestrator/settings.py +1 -1
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/METADATA +8 -3
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/RECORD +54 -11
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from enum import Enum, IntEnum
|
|
4
|
+
from typing import Annotated, Any, Literal, NamedTuple, TypeAlias, TypedDict, get_args, get_origin
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
from sqlalchemy.orm.attributes import InstrumentedAttribute
|
|
8
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
9
|
+
from sqlalchemy_utils.types.ltree import Ltree
|
|
10
|
+
|
|
11
|
+
from orchestrator.types import filter_nonetype, get_origin_and_args, is_optional_type, is_union_type
|
|
12
|
+
|
|
13
|
+
from .validators import is_bool_string, is_iso_date, is_uuid
|
|
14
|
+
|
|
15
|
+
SQLAColumn: TypeAlias = ColumnElement[Any] | InstrumentedAttribute[Any]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SearchMetadata:
|
|
20
|
+
"""Metadata about the search operation performed."""
|
|
21
|
+
|
|
22
|
+
search_type: str
|
|
23
|
+
description: str
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def structured(cls) -> "SearchMetadata":
|
|
27
|
+
return cls(
|
|
28
|
+
search_type="structured", description="This search performs a filter-based search using structured queries."
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def fuzzy(cls) -> "SearchMetadata":
|
|
33
|
+
return cls(
|
|
34
|
+
search_type="fuzzy",
|
|
35
|
+
description="This search performs a trigram similarity search.",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def semantic(cls) -> "SearchMetadata":
|
|
40
|
+
return cls(
|
|
41
|
+
search_type="semantic",
|
|
42
|
+
description="This search performs a vector similarity search, using L2 distance on embeddings with minimum distance scoring (normalized).",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def hybrid(cls) -> "SearchMetadata":
|
|
47
|
+
return cls(
|
|
48
|
+
search_type="hybrid",
|
|
49
|
+
description="This search performs reciprocal rank fusion combining trigram similarity, word_similarity, and L2 vector distance.",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def empty(cls) -> "SearchMetadata":
|
|
54
|
+
return cls(search_type="empty", description="Empty search - no criteria provided")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class BooleanOperator(str, Enum):
|
|
58
|
+
AND = "AND"
|
|
59
|
+
OR = "OR"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class FilterOp(str, Enum):
|
|
63
|
+
EQ = "eq"
|
|
64
|
+
NEQ = "neq"
|
|
65
|
+
LT = "lt"
|
|
66
|
+
LIKE = "like"
|
|
67
|
+
LTE = "lte"
|
|
68
|
+
GT = "gt"
|
|
69
|
+
GTE = "gte"
|
|
70
|
+
BETWEEN = "between"
|
|
71
|
+
|
|
72
|
+
MATCHES_LQUERY = "matches_lquery" # The ~ operator for wildcard matching
|
|
73
|
+
IS_ANCESTOR = "is_ancestor" # The @> operator
|
|
74
|
+
IS_DESCENDANT = "is_descendant" # The <@ operator
|
|
75
|
+
PATH_MATCH = "path_match"
|
|
76
|
+
|
|
77
|
+
HAS_COMPONENT = "has_component" # Path contains this segment
|
|
78
|
+
NOT_HAS_COMPONENT = "not_has_component" # Path doesn't contain segment
|
|
79
|
+
ENDS_WITH = "ends_with"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class EntityType(str, Enum):
|
|
83
|
+
SUBSCRIPTION = "SUBSCRIPTION"
|
|
84
|
+
PRODUCT = "PRODUCT"
|
|
85
|
+
WORKFLOW = "WORKFLOW"
|
|
86
|
+
PROCESS = "PROCESS"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class ActionType(str, Enum):
|
|
90
|
+
"""Defines the explicit, safe actions the agent can request."""
|
|
91
|
+
|
|
92
|
+
SELECT = "select" # Retrieve a list of matching records.
|
|
93
|
+
# COUNT = "count" # For phase1; the agent will not support this yet.
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class UIType(str, Enum):
|
|
97
|
+
STRING = "string"
|
|
98
|
+
NUMBER = "number"
|
|
99
|
+
BOOLEAN = "boolean"
|
|
100
|
+
DATETIME = "datetime"
|
|
101
|
+
COMPONENT = "component"
|
|
102
|
+
|
|
103
|
+
@classmethod
|
|
104
|
+
def from_field_type(cls, ft: "FieldType") -> "UIType":
|
|
105
|
+
"""Create a UIType from a backend FieldType to indicate how a value must be rendered."""
|
|
106
|
+
if ft in (FieldType.INTEGER, FieldType.FLOAT):
|
|
107
|
+
return cls.NUMBER
|
|
108
|
+
if ft == FieldType.BOOLEAN:
|
|
109
|
+
return cls.BOOLEAN
|
|
110
|
+
if ft == FieldType.DATETIME:
|
|
111
|
+
return cls.DATETIME
|
|
112
|
+
return cls.STRING
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class FieldType(str, Enum):
|
|
116
|
+
STRING = "string"
|
|
117
|
+
INTEGER = "integer"
|
|
118
|
+
FLOAT = "float"
|
|
119
|
+
BOOLEAN = "boolean"
|
|
120
|
+
DATETIME = "datetime"
|
|
121
|
+
UUID = "uuid"
|
|
122
|
+
BLOCK = "block"
|
|
123
|
+
RESOURCE_TYPE = "resource_type"
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def infer(cls, val: Any) -> "FieldType":
|
|
127
|
+
if isinstance(val, TypedValue):
|
|
128
|
+
return cls._infer_typed_value(val)
|
|
129
|
+
|
|
130
|
+
if isinstance(val, bool):
|
|
131
|
+
return cls.BOOLEAN
|
|
132
|
+
if isinstance(val, int):
|
|
133
|
+
return cls.INTEGER
|
|
134
|
+
if isinstance(val, float):
|
|
135
|
+
return cls.FLOAT
|
|
136
|
+
if isinstance(val, UUID):
|
|
137
|
+
return cls.UUID
|
|
138
|
+
if isinstance(val, (datetime, date)):
|
|
139
|
+
return cls.DATETIME
|
|
140
|
+
if isinstance(val, str):
|
|
141
|
+
return cls._infer_from_str(val)
|
|
142
|
+
|
|
143
|
+
return cls.STRING
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def _infer_typed_value(cls, val: "TypedValue") -> "FieldType":
|
|
147
|
+
if val.type == cls.BLOCK:
|
|
148
|
+
return cls.BLOCK
|
|
149
|
+
if val.type == cls.RESOURCE_TYPE:
|
|
150
|
+
return cls.RESOURCE_TYPE
|
|
151
|
+
return cls.STRING
|
|
152
|
+
|
|
153
|
+
@classmethod
|
|
154
|
+
def _infer_from_str(cls, val: str) -> "FieldType":
|
|
155
|
+
if is_uuid(val):
|
|
156
|
+
return cls.UUID
|
|
157
|
+
if is_iso_date(val):
|
|
158
|
+
return cls.DATETIME
|
|
159
|
+
if is_bool_string(val):
|
|
160
|
+
return cls.BOOLEAN
|
|
161
|
+
if val.isdigit():
|
|
162
|
+
return cls.INTEGER
|
|
163
|
+
try:
|
|
164
|
+
float(val)
|
|
165
|
+
return cls.FLOAT
|
|
166
|
+
except ValueError:
|
|
167
|
+
return cls.STRING
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def from_type_hint(cls, type_hint: object) -> "FieldType":
|
|
171
|
+
"""Convert type hint to FieldType."""
|
|
172
|
+
_type_mapping = {
|
|
173
|
+
int: cls.INTEGER,
|
|
174
|
+
float: cls.FLOAT,
|
|
175
|
+
bool: cls.BOOLEAN,
|
|
176
|
+
str: cls.STRING,
|
|
177
|
+
datetime: cls.DATETIME,
|
|
178
|
+
UUID: cls.UUID,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if type_hint in _type_mapping:
|
|
182
|
+
return _type_mapping[type_hint] # type: ignore[index]
|
|
183
|
+
|
|
184
|
+
if get_origin(type_hint) is Annotated:
|
|
185
|
+
inner_type = get_args(type_hint)[0]
|
|
186
|
+
return cls.from_type_hint(inner_type)
|
|
187
|
+
|
|
188
|
+
origin, args = get_origin_and_args(type_hint)
|
|
189
|
+
|
|
190
|
+
if origin is list:
|
|
191
|
+
return cls._handle_list_type(args)
|
|
192
|
+
|
|
193
|
+
if origin is Literal:
|
|
194
|
+
return cls._handle_literal_type(args)
|
|
195
|
+
|
|
196
|
+
if is_optional_type(type_hint) or is_union_type(type_hint):
|
|
197
|
+
return cls._handle_union_type(args)
|
|
198
|
+
|
|
199
|
+
if isinstance(type_hint, type):
|
|
200
|
+
return cls._handle_class_type(type_hint)
|
|
201
|
+
|
|
202
|
+
return cls.STRING
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def _handle_list_type(cls, args: tuple) -> "FieldType":
|
|
206
|
+
if args:
|
|
207
|
+
element_type = args[0]
|
|
208
|
+
return cls.from_type_hint(element_type)
|
|
209
|
+
return cls.STRING
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def _handle_literal_type(cls, args: tuple) -> "FieldType":
|
|
213
|
+
if not args:
|
|
214
|
+
return cls.STRING
|
|
215
|
+
first_value = args[0]
|
|
216
|
+
if isinstance(first_value, bool):
|
|
217
|
+
return cls.BOOLEAN
|
|
218
|
+
if isinstance(first_value, int):
|
|
219
|
+
return cls.INTEGER
|
|
220
|
+
if isinstance(first_value, str):
|
|
221
|
+
return cls.STRING
|
|
222
|
+
if isinstance(first_value, float):
|
|
223
|
+
return cls.FLOAT
|
|
224
|
+
return cls.STRING
|
|
225
|
+
|
|
226
|
+
@classmethod
|
|
227
|
+
def _handle_union_type(cls, args: tuple) -> "FieldType":
|
|
228
|
+
non_none_types = list(filter_nonetype(args))
|
|
229
|
+
if non_none_types:
|
|
230
|
+
return cls.from_type_hint(non_none_types[0])
|
|
231
|
+
return cls.STRING
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def _handle_class_type(cls, type_hint: type) -> "FieldType":
|
|
235
|
+
if issubclass(type_hint, IntEnum):
|
|
236
|
+
return cls.INTEGER
|
|
237
|
+
if issubclass(type_hint, Enum):
|
|
238
|
+
return cls.STRING
|
|
239
|
+
|
|
240
|
+
from orchestrator.domain.base import ProductBlockModel
|
|
241
|
+
|
|
242
|
+
if issubclass(type_hint, ProductBlockModel):
|
|
243
|
+
return cls.BLOCK
|
|
244
|
+
|
|
245
|
+
return cls.STRING
|
|
246
|
+
|
|
247
|
+
def is_embeddable(self, value: str | None) -> bool:
|
|
248
|
+
"""Check if a field should be embedded."""
|
|
249
|
+
if value is None:
|
|
250
|
+
return False
|
|
251
|
+
|
|
252
|
+
# If inference suggests it's not actually a string, don't embed it
|
|
253
|
+
return FieldType._infer_from_str(value) == FieldType.STRING
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
@dataclass(frozen=True)
|
|
257
|
+
class TypedValue:
|
|
258
|
+
value: Any
|
|
259
|
+
type: FieldType
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class ExtractedField(NamedTuple):
|
|
263
|
+
path: str
|
|
264
|
+
value: str
|
|
265
|
+
value_type: FieldType
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def from_raw(cls, path: str, raw_value: Any) -> "ExtractedField":
|
|
269
|
+
value = str(raw_value.value if isinstance(raw_value, TypedValue) else raw_value)
|
|
270
|
+
value_type = FieldType.infer(raw_value)
|
|
271
|
+
return cls(path=path, value=value, value_type=value_type)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class IndexableRecord(TypedDict):
|
|
275
|
+
entity_id: str
|
|
276
|
+
entity_type: str
|
|
277
|
+
path: Ltree
|
|
278
|
+
value: Any
|
|
279
|
+
value_type: Any
|
|
280
|
+
content_hash: str
|
|
281
|
+
embedding: list[float] | None
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
from dateutil.parser import isoparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def is_uuid(value: str) -> bool:
|
|
7
|
+
"""Check if a string is a valid UUID."""
|
|
8
|
+
try:
|
|
9
|
+
uuid.UUID(value)
|
|
10
|
+
return True
|
|
11
|
+
except (ValueError, TypeError):
|
|
12
|
+
return False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def is_iso_date(value: str) -> bool:
|
|
16
|
+
"""Check if a string is a valid ISO 8601 date."""
|
|
17
|
+
try:
|
|
18
|
+
isoparse(value)
|
|
19
|
+
return True
|
|
20
|
+
except (ValueError, TypeError):
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_bool_string(value: str) -> bool:
|
|
25
|
+
"""Check if a string explicitly represents a boolean value with true/false."""
|
|
26
|
+
|
|
27
|
+
return value.strip().lower() in {"true", "false"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Search Indexing CLI
|
|
2
|
+
|
|
3
|
+
Typer-based CLI for maintaining search indexes (subscriptions, products, processes, workflows).
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
Run from project root:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
dotenv run python main.py index [COMMAND] [OPTIONS]
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### Commands
|
|
14
|
+
|
|
15
|
+
- `subscriptions` – index `subscription_search_index`
|
|
16
|
+
- `products` – index `product_search_index`
|
|
17
|
+
- `processes` – index `process_search_index`
|
|
18
|
+
- `workflows` – index `workflow_search_index`
|
|
19
|
+
|
|
20
|
+
### Options
|
|
21
|
+
|
|
22
|
+
- `--<id>` – UUID of a specific entity (default: all)
|
|
23
|
+
- `--dry-run` – no DB writes
|
|
24
|
+
- `--force-index` – re-index even if unchanged
|
|
25
|
+
|
|
26
|
+
### Examples
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
# Index all subscriptions
|
|
30
|
+
dotenv run python main.py index subscriptions
|
|
31
|
+
|
|
32
|
+
# Re-index all subscriptions
|
|
33
|
+
dotenv run python main.py index subscriptions --force-index
|
|
34
|
+
|
|
35
|
+
# Index a single subscription
|
|
36
|
+
dotenv run python main.py index subscriptions --subscription-id=<UUID>
|
|
37
|
+
```
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Running a local MiniLM embedding server with Hugging Face TEI
|
|
2
|
+
|
|
3
|
+
Only **OpenAI-compatible endpoints** are supported locally.
|
|
4
|
+
|
|
5
|
+
You can spin up a embedding API based on **sentence-transformers/all-MiniLM-L6-v2** using [Hugging Face TEI](https://github.com/huggingface/text-embeddings-inference):
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
docker run --rm -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 \
|
|
9
|
+
--model-id sentence-transformers/all-MiniLM-L6-v2
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Environment variables
|
|
15
|
+
|
|
16
|
+
Point your backend to the local endpoint and declare the new vector size:
|
|
17
|
+
|
|
18
|
+
```env
|
|
19
|
+
OPENAI_BASE_URL=http://localhost:8080/v1
|
|
20
|
+
EMBEDDING_DIMENSION=384
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Depending on the model, you might want to change the `EMBEDDING_FALLBACK_MAX_TOKENS` and `EMBEDDING_MAX_BATCH_SIZE` settings, which are set conservatively and according to the requirements of the setup used in this example.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Apply the schema change
|
|
28
|
+
|
|
29
|
+
With these new settings run:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
dotenv run python main.py embedding resize
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Note** that this will delete all records and you will have to re-index.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Re-index embeddings
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
dotenv run python main.py index subscriptions
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
The search index now uses **384-dimension MiniLM vectors** served from your local Docker container. That’s it! 🚀
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
EqualityFilter,
|
|
3
|
+
FilterCondition,
|
|
4
|
+
FilterTree,
|
|
5
|
+
PathFilter,
|
|
6
|
+
StringFilter,
|
|
7
|
+
)
|
|
8
|
+
from .date_filters import DateFilter, DateRangeFilter, DateValueFilter
|
|
9
|
+
from .ltree_filters import LtreeFilter
|
|
10
|
+
from .numeric_filter import NumericFilter, NumericRangeFilter, NumericValueFilter
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
# Base filter classes
|
|
14
|
+
"PathFilter",
|
|
15
|
+
"FilterTree",
|
|
16
|
+
"FilterCondition",
|
|
17
|
+
"StringFilter",
|
|
18
|
+
"EqualityFilter",
|
|
19
|
+
# Filters for specific value types
|
|
20
|
+
"NumericValueFilter",
|
|
21
|
+
"NumericRangeFilter",
|
|
22
|
+
"DateValueFilter",
|
|
23
|
+
"DateRangeFilter",
|
|
24
|
+
"DateFilter",
|
|
25
|
+
"LtreeFilter",
|
|
26
|
+
"NumericFilter",
|
|
27
|
+
]
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from itertools import count
|
|
4
|
+
from typing import Any, ClassVar, Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
7
|
+
from sqlalchemy import BinaryExpression, and_, cast, exists, literal, or_, select
|
|
8
|
+
from sqlalchemy.dialects.postgresql import BOOLEAN
|
|
9
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
10
|
+
from sqlalchemy_utils.types.ltree import Ltree
|
|
11
|
+
|
|
12
|
+
from orchestrator.db.models import AiSearchIndex
|
|
13
|
+
from orchestrator.search.core.types import BooleanOperator, FieldType, FilterOp, SQLAColumn, UIType
|
|
14
|
+
|
|
15
|
+
from .date_filters import DateFilter
|
|
16
|
+
from .ltree_filters import LtreeFilter
|
|
17
|
+
from .numeric_filter import NumericFilter
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EqualityFilter(BaseModel):
|
|
21
|
+
op: Literal[FilterOp.EQ, FilterOp.NEQ]
|
|
22
|
+
value: Any
|
|
23
|
+
|
|
24
|
+
def to_expression(self, column: SQLAColumn, path: str) -> BinaryExpression[bool] | ColumnElement[bool]:
|
|
25
|
+
if isinstance(self.value, bool):
|
|
26
|
+
colb = cast(column, BOOLEAN)
|
|
27
|
+
return colb.is_(self.value) if self.op == FilterOp.EQ else ~colb.is_(self.value)
|
|
28
|
+
sv = str(self.value)
|
|
29
|
+
return (column == sv) if self.op == FilterOp.EQ else (column != sv)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class StringFilter(BaseModel):
|
|
33
|
+
op: Literal[FilterOp.LIKE]
|
|
34
|
+
value: str
|
|
35
|
+
|
|
36
|
+
def to_expression(self, column: SQLAColumn, path: str) -> ColumnElement[bool]:
|
|
37
|
+
return column.like(self.value)
|
|
38
|
+
|
|
39
|
+
@model_validator(mode="after")
|
|
40
|
+
def validate_like_pattern(self) -> StringFilter:
|
|
41
|
+
"""If the operation is 'like', the value must contain a wildcard."""
|
|
42
|
+
if self.op == FilterOp.LIKE:
|
|
43
|
+
if "%" not in self.value and "_" not in self.value:
|
|
44
|
+
raise ValueError("The value for a 'like' operation must contain a wildcard character ('%' or '_').")
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
FilterCondition = (
|
|
49
|
+
DateFilter # DATETIME
|
|
50
|
+
| NumericFilter # INT/FLOAT
|
|
51
|
+
| EqualityFilter # BOOLEAN/UUID/BLOCK/RESOURCE_TYPE
|
|
52
|
+
| StringFilter # STRING TODO: convert to hybrid search
|
|
53
|
+
| LtreeFilter # Path
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PathFilter(BaseModel):
|
|
58
|
+
|
|
59
|
+
path: str = Field(description="The ltree path of the field to filter on, e.g., 'subscription.customer_id'.")
|
|
60
|
+
condition: FilterCondition = Field(description="The filter condition to apply.")
|
|
61
|
+
|
|
62
|
+
value_kind: UIType
|
|
63
|
+
|
|
64
|
+
model_config = ConfigDict(
|
|
65
|
+
json_schema_extra={
|
|
66
|
+
"examples": [
|
|
67
|
+
{
|
|
68
|
+
"path": "subscription.status",
|
|
69
|
+
"condition": {"op": "eq", "value": "active"},
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"path": "subscription.customer_id",
|
|
73
|
+
"condition": {"op": "ne", "value": "acme"},
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"path": "subscription.start_date",
|
|
77
|
+
"condition": {"op": "gt", "value": "2025-01-01"},
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"path": "subscription.end_date",
|
|
81
|
+
"condition": {
|
|
82
|
+
"op": "between",
|
|
83
|
+
"value": {"from": "2025-06-01", "to": "2025-07-01"},
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"path": "subscription.*.name",
|
|
88
|
+
"condition": {"op": "matches_lquery", "value": "*.foo_*"},
|
|
89
|
+
},
|
|
90
|
+
]
|
|
91
|
+
}
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@model_validator(mode="before")
|
|
95
|
+
@classmethod
|
|
96
|
+
def _transfer_path_to_value_if_needed(cls, data: Any) -> Any:
|
|
97
|
+
"""Transform for path-only filters.
|
|
98
|
+
|
|
99
|
+
If `op` is `has_component`, `not_has_component`, or `ends_with` and no `value` is
|
|
100
|
+
provided in the `condition`, this validator will automatically use the `path`
|
|
101
|
+
field as the `value` and set the `path` to a wildcard '*' for the query.
|
|
102
|
+
"""
|
|
103
|
+
if isinstance(data, dict):
|
|
104
|
+
path = data.get("path")
|
|
105
|
+
condition = data.get("condition")
|
|
106
|
+
|
|
107
|
+
if path and isinstance(condition, dict):
|
|
108
|
+
op = condition.get("op")
|
|
109
|
+
value = condition.get("value")
|
|
110
|
+
|
|
111
|
+
path_only_ops = [FilterOp.HAS_COMPONENT, FilterOp.NOT_HAS_COMPONENT, FilterOp.ENDS_WITH]
|
|
112
|
+
|
|
113
|
+
if op in path_only_ops and value is None:
|
|
114
|
+
condition["value"] = path
|
|
115
|
+
data["path"] = "*"
|
|
116
|
+
return data
|
|
117
|
+
|
|
118
|
+
def to_expression(self, value_column: SQLAColumn, value_type_column: SQLAColumn) -> ColumnElement[bool]:
|
|
119
|
+
"""Convert the path filter into a SQLAlchemy expression with type safety.
|
|
120
|
+
|
|
121
|
+
This method creates a type guard to ensure we only match compatible field types,
|
|
122
|
+
then delegates to the specific filter condition.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
value_column : ColumnElement
|
|
127
|
+
The SQLAlchemy column element representing the value to be filtered.
|
|
128
|
+
value_type_column : ColumnElement
|
|
129
|
+
The SQLAlchemy column element representing the field type.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
-------
|
|
133
|
+
ColumnElement[bool]
|
|
134
|
+
A SQLAlchemy boolean expression that can be used in a ``WHERE`` clause.
|
|
135
|
+
"""
|
|
136
|
+
# Type guard - only match compatible field types
|
|
137
|
+
allowed_field_types = [ft.value for ft in FieldType if UIType.from_field_type(ft) == self.value_kind]
|
|
138
|
+
type_guard = value_type_column.in_(allowed_field_types) if allowed_field_types else literal(True)
|
|
139
|
+
|
|
140
|
+
return and_(type_guard, self.condition.to_expression(value_column, self.path))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class FilterTree(BaseModel):
|
|
144
|
+
model_config = ConfigDict(
|
|
145
|
+
json_schema_extra={
|
|
146
|
+
"description": (
|
|
147
|
+
"Boolean filter tree. Operators must be UPPERCASE: AND / OR.\n"
|
|
148
|
+
"Node shapes:\n"
|
|
149
|
+
" • Group: {'op':'AND'|'OR', 'children': [<PathFilter|FilterTree>, ...]}\n"
|
|
150
|
+
" • Leaf (PathFilter): {'path':'<ltree>', 'condition': {...}}\n"
|
|
151
|
+
"Rules:\n"
|
|
152
|
+
" • Do NOT put 'op' or 'children' inside a leaf 'condition'.\n"
|
|
153
|
+
" • Max depth = 5.\n"
|
|
154
|
+
" • Use from_flat_and() for a flat list of leaves."
|
|
155
|
+
),
|
|
156
|
+
"examples": [
|
|
157
|
+
{
|
|
158
|
+
"op": "AND",
|
|
159
|
+
"children": [
|
|
160
|
+
{"path": "subscription.status", "condition": {"op": "eq", "value": "active"}},
|
|
161
|
+
{"path": "subscription.start_date", "condition": {"op": "gt", "value": "2021-01-01"}},
|
|
162
|
+
],
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"op": "AND",
|
|
166
|
+
"children": [
|
|
167
|
+
{"path": "subscription.start_date", "condition": {"op": "gte", "value": "2024-01-01"}},
|
|
168
|
+
{
|
|
169
|
+
"op": "OR",
|
|
170
|
+
"children": [
|
|
171
|
+
{"path": "subscription.product_name", "condition": {"op": "like", "value": "%fiber%"}},
|
|
172
|
+
{"path": "subscription.customer_id", "condition": {"op": "eq", "value": "Surf"}},
|
|
173
|
+
],
|
|
174
|
+
},
|
|
175
|
+
],
|
|
176
|
+
},
|
|
177
|
+
],
|
|
178
|
+
}
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
op: BooleanOperator = Field(
|
|
182
|
+
description="Operator for grouping conditions in uppercase.", default=BooleanOperator.AND
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
children: list[FilterTree | PathFilter] = Field(min_length=1, description="Path filters or nested groups.")
|
|
186
|
+
|
|
187
|
+
MAX_DEPTH: ClassVar[int] = 5
|
|
188
|
+
|
|
189
|
+
@model_validator(mode="after")
|
|
190
|
+
def _validate_depth(self) -> FilterTree:
|
|
191
|
+
def depth(node: "FilterTree | PathFilter") -> int:
|
|
192
|
+
return 1 + max(depth(c) for c in node.children) if isinstance(node, FilterTree) else 1
|
|
193
|
+
|
|
194
|
+
if depth(self) > self.MAX_DEPTH:
|
|
195
|
+
raise ValueError(f"FilterTree nesting exceeds MAX_DEPTH={self.MAX_DEPTH}")
|
|
196
|
+
return self
|
|
197
|
+
|
|
198
|
+
@classmethod
|
|
199
|
+
def from_flat_and(cls, filters: list[PathFilter]) -> FilterTree | None:
|
|
200
|
+
"""Wrap a flat list of PathFilter into an AND group (or None)."""
|
|
201
|
+
return None if not filters else cls(op=BooleanOperator.AND, children=list(filters))
|
|
202
|
+
|
|
203
|
+
def get_all_paths(self) -> set[str]:
|
|
204
|
+
"""Collects all unique paths from the PathFilter leaves in the tree."""
|
|
205
|
+
return {leaf.path for leaf in self.get_all_leaves()}
|
|
206
|
+
|
|
207
|
+
def get_all_leaves(self) -> list[PathFilter]:
|
|
208
|
+
"""Collect all PathFilter leaves in the tree."""
|
|
209
|
+
leaves: list[PathFilter] = []
|
|
210
|
+
for child in self.children:
|
|
211
|
+
if isinstance(child, PathFilter):
|
|
212
|
+
leaves.append(child)
|
|
213
|
+
else:
|
|
214
|
+
leaves.extend(child.get_all_leaves())
|
|
215
|
+
return leaves
|
|
216
|
+
|
|
217
|
+
def to_expression(
|
|
218
|
+
self,
|
|
219
|
+
entity_id_col: SQLAColumn,
|
|
220
|
+
*,
|
|
221
|
+
entity_type_value: str | None = None,
|
|
222
|
+
) -> ColumnElement[bool]:
|
|
223
|
+
"""Compile this tree into a SQLAlchemy boolean expression.
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
entity_id_col : SQLAColumn
|
|
228
|
+
Column in the outer query representing the entity ID.
|
|
229
|
+
entity_type_value : str, optional
|
|
230
|
+
If provided, each subquery is additionally constrained to this entity type.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
-------
|
|
234
|
+
ColumnElement[bool]
|
|
235
|
+
A SQLAlchemy expression suitable for use in a WHERE clause.
|
|
236
|
+
"""
|
|
237
|
+
alias_idx = count(1)
|
|
238
|
+
|
|
239
|
+
def leaf_exists(pf: PathFilter) -> ColumnElement[bool]:
|
|
240
|
+
from sqlalchemy.orm import aliased
|
|
241
|
+
|
|
242
|
+
alias = aliased(AiSearchIndex, name=f"flt_{next(alias_idx)}")
|
|
243
|
+
|
|
244
|
+
correlates = [alias.entity_id == entity_id_col]
|
|
245
|
+
if entity_type_value is not None:
|
|
246
|
+
correlates.append(alias.entity_type == entity_type_value)
|
|
247
|
+
|
|
248
|
+
if isinstance(pf.condition, LtreeFilter):
|
|
249
|
+
# row-level predicate is always positive
|
|
250
|
+
positive = pf.condition.to_expression(alias.path, pf.path)
|
|
251
|
+
subq = select(1).select_from(alias).where(and_(*correlates, positive))
|
|
252
|
+
if pf.condition.op == FilterOp.NOT_HAS_COMPONENT:
|
|
253
|
+
return ~exists(subq) # NOT at the entity level
|
|
254
|
+
return exists(subq)
|
|
255
|
+
|
|
256
|
+
# value leaf: path predicate + typed value compare
|
|
257
|
+
if "." not in pf.path:
|
|
258
|
+
path_pred = LtreeFilter(op=FilterOp.ENDS_WITH, value=pf.path).to_expression(alias.path, "")
|
|
259
|
+
else:
|
|
260
|
+
path_pred = alias.path == Ltree(pf.path)
|
|
261
|
+
|
|
262
|
+
value_pred = pf.to_expression(alias.value, alias.value_type)
|
|
263
|
+
subq = select(1).select_from(alias).where(and_(*correlates, path_pred, value_pred))
|
|
264
|
+
return exists(subq)
|
|
265
|
+
|
|
266
|
+
def compile_node(node: FilterTree | PathFilter) -> ColumnElement[bool]:
|
|
267
|
+
if isinstance(node, FilterTree):
|
|
268
|
+
compiled = [compile_node(ch) for ch in node.children]
|
|
269
|
+
return and_(*compiled) if node.op == BooleanOperator.AND else or_(*compiled)
|
|
270
|
+
return leaf_exists(node)
|
|
271
|
+
|
|
272
|
+
return compile_node(self)
|