orchestrator-core 4.4.1__py3-none-any.whl → 4.5.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. orchestrator/__init__.py +26 -2
  2. orchestrator/agentic_app.py +84 -0
  3. orchestrator/api/api_v1/api.py +10 -0
  4. orchestrator/api/api_v1/endpoints/search.py +277 -0
  5. orchestrator/app.py +32 -0
  6. orchestrator/cli/index_llm.py +73 -0
  7. orchestrator/cli/main.py +22 -1
  8. orchestrator/cli/resize_embedding.py +135 -0
  9. orchestrator/cli/search_explore.py +208 -0
  10. orchestrator/cli/speedtest.py +151 -0
  11. orchestrator/db/models.py +37 -1
  12. orchestrator/llm_settings.py +51 -0
  13. orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
  14. orchestrator/schemas/search.py +117 -0
  15. orchestrator/search/__init__.py +12 -0
  16. orchestrator/search/agent/__init__.py +8 -0
  17. orchestrator/search/agent/agent.py +47 -0
  18. orchestrator/search/agent/prompts.py +62 -0
  19. orchestrator/search/agent/state.py +8 -0
  20. orchestrator/search/agent/tools.py +121 -0
  21. orchestrator/search/core/__init__.py +0 -0
  22. orchestrator/search/core/embedding.py +64 -0
  23. orchestrator/search/core/exceptions.py +22 -0
  24. orchestrator/search/core/types.py +281 -0
  25. orchestrator/search/core/validators.py +27 -0
  26. orchestrator/search/docs/index.md +37 -0
  27. orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
  28. orchestrator/search/filters/__init__.py +27 -0
  29. orchestrator/search/filters/base.py +272 -0
  30. orchestrator/search/filters/date_filters.py +75 -0
  31. orchestrator/search/filters/definitions.py +93 -0
  32. orchestrator/search/filters/ltree_filters.py +43 -0
  33. orchestrator/search/filters/numeric_filter.py +60 -0
  34. orchestrator/search/indexing/__init__.py +3 -0
  35. orchestrator/search/indexing/indexer.py +323 -0
  36. orchestrator/search/indexing/registry.py +88 -0
  37. orchestrator/search/indexing/tasks.py +53 -0
  38. orchestrator/search/indexing/traverse.py +322 -0
  39. orchestrator/search/retrieval/__init__.py +3 -0
  40. orchestrator/search/retrieval/builder.py +108 -0
  41. orchestrator/search/retrieval/engine.py +152 -0
  42. orchestrator/search/retrieval/pagination.py +83 -0
  43. orchestrator/search/retrieval/retriever.py +447 -0
  44. orchestrator/search/retrieval/utils.py +106 -0
  45. orchestrator/search/retrieval/validation.py +174 -0
  46. orchestrator/search/schemas/__init__.py +0 -0
  47. orchestrator/search/schemas/parameters.py +116 -0
  48. orchestrator/search/schemas/results.py +63 -0
  49. orchestrator/services/settings_env_variables.py +2 -2
  50. orchestrator/settings.py +1 -1
  51. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/METADATA +8 -3
  52. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/RECORD +54 -11
  53. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/WHEEL +0 -0
  54. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.1a1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,281 @@
1
+ from dataclasses import dataclass
2
+ from datetime import date, datetime
3
+ from enum import Enum, IntEnum
4
+ from typing import Annotated, Any, Literal, NamedTuple, TypeAlias, TypedDict, get_args, get_origin
5
+ from uuid import UUID
6
+
7
+ from sqlalchemy.orm.attributes import InstrumentedAttribute
8
+ from sqlalchemy.sql.elements import ColumnElement
9
+ from sqlalchemy_utils.types.ltree import Ltree
10
+
11
+ from orchestrator.types import filter_nonetype, get_origin_and_args, is_optional_type, is_union_type
12
+
13
+ from .validators import is_bool_string, is_iso_date, is_uuid
14
+
15
+ SQLAColumn: TypeAlias = ColumnElement[Any] | InstrumentedAttribute[Any]
16
+
17
+
18
+ @dataclass
19
+ class SearchMetadata:
20
+ """Metadata about the search operation performed."""
21
+
22
+ search_type: str
23
+ description: str
24
+
25
+ @classmethod
26
+ def structured(cls) -> "SearchMetadata":
27
+ return cls(
28
+ search_type="structured", description="This search performs a filter-based search using structured queries."
29
+ )
30
+
31
+ @classmethod
32
+ def fuzzy(cls) -> "SearchMetadata":
33
+ return cls(
34
+ search_type="fuzzy",
35
+ description="This search performs a trigram similarity search.",
36
+ )
37
+
38
+ @classmethod
39
+ def semantic(cls) -> "SearchMetadata":
40
+ return cls(
41
+ search_type="semantic",
42
+ description="This search performs a vector similarity search, using L2 distance on embeddings with minimum distance scoring (normalized).",
43
+ )
44
+
45
+ @classmethod
46
+ def hybrid(cls) -> "SearchMetadata":
47
+ return cls(
48
+ search_type="hybrid",
49
+ description="This search performs reciprocal rank fusion combining trigram similarity, word_similarity, and L2 vector distance.",
50
+ )
51
+
52
+ @classmethod
53
+ def empty(cls) -> "SearchMetadata":
54
+ return cls(search_type="empty", description="Empty search - no criteria provided")
55
+
56
+
57
+ class BooleanOperator(str, Enum):
58
+ AND = "AND"
59
+ OR = "OR"
60
+
61
+
62
+ class FilterOp(str, Enum):
63
+ EQ = "eq"
64
+ NEQ = "neq"
65
+ LT = "lt"
66
+ LIKE = "like"
67
+ LTE = "lte"
68
+ GT = "gt"
69
+ GTE = "gte"
70
+ BETWEEN = "between"
71
+
72
+ MATCHES_LQUERY = "matches_lquery" # The ~ operator for wildcard matching
73
+ IS_ANCESTOR = "is_ancestor" # The @> operator
74
+ IS_DESCENDANT = "is_descendant" # The <@ operator
75
+ PATH_MATCH = "path_match"
76
+
77
+ HAS_COMPONENT = "has_component" # Path contains this segment
78
+ NOT_HAS_COMPONENT = "not_has_component" # Path doesn't contain segment
79
+ ENDS_WITH = "ends_with"
80
+
81
+
82
+ class EntityType(str, Enum):
83
+ SUBSCRIPTION = "SUBSCRIPTION"
84
+ PRODUCT = "PRODUCT"
85
+ WORKFLOW = "WORKFLOW"
86
+ PROCESS = "PROCESS"
87
+
88
+
89
+ class ActionType(str, Enum):
90
+ """Defines the explicit, safe actions the agent can request."""
91
+
92
+ SELECT = "select" # Retrieve a list of matching records.
93
+ # COUNT = "count" # For phase1; the agent will not support this yet.
94
+
95
+
96
+ class UIType(str, Enum):
97
+ STRING = "string"
98
+ NUMBER = "number"
99
+ BOOLEAN = "boolean"
100
+ DATETIME = "datetime"
101
+ COMPONENT = "component"
102
+
103
+ @classmethod
104
+ def from_field_type(cls, ft: "FieldType") -> "UIType":
105
+ """Create a UIType from a backend FieldType to indicate how a value must be rendered."""
106
+ if ft in (FieldType.INTEGER, FieldType.FLOAT):
107
+ return cls.NUMBER
108
+ if ft == FieldType.BOOLEAN:
109
+ return cls.BOOLEAN
110
+ if ft == FieldType.DATETIME:
111
+ return cls.DATETIME
112
+ return cls.STRING
113
+
114
+
115
+ class FieldType(str, Enum):
116
+ STRING = "string"
117
+ INTEGER = "integer"
118
+ FLOAT = "float"
119
+ BOOLEAN = "boolean"
120
+ DATETIME = "datetime"
121
+ UUID = "uuid"
122
+ BLOCK = "block"
123
+ RESOURCE_TYPE = "resource_type"
124
+
125
+ @classmethod
126
+ def infer(cls, val: Any) -> "FieldType":
127
+ if isinstance(val, TypedValue):
128
+ return cls._infer_typed_value(val)
129
+
130
+ if isinstance(val, bool):
131
+ return cls.BOOLEAN
132
+ if isinstance(val, int):
133
+ return cls.INTEGER
134
+ if isinstance(val, float):
135
+ return cls.FLOAT
136
+ if isinstance(val, UUID):
137
+ return cls.UUID
138
+ if isinstance(val, (datetime, date)):
139
+ return cls.DATETIME
140
+ if isinstance(val, str):
141
+ return cls._infer_from_str(val)
142
+
143
+ return cls.STRING
144
+
145
+ @classmethod
146
+ def _infer_typed_value(cls, val: "TypedValue") -> "FieldType":
147
+ if val.type == cls.BLOCK:
148
+ return cls.BLOCK
149
+ if val.type == cls.RESOURCE_TYPE:
150
+ return cls.RESOURCE_TYPE
151
+ return cls.STRING
152
+
153
+ @classmethod
154
+ def _infer_from_str(cls, val: str) -> "FieldType":
155
+ if is_uuid(val):
156
+ return cls.UUID
157
+ if is_iso_date(val):
158
+ return cls.DATETIME
159
+ if is_bool_string(val):
160
+ return cls.BOOLEAN
161
+ if val.isdigit():
162
+ return cls.INTEGER
163
+ try:
164
+ float(val)
165
+ return cls.FLOAT
166
+ except ValueError:
167
+ return cls.STRING
168
+
169
+ @classmethod
170
+ def from_type_hint(cls, type_hint: object) -> "FieldType":
171
+ """Convert type hint to FieldType."""
172
+ _type_mapping = {
173
+ int: cls.INTEGER,
174
+ float: cls.FLOAT,
175
+ bool: cls.BOOLEAN,
176
+ str: cls.STRING,
177
+ datetime: cls.DATETIME,
178
+ UUID: cls.UUID,
179
+ }
180
+
181
+ if type_hint in _type_mapping:
182
+ return _type_mapping[type_hint] # type: ignore[index]
183
+
184
+ if get_origin(type_hint) is Annotated:
185
+ inner_type = get_args(type_hint)[0]
186
+ return cls.from_type_hint(inner_type)
187
+
188
+ origin, args = get_origin_and_args(type_hint)
189
+
190
+ if origin is list:
191
+ return cls._handle_list_type(args)
192
+
193
+ if origin is Literal:
194
+ return cls._handle_literal_type(args)
195
+
196
+ if is_optional_type(type_hint) or is_union_type(type_hint):
197
+ return cls._handle_union_type(args)
198
+
199
+ if isinstance(type_hint, type):
200
+ return cls._handle_class_type(type_hint)
201
+
202
+ return cls.STRING
203
+
204
+ @classmethod
205
+ def _handle_list_type(cls, args: tuple) -> "FieldType":
206
+ if args:
207
+ element_type = args[0]
208
+ return cls.from_type_hint(element_type)
209
+ return cls.STRING
210
+
211
+ @classmethod
212
+ def _handle_literal_type(cls, args: tuple) -> "FieldType":
213
+ if not args:
214
+ return cls.STRING
215
+ first_value = args[0]
216
+ if isinstance(first_value, bool):
217
+ return cls.BOOLEAN
218
+ if isinstance(first_value, int):
219
+ return cls.INTEGER
220
+ if isinstance(first_value, str):
221
+ return cls.STRING
222
+ if isinstance(first_value, float):
223
+ return cls.FLOAT
224
+ return cls.STRING
225
+
226
+ @classmethod
227
+ def _handle_union_type(cls, args: tuple) -> "FieldType":
228
+ non_none_types = list(filter_nonetype(args))
229
+ if non_none_types:
230
+ return cls.from_type_hint(non_none_types[0])
231
+ return cls.STRING
232
+
233
+ @classmethod
234
+ def _handle_class_type(cls, type_hint: type) -> "FieldType":
235
+ if issubclass(type_hint, IntEnum):
236
+ return cls.INTEGER
237
+ if issubclass(type_hint, Enum):
238
+ return cls.STRING
239
+
240
+ from orchestrator.domain.base import ProductBlockModel
241
+
242
+ if issubclass(type_hint, ProductBlockModel):
243
+ return cls.BLOCK
244
+
245
+ return cls.STRING
246
+
247
+ def is_embeddable(self, value: str | None) -> bool:
248
+ """Check if a field should be embedded."""
249
+ if value is None:
250
+ return False
251
+
252
+ # If inference suggests it's not actually a string, don't embed it
253
+ return FieldType._infer_from_str(value) == FieldType.STRING
254
+
255
+
256
+ @dataclass(frozen=True)
257
+ class TypedValue:
258
+ value: Any
259
+ type: FieldType
260
+
261
+
262
+ class ExtractedField(NamedTuple):
263
+ path: str
264
+ value: str
265
+ value_type: FieldType
266
+
267
+ @classmethod
268
+ def from_raw(cls, path: str, raw_value: Any) -> "ExtractedField":
269
+ value = str(raw_value.value if isinstance(raw_value, TypedValue) else raw_value)
270
+ value_type = FieldType.infer(raw_value)
271
+ return cls(path=path, value=value, value_type=value_type)
272
+
273
+
274
+ class IndexableRecord(TypedDict):
275
+ entity_id: str
276
+ entity_type: str
277
+ path: Ltree
278
+ value: Any
279
+ value_type: Any
280
+ content_hash: str
281
+ embedding: list[float] | None
@@ -0,0 +1,27 @@
1
+ import uuid
2
+
3
+ from dateutil.parser import isoparse
4
+
5
+
6
+ def is_uuid(value: str) -> bool:
7
+ """Check if a string is a valid UUID."""
8
+ try:
9
+ uuid.UUID(value)
10
+ return True
11
+ except (ValueError, TypeError):
12
+ return False
13
+
14
+
15
+ def is_iso_date(value: str) -> bool:
16
+ """Check if a string is a valid ISO 8601 date."""
17
+ try:
18
+ isoparse(value)
19
+ return True
20
+ except (ValueError, TypeError):
21
+ return False
22
+
23
+
24
+ def is_bool_string(value: str) -> bool:
25
+ """Check if a string explicitly represents a boolean value with true/false."""
26
+
27
+ return value.strip().lower() in {"true", "false"}
@@ -0,0 +1,37 @@
1
+ # Search Indexing CLI
2
+
3
+ Typer-based CLI for maintaining search indexes (subscriptions, products, processes, workflows).
4
+
5
+ ## Usage
6
+
7
+ Run from project root:
8
+
9
+ ```
10
+ dotenv run python main.py index [COMMAND] [OPTIONS]
11
+ ```
12
+
13
+ ### Commands
14
+
15
+ - `subscriptions` – index `subscription_search_index`
16
+ - `products` – index `product_search_index`
17
+ - `processes` – index `process_search_index`
18
+ - `workflows` – index `workflow_search_index`
19
+
20
+ ### Options
21
+
22
+ - `--<id>` – UUID of a specific entity (default: all)
23
+ - `--dry-run` – no DB writes
24
+ - `--force-index` – re-index even if unchanged
25
+
26
+ ### Examples
27
+
28
+ ```
29
+ # Index all subscriptions
30
+ dotenv run python main.py index subscriptions
31
+
32
+ # Re-index all subscriptions
33
+ dotenv run python main.py index subscriptions --force-index
34
+
35
+ # Index a single subscription
36
+ dotenv run python main.py index subscriptions --subscription-id=<UUID>
37
+ ```
@@ -0,0 +1,45 @@
1
+ # Running a local MiniLM embedding server with Hugging Face TEI
2
+
3
+ Only **OpenAI-compatible endpoints** are supported locally.
4
+
5
+ You can spin up a embedding API based on **sentence-transformers/all-MiniLM-L6-v2** using [Hugging Face TEI](https://github.com/huggingface/text-embeddings-inference):
6
+
7
+ ```bash
8
+ docker run --rm -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 \
9
+ --model-id sentence-transformers/all-MiniLM-L6-v2
10
+ ```
11
+
12
+ ---
13
+
14
+ ## Environment variables
15
+
16
+ Point your backend to the local endpoint and declare the new vector size:
17
+
18
+ ```env
19
+ OPENAI_BASE_URL=http://localhost:8080/v1
20
+ EMBEDDING_DIMENSION=384
21
+ ```
22
+
23
+ Depending on the model, you might want to change the `EMBEDDING_FALLBACK_MAX_TOKENS` and `EMBEDDING_MAX_BATCH_SIZE` settings, which are set conservatively and according to the requirements of the setup used in this example.
24
+
25
+ ---
26
+
27
+ ## Apply the schema change
28
+
29
+ With these new settings run:
30
+
31
+ ```bash
32
+ dotenv run python main.py embedding resize
33
+ ```
34
+
35
+ **Note** that this will delete all records and you will have to re-index.
36
+
37
+ ---
38
+
39
+ ## Re-index embeddings
40
+
41
+ ```bash
42
+ dotenv run python main.py index subscriptions
43
+ ```
44
+
45
+ The search index now uses **384-dimension MiniLM vectors** served from your local Docker container. That’s it! 🚀
@@ -0,0 +1,27 @@
1
+ from .base import (
2
+ EqualityFilter,
3
+ FilterCondition,
4
+ FilterTree,
5
+ PathFilter,
6
+ StringFilter,
7
+ )
8
+ from .date_filters import DateFilter, DateRangeFilter, DateValueFilter
9
+ from .ltree_filters import LtreeFilter
10
+ from .numeric_filter import NumericFilter, NumericRangeFilter, NumericValueFilter
11
+
12
+ __all__ = [
13
+ # Base filter classes
14
+ "PathFilter",
15
+ "FilterTree",
16
+ "FilterCondition",
17
+ "StringFilter",
18
+ "EqualityFilter",
19
+ # Filters for specific value types
20
+ "NumericValueFilter",
21
+ "NumericRangeFilter",
22
+ "DateValueFilter",
23
+ "DateRangeFilter",
24
+ "DateFilter",
25
+ "LtreeFilter",
26
+ "NumericFilter",
27
+ ]
@@ -0,0 +1,272 @@
1
+ from __future__ import annotations
2
+
3
+ from itertools import count
4
+ from typing import Any, ClassVar, Literal
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
7
+ from sqlalchemy import BinaryExpression, and_, cast, exists, literal, or_, select
8
+ from sqlalchemy.dialects.postgresql import BOOLEAN
9
+ from sqlalchemy.sql.elements import ColumnElement
10
+ from sqlalchemy_utils.types.ltree import Ltree
11
+
12
+ from orchestrator.db.models import AiSearchIndex
13
+ from orchestrator.search.core.types import BooleanOperator, FieldType, FilterOp, SQLAColumn, UIType
14
+
15
+ from .date_filters import DateFilter
16
+ from .ltree_filters import LtreeFilter
17
+ from .numeric_filter import NumericFilter
18
+
19
+
20
+ class EqualityFilter(BaseModel):
21
+ op: Literal[FilterOp.EQ, FilterOp.NEQ]
22
+ value: Any
23
+
24
+ def to_expression(self, column: SQLAColumn, path: str) -> BinaryExpression[bool] | ColumnElement[bool]:
25
+ if isinstance(self.value, bool):
26
+ colb = cast(column, BOOLEAN)
27
+ return colb.is_(self.value) if self.op == FilterOp.EQ else ~colb.is_(self.value)
28
+ sv = str(self.value)
29
+ return (column == sv) if self.op == FilterOp.EQ else (column != sv)
30
+
31
+
32
+ class StringFilter(BaseModel):
33
+ op: Literal[FilterOp.LIKE]
34
+ value: str
35
+
36
+ def to_expression(self, column: SQLAColumn, path: str) -> ColumnElement[bool]:
37
+ return column.like(self.value)
38
+
39
+ @model_validator(mode="after")
40
+ def validate_like_pattern(self) -> StringFilter:
41
+ """If the operation is 'like', the value must contain a wildcard."""
42
+ if self.op == FilterOp.LIKE:
43
+ if "%" not in self.value and "_" not in self.value:
44
+ raise ValueError("The value for a 'like' operation must contain a wildcard character ('%' or '_').")
45
+ return self
46
+
47
+
48
+ FilterCondition = (
49
+ DateFilter # DATETIME
50
+ | NumericFilter # INT/FLOAT
51
+ | EqualityFilter # BOOLEAN/UUID/BLOCK/RESOURCE_TYPE
52
+ | StringFilter # STRING TODO: convert to hybrid search
53
+ | LtreeFilter # Path
54
+ )
55
+
56
+
57
+ class PathFilter(BaseModel):
58
+
59
+ path: str = Field(description="The ltree path of the field to filter on, e.g., 'subscription.customer_id'.")
60
+ condition: FilterCondition = Field(description="The filter condition to apply.")
61
+
62
+ value_kind: UIType
63
+
64
+ model_config = ConfigDict(
65
+ json_schema_extra={
66
+ "examples": [
67
+ {
68
+ "path": "subscription.status",
69
+ "condition": {"op": "eq", "value": "active"},
70
+ },
71
+ {
72
+ "path": "subscription.customer_id",
73
+ "condition": {"op": "ne", "value": "acme"},
74
+ },
75
+ {
76
+ "path": "subscription.start_date",
77
+ "condition": {"op": "gt", "value": "2025-01-01"},
78
+ },
79
+ {
80
+ "path": "subscription.end_date",
81
+ "condition": {
82
+ "op": "between",
83
+ "value": {"from": "2025-06-01", "to": "2025-07-01"},
84
+ },
85
+ },
86
+ {
87
+ "path": "subscription.*.name",
88
+ "condition": {"op": "matches_lquery", "value": "*.foo_*"},
89
+ },
90
+ ]
91
+ }
92
+ )
93
+
94
+ @model_validator(mode="before")
95
+ @classmethod
96
+ def _transfer_path_to_value_if_needed(cls, data: Any) -> Any:
97
+ """Transform for path-only filters.
98
+
99
+ If `op` is `has_component`, `not_has_component`, or `ends_with` and no `value` is
100
+ provided in the `condition`, this validator will automatically use the `path`
101
+ field as the `value` and set the `path` to a wildcard '*' for the query.
102
+ """
103
+ if isinstance(data, dict):
104
+ path = data.get("path")
105
+ condition = data.get("condition")
106
+
107
+ if path and isinstance(condition, dict):
108
+ op = condition.get("op")
109
+ value = condition.get("value")
110
+
111
+ path_only_ops = [FilterOp.HAS_COMPONENT, FilterOp.NOT_HAS_COMPONENT, FilterOp.ENDS_WITH]
112
+
113
+ if op in path_only_ops and value is None:
114
+ condition["value"] = path
115
+ data["path"] = "*"
116
+ return data
117
+
118
+ def to_expression(self, value_column: SQLAColumn, value_type_column: SQLAColumn) -> ColumnElement[bool]:
119
+ """Convert the path filter into a SQLAlchemy expression with type safety.
120
+
121
+ This method creates a type guard to ensure we only match compatible field types,
122
+ then delegates to the specific filter condition.
123
+
124
+ Parameters
125
+ ----------
126
+ value_column : ColumnElement
127
+ The SQLAlchemy column element representing the value to be filtered.
128
+ value_type_column : ColumnElement
129
+ The SQLAlchemy column element representing the field type.
130
+
131
+ Returns:
132
+ -------
133
+ ColumnElement[bool]
134
+ A SQLAlchemy boolean expression that can be used in a ``WHERE`` clause.
135
+ """
136
+ # Type guard - only match compatible field types
137
+ allowed_field_types = [ft.value for ft in FieldType if UIType.from_field_type(ft) == self.value_kind]
138
+ type_guard = value_type_column.in_(allowed_field_types) if allowed_field_types else literal(True)
139
+
140
+ return and_(type_guard, self.condition.to_expression(value_column, self.path))
141
+
142
+
143
+ class FilterTree(BaseModel):
144
+ model_config = ConfigDict(
145
+ json_schema_extra={
146
+ "description": (
147
+ "Boolean filter tree. Operators must be UPPERCASE: AND / OR.\n"
148
+ "Node shapes:\n"
149
+ " • Group: {'op':'AND'|'OR', 'children': [<PathFilter|FilterTree>, ...]}\n"
150
+ " • Leaf (PathFilter): {'path':'<ltree>', 'condition': {...}}\n"
151
+ "Rules:\n"
152
+ " • Do NOT put 'op' or 'children' inside a leaf 'condition'.\n"
153
+ " • Max depth = 5.\n"
154
+ " • Use from_flat_and() for a flat list of leaves."
155
+ ),
156
+ "examples": [
157
+ {
158
+ "op": "AND",
159
+ "children": [
160
+ {"path": "subscription.status", "condition": {"op": "eq", "value": "active"}},
161
+ {"path": "subscription.start_date", "condition": {"op": "gt", "value": "2021-01-01"}},
162
+ ],
163
+ },
164
+ {
165
+ "op": "AND",
166
+ "children": [
167
+ {"path": "subscription.start_date", "condition": {"op": "gte", "value": "2024-01-01"}},
168
+ {
169
+ "op": "OR",
170
+ "children": [
171
+ {"path": "subscription.product_name", "condition": {"op": "like", "value": "%fiber%"}},
172
+ {"path": "subscription.customer_id", "condition": {"op": "eq", "value": "Surf"}},
173
+ ],
174
+ },
175
+ ],
176
+ },
177
+ ],
178
+ }
179
+ )
180
+
181
+ op: BooleanOperator = Field(
182
+ description="Operator for grouping conditions in uppercase.", default=BooleanOperator.AND
183
+ )
184
+
185
+ children: list[FilterTree | PathFilter] = Field(min_length=1, description="Path filters or nested groups.")
186
+
187
+ MAX_DEPTH: ClassVar[int] = 5
188
+
189
+ @model_validator(mode="after")
190
+ def _validate_depth(self) -> FilterTree:
191
+ def depth(node: "FilterTree | PathFilter") -> int:
192
+ return 1 + max(depth(c) for c in node.children) if isinstance(node, FilterTree) else 1
193
+
194
+ if depth(self) > self.MAX_DEPTH:
195
+ raise ValueError(f"FilterTree nesting exceeds MAX_DEPTH={self.MAX_DEPTH}")
196
+ return self
197
+
198
+ @classmethod
199
+ def from_flat_and(cls, filters: list[PathFilter]) -> FilterTree | None:
200
+ """Wrap a flat list of PathFilter into an AND group (or None)."""
201
+ return None if not filters else cls(op=BooleanOperator.AND, children=list(filters))
202
+
203
+ def get_all_paths(self) -> set[str]:
204
+ """Collects all unique paths from the PathFilter leaves in the tree."""
205
+ return {leaf.path for leaf in self.get_all_leaves()}
206
+
207
+ def get_all_leaves(self) -> list[PathFilter]:
208
+ """Collect all PathFilter leaves in the tree."""
209
+ leaves: list[PathFilter] = []
210
+ for child in self.children:
211
+ if isinstance(child, PathFilter):
212
+ leaves.append(child)
213
+ else:
214
+ leaves.extend(child.get_all_leaves())
215
+ return leaves
216
+
217
+ def to_expression(
218
+ self,
219
+ entity_id_col: SQLAColumn,
220
+ *,
221
+ entity_type_value: str | None = None,
222
+ ) -> ColumnElement[bool]:
223
+ """Compile this tree into a SQLAlchemy boolean expression.
224
+
225
+ Parameters
226
+ ----------
227
+ entity_id_col : SQLAColumn
228
+ Column in the outer query representing the entity ID.
229
+ entity_type_value : str, optional
230
+ If provided, each subquery is additionally constrained to this entity type.
231
+
232
+ Returns:
233
+ -------
234
+ ColumnElement[bool]
235
+ A SQLAlchemy expression suitable for use in a WHERE clause.
236
+ """
237
+ alias_idx = count(1)
238
+
239
+ def leaf_exists(pf: PathFilter) -> ColumnElement[bool]:
240
+ from sqlalchemy.orm import aliased
241
+
242
+ alias = aliased(AiSearchIndex, name=f"flt_{next(alias_idx)}")
243
+
244
+ correlates = [alias.entity_id == entity_id_col]
245
+ if entity_type_value is not None:
246
+ correlates.append(alias.entity_type == entity_type_value)
247
+
248
+ if isinstance(pf.condition, LtreeFilter):
249
+ # row-level predicate is always positive
250
+ positive = pf.condition.to_expression(alias.path, pf.path)
251
+ subq = select(1).select_from(alias).where(and_(*correlates, positive))
252
+ if pf.condition.op == FilterOp.NOT_HAS_COMPONENT:
253
+ return ~exists(subq) # NOT at the entity level
254
+ return exists(subq)
255
+
256
+ # value leaf: path predicate + typed value compare
257
+ if "." not in pf.path:
258
+ path_pred = LtreeFilter(op=FilterOp.ENDS_WITH, value=pf.path).to_expression(alias.path, "")
259
+ else:
260
+ path_pred = alias.path == Ltree(pf.path)
261
+
262
+ value_pred = pf.to_expression(alias.value, alias.value_type)
263
+ subq = select(1).select_from(alias).where(and_(*correlates, path_pred, value_pred))
264
+ return exists(subq)
265
+
266
+ def compile_node(node: FilterTree | PathFilter) -> ColumnElement[bool]:
267
+ if isinstance(node, FilterTree):
268
+ compiled = [compile_node(ch) for ch in node.children]
269
+ return and_(*compiled) if node.op == BooleanOperator.AND else or_(*compiled)
270
+ return leaf_exists(node)
271
+
272
+ return compile_node(self)