orchestrator-core 4.4.1__py3-none-any.whl → 4.5.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. orchestrator/__init__.py +26 -2
  2. orchestrator/agentic_app.py +84 -0
  3. orchestrator/api/api_v1/api.py +10 -0
  4. orchestrator/api/api_v1/endpoints/search.py +277 -0
  5. orchestrator/app.py +32 -0
  6. orchestrator/cli/index_llm.py +73 -0
  7. orchestrator/cli/main.py +22 -1
  8. orchestrator/cli/resize_embedding.py +135 -0
  9. orchestrator/cli/search_explore.py +208 -0
  10. orchestrator/cli/speedtest.py +151 -0
  11. orchestrator/db/models.py +37 -1
  12. orchestrator/llm_settings.py +51 -0
  13. orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
  14. orchestrator/schemas/search.py +117 -0
  15. orchestrator/search/__init__.py +12 -0
  16. orchestrator/search/agent/__init__.py +8 -0
  17. orchestrator/search/agent/agent.py +47 -0
  18. orchestrator/search/agent/prompts.py +87 -0
  19. orchestrator/search/agent/state.py +8 -0
  20. orchestrator/search/agent/tools.py +236 -0
  21. orchestrator/search/core/__init__.py +0 -0
  22. orchestrator/search/core/embedding.py +64 -0
  23. orchestrator/search/core/exceptions.py +22 -0
  24. orchestrator/search/core/types.py +281 -0
  25. orchestrator/search/core/validators.py +27 -0
  26. orchestrator/search/docs/index.md +37 -0
  27. orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
  28. orchestrator/search/filters/__init__.py +27 -0
  29. orchestrator/search/filters/base.py +275 -0
  30. orchestrator/search/filters/date_filters.py +75 -0
  31. orchestrator/search/filters/definitions.py +93 -0
  32. orchestrator/search/filters/ltree_filters.py +43 -0
  33. orchestrator/search/filters/numeric_filter.py +60 -0
  34. orchestrator/search/indexing/__init__.py +3 -0
  35. orchestrator/search/indexing/indexer.py +323 -0
  36. orchestrator/search/indexing/registry.py +88 -0
  37. orchestrator/search/indexing/tasks.py +53 -0
  38. orchestrator/search/indexing/traverse.py +322 -0
  39. orchestrator/search/retrieval/__init__.py +3 -0
  40. orchestrator/search/retrieval/builder.py +113 -0
  41. orchestrator/search/retrieval/engine.py +152 -0
  42. orchestrator/search/retrieval/pagination.py +83 -0
  43. orchestrator/search/retrieval/retriever.py +447 -0
  44. orchestrator/search/retrieval/utils.py +106 -0
  45. orchestrator/search/retrieval/validation.py +174 -0
  46. orchestrator/search/schemas/__init__.py +0 -0
  47. orchestrator/search/schemas/parameters.py +116 -0
  48. orchestrator/search/schemas/results.py +64 -0
  49. orchestrator/services/settings_env_variables.py +2 -2
  50. orchestrator/settings.py +1 -1
  51. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a2.dist-info}/METADATA +8 -3
  52. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a2.dist-info}/RECORD +54 -11
  53. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a2.dist-info}/WHEEL +0 -0
  54. {orchestrator_core-4.4.1.dist-info → orchestrator_core-4.5.0a2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,22 @@
1
+ class SearchUtilsError(Exception):
2
+ """Base exception for this module."""
3
+
4
+ pass
5
+
6
+
7
+ class ProductNotInRegistryError(SearchUtilsError):
8
+ """Raised when a product is not found in the model registry."""
9
+
10
+ pass
11
+
12
+
13
+ class ModelLoadError(SearchUtilsError):
14
+ """Raised when a Pydantic model fails to load from a subscription."""
15
+
16
+ pass
17
+
18
+
19
+ class InvalidCursorError(SearchUtilsError):
20
+ """Raised when cursor cannot be decoded."""
21
+
22
+ pass
@@ -0,0 +1,281 @@
1
+ from dataclasses import dataclass
2
+ from datetime import date, datetime
3
+ from enum import Enum, IntEnum
4
+ from typing import Annotated, Any, Literal, NamedTuple, TypeAlias, TypedDict, get_args, get_origin
5
+ from uuid import UUID
6
+
7
+ from sqlalchemy.orm.attributes import InstrumentedAttribute
8
+ from sqlalchemy.sql.elements import ColumnElement
9
+ from sqlalchemy_utils.types.ltree import Ltree
10
+
11
+ from orchestrator.types import filter_nonetype, get_origin_and_args, is_optional_type, is_union_type
12
+
13
+ from .validators import is_bool_string, is_iso_date, is_uuid
14
+
15
+ SQLAColumn: TypeAlias = ColumnElement[Any] | InstrumentedAttribute[Any]
16
+
17
+
18
+ @dataclass
19
+ class SearchMetadata:
20
+ """Metadata about the search operation performed."""
21
+
22
+ search_type: str
23
+ description: str
24
+
25
+ @classmethod
26
+ def structured(cls) -> "SearchMetadata":
27
+ return cls(
28
+ search_type="structured", description="This search performs a filter-based search using structured queries."
29
+ )
30
+
31
+ @classmethod
32
+ def fuzzy(cls) -> "SearchMetadata":
33
+ return cls(
34
+ search_type="fuzzy",
35
+ description="This search performs a trigram similarity search.",
36
+ )
37
+
38
+ @classmethod
39
+ def semantic(cls) -> "SearchMetadata":
40
+ return cls(
41
+ search_type="semantic",
42
+ description="This search performs a vector similarity search, using L2 distance on embeddings with minimum distance scoring (normalized).",
43
+ )
44
+
45
+ @classmethod
46
+ def hybrid(cls) -> "SearchMetadata":
47
+ return cls(
48
+ search_type="hybrid",
49
+ description="This search performs reciprocal rank fusion combining trigram similarity, word_similarity, and L2 vector distance.",
50
+ )
51
+
52
+ @classmethod
53
+ def empty(cls) -> "SearchMetadata":
54
+ return cls(search_type="empty", description="Empty search - no criteria provided")
55
+
56
+
57
+ class BooleanOperator(str, Enum):
58
+ AND = "AND"
59
+ OR = "OR"
60
+
61
+
62
+ class FilterOp(str, Enum):
63
+ EQ = "eq"
64
+ NEQ = "neq"
65
+ LT = "lt"
66
+ LIKE = "like"
67
+ LTE = "lte"
68
+ GT = "gt"
69
+ GTE = "gte"
70
+ BETWEEN = "between"
71
+
72
+ MATCHES_LQUERY = "matches_lquery" # The ~ operator for wildcard matching
73
+ IS_ANCESTOR = "is_ancestor" # The @> operator
74
+ IS_DESCENDANT = "is_descendant" # The <@ operator
75
+ PATH_MATCH = "path_match"
76
+
77
+ HAS_COMPONENT = "has_component" # Path contains this segment
78
+ NOT_HAS_COMPONENT = "not_has_component" # Path doesn't contain segment
79
+ ENDS_WITH = "ends_with"
80
+
81
+
82
+ class EntityType(str, Enum):
83
+ SUBSCRIPTION = "SUBSCRIPTION"
84
+ PRODUCT = "PRODUCT"
85
+ WORKFLOW = "WORKFLOW"
86
+ PROCESS = "PROCESS"
87
+
88
+
89
+ class ActionType(str, Enum):
90
+ """Defines the explicit, safe actions the agent can request."""
91
+
92
+ SELECT = "select" # Retrieve a list of matching records.
93
+ # COUNT = "count" # For phase1; the agent will not support this yet.
94
+
95
+
96
+ class UIType(str, Enum):
97
+ STRING = "string"
98
+ NUMBER = "number"
99
+ BOOLEAN = "boolean"
100
+ DATETIME = "datetime"
101
+ COMPONENT = "component"
102
+
103
+ @classmethod
104
+ def from_field_type(cls, ft: "FieldType") -> "UIType":
105
+ """Create a UIType from a backend FieldType to indicate how a value must be rendered."""
106
+ if ft in (FieldType.INTEGER, FieldType.FLOAT):
107
+ return cls.NUMBER
108
+ if ft == FieldType.BOOLEAN:
109
+ return cls.BOOLEAN
110
+ if ft == FieldType.DATETIME:
111
+ return cls.DATETIME
112
+ return cls.STRING
113
+
114
+
115
+ class FieldType(str, Enum):
116
+ STRING = "string"
117
+ INTEGER = "integer"
118
+ FLOAT = "float"
119
+ BOOLEAN = "boolean"
120
+ DATETIME = "datetime"
121
+ UUID = "uuid"
122
+ BLOCK = "block"
123
+ RESOURCE_TYPE = "resource_type"
124
+
125
+ @classmethod
126
+ def infer(cls, val: Any) -> "FieldType":
127
+ if isinstance(val, TypedValue):
128
+ return cls._infer_typed_value(val)
129
+
130
+ if isinstance(val, bool):
131
+ return cls.BOOLEAN
132
+ if isinstance(val, int):
133
+ return cls.INTEGER
134
+ if isinstance(val, float):
135
+ return cls.FLOAT
136
+ if isinstance(val, UUID):
137
+ return cls.UUID
138
+ if isinstance(val, (datetime, date)):
139
+ return cls.DATETIME
140
+ if isinstance(val, str):
141
+ return cls._infer_from_str(val)
142
+
143
+ return cls.STRING
144
+
145
+ @classmethod
146
+ def _infer_typed_value(cls, val: "TypedValue") -> "FieldType":
147
+ if val.type == cls.BLOCK:
148
+ return cls.BLOCK
149
+ if val.type == cls.RESOURCE_TYPE:
150
+ return cls.RESOURCE_TYPE
151
+ return cls.STRING
152
+
153
+ @classmethod
154
+ def _infer_from_str(cls, val: str) -> "FieldType":
155
+ if is_uuid(val):
156
+ return cls.UUID
157
+ if is_iso_date(val):
158
+ return cls.DATETIME
159
+ if is_bool_string(val):
160
+ return cls.BOOLEAN
161
+ if val.isdigit():
162
+ return cls.INTEGER
163
+ try:
164
+ float(val)
165
+ return cls.FLOAT
166
+ except ValueError:
167
+ return cls.STRING
168
+
169
+ @classmethod
170
+ def from_type_hint(cls, type_hint: object) -> "FieldType":
171
+ """Convert type hint to FieldType."""
172
+ _type_mapping = {
173
+ int: cls.INTEGER,
174
+ float: cls.FLOAT,
175
+ bool: cls.BOOLEAN,
176
+ str: cls.STRING,
177
+ datetime: cls.DATETIME,
178
+ UUID: cls.UUID,
179
+ }
180
+
181
+ if type_hint in _type_mapping:
182
+ return _type_mapping[type_hint] # type: ignore[index]
183
+
184
+ if get_origin(type_hint) is Annotated:
185
+ inner_type = get_args(type_hint)[0]
186
+ return cls.from_type_hint(inner_type)
187
+
188
+ origin, args = get_origin_and_args(type_hint)
189
+
190
+ if origin is list:
191
+ return cls._handle_list_type(args)
192
+
193
+ if origin is Literal:
194
+ return cls._handle_literal_type(args)
195
+
196
+ if is_optional_type(type_hint) or is_union_type(type_hint):
197
+ return cls._handle_union_type(args)
198
+
199
+ if isinstance(type_hint, type):
200
+ return cls._handle_class_type(type_hint)
201
+
202
+ return cls.STRING
203
+
204
+ @classmethod
205
+ def _handle_list_type(cls, args: tuple) -> "FieldType":
206
+ if args:
207
+ element_type = args[0]
208
+ return cls.from_type_hint(element_type)
209
+ return cls.STRING
210
+
211
+ @classmethod
212
+ def _handle_literal_type(cls, args: tuple) -> "FieldType":
213
+ if not args:
214
+ return cls.STRING
215
+ first_value = args[0]
216
+ if isinstance(first_value, bool):
217
+ return cls.BOOLEAN
218
+ if isinstance(first_value, int):
219
+ return cls.INTEGER
220
+ if isinstance(first_value, str):
221
+ return cls.STRING
222
+ if isinstance(first_value, float):
223
+ return cls.FLOAT
224
+ return cls.STRING
225
+
226
+ @classmethod
227
+ def _handle_union_type(cls, args: tuple) -> "FieldType":
228
+ non_none_types = list(filter_nonetype(args))
229
+ if non_none_types:
230
+ return cls.from_type_hint(non_none_types[0])
231
+ return cls.STRING
232
+
233
+ @classmethod
234
+ def _handle_class_type(cls, type_hint: type) -> "FieldType":
235
+ if issubclass(type_hint, IntEnum):
236
+ return cls.INTEGER
237
+ if issubclass(type_hint, Enum):
238
+ return cls.STRING
239
+
240
+ from orchestrator.domain.base import ProductBlockModel
241
+
242
+ if issubclass(type_hint, ProductBlockModel):
243
+ return cls.BLOCK
244
+
245
+ return cls.STRING
246
+
247
+ def is_embeddable(self, value: str | None) -> bool:
248
+ """Check if a field should be embedded."""
249
+ if value is None:
250
+ return False
251
+
252
+ # If inference suggests it's not actually a string, don't embed it
253
+ return FieldType._infer_from_str(value) == FieldType.STRING
254
+
255
+
256
+ @dataclass(frozen=True)
257
+ class TypedValue:
258
+ value: Any
259
+ type: FieldType
260
+
261
+
262
+ class ExtractedField(NamedTuple):
263
+ path: str
264
+ value: str
265
+ value_type: FieldType
266
+
267
+ @classmethod
268
+ def from_raw(cls, path: str, raw_value: Any) -> "ExtractedField":
269
+ value = str(raw_value.value if isinstance(raw_value, TypedValue) else raw_value)
270
+ value_type = FieldType.infer(raw_value)
271
+ return cls(path=path, value=value, value_type=value_type)
272
+
273
+
274
+ class IndexableRecord(TypedDict):
275
+ entity_id: str
276
+ entity_type: str
277
+ path: Ltree
278
+ value: Any
279
+ value_type: Any
280
+ content_hash: str
281
+ embedding: list[float] | None
@@ -0,0 +1,27 @@
1
+ import uuid
2
+
3
+ from dateutil.parser import isoparse
4
+
5
+
6
+ def is_uuid(value: str) -> bool:
7
+ """Check if a string is a valid UUID."""
8
+ try:
9
+ uuid.UUID(value)
10
+ return True
11
+ except (ValueError, TypeError):
12
+ return False
13
+
14
+
15
+ def is_iso_date(value: str) -> bool:
16
+ """Check if a string is a valid ISO 8601 date."""
17
+ try:
18
+ isoparse(value)
19
+ return True
20
+ except (ValueError, TypeError):
21
+ return False
22
+
23
+
24
+ def is_bool_string(value: str) -> bool:
25
+ """Check if a string explicitly represents a boolean value with true/false."""
26
+
27
+ return value.strip().lower() in {"true", "false"}
@@ -0,0 +1,37 @@
1
+ # Search Indexing CLI
2
+
3
+ Typer-based CLI for maintaining search indexes (subscriptions, products, processes, workflows).
4
+
5
+ ## Usage
6
+
7
+ Run from project root:
8
+
9
+ ```
10
+ dotenv run python main.py index [COMMAND] [OPTIONS]
11
+ ```
12
+
13
+ ### Commands
14
+
15
+ - `subscriptions` – index `subscription_search_index`
16
+ - `products` – index `product_search_index`
17
+ - `processes` – index `process_search_index`
18
+ - `workflows` – index `workflow_search_index`
19
+
20
+ ### Options
21
+
22
+ - `--<id>` – UUID of a specific entity (default: all)
23
+ - `--dry-run` – no DB writes
24
+ - `--force-index` – re-index even if unchanged
25
+
26
+ ### Examples
27
+
28
+ ```
29
+ # Index all subscriptions
30
+ dotenv run python main.py index subscriptions
31
+
32
+ # Re-index all subscriptions
33
+ dotenv run python main.py index subscriptions --force-index
34
+
35
+ # Index a single subscription
36
+ dotenv run python main.py index subscriptions --subscription-id=<UUID>
37
+ ```
@@ -0,0 +1,45 @@
1
+ # Running a local MiniLM embedding server with Hugging Face TEI
2
+
3
+ Only **OpenAI-compatible endpoints** are supported locally.
4
+
5
+ You can spin up a embedding API based on **sentence-transformers/all-MiniLM-L6-v2** using [Hugging Face TEI](https://github.com/huggingface/text-embeddings-inference):
6
+
7
+ ```bash
8
+ docker run --rm -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 \
9
+ --model-id sentence-transformers/all-MiniLM-L6-v2
10
+ ```
11
+
12
+ ---
13
+
14
+ ## Environment variables
15
+
16
+ Point your backend to the local endpoint and declare the new vector size:
17
+
18
+ ```env
19
+ OPENAI_BASE_URL=http://localhost:8080/v1
20
+ EMBEDDING_DIMENSION=384
21
+ ```
22
+
23
+ Depending on the model, you might want to change the `EMBEDDING_FALLBACK_MAX_TOKENS` and `EMBEDDING_MAX_BATCH_SIZE` settings, which are set conservatively and according to the requirements of the setup used in this example.
24
+
25
+ ---
26
+
27
+ ## Apply the schema change
28
+
29
+ With these new settings run:
30
+
31
+ ```bash
32
+ dotenv run python main.py embedding resize
33
+ ```
34
+
35
+ **Note** that this will delete all records and you will have to re-index.
36
+
37
+ ---
38
+
39
+ ## Re-index embeddings
40
+
41
+ ```bash
42
+ dotenv run python main.py index subscriptions
43
+ ```
44
+
45
+ The search index now uses **384-dimension MiniLM vectors** served from your local Docker container. That’s it! 🚀
@@ -0,0 +1,27 @@
1
+ from .base import (
2
+ EqualityFilter,
3
+ FilterCondition,
4
+ FilterTree,
5
+ PathFilter,
6
+ StringFilter,
7
+ )
8
+ from .date_filters import DateFilter, DateRangeFilter, DateValueFilter
9
+ from .ltree_filters import LtreeFilter
10
+ from .numeric_filter import NumericFilter, NumericRangeFilter, NumericValueFilter
11
+
12
+ __all__ = [
13
+ # Base filter classes
14
+ "PathFilter",
15
+ "FilterTree",
16
+ "FilterCondition",
17
+ "StringFilter",
18
+ "EqualityFilter",
19
+ # Filters for specific value types
20
+ "NumericValueFilter",
21
+ "NumericRangeFilter",
22
+ "DateValueFilter",
23
+ "DateRangeFilter",
24
+ "DateFilter",
25
+ "LtreeFilter",
26
+ "NumericFilter",
27
+ ]