orchestrator-core 4.4.0rc2__py3-none-any.whl → 5.0.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. orchestrator/__init__.py +1 -1
  2. orchestrator/api/api_v1/api.py +7 -0
  3. orchestrator/api/api_v1/endpoints/agent.py +62 -0
  4. orchestrator/api/api_v1/endpoints/processes.py +6 -12
  5. orchestrator/api/api_v1/endpoints/search.py +197 -0
  6. orchestrator/api/api_v1/endpoints/subscriptions.py +0 -1
  7. orchestrator/app.py +4 -0
  8. orchestrator/cli/index_llm.py +73 -0
  9. orchestrator/cli/main.py +8 -1
  10. orchestrator/cli/resize_embedding.py +136 -0
  11. orchestrator/cli/scheduler.py +29 -40
  12. orchestrator/cli/search_explore.py +203 -0
  13. orchestrator/db/models.py +37 -1
  14. orchestrator/graphql/schema.py +0 -5
  15. orchestrator/graphql/schemas/process.py +2 -2
  16. orchestrator/graphql/utils/create_resolver_error_handler.py +1 -1
  17. orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
  18. orchestrator/schedules/__init__.py +2 -1
  19. orchestrator/schedules/resume_workflows.py +2 -2
  20. orchestrator/schedules/scheduling.py +24 -64
  21. orchestrator/schedules/task_vacuum.py +2 -2
  22. orchestrator/schedules/validate_products.py +2 -8
  23. orchestrator/schedules/validate_subscriptions.py +2 -2
  24. orchestrator/schemas/search.py +101 -0
  25. orchestrator/search/__init__.py +0 -0
  26. orchestrator/search/agent/__init__.py +1 -0
  27. orchestrator/search/agent/prompts.py +62 -0
  28. orchestrator/search/agent/state.py +8 -0
  29. orchestrator/search/agent/tools.py +122 -0
  30. orchestrator/search/core/__init__.py +0 -0
  31. orchestrator/search/core/embedding.py +64 -0
  32. orchestrator/search/core/exceptions.py +16 -0
  33. orchestrator/search/core/types.py +162 -0
  34. orchestrator/search/core/validators.py +27 -0
  35. orchestrator/search/docs/index.md +37 -0
  36. orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
  37. orchestrator/search/filters/__init__.py +27 -0
  38. orchestrator/search/filters/base.py +236 -0
  39. orchestrator/search/filters/date_filters.py +75 -0
  40. orchestrator/search/filters/definitions.py +76 -0
  41. orchestrator/search/filters/ltree_filters.py +31 -0
  42. orchestrator/search/filters/numeric_filter.py +60 -0
  43. orchestrator/search/indexing/__init__.py +3 -0
  44. orchestrator/search/indexing/indexer.py +316 -0
  45. orchestrator/search/indexing/registry.py +88 -0
  46. orchestrator/search/indexing/tasks.py +53 -0
  47. orchestrator/search/indexing/traverse.py +209 -0
  48. orchestrator/search/retrieval/__init__.py +3 -0
  49. orchestrator/search/retrieval/builder.py +64 -0
  50. orchestrator/search/retrieval/engine.py +96 -0
  51. orchestrator/search/retrieval/ranker.py +202 -0
  52. orchestrator/search/retrieval/utils.py +88 -0
  53. orchestrator/search/retrieval/validation.py +174 -0
  54. orchestrator/search/schemas/__init__.py +0 -0
  55. orchestrator/search/schemas/parameters.py +114 -0
  56. orchestrator/search/schemas/results.py +47 -0
  57. orchestrator/services/processes.py +11 -16
  58. orchestrator/services/subscriptions.py +0 -4
  59. orchestrator/settings.py +29 -1
  60. orchestrator/targets.py +0 -1
  61. orchestrator/workflow.py +1 -8
  62. orchestrator/workflows/utils.py +1 -48
  63. {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/METADATA +6 -3
  64. {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/RECORD +66 -30
  65. orchestrator/graphql/resolvers/scheduled_tasks.py +0 -36
  66. orchestrator/graphql/schemas/scheduled_task.py +0 -8
  67. orchestrator/schedules/scheduler.py +0 -163
  68. {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/WHEEL +0 -0
  69. {orchestrator_core-4.4.0rc2.dist-info → orchestrator_core-5.0.0a1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,101 @@
1
+ from datetime import datetime
2
+ from typing import Any, Generic, TypeVar
3
+ from uuid import UUID
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+ from orchestrator.search.schemas.results import Highlight, PathInfo
8
+
9
+ T = TypeVar("T")
10
+
11
+
12
+ class PageInfoSchema(BaseModel):
13
+ total_items: int = Field(default=0, alias="totalItems")
14
+ start_cursor: int = Field(default=0, alias="startCursor")
15
+ has_previous_page: bool = Field(default=False, alias="hasPreviousPage")
16
+ has_next_page: bool = Field(default=False, alias="hasNextPage")
17
+ end_cursor: int = Field(default=0, alias="endCursor")
18
+ sort_fields: list[str] = Field(default_factory=list, alias="sortFields")
19
+ filter_fields: list[str] = Field(default_factory=list, alias="filterFields")
20
+
21
+
22
+ class ProductSchema(BaseModel):
23
+ model_config = ConfigDict(from_attributes=True, populate_by_name=True)
24
+
25
+ name: str
26
+ tag: str
27
+ product_type: str = Field(alias="productType")
28
+
29
+
30
+ class SubscriptionSearchResult(BaseModel):
31
+ score: float
32
+ highlight: Highlight | None = None
33
+
34
+ subscription: dict[str, Any]
35
+
36
+
37
+ class ConnectionSchema(BaseModel, Generic[T]):
38
+ page: list[T]
39
+ page_info: PageInfoSchema = Field(alias="pageInfo")
40
+
41
+ model_config = ConfigDict(populate_by_name=True)
42
+
43
+
44
+ class WorkflowProductSchema(BaseModel):
45
+ """Product associated with a workflow."""
46
+
47
+ model_config = ConfigDict(from_attributes=True, populate_by_name=True)
48
+
49
+ product_type: str = Field(alias="productType")
50
+ product_id: UUID = Field(alias="productId")
51
+ name: str
52
+
53
+
54
+ class WorkflowSearchSchema(BaseModel):
55
+ """Schema for workflow search results."""
56
+
57
+ model_config = ConfigDict(from_attributes=True, populate_by_name=True)
58
+
59
+ name: str
60
+ products: list[WorkflowProductSchema]
61
+ description: str | None = None
62
+ created_at: datetime | None = Field(alias="createdAt", default=None)
63
+
64
+
65
+ class ProductSearchSchema(BaseModel):
66
+ """Schema for product search results."""
67
+
68
+ model_config = ConfigDict(from_attributes=True, populate_by_name=True)
69
+
70
+ product_id: UUID = Field(alias="productId")
71
+ name: str
72
+ product_type: str = Field(alias="productType")
73
+ tag: str | None = None
74
+ description: str | None = None
75
+ status: str | None = None
76
+ created_at: datetime | None = Field(alias="createdAt", default=None)
77
+
78
+
79
+ class ProcessSearchSchema(BaseModel):
80
+ """Schema for process search results."""
81
+
82
+ model_config = ConfigDict(from_attributes=True, populate_by_name=True)
83
+
84
+ process_id: UUID = Field(alias="processId")
85
+ workflow_name: str = Field(alias="workflowName")
86
+ workflow_id: UUID = Field(alias="workflowId")
87
+ status: str = Field(alias="last_status")
88
+ is_task: bool = Field(alias="isTask")
89
+ created_by: str | None = Field(alias="createdBy", default=None)
90
+ started_at: datetime = Field(alias="startedAt")
91
+ last_modified_at: datetime = Field(alias="lastModifiedAt")
92
+ last_step: str | None = Field(alias="lastStep", default=None)
93
+ failed_reason: str | None = Field(alias="failedReason", default=None)
94
+ subscription_ids: list[UUID] | None = Field(alias="subscriptionIds", default=None)
95
+
96
+
97
+ class PathsResponse(BaseModel):
98
+ prefix: str
99
+ paths: list[PathInfo]
100
+
101
+ model_config = ConfigDict(extra="forbid", use_enum_values=True)
File without changes
@@ -0,0 +1 @@
1
+ # This module requires: pydantic-ai==0.7.0, ag-ui-protocol>=0.1.8
@@ -0,0 +1,62 @@
1
+ import json
2
+ from textwrap import dedent
3
+
4
+ import structlog
5
+ from pydantic_ai import RunContext
6
+ from pydantic_ai.ag_ui import StateDeps
7
+
8
+ from orchestrator.search.retrieval.validation import get_structured_filter_schema
9
+
10
+ from .state import SearchState
11
+
12
+ logger = structlog.get_logger(__name__)
13
+
14
+
15
+ async def get_base_instructions() -> str:
16
+
17
+ try:
18
+ schema_dict = get_structured_filter_schema()
19
+ if schema_dict:
20
+ schema_info = "\n".join([f" {path}: {field_type}" for path, field_type in schema_dict.items()])
21
+ else:
22
+ schema_info = " No filterable fields available"
23
+ except Exception as e:
24
+ logger.warning(f"Failed to load schema for prompt: {e}")
25
+ schema_info = " Schema temporarily unavailable"
26
+ logger.error(f"Generated schema for agent prompt:\n{schema_info}")
27
+
28
+ return dedent(
29
+ f"""
30
+ You are a helpful assistant for building and running database queries.
31
+
32
+ **Available Data Schema:**
33
+ Use the following schema to understand the available fields.
34
+ When you build filters, each `path` MUST be a valid path from this schema,
35
+ and the operator/value MUST match that path's type.
36
+ ```
37
+ {schema_info}
38
+ ```
39
+ **Workflow (do in order):**
40
+ 1) `set_search_parameters` to define the main entity being searched.
41
+ 2) Build a complete `FilterTree` (AND at root unless the user asks for OR).
42
+ 3) `set_filter_tree(filters=<FilterTree or null>)`.
43
+ 4) `execute_search()`.
44
+ 5) Summarize the results for the user.
45
+ """
46
+ )
47
+
48
+
49
+ async def get_dynamic_instructions(ctx: RunContext[StateDeps[SearchState]]) -> str:
50
+ """Dynamically generate the system prompt for the agent."""
51
+ param_state = json.dumps(ctx.deps.state.parameters, indent=2, default=str) if ctx.deps.state.parameters else "{}"
52
+
53
+ return dedent(
54
+ f"""
55
+ Current search parameters state:
56
+ {param_state}
57
+
58
+ Remember:
59
+ - If filters are missing or incomplete, construct a full FilterTree and call `set_filter_tree`.
60
+ - Then call `execute_search`.
61
+ """
62
+ )
@@ -0,0 +1,8 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class SearchState(BaseModel):
7
+ parameters: dict[str, Any] | None = None
8
+ results: list[dict[str, Any]] = Field(default_factory=list)
@@ -0,0 +1,122 @@
1
+ import asyncio
2
+ from collections.abc import Awaitable, Callable
3
+ from typing import Any, TypeVar
4
+
5
+ import structlog
6
+ from ag_ui.core import EventType, StateSnapshotEvent
7
+ from pydantic_ai import RunContext
8
+ from pydantic_ai.ag_ui import StateDeps
9
+ from pydantic_ai.exceptions import ModelRetry
10
+ from pydantic_ai.messages import ModelRequest, UserPromptPart
11
+ from pydantic_ai.toolsets import FunctionToolset
12
+
13
+ from orchestrator.api.api_v1.endpoints.search import (
14
+ search_processes,
15
+ search_products,
16
+ search_subscriptions,
17
+ search_workflows,
18
+ )
19
+ from orchestrator.schemas.search import ConnectionSchema
20
+ from orchestrator.search.core.types import ActionType, EntityType
21
+ from orchestrator.search.filters import FilterTree
22
+ from orchestrator.search.retrieval.validation import validate_filter_tree
23
+ from orchestrator.search.schemas.parameters import PARAMETER_REGISTRY, BaseSearchParameters
24
+
25
+ from .state import SearchState
26
+
27
+ logger = structlog.get_logger(__name__)
28
+ P = TypeVar("P", bound=BaseSearchParameters)
29
+
30
+ SearchFn = Callable[[P], ConnectionSchema[Any]] | Callable[[P], Awaitable[ConnectionSchema[Any]]]
31
+
32
+ SEARCH_FN_MAP: dict[EntityType, SearchFn] = {
33
+ EntityType.SUBSCRIPTION: search_subscriptions,
34
+ EntityType.WORKFLOW: search_workflows,
35
+ EntityType.PRODUCT: search_products,
36
+ EntityType.PROCESS: search_processes,
37
+ }
38
+
39
+ search_toolset: FunctionToolset[StateDeps[SearchState]] = FunctionToolset(max_retries=1)
40
+
41
+
42
+ def last_user_message(ctx: RunContext[StateDeps[SearchState]]) -> str | None:
43
+ for msg in reversed(ctx.messages):
44
+ if isinstance(msg, ModelRequest):
45
+ for part in msg.parts:
46
+ if isinstance(part, UserPromptPart) and isinstance(part.content, str):
47
+ return part.content
48
+ return None
49
+
50
+
51
+ @search_toolset.tool # type: ignore[misc]
52
+ async def set_search_parameters(
53
+ ctx: RunContext[StateDeps[SearchState]],
54
+ entity_type: EntityType,
55
+ action: str | ActionType = ActionType.SELECT,
56
+ ) -> StateSnapshotEvent:
57
+ params = ctx.deps.state.parameters or {}
58
+ is_new_search = params.get("entity_type") != entity_type.value
59
+ final_query = (last_user_message(ctx) or "") if is_new_search else params.get("query", "")
60
+
61
+ ctx.deps.state.parameters = {"action": action, "entity_type": entity_type, "filters": None, "query": final_query}
62
+ ctx.deps.state.results = []
63
+ logger.info(f"Set search parameters: entity_type={entity_type}, action={action}")
64
+
65
+ return StateSnapshotEvent(
66
+ type=EventType.STATE_SNAPSHOT,
67
+ snapshot=ctx.deps.state.model_dump(),
68
+ )
69
+
70
+
71
+ @search_toolset.tool(retries=2) # type: ignore[misc]
72
+ async def set_filter_tree(
73
+ ctx: RunContext[StateDeps[SearchState]],
74
+ filters: FilterTree | None,
75
+ ) -> StateSnapshotEvent:
76
+ """Replace current filters atomically with a full FilterTree, or clear with None.
77
+
78
+ Requirements:
79
+ - Root/group operators must be 'AND' or 'OR' (uppercase).
80
+ - Provide either PathFilters or nested groups under `children`.
81
+ - See the FilterTree schema examples for the exact shape.
82
+ """
83
+ if ctx.deps.state.parameters is None:
84
+ raise ModelRetry("Search parameters are not initialized. Call set_search_parameters first.")
85
+
86
+ entity_type = EntityType(ctx.deps.state.parameters["entity_type"])
87
+
88
+ try:
89
+ await validate_filter_tree(filters, entity_type)
90
+ except Exception as e:
91
+ raise ModelRetry(str(e))
92
+
93
+ ctx.deps.state.parameters["filters"] = None if filters is None else filters.model_dump(mode="json", by_alias=True)
94
+ logger.info(
95
+ "Set filter tree",
96
+ filters=None if filters is None else filters.model_dump(mode="json", by_alias=True),
97
+ )
98
+ return StateSnapshotEvent(type=EventType.STATE_SNAPSHOT, snapshot=ctx.deps.state.model_dump())
99
+
100
+
101
+ @search_toolset.tool # type: ignore[misc]
102
+ async def execute_search(
103
+ ctx: RunContext[StateDeps[SearchState]],
104
+ limit: int = 5,
105
+ ) -> StateSnapshotEvent:
106
+ """Execute the search with the current parameters."""
107
+ if not ctx.deps.state.parameters:
108
+ raise ValueError("No search parameters set")
109
+
110
+ entity_type = EntityType(ctx.deps.state.parameters["entity_type"])
111
+ param_class = PARAMETER_REGISTRY.get(entity_type)
112
+ if not param_class:
113
+ raise ValueError(f"Unknown entity type: {entity_type}")
114
+
115
+ params = param_class(**ctx.deps.state.parameters)
116
+ logger.info("Executing database search", **params.model_dump(mode="json"))
117
+
118
+ fn = SEARCH_FN_MAP[entity_type]
119
+ page_connection = await fn(params) if asyncio.iscoroutinefunction(fn) else fn(params)
120
+ ctx.deps.state.results = [item.model_dump(mode="json") for item in page_connection.page[:limit]]
121
+
122
+ return StateSnapshotEvent(type=EventType.STATE_SNAPSHOT, snapshot=ctx.deps.state.model_dump())
File without changes
@@ -0,0 +1,64 @@
1
+ import logging
2
+
3
+ import structlog
4
+ from litellm import aembedding as llm_aembedding
5
+ from litellm import embedding as llm_embedding
6
+ from litellm import exceptions as llm_exc
7
+
8
+ from orchestrator.settings import app_settings
9
+
10
+ logger = structlog.get_logger(__name__)
11
+
12
+ # Its logging alot of noise such as embedding vectors.
13
+ logging.getLogger("LiteLLM").setLevel(logging.WARNING)
14
+
15
+
16
+ class EmbeddingIndexer:
17
+
18
+ @classmethod
19
+ def get_embeddings_from_api_batch(cls, texts: list[str], dry_run: bool) -> list[list[float]]:
20
+ if not texts:
21
+ return []
22
+ if dry_run:
23
+ logger.debug("Dry Run: returning empty embeddings")
24
+ return [[] for _ in texts]
25
+
26
+ try:
27
+ resp = llm_embedding(
28
+ model=app_settings.EMBEDDING_MODEL,
29
+ input=[t.lower() for t in texts],
30
+ api_key=app_settings.OPENAI_API_KEY,
31
+ base_url=app_settings.OPENAI_BASE_URL,
32
+ timeout=app_settings.LLM_TIMEOUT,
33
+ max_retries=app_settings.LLM_MAX_RETRIES,
34
+ )
35
+ data = sorted(resp.data, key=lambda e: e["index"])
36
+ return [row["embedding"] for row in data]
37
+ except (llm_exc.APIError, llm_exc.APIConnectionError, llm_exc.RateLimitError, llm_exc.Timeout) as e:
38
+ logger.error("Embedding request failed", error=str(e))
39
+ return [[] for _ in texts]
40
+ except Exception as e:
41
+ logger.error("Unexpected embedding error", error=str(e))
42
+ return [[] for _ in texts]
43
+
44
+
45
+ class QueryEmbedder:
46
+ """A stateless, async utility for embedding real-time user queries."""
47
+
48
+ @classmethod
49
+ async def generate_for_text_async(cls, text: str) -> list[float]:
50
+ if not text:
51
+ return []
52
+ try:
53
+ resp = await llm_aembedding(
54
+ model=app_settings.EMBEDDING_MODEL,
55
+ input=[text.lower()],
56
+ api_key=app_settings.OPENAI_API_KEY,
57
+ base_url=app_settings.OPENAI_BASE_URL,
58
+ timeout=app_settings.LLM_TIMEOUT,
59
+ max_retries=app_settings.LLM_MAX_RETRIES,
60
+ )
61
+ return resp.data[0]["embedding"]
62
+ except Exception as e:
63
+ logger.error("Async embedding generation failed", error=str(e))
64
+ return []
@@ -0,0 +1,16 @@
1
+ class SearchUtilsError(Exception):
2
+ """Base exception for this module."""
3
+
4
+ pass
5
+
6
+
7
+ class ProductNotInRegistryError(SearchUtilsError):
8
+ """Raised when a product is not found in the model registry."""
9
+
10
+ pass
11
+
12
+
13
+ class ModelLoadError(SearchUtilsError):
14
+ """Raised when a Pydantic model fails to load from a subscription."""
15
+
16
+ pass
@@ -0,0 +1,162 @@
1
+ from dataclasses import dataclass
2
+ from datetime import date, datetime
3
+ from enum import Enum
4
+ from typing import Any, NamedTuple, TypeAlias, TypedDict
5
+ from uuid import UUID
6
+
7
+ from sqlalchemy.orm.attributes import InstrumentedAttribute
8
+ from sqlalchemy.sql.elements import ColumnElement
9
+ from sqlalchemy_utils.types.ltree import Ltree
10
+
11
+ from .validators import is_bool_string, is_iso_date, is_uuid
12
+
13
+ SQLAColumn: TypeAlias = ColumnElement[Any] | InstrumentedAttribute[Any]
14
+
15
+
16
+ class BooleanOperator(str, Enum):
17
+ AND = "AND"
18
+ OR = "OR"
19
+
20
+
21
+ class FilterOp(str, Enum):
22
+ EQ = "eq"
23
+ NEQ = "neq"
24
+ LT = "lt"
25
+ LIKE = "like"
26
+ LTE = "lte"
27
+ GT = "gt"
28
+ GTE = "gte"
29
+ BETWEEN = "between"
30
+
31
+ MATCHES_LQUERY = "matches_lquery" # The ~ operator for wildcard matching
32
+ IS_ANCESTOR = "is_ancestor" # The @> operator
33
+ IS_DESCENDANT = "is_descendant" # The <@ operator
34
+ PATH_MATCH = "path_match"
35
+
36
+
37
+ class EntityType(str, Enum):
38
+ SUBSCRIPTION = "SUBSCRIPTION"
39
+ PRODUCT = "PRODUCT"
40
+ WORKFLOW = "WORKFLOW"
41
+ PROCESS = "PROCESS"
42
+
43
+
44
+ class ActionType(str, Enum):
45
+ """Defines the explicit, safe actions the agent can request."""
46
+
47
+ SELECT = "select" # Retrieve a list of matching records.
48
+ # COUNT = "count" # For phase1; the agent will not support this yet.
49
+
50
+
51
+ class UIType(str, Enum):
52
+ STRING = "string"
53
+ NUMBER = "number"
54
+ BOOLEAN = "boolean"
55
+ DATETIME = "datetime"
56
+
57
+ @classmethod
58
+ def from_field_type(cls, ft: "FieldType") -> "UIType":
59
+ """Create a UIType from a backend FieldType to indicate how a value must be rendered."""
60
+ if ft in (FieldType.INTEGER, FieldType.FLOAT):
61
+ return cls.NUMBER
62
+ if ft == FieldType.BOOLEAN:
63
+ return cls.BOOLEAN
64
+ if ft == FieldType.DATETIME:
65
+ return cls.DATETIME
66
+ return cls.STRING
67
+
68
+
69
+ class FieldType(str, Enum):
70
+ STRING = "string"
71
+ INTEGER = "integer"
72
+ FLOAT = "float"
73
+ BOOLEAN = "boolean"
74
+ DATETIME = "datetime"
75
+ UUID = "uuid"
76
+ BLOCK = "block"
77
+ RESOURCE_TYPE = "resource_type"
78
+
79
+ @classmethod
80
+ def infer(cls, val: Any) -> "FieldType":
81
+ if isinstance(val, TypedValue):
82
+ return cls._infer_typed_value(val)
83
+
84
+ if isinstance(val, bool):
85
+ return cls.BOOLEAN
86
+ if isinstance(val, int):
87
+ return cls.INTEGER
88
+ if isinstance(val, float):
89
+ return cls.FLOAT
90
+ if isinstance(val, UUID):
91
+ return cls.UUID
92
+ if isinstance(val, (datetime, date)):
93
+ return cls.DATETIME
94
+ if isinstance(val, str):
95
+ return cls._infer_from_str(val)
96
+
97
+ return cls.STRING
98
+
99
+ @classmethod
100
+ def _infer_typed_value(cls, val: "TypedValue") -> "FieldType":
101
+ if val.type == cls.BLOCK:
102
+ return cls.BLOCK
103
+ if val.type == cls.RESOURCE_TYPE:
104
+ return cls.RESOURCE_TYPE
105
+ return cls.STRING
106
+
107
+ @classmethod
108
+ def _infer_from_str(cls, val: str) -> "FieldType":
109
+ if is_uuid(val):
110
+ return cls.UUID
111
+ if is_iso_date(val):
112
+ return cls.DATETIME
113
+ if is_bool_string(val):
114
+ return cls.BOOLEAN
115
+ if val.isdigit():
116
+ return cls.INTEGER
117
+ try:
118
+ float(val)
119
+ return cls.FLOAT
120
+ except ValueError:
121
+ return cls.STRING
122
+
123
+ def pg_cast(self) -> str:
124
+ return {
125
+ FieldType.STRING: "::text",
126
+ FieldType.INTEGER: "::integer",
127
+ FieldType.FLOAT: "::double precision",
128
+ FieldType.BOOLEAN: "::boolean",
129
+ FieldType.DATETIME: "::timestamptz",
130
+ FieldType.UUID: "::uuid",
131
+ }.get(self, "::text")
132
+
133
+ def is_embeddable(self) -> bool:
134
+ return self == FieldType.STRING
135
+
136
+
137
+ @dataclass(frozen=True)
138
+ class TypedValue:
139
+ value: Any
140
+ type: FieldType
141
+
142
+
143
+ class ExtractedField(NamedTuple):
144
+ path: str
145
+ value: str
146
+ value_type: FieldType
147
+
148
+ @classmethod
149
+ def from_raw(cls, path: str, raw_value: Any) -> "ExtractedField":
150
+ value = str(raw_value.value if isinstance(raw_value, TypedValue) else raw_value)
151
+ value_type = FieldType.infer(raw_value)
152
+ return cls(path=path, value=value, value_type=value_type)
153
+
154
+
155
+ class IndexableRecord(TypedDict):
156
+ entity_id: str
157
+ entity_type: str
158
+ path: Ltree
159
+ value: Any
160
+ value_type: Any
161
+ content_hash: str
162
+ embedding: list[float] | None
@@ -0,0 +1,27 @@
1
+ import uuid
2
+
3
+ from dateutil.parser import isoparse
4
+
5
+
6
+ def is_uuid(value: str) -> bool:
7
+ """Check if a string is a valid UUID."""
8
+ try:
9
+ uuid.UUID(value)
10
+ return True
11
+ except (ValueError, TypeError):
12
+ return False
13
+
14
+
15
+ def is_iso_date(value: str) -> bool:
16
+ """Check if a string is a valid ISO 8601 date."""
17
+ try:
18
+ isoparse(value)
19
+ return True
20
+ except (ValueError, TypeError):
21
+ return False
22
+
23
+
24
+ def is_bool_string(value: str) -> bool:
25
+ """Check if a string explicitly represents a boolean value with true/false."""
26
+
27
+ return value.strip().lower() in {"true", "false"}
@@ -0,0 +1,37 @@
1
+ # Search Indexing CLI
2
+
3
+ Typer-based CLI for maintaining search indexes (subscriptions, products, processes, workflows).
4
+
5
+ ## Usage
6
+
7
+ Run from project root:
8
+
9
+ ```
10
+ dotenv run python main.py index [COMMAND] [OPTIONS]
11
+ ```
12
+
13
+ ### Commands
14
+
15
+ - `subscriptions` – index `subscription_search_index`
16
+ - `products` – index `product_search_index`
17
+ - `processes` – index `process_search_index`
18
+ - `workflows` – index `workflow_search_index`
19
+
20
+ ### Options
21
+
22
+ - `--<id>` – UUID of a specific entity (default: all)
23
+ - `--dry-run` – no DB writes
24
+ - `--force-index` – re-index even if unchanged
25
+
26
+ ### Examples
27
+
28
+ ```
29
+ # Index all subscriptions
30
+ dotenv run python main.py index subscriptions
31
+
32
+ # Re-index all subscriptions
33
+ dotenv run python main.py index subscriptions --force-index
34
+
35
+ # Index a single subscription
36
+ dotenv run python main.py index subscriptions --subscription-id=<UUID>
37
+ ```
@@ -0,0 +1,45 @@
1
+ # Running a local MiniLM embedding server with Hugging Face TEI
2
+
3
+ Only **OpenAI-compatible endpoints** are supported locally.
4
+
5
+ You can spin up a embedding API based on **sentence-transformers/all-MiniLM-L6-v2** using [Hugging Face TEI](https://github.com/huggingface/text-embeddings-inference):
6
+
7
+ ```bash
8
+ docker run --rm -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 \
9
+ --model-id sentence-transformers/all-MiniLM-L6-v2
10
+ ```
11
+
12
+ ---
13
+
14
+ ## Environment variables
15
+
16
+ Point your backend to the local endpoint and declare the new vector size:
17
+
18
+ ```env
19
+ OPENAI_BASE_URL=http://localhost:8080/v1
20
+ EMBEDDING_DIMENSION=384
21
+ ```
22
+
23
+ Depending on the model, you might want to change the `EMBEDDING_FALLBACK_MAX_TOKENS` and `EMBEDDING_MAX_BATCH_SIZE` settings, which are set conservatively and according to the requirements of the setup used in this example.
24
+
25
+ ---
26
+
27
+ ## Apply the schema change
28
+
29
+ With these new settings run:
30
+
31
+ ```bash
32
+ dotenv run python main.py embedding resize
33
+ ```
34
+
35
+ **Note** that this will delete all records and you will have to re-index.
36
+
37
+ ---
38
+
39
+ ## Re-index embeddings
40
+
41
+ ```bash
42
+ dotenv run python main.py index subscriptions
43
+ ```
44
+
45
+ The search index now uses **384-dimension MiniLM vectors** served from your local Docker container. That’s it! 🚀
@@ -0,0 +1,27 @@
1
+ from .base import (
2
+ EqualityFilter,
3
+ FilterCondition,
4
+ FilterTree,
5
+ PathFilter,
6
+ StringFilter,
7
+ )
8
+ from .date_filters import DateFilter, DateRangeFilter, DateValueFilter
9
+ from .ltree_filters import LtreeFilter
10
+ from .numeric_filter import NumericFilter, NumericRangeFilter, NumericValueFilter
11
+
12
+ __all__ = [
13
+ # Base filter classes
14
+ "PathFilter",
15
+ "FilterTree",
16
+ "FilterCondition",
17
+ "StringFilter",
18
+ "EqualityFilter",
19
+ # Filters for specific value types
20
+ "NumericValueFilter",
21
+ "NumericRangeFilter",
22
+ "DateValueFilter",
23
+ "DateRangeFilter",
24
+ "DateFilter",
25
+ "LtreeFilter",
26
+ "NumericFilter",
27
+ ]