orchestrator-core 4.4.0rc1__py3-none-any.whl → 5.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orchestrator/__init__.py +1 -1
- orchestrator/api/api_v1/api.py +7 -0
- orchestrator/api/api_v1/endpoints/agent.py +62 -0
- orchestrator/api/api_v1/endpoints/processes.py +6 -12
- orchestrator/api/api_v1/endpoints/search.py +197 -0
- orchestrator/app.py +4 -0
- orchestrator/cli/index_llm.py +73 -0
- orchestrator/cli/main.py +8 -1
- orchestrator/cli/resize_embedding.py +136 -0
- orchestrator/cli/scheduler.py +29 -39
- orchestrator/cli/search_explore.py +203 -0
- orchestrator/db/models.py +37 -1
- orchestrator/graphql/schema.py +0 -5
- orchestrator/graphql/schemas/process.py +2 -2
- orchestrator/graphql/utils/create_resolver_error_handler.py +1 -1
- orchestrator/migrations/versions/schema/2025-08-12_52b37b5b2714_search_index_model_for_llm_integration.py +95 -0
- orchestrator/schedules/__init__.py +2 -1
- orchestrator/schedules/resume_workflows.py +2 -2
- orchestrator/schedules/scheduling.py +24 -64
- orchestrator/schedules/task_vacuum.py +2 -2
- orchestrator/schedules/validate_products.py +2 -8
- orchestrator/schedules/validate_subscriptions.py +2 -2
- orchestrator/schemas/search.py +101 -0
- orchestrator/search/__init__.py +0 -0
- orchestrator/search/agent/__init__.py +1 -0
- orchestrator/search/agent/prompts.py +62 -0
- orchestrator/search/agent/state.py +8 -0
- orchestrator/search/agent/tools.py +122 -0
- orchestrator/search/core/__init__.py +0 -0
- orchestrator/search/core/embedding.py +64 -0
- orchestrator/search/core/exceptions.py +16 -0
- orchestrator/search/core/types.py +162 -0
- orchestrator/search/core/validators.py +27 -0
- orchestrator/search/docs/index.md +37 -0
- orchestrator/search/docs/running_local_text_embedding_inference.md +45 -0
- orchestrator/search/filters/__init__.py +27 -0
- orchestrator/search/filters/base.py +236 -0
- orchestrator/search/filters/date_filters.py +75 -0
- orchestrator/search/filters/definitions.py +76 -0
- orchestrator/search/filters/ltree_filters.py +31 -0
- orchestrator/search/filters/numeric_filter.py +60 -0
- orchestrator/search/indexing/__init__.py +3 -0
- orchestrator/search/indexing/indexer.py +316 -0
- orchestrator/search/indexing/registry.py +88 -0
- orchestrator/search/indexing/tasks.py +53 -0
- orchestrator/search/indexing/traverse.py +209 -0
- orchestrator/search/retrieval/__init__.py +3 -0
- orchestrator/search/retrieval/builder.py +64 -0
- orchestrator/search/retrieval/engine.py +96 -0
- orchestrator/search/retrieval/ranker.py +202 -0
- orchestrator/search/retrieval/utils.py +88 -0
- orchestrator/search/retrieval/validation.py +174 -0
- orchestrator/search/schemas/__init__.py +0 -0
- orchestrator/search/schemas/parameters.py +114 -0
- orchestrator/search/schemas/results.py +47 -0
- orchestrator/services/processes.py +11 -16
- orchestrator/settings.py +29 -1
- orchestrator/workflow.py +1 -8
- {orchestrator_core-4.4.0rc1.dist-info → orchestrator_core-5.0.0a1.dist-info}/METADATA +6 -3
- {orchestrator_core-4.4.0rc1.dist-info → orchestrator_core-5.0.0a1.dist-info}/RECORD +62 -26
- orchestrator/graphql/resolvers/scheduled_tasks.py +0 -36
- orchestrator/graphql/schemas/scheduled_task.py +0 -8
- orchestrator/schedules/scheduler.py +0 -153
- {orchestrator_core-4.4.0rc1.dist-info → orchestrator_core-5.0.0a1.dist-info}/WHEEL +0 -0
- {orchestrator_core-4.4.0rc1.dist-info → orchestrator_core-5.0.0a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any, Generic, TypeVar
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
from orchestrator.search.schemas.results import Highlight, PathInfo
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PageInfoSchema(BaseModel):
|
|
13
|
+
total_items: int = Field(default=0, alias="totalItems")
|
|
14
|
+
start_cursor: int = Field(default=0, alias="startCursor")
|
|
15
|
+
has_previous_page: bool = Field(default=False, alias="hasPreviousPage")
|
|
16
|
+
has_next_page: bool = Field(default=False, alias="hasNextPage")
|
|
17
|
+
end_cursor: int = Field(default=0, alias="endCursor")
|
|
18
|
+
sort_fields: list[str] = Field(default_factory=list, alias="sortFields")
|
|
19
|
+
filter_fields: list[str] = Field(default_factory=list, alias="filterFields")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ProductSchema(BaseModel):
|
|
23
|
+
model_config = ConfigDict(from_attributes=True, populate_by_name=True)
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
tag: str
|
|
27
|
+
product_type: str = Field(alias="productType")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SubscriptionSearchResult(BaseModel):
|
|
31
|
+
score: float
|
|
32
|
+
highlight: Highlight | None = None
|
|
33
|
+
|
|
34
|
+
subscription: dict[str, Any]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ConnectionSchema(BaseModel, Generic[T]):
|
|
38
|
+
page: list[T]
|
|
39
|
+
page_info: PageInfoSchema = Field(alias="pageInfo")
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class WorkflowProductSchema(BaseModel):
|
|
45
|
+
"""Product associated with a workflow."""
|
|
46
|
+
|
|
47
|
+
model_config = ConfigDict(from_attributes=True, populate_by_name=True)
|
|
48
|
+
|
|
49
|
+
product_type: str = Field(alias="productType")
|
|
50
|
+
product_id: UUID = Field(alias="productId")
|
|
51
|
+
name: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class WorkflowSearchSchema(BaseModel):
|
|
55
|
+
"""Schema for workflow search results."""
|
|
56
|
+
|
|
57
|
+
model_config = ConfigDict(from_attributes=True, populate_by_name=True)
|
|
58
|
+
|
|
59
|
+
name: str
|
|
60
|
+
products: list[WorkflowProductSchema]
|
|
61
|
+
description: str | None = None
|
|
62
|
+
created_at: datetime | None = Field(alias="createdAt", default=None)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ProductSearchSchema(BaseModel):
|
|
66
|
+
"""Schema for product search results."""
|
|
67
|
+
|
|
68
|
+
model_config = ConfigDict(from_attributes=True, populate_by_name=True)
|
|
69
|
+
|
|
70
|
+
product_id: UUID = Field(alias="productId")
|
|
71
|
+
name: str
|
|
72
|
+
product_type: str = Field(alias="productType")
|
|
73
|
+
tag: str | None = None
|
|
74
|
+
description: str | None = None
|
|
75
|
+
status: str | None = None
|
|
76
|
+
created_at: datetime | None = Field(alias="createdAt", default=None)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ProcessSearchSchema(BaseModel):
|
|
80
|
+
"""Schema for process search results."""
|
|
81
|
+
|
|
82
|
+
model_config = ConfigDict(from_attributes=True, populate_by_name=True)
|
|
83
|
+
|
|
84
|
+
process_id: UUID = Field(alias="processId")
|
|
85
|
+
workflow_name: str = Field(alias="workflowName")
|
|
86
|
+
workflow_id: UUID = Field(alias="workflowId")
|
|
87
|
+
status: str = Field(alias="last_status")
|
|
88
|
+
is_task: bool = Field(alias="isTask")
|
|
89
|
+
created_by: str | None = Field(alias="createdBy", default=None)
|
|
90
|
+
started_at: datetime = Field(alias="startedAt")
|
|
91
|
+
last_modified_at: datetime = Field(alias="lastModifiedAt")
|
|
92
|
+
last_step: str | None = Field(alias="lastStep", default=None)
|
|
93
|
+
failed_reason: str | None = Field(alias="failedReason", default=None)
|
|
94
|
+
subscription_ids: list[UUID] | None = Field(alias="subscriptionIds", default=None)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class PathsResponse(BaseModel):
|
|
98
|
+
prefix: str
|
|
99
|
+
paths: list[PathInfo]
|
|
100
|
+
|
|
101
|
+
model_config = ConfigDict(extra="forbid", use_enum_values=True)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This module requires: pydantic-ai==0.7.0, ag-ui-protocol>=0.1.8
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from textwrap import dedent
|
|
3
|
+
|
|
4
|
+
import structlog
|
|
5
|
+
from pydantic_ai import RunContext
|
|
6
|
+
from pydantic_ai.ag_ui import StateDeps
|
|
7
|
+
|
|
8
|
+
from orchestrator.search.retrieval.validation import get_structured_filter_schema
|
|
9
|
+
|
|
10
|
+
from .state import SearchState
|
|
11
|
+
|
|
12
|
+
logger = structlog.get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def get_base_instructions() -> str:
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
schema_dict = get_structured_filter_schema()
|
|
19
|
+
if schema_dict:
|
|
20
|
+
schema_info = "\n".join([f" {path}: {field_type}" for path, field_type in schema_dict.items()])
|
|
21
|
+
else:
|
|
22
|
+
schema_info = " No filterable fields available"
|
|
23
|
+
except Exception as e:
|
|
24
|
+
logger.warning(f"Failed to load schema for prompt: {e}")
|
|
25
|
+
schema_info = " Schema temporarily unavailable"
|
|
26
|
+
logger.error(f"Generated schema for agent prompt:\n{schema_info}")
|
|
27
|
+
|
|
28
|
+
return dedent(
|
|
29
|
+
f"""
|
|
30
|
+
You are a helpful assistant for building and running database queries.
|
|
31
|
+
|
|
32
|
+
**Available Data Schema:**
|
|
33
|
+
Use the following schema to understand the available fields.
|
|
34
|
+
When you build filters, each `path` MUST be a valid path from this schema,
|
|
35
|
+
and the operator/value MUST match that path's type.
|
|
36
|
+
```
|
|
37
|
+
{schema_info}
|
|
38
|
+
```
|
|
39
|
+
**Workflow (do in order):**
|
|
40
|
+
1) `set_search_parameters` to define the main entity being searched.
|
|
41
|
+
2) Build a complete `FilterTree` (AND at root unless the user asks for OR).
|
|
42
|
+
3) `set_filter_tree(filters=<FilterTree or null>)`.
|
|
43
|
+
4) `execute_search()`.
|
|
44
|
+
5) Summarize the results for the user.
|
|
45
|
+
"""
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def get_dynamic_instructions(ctx: RunContext[StateDeps[SearchState]]) -> str:
|
|
50
|
+
"""Dynamically generate the system prompt for the agent."""
|
|
51
|
+
param_state = json.dumps(ctx.deps.state.parameters, indent=2, default=str) if ctx.deps.state.parameters else "{}"
|
|
52
|
+
|
|
53
|
+
return dedent(
|
|
54
|
+
f"""
|
|
55
|
+
Current search parameters state:
|
|
56
|
+
{param_state}
|
|
57
|
+
|
|
58
|
+
Remember:
|
|
59
|
+
- If filters are missing or incomplete, construct a full FilterTree and call `set_filter_tree`.
|
|
60
|
+
- Then call `execute_search`.
|
|
61
|
+
"""
|
|
62
|
+
)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections.abc import Awaitable, Callable
|
|
3
|
+
from typing import Any, TypeVar
|
|
4
|
+
|
|
5
|
+
import structlog
|
|
6
|
+
from ag_ui.core import EventType, StateSnapshotEvent
|
|
7
|
+
from pydantic_ai import RunContext
|
|
8
|
+
from pydantic_ai.ag_ui import StateDeps
|
|
9
|
+
from pydantic_ai.exceptions import ModelRetry
|
|
10
|
+
from pydantic_ai.messages import ModelRequest, UserPromptPart
|
|
11
|
+
from pydantic_ai.toolsets import FunctionToolset
|
|
12
|
+
|
|
13
|
+
from orchestrator.api.api_v1.endpoints.search import (
|
|
14
|
+
search_processes,
|
|
15
|
+
search_products,
|
|
16
|
+
search_subscriptions,
|
|
17
|
+
search_workflows,
|
|
18
|
+
)
|
|
19
|
+
from orchestrator.schemas.search import ConnectionSchema
|
|
20
|
+
from orchestrator.search.core.types import ActionType, EntityType
|
|
21
|
+
from orchestrator.search.filters import FilterTree
|
|
22
|
+
from orchestrator.search.retrieval.validation import validate_filter_tree
|
|
23
|
+
from orchestrator.search.schemas.parameters import PARAMETER_REGISTRY, BaseSearchParameters
|
|
24
|
+
|
|
25
|
+
from .state import SearchState
|
|
26
|
+
|
|
27
|
+
logger = structlog.get_logger(__name__)
|
|
28
|
+
P = TypeVar("P", bound=BaseSearchParameters)
|
|
29
|
+
|
|
30
|
+
SearchFn = Callable[[P], ConnectionSchema[Any]] | Callable[[P], Awaitable[ConnectionSchema[Any]]]
|
|
31
|
+
|
|
32
|
+
SEARCH_FN_MAP: dict[EntityType, SearchFn] = {
|
|
33
|
+
EntityType.SUBSCRIPTION: search_subscriptions,
|
|
34
|
+
EntityType.WORKFLOW: search_workflows,
|
|
35
|
+
EntityType.PRODUCT: search_products,
|
|
36
|
+
EntityType.PROCESS: search_processes,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
search_toolset: FunctionToolset[StateDeps[SearchState]] = FunctionToolset(max_retries=1)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def last_user_message(ctx: RunContext[StateDeps[SearchState]]) -> str | None:
|
|
43
|
+
for msg in reversed(ctx.messages):
|
|
44
|
+
if isinstance(msg, ModelRequest):
|
|
45
|
+
for part in msg.parts:
|
|
46
|
+
if isinstance(part, UserPromptPart) and isinstance(part.content, str):
|
|
47
|
+
return part.content
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@search_toolset.tool # type: ignore[misc]
|
|
52
|
+
async def set_search_parameters(
|
|
53
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
54
|
+
entity_type: EntityType,
|
|
55
|
+
action: str | ActionType = ActionType.SELECT,
|
|
56
|
+
) -> StateSnapshotEvent:
|
|
57
|
+
params = ctx.deps.state.parameters or {}
|
|
58
|
+
is_new_search = params.get("entity_type") != entity_type.value
|
|
59
|
+
final_query = (last_user_message(ctx) or "") if is_new_search else params.get("query", "")
|
|
60
|
+
|
|
61
|
+
ctx.deps.state.parameters = {"action": action, "entity_type": entity_type, "filters": None, "query": final_query}
|
|
62
|
+
ctx.deps.state.results = []
|
|
63
|
+
logger.info(f"Set search parameters: entity_type={entity_type}, action={action}")
|
|
64
|
+
|
|
65
|
+
return StateSnapshotEvent(
|
|
66
|
+
type=EventType.STATE_SNAPSHOT,
|
|
67
|
+
snapshot=ctx.deps.state.model_dump(),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@search_toolset.tool(retries=2) # type: ignore[misc]
|
|
72
|
+
async def set_filter_tree(
|
|
73
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
74
|
+
filters: FilterTree | None,
|
|
75
|
+
) -> StateSnapshotEvent:
|
|
76
|
+
"""Replace current filters atomically with a full FilterTree, or clear with None.
|
|
77
|
+
|
|
78
|
+
Requirements:
|
|
79
|
+
- Root/group operators must be 'AND' or 'OR' (uppercase).
|
|
80
|
+
- Provide either PathFilters or nested groups under `children`.
|
|
81
|
+
- See the FilterTree schema examples for the exact shape.
|
|
82
|
+
"""
|
|
83
|
+
if ctx.deps.state.parameters is None:
|
|
84
|
+
raise ModelRetry("Search parameters are not initialized. Call set_search_parameters first.")
|
|
85
|
+
|
|
86
|
+
entity_type = EntityType(ctx.deps.state.parameters["entity_type"])
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
await validate_filter_tree(filters, entity_type)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
raise ModelRetry(str(e))
|
|
92
|
+
|
|
93
|
+
ctx.deps.state.parameters["filters"] = None if filters is None else filters.model_dump(mode="json", by_alias=True)
|
|
94
|
+
logger.info(
|
|
95
|
+
"Set filter tree",
|
|
96
|
+
filters=None if filters is None else filters.model_dump(mode="json", by_alias=True),
|
|
97
|
+
)
|
|
98
|
+
return StateSnapshotEvent(type=EventType.STATE_SNAPSHOT, snapshot=ctx.deps.state.model_dump())
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@search_toolset.tool # type: ignore[misc]
|
|
102
|
+
async def execute_search(
|
|
103
|
+
ctx: RunContext[StateDeps[SearchState]],
|
|
104
|
+
limit: int = 5,
|
|
105
|
+
) -> StateSnapshotEvent:
|
|
106
|
+
"""Execute the search with the current parameters."""
|
|
107
|
+
if not ctx.deps.state.parameters:
|
|
108
|
+
raise ValueError("No search parameters set")
|
|
109
|
+
|
|
110
|
+
entity_type = EntityType(ctx.deps.state.parameters["entity_type"])
|
|
111
|
+
param_class = PARAMETER_REGISTRY.get(entity_type)
|
|
112
|
+
if not param_class:
|
|
113
|
+
raise ValueError(f"Unknown entity type: {entity_type}")
|
|
114
|
+
|
|
115
|
+
params = param_class(**ctx.deps.state.parameters)
|
|
116
|
+
logger.info("Executing database search", **params.model_dump(mode="json"))
|
|
117
|
+
|
|
118
|
+
fn = SEARCH_FN_MAP[entity_type]
|
|
119
|
+
page_connection = await fn(params) if asyncio.iscoroutinefunction(fn) else fn(params)
|
|
120
|
+
ctx.deps.state.results = [item.model_dump(mode="json") for item in page_connection.page[:limit]]
|
|
121
|
+
|
|
122
|
+
return StateSnapshotEvent(type=EventType.STATE_SNAPSHOT, snapshot=ctx.deps.state.model_dump())
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import structlog
|
|
4
|
+
from litellm import aembedding as llm_aembedding
|
|
5
|
+
from litellm import embedding as llm_embedding
|
|
6
|
+
from litellm import exceptions as llm_exc
|
|
7
|
+
|
|
8
|
+
from orchestrator.settings import app_settings
|
|
9
|
+
|
|
10
|
+
logger = structlog.get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
# Its logging alot of noise such as embedding vectors.
|
|
13
|
+
logging.getLogger("LiteLLM").setLevel(logging.WARNING)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EmbeddingIndexer:
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def get_embeddings_from_api_batch(cls, texts: list[str], dry_run: bool) -> list[list[float]]:
|
|
20
|
+
if not texts:
|
|
21
|
+
return []
|
|
22
|
+
if dry_run:
|
|
23
|
+
logger.debug("Dry Run: returning empty embeddings")
|
|
24
|
+
return [[] for _ in texts]
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
resp = llm_embedding(
|
|
28
|
+
model=app_settings.EMBEDDING_MODEL,
|
|
29
|
+
input=[t.lower() for t in texts],
|
|
30
|
+
api_key=app_settings.OPENAI_API_KEY,
|
|
31
|
+
base_url=app_settings.OPENAI_BASE_URL,
|
|
32
|
+
timeout=app_settings.LLM_TIMEOUT,
|
|
33
|
+
max_retries=app_settings.LLM_MAX_RETRIES,
|
|
34
|
+
)
|
|
35
|
+
data = sorted(resp.data, key=lambda e: e["index"])
|
|
36
|
+
return [row["embedding"] for row in data]
|
|
37
|
+
except (llm_exc.APIError, llm_exc.APIConnectionError, llm_exc.RateLimitError, llm_exc.Timeout) as e:
|
|
38
|
+
logger.error("Embedding request failed", error=str(e))
|
|
39
|
+
return [[] for _ in texts]
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error("Unexpected embedding error", error=str(e))
|
|
42
|
+
return [[] for _ in texts]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class QueryEmbedder:
|
|
46
|
+
"""A stateless, async utility for embedding real-time user queries."""
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
async def generate_for_text_async(cls, text: str) -> list[float]:
|
|
50
|
+
if not text:
|
|
51
|
+
return []
|
|
52
|
+
try:
|
|
53
|
+
resp = await llm_aembedding(
|
|
54
|
+
model=app_settings.EMBEDDING_MODEL,
|
|
55
|
+
input=[text.lower()],
|
|
56
|
+
api_key=app_settings.OPENAI_API_KEY,
|
|
57
|
+
base_url=app_settings.OPENAI_BASE_URL,
|
|
58
|
+
timeout=app_settings.LLM_TIMEOUT,
|
|
59
|
+
max_retries=app_settings.LLM_MAX_RETRIES,
|
|
60
|
+
)
|
|
61
|
+
return resp.data[0]["embedding"]
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.error("Async embedding generation failed", error=str(e))
|
|
64
|
+
return []
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
class SearchUtilsError(Exception):
|
|
2
|
+
"""Base exception for this module."""
|
|
3
|
+
|
|
4
|
+
pass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ProductNotInRegistryError(SearchUtilsError):
|
|
8
|
+
"""Raised when a product is not found in the model registry."""
|
|
9
|
+
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ModelLoadError(SearchUtilsError):
|
|
14
|
+
"""Raised when a Pydantic model fails to load from a subscription."""
|
|
15
|
+
|
|
16
|
+
pass
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import date, datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any, NamedTuple, TypeAlias, TypedDict
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
|
|
7
|
+
from sqlalchemy.orm.attributes import InstrumentedAttribute
|
|
8
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
9
|
+
from sqlalchemy_utils.types.ltree import Ltree
|
|
10
|
+
|
|
11
|
+
from .validators import is_bool_string, is_iso_date, is_uuid
|
|
12
|
+
|
|
13
|
+
SQLAColumn: TypeAlias = ColumnElement[Any] | InstrumentedAttribute[Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BooleanOperator(str, Enum):
|
|
17
|
+
AND = "AND"
|
|
18
|
+
OR = "OR"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FilterOp(str, Enum):
|
|
22
|
+
EQ = "eq"
|
|
23
|
+
NEQ = "neq"
|
|
24
|
+
LT = "lt"
|
|
25
|
+
LIKE = "like"
|
|
26
|
+
LTE = "lte"
|
|
27
|
+
GT = "gt"
|
|
28
|
+
GTE = "gte"
|
|
29
|
+
BETWEEN = "between"
|
|
30
|
+
|
|
31
|
+
MATCHES_LQUERY = "matches_lquery" # The ~ operator for wildcard matching
|
|
32
|
+
IS_ANCESTOR = "is_ancestor" # The @> operator
|
|
33
|
+
IS_DESCENDANT = "is_descendant" # The <@ operator
|
|
34
|
+
PATH_MATCH = "path_match"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EntityType(str, Enum):
|
|
38
|
+
SUBSCRIPTION = "SUBSCRIPTION"
|
|
39
|
+
PRODUCT = "PRODUCT"
|
|
40
|
+
WORKFLOW = "WORKFLOW"
|
|
41
|
+
PROCESS = "PROCESS"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ActionType(str, Enum):
|
|
45
|
+
"""Defines the explicit, safe actions the agent can request."""
|
|
46
|
+
|
|
47
|
+
SELECT = "select" # Retrieve a list of matching records.
|
|
48
|
+
# COUNT = "count" # For phase1; the agent will not support this yet.
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class UIType(str, Enum):
|
|
52
|
+
STRING = "string"
|
|
53
|
+
NUMBER = "number"
|
|
54
|
+
BOOLEAN = "boolean"
|
|
55
|
+
DATETIME = "datetime"
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def from_field_type(cls, ft: "FieldType") -> "UIType":
|
|
59
|
+
"""Create a UIType from a backend FieldType to indicate how a value must be rendered."""
|
|
60
|
+
if ft in (FieldType.INTEGER, FieldType.FLOAT):
|
|
61
|
+
return cls.NUMBER
|
|
62
|
+
if ft == FieldType.BOOLEAN:
|
|
63
|
+
return cls.BOOLEAN
|
|
64
|
+
if ft == FieldType.DATETIME:
|
|
65
|
+
return cls.DATETIME
|
|
66
|
+
return cls.STRING
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class FieldType(str, Enum):
|
|
70
|
+
STRING = "string"
|
|
71
|
+
INTEGER = "integer"
|
|
72
|
+
FLOAT = "float"
|
|
73
|
+
BOOLEAN = "boolean"
|
|
74
|
+
DATETIME = "datetime"
|
|
75
|
+
UUID = "uuid"
|
|
76
|
+
BLOCK = "block"
|
|
77
|
+
RESOURCE_TYPE = "resource_type"
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def infer(cls, val: Any) -> "FieldType":
|
|
81
|
+
if isinstance(val, TypedValue):
|
|
82
|
+
return cls._infer_typed_value(val)
|
|
83
|
+
|
|
84
|
+
if isinstance(val, bool):
|
|
85
|
+
return cls.BOOLEAN
|
|
86
|
+
if isinstance(val, int):
|
|
87
|
+
return cls.INTEGER
|
|
88
|
+
if isinstance(val, float):
|
|
89
|
+
return cls.FLOAT
|
|
90
|
+
if isinstance(val, UUID):
|
|
91
|
+
return cls.UUID
|
|
92
|
+
if isinstance(val, (datetime, date)):
|
|
93
|
+
return cls.DATETIME
|
|
94
|
+
if isinstance(val, str):
|
|
95
|
+
return cls._infer_from_str(val)
|
|
96
|
+
|
|
97
|
+
return cls.STRING
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def _infer_typed_value(cls, val: "TypedValue") -> "FieldType":
|
|
101
|
+
if val.type == cls.BLOCK:
|
|
102
|
+
return cls.BLOCK
|
|
103
|
+
if val.type == cls.RESOURCE_TYPE:
|
|
104
|
+
return cls.RESOURCE_TYPE
|
|
105
|
+
return cls.STRING
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def _infer_from_str(cls, val: str) -> "FieldType":
|
|
109
|
+
if is_uuid(val):
|
|
110
|
+
return cls.UUID
|
|
111
|
+
if is_iso_date(val):
|
|
112
|
+
return cls.DATETIME
|
|
113
|
+
if is_bool_string(val):
|
|
114
|
+
return cls.BOOLEAN
|
|
115
|
+
if val.isdigit():
|
|
116
|
+
return cls.INTEGER
|
|
117
|
+
try:
|
|
118
|
+
float(val)
|
|
119
|
+
return cls.FLOAT
|
|
120
|
+
except ValueError:
|
|
121
|
+
return cls.STRING
|
|
122
|
+
|
|
123
|
+
def pg_cast(self) -> str:
|
|
124
|
+
return {
|
|
125
|
+
FieldType.STRING: "::text",
|
|
126
|
+
FieldType.INTEGER: "::integer",
|
|
127
|
+
FieldType.FLOAT: "::double precision",
|
|
128
|
+
FieldType.BOOLEAN: "::boolean",
|
|
129
|
+
FieldType.DATETIME: "::timestamptz",
|
|
130
|
+
FieldType.UUID: "::uuid",
|
|
131
|
+
}.get(self, "::text")
|
|
132
|
+
|
|
133
|
+
def is_embeddable(self) -> bool:
|
|
134
|
+
return self == FieldType.STRING
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass(frozen=True)
|
|
138
|
+
class TypedValue:
|
|
139
|
+
value: Any
|
|
140
|
+
type: FieldType
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ExtractedField(NamedTuple):
|
|
144
|
+
path: str
|
|
145
|
+
value: str
|
|
146
|
+
value_type: FieldType
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def from_raw(cls, path: str, raw_value: Any) -> "ExtractedField":
|
|
150
|
+
value = str(raw_value.value if isinstance(raw_value, TypedValue) else raw_value)
|
|
151
|
+
value_type = FieldType.infer(raw_value)
|
|
152
|
+
return cls(path=path, value=value, value_type=value_type)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class IndexableRecord(TypedDict):
|
|
156
|
+
entity_id: str
|
|
157
|
+
entity_type: str
|
|
158
|
+
path: Ltree
|
|
159
|
+
value: Any
|
|
160
|
+
value_type: Any
|
|
161
|
+
content_hash: str
|
|
162
|
+
embedding: list[float] | None
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
from dateutil.parser import isoparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def is_uuid(value: str) -> bool:
|
|
7
|
+
"""Check if a string is a valid UUID."""
|
|
8
|
+
try:
|
|
9
|
+
uuid.UUID(value)
|
|
10
|
+
return True
|
|
11
|
+
except (ValueError, TypeError):
|
|
12
|
+
return False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def is_iso_date(value: str) -> bool:
|
|
16
|
+
"""Check if a string is a valid ISO 8601 date."""
|
|
17
|
+
try:
|
|
18
|
+
isoparse(value)
|
|
19
|
+
return True
|
|
20
|
+
except (ValueError, TypeError):
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_bool_string(value: str) -> bool:
|
|
25
|
+
"""Check if a string explicitly represents a boolean value with true/false."""
|
|
26
|
+
|
|
27
|
+
return value.strip().lower() in {"true", "false"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Search Indexing CLI
|
|
2
|
+
|
|
3
|
+
Typer-based CLI for maintaining search indexes (subscriptions, products, processes, workflows).
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
Run from project root:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
dotenv run python main.py index [COMMAND] [OPTIONS]
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### Commands
|
|
14
|
+
|
|
15
|
+
- `subscriptions` – index `subscription_search_index`
|
|
16
|
+
- `products` – index `product_search_index`
|
|
17
|
+
- `processes` – index `process_search_index`
|
|
18
|
+
- `workflows` – index `workflow_search_index`
|
|
19
|
+
|
|
20
|
+
### Options
|
|
21
|
+
|
|
22
|
+
- `--<id>` – UUID of a specific entity (default: all)
|
|
23
|
+
- `--dry-run` – no DB writes
|
|
24
|
+
- `--force-index` – re-index even if unchanged
|
|
25
|
+
|
|
26
|
+
### Examples
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
# Index all subscriptions
|
|
30
|
+
dotenv run python main.py index subscriptions
|
|
31
|
+
|
|
32
|
+
# Re-index all subscriptions
|
|
33
|
+
dotenv run python main.py index subscriptions --force-index
|
|
34
|
+
|
|
35
|
+
# Index a single subscription
|
|
36
|
+
dotenv run python main.py index subscriptions --subscription-id=<UUID>
|
|
37
|
+
```
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Running a local MiniLM embedding server with Hugging Face TEI
|
|
2
|
+
|
|
3
|
+
Only **OpenAI-compatible endpoints** are supported locally.
|
|
4
|
+
|
|
5
|
+
You can spin up a embedding API based on **sentence-transformers/all-MiniLM-L6-v2** using [Hugging Face TEI](https://github.com/huggingface/text-embeddings-inference):
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
docker run --rm -p 8080:80 ghcr.io/huggingface/text-embeddings-inference:cpu-1.8 \
|
|
9
|
+
--model-id sentence-transformers/all-MiniLM-L6-v2
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Environment variables
|
|
15
|
+
|
|
16
|
+
Point your backend to the local endpoint and declare the new vector size:
|
|
17
|
+
|
|
18
|
+
```env
|
|
19
|
+
OPENAI_BASE_URL=http://localhost:8080/v1
|
|
20
|
+
EMBEDDING_DIMENSION=384
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Depending on the model, you might want to change the `EMBEDDING_FALLBACK_MAX_TOKENS` and `EMBEDDING_MAX_BATCH_SIZE` settings, which are set conservatively and according to the requirements of the setup used in this example.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Apply the schema change
|
|
28
|
+
|
|
29
|
+
With these new settings run:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
dotenv run python main.py embedding resize
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Note** that this will delete all records and you will have to re-index.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Re-index embeddings
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
dotenv run python main.py index subscriptions
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
The search index now uses **384-dimension MiniLM vectors** served from your local Docker container. That’s it! 🚀
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
EqualityFilter,
|
|
3
|
+
FilterCondition,
|
|
4
|
+
FilterTree,
|
|
5
|
+
PathFilter,
|
|
6
|
+
StringFilter,
|
|
7
|
+
)
|
|
8
|
+
from .date_filters import DateFilter, DateRangeFilter, DateValueFilter
|
|
9
|
+
from .ltree_filters import LtreeFilter
|
|
10
|
+
from .numeric_filter import NumericFilter, NumericRangeFilter, NumericValueFilter
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
# Base filter classes
|
|
14
|
+
"PathFilter",
|
|
15
|
+
"FilterTree",
|
|
16
|
+
"FilterCondition",
|
|
17
|
+
"StringFilter",
|
|
18
|
+
"EqualityFilter",
|
|
19
|
+
# Filters for specific value types
|
|
20
|
+
"NumericValueFilter",
|
|
21
|
+
"NumericRangeFilter",
|
|
22
|
+
"DateValueFilter",
|
|
23
|
+
"DateRangeFilter",
|
|
24
|
+
"DateFilter",
|
|
25
|
+
"LtreeFilter",
|
|
26
|
+
"NumericFilter",
|
|
27
|
+
]
|