orchestrator-core 4.4.2__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. orchestrator/__init__.py +17 -2
  2. orchestrator/agentic_app.py +103 -0
  3. orchestrator/api/api_v1/api.py +14 -2
  4. orchestrator/api/api_v1/endpoints/search.py +296 -0
  5. orchestrator/app.py +32 -0
  6. orchestrator/cli/main.py +22 -1
  7. orchestrator/cli/search/__init__.py +32 -0
  8. orchestrator/cli/search/index_llm.py +73 -0
  9. orchestrator/cli/search/resize_embedding.py +135 -0
  10. orchestrator/cli/search/search_explore.py +208 -0
  11. orchestrator/cli/search/speedtest.py +151 -0
  12. orchestrator/db/models.py +37 -1
  13. orchestrator/devtools/populator.py +16 -0
  14. orchestrator/domain/base.py +2 -7
  15. orchestrator/domain/lifecycle.py +24 -7
  16. orchestrator/llm_settings.py +57 -0
  17. orchestrator/log_config.py +1 -0
  18. orchestrator/migrations/helpers.py +7 -1
  19. orchestrator/schemas/search.py +130 -0
  20. orchestrator/schemas/workflow.py +1 -0
  21. orchestrator/search/__init__.py +12 -0
  22. orchestrator/search/agent/__init__.py +21 -0
  23. orchestrator/search/agent/agent.py +62 -0
  24. orchestrator/search/agent/prompts.py +100 -0
  25. orchestrator/search/agent/state.py +21 -0
  26. orchestrator/search/agent/tools.py +258 -0
  27. orchestrator/search/core/__init__.py +12 -0
  28. orchestrator/search/core/embedding.py +73 -0
  29. orchestrator/search/core/exceptions.py +36 -0
  30. orchestrator/search/core/types.py +296 -0
  31. orchestrator/search/core/validators.py +40 -0
  32. orchestrator/search/docs/index.md +37 -0
  33. orchestrator/search/docs/running_local_text_embedding_inference.md +46 -0
  34. orchestrator/search/filters/__init__.py +40 -0
  35. orchestrator/search/filters/base.py +295 -0
  36. orchestrator/search/filters/date_filters.py +88 -0
  37. orchestrator/search/filters/definitions.py +107 -0
  38. orchestrator/search/filters/ltree_filters.py +56 -0
  39. orchestrator/search/filters/numeric_filter.py +73 -0
  40. orchestrator/search/indexing/__init__.py +16 -0
  41. orchestrator/search/indexing/indexer.py +334 -0
  42. orchestrator/search/indexing/registry.py +101 -0
  43. orchestrator/search/indexing/tasks.py +69 -0
  44. orchestrator/search/indexing/traverse.py +334 -0
  45. orchestrator/search/llm_migration.py +108 -0
  46. orchestrator/search/retrieval/__init__.py +16 -0
  47. orchestrator/search/retrieval/builder.py +123 -0
  48. orchestrator/search/retrieval/engine.py +154 -0
  49. orchestrator/search/retrieval/exceptions.py +90 -0
  50. orchestrator/search/retrieval/pagination.py +96 -0
  51. orchestrator/search/retrieval/retrievers/__init__.py +26 -0
  52. orchestrator/search/retrieval/retrievers/base.py +123 -0
  53. orchestrator/search/retrieval/retrievers/fuzzy.py +94 -0
  54. orchestrator/search/retrieval/retrievers/hybrid.py +277 -0
  55. orchestrator/search/retrieval/retrievers/semantic.py +94 -0
  56. orchestrator/search/retrieval/retrievers/structured.py +39 -0
  57. orchestrator/search/retrieval/utils.py +120 -0
  58. orchestrator/search/retrieval/validation.py +152 -0
  59. orchestrator/search/schemas/__init__.py +12 -0
  60. orchestrator/search/schemas/parameters.py +129 -0
  61. orchestrator/search/schemas/results.py +77 -0
  62. orchestrator/services/processes.py +1 -1
  63. orchestrator/services/settings_env_variables.py +2 -2
  64. orchestrator/settings.py +8 -1
  65. orchestrator/utils/state.py +6 -1
  66. orchestrator/workflows/steps.py +15 -1
  67. orchestrator/workflows/tasks/validate_products.py +1 -1
  68. {orchestrator_core-4.4.2.dist-info → orchestrator_core-4.5.0.dist-info}/METADATA +15 -8
  69. {orchestrator_core-4.4.2.dist-info → orchestrator_core-4.5.0.dist-info}/RECORD +71 -21
  70. {orchestrator_core-4.4.2.dist-info → orchestrator_core-4.5.0.dist-info}/WHEEL +0 -0
  71. {orchestrator_core-4.4.2.dist-info → orchestrator_core-4.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,56 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from typing import Literal
15
+
16
+ from pydantic import BaseModel, Field
17
+ from sqlalchemy import TEXT, bindparam
18
+ from sqlalchemy.sql.elements import ColumnElement
19
+ from sqlalchemy_utils.types.ltree import Ltree
20
+
21
+ from orchestrator.search.core.types import LTREE_SEPARATOR, FilterOp, SQLAColumn
22
+
23
+
24
+ class LtreeFilter(BaseModel):
25
+ """Filter for ltree path operations."""
26
+
27
+ op: Literal[
28
+ FilterOp.MATCHES_LQUERY,
29
+ FilterOp.IS_ANCESTOR,
30
+ FilterOp.IS_DESCENDANT,
31
+ FilterOp.PATH_MATCH,
32
+ FilterOp.HAS_COMPONENT,
33
+ FilterOp.NOT_HAS_COMPONENT,
34
+ FilterOp.ENDS_WITH,
35
+ ]
36
+ value: str = Field(description="The ltree path or lquery pattern to compare against.")
37
+
38
+ def to_expression(self, column: SQLAColumn, path: str) -> ColumnElement[bool]:
39
+ """Converts the filter condition into a SQLAlchemy expression."""
40
+ match self.op:
41
+ case FilterOp.IS_DESCENDANT:
42
+ ltree_value = Ltree(self.value)
43
+ return column.op("<@")(ltree_value)
44
+ case FilterOp.IS_ANCESTOR:
45
+ ltree_value = Ltree(self.value)
46
+ return column.op("@>")(ltree_value)
47
+ case FilterOp.MATCHES_LQUERY:
48
+ param = bindparam(None, self.value, type_=TEXT)
49
+ return column.op("~")(param)
50
+ case FilterOp.PATH_MATCH:
51
+ ltree_value = Ltree(path)
52
+ return column == ltree_value
53
+ case FilterOp.HAS_COMPONENT | FilterOp.NOT_HAS_COMPONENT:
54
+ return column.op("~")(bindparam(None, f"*{LTREE_SEPARATOR}{self.value}{LTREE_SEPARATOR}*", type_=TEXT))
55
+ case FilterOp.ENDS_WITH:
56
+ return column.op("~")(bindparam(None, f"*{LTREE_SEPARATOR}{self.value}", type_=TEXT))
@@ -0,0 +1,73 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from typing import Annotated, Any, Literal
15
+
16
+ from pydantic import BaseModel, Field, model_validator
17
+ from sqlalchemy import DOUBLE_PRECISION, INTEGER, and_
18
+ from sqlalchemy import cast as sa_cast
19
+ from sqlalchemy.sql.elements import ColumnElement
20
+ from typing_extensions import Self
21
+
22
+ from orchestrator.search.core.types import FilterOp, SQLAColumn
23
+
24
+
25
+ class NumericRange(BaseModel):
26
+ start: int | float
27
+ end: int | float
28
+
29
+ @model_validator(mode="after")
30
+ def validate_order(self) -> Self:
31
+ if self.end <= self.start:
32
+ raise ValueError("'end' must be greater than 'start'")
33
+ return self
34
+
35
+
36
+ class NumericValueFilter(BaseModel):
37
+ """A filter for single numeric value comparisons (int or float)."""
38
+
39
+ op: Literal[FilterOp.EQ, FilterOp.NEQ, FilterOp.LT, FilterOp.LTE, FilterOp.GT, FilterOp.GTE]
40
+ value: int | float
41
+
42
+ def to_expression(self, column: SQLAColumn, path: str) -> ColumnElement[bool]:
43
+ cast_type = INTEGER if isinstance(self.value, int) else DOUBLE_PRECISION
44
+ numeric_column: ColumnElement[Any] = sa_cast(column, cast_type)
45
+ match self.op:
46
+
47
+ case FilterOp.EQ:
48
+ return numeric_column == self.value
49
+ case FilterOp.NEQ:
50
+ return numeric_column != self.value
51
+ case FilterOp.LT:
52
+ return numeric_column < self.value
53
+ case FilterOp.LTE:
54
+ return numeric_column <= self.value
55
+ case FilterOp.GT:
56
+ return numeric_column > self.value
57
+ case FilterOp.GTE:
58
+ return numeric_column >= self.value
59
+
60
+
61
+ class NumericRangeFilter(BaseModel):
62
+ """A filter for a range of numeric values (int or float)."""
63
+
64
+ op: Literal[FilterOp.BETWEEN]
65
+ value: NumericRange
66
+
67
+ def to_expression(self, column: SQLAColumn, path: str) -> ColumnElement[bool]:
68
+ cast_type = INTEGER if isinstance(self.value.start, int) else DOUBLE_PRECISION
69
+ numeric_column: ColumnElement[Any] = sa_cast(column, cast_type)
70
+ return and_(numeric_column >= self.value.start, numeric_column <= self.value.end)
71
+
72
+
73
+ NumericFilter = Annotated[NumericValueFilter | NumericRangeFilter, Field(discriminator="op")]
@@ -0,0 +1,16 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from .tasks import run_indexing_for_entity
15
+
16
+ __all__ = ["run_indexing_for_entity"]
@@ -0,0 +1,334 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ import hashlib
15
+ from collections.abc import Generator, Iterable, Iterator
16
+ from contextlib import contextmanager, nullcontext
17
+ from functools import lru_cache
18
+ from typing import Any
19
+
20
+ import structlog
21
+ from litellm.utils import encode, get_max_tokens
22
+ from sqlalchemy import delete, tuple_
23
+ from sqlalchemy.dialects.postgresql import insert
24
+ from sqlalchemy.dialects.postgresql.dml import Insert
25
+ from sqlalchemy.orm import Session
26
+ from sqlalchemy_utils.types.ltree import Ltree
27
+
28
+ from orchestrator.db import db
29
+ from orchestrator.db.models import AiSearchIndex
30
+ from orchestrator.llm_settings import llm_settings
31
+ from orchestrator.search.core.embedding import EmbeddingIndexer
32
+ from orchestrator.search.core.types import ExtractedField, IndexableRecord
33
+ from orchestrator.search.indexing.registry import EntityConfig
34
+ from orchestrator.search.indexing.traverse import DatabaseEntity
35
+
36
+ logger = structlog.get_logger(__name__)
37
+
38
+
39
+ @contextmanager
40
+ def _maybe_begin(session: Session | None) -> Iterator[None]:
41
+ if session is None:
42
+ yield
43
+ else:
44
+ with session.begin():
45
+ yield
46
+
47
+
48
+ class Indexer:
49
+ """Index entities into `AiSearchIndex` using streaming reads and batched writes.
50
+
51
+ Entities are read from a streaming iterator and accumulated into chunks of
52
+ size `chunk_size`. For each chunk, the indexer extracts fields, diffs via
53
+ content hashes, deletes stale paths, and prepares upserts using a two-list
54
+ buffer:
55
+ - Embeddable list (STRING fields): maintains a running token count against a
56
+ token budget (model context window minus a safety margin) and flushes when
57
+ adding the next item would exceed the budget.
58
+ - Non-embeddable list: accumulated in parallel and does not contribute to the
59
+ flush condition.
60
+ Each flush (or end-of-chunk) emits a single combined UPSERT batch from both
61
+ lists (wrapped in a per-chunk transaction in non-dry-runs).
62
+
63
+ Args:
64
+ config (EntityConfig): Registry config describing the entity kind,
65
+ ORM table, and traverser.
66
+ dry_run (bool): If True, skip DELETE/UPSERT statements and external
67
+ embedding calls.
68
+ force_index (bool): If True, ignore existing hashes and reindex all
69
+ fields for each entity.
70
+ chunk_size (int): Number of entities to process per batch. Defaults to 1000.
71
+
72
+ Notes:
73
+ - Non-dry-run runs open a write session and wrap each processed chunk in
74
+ a transaction (`Session.begin()`).
75
+ - Read queries use the passed session when available, otherwise the
76
+ generic `db.session`.
77
+
78
+ Workflow:
79
+ 1) Stream entities (yield_per=chunk_size) and accumulate into a chunk.
80
+ 2) Begin transaction for the chunk.
81
+ 3) determine_changes() → fields_to_upsert, paths_to_delete.
82
+ 4) Delete stale paths.
83
+ 5) Build UPSERT batches with a two-list buffer:
84
+ - Embeddable list (STRING): track running token count; flush when next item
85
+ would exceed the token budget (model max context - safety margin).
86
+ - Non-embeddable list: accumulate in parallel; does not affect flushing.
87
+ 6) Execute UPSERT for each batch (skip in dry_run).
88
+ 7) Commit transaction (auto on context exit).
89
+ 8) Repeat until the stream is exhausted.
90
+ """
91
+
92
+ def __init__(self, config: EntityConfig, dry_run: bool, force_index: bool, chunk_size: int = 1000) -> None:
93
+ self.config = config
94
+ self.dry_run = dry_run
95
+ self.force_index = force_index
96
+ self.chunk_size = chunk_size
97
+ self.embedding_model = llm_settings.EMBEDDING_MODEL
98
+ self.logger = logger.bind(entity_kind=config.entity_kind.value)
99
+
100
+ def run(self, entities: Iterable[DatabaseEntity]) -> int:
101
+ """Orchestrates the entire indexing process."""
102
+ chunk: list[DatabaseEntity] = []
103
+ total_records_processed = 0
104
+ total_identical_records = 0
105
+
106
+ write_scope = db.database_scope() if not self.dry_run else nullcontext()
107
+
108
+ def flush() -> None:
109
+ nonlocal total_records_processed, total_identical_records
110
+ with _maybe_begin(session):
111
+ processed_in_chunk, identical_in_chunk = self._process_chunk(chunk, session)
112
+ total_records_processed += processed_in_chunk
113
+ total_identical_records += identical_in_chunk
114
+ chunk.clear()
115
+
116
+ with write_scope as database:
117
+ session: Session | None = getattr(database, "session", None)
118
+ for entity in entities:
119
+ chunk.append(entity)
120
+ if len(chunk) >= self.chunk_size:
121
+ flush()
122
+
123
+ if chunk:
124
+ flush()
125
+
126
+ final_log_message = (
127
+ f"processed {total_records_processed} records and skipped {total_identical_records} identical records."
128
+ )
129
+ self.logger.info(
130
+ f"Dry run, would have indexed {final_log_message}"
131
+ if self.dry_run
132
+ else f"Indexing done, {final_log_message}"
133
+ )
134
+ return total_records_processed
135
+
136
+ def _process_chunk(self, entity_chunk: list[DatabaseEntity], session: Session | None = None) -> tuple[int, int]:
137
+ """Process a chunk of entities."""
138
+ if not entity_chunk:
139
+ return 0, 0
140
+
141
+ fields_to_upsert, paths_to_delete, identical_count = self._determine_changes(entity_chunk, session)
142
+
143
+ if paths_to_delete and session is not None:
144
+ self.logger.debug(f"Deleting {len(paths_to_delete)} stale records in chunk.")
145
+ self._execute_batched_deletes(paths_to_delete, session)
146
+
147
+ if fields_to_upsert:
148
+ upsert_stmt = self._get_upsert_statement()
149
+ batch_generator = self._generate_upsert_batches(fields_to_upsert)
150
+
151
+ for batch in batch_generator:
152
+ if self.dry_run:
153
+ self.logger.debug(f"Dry Run: Would upsert {len(batch)} records.")
154
+ elif batch and session:
155
+ session.execute(upsert_stmt, batch)
156
+
157
+ return len(fields_to_upsert), identical_count
158
+
159
+ def _determine_changes(
160
+ self, entities: list[DatabaseEntity], session: Session | None = None
161
+ ) -> tuple[list[tuple[str, ExtractedField]], list[tuple[str, Ltree]], int]:
162
+ """Identifies all changes across all entities using pre-fetched data."""
163
+ entity_ids = [str(getattr(e, self.config.pk_name)) for e in entities]
164
+ read_session = session or db.session
165
+ existing_hashes = {} if self.force_index else self._get_all_existing_hashes(entity_ids, read_session)
166
+
167
+ fields_to_upsert: list[tuple[str, ExtractedField]] = []
168
+ paths_to_delete: list[tuple[str, Ltree]] = []
169
+ identical_records_count = 0
170
+
171
+ for entity in entities:
172
+ entity_id = str(getattr(entity, self.config.pk_name))
173
+ current_fields = self.config.traverser.get_fields(
174
+ entity, pk_name=self.config.pk_name, root_name=self.config.root_name
175
+ )
176
+
177
+ entity_hashes = existing_hashes.get(entity_id, {})
178
+ current_paths = set()
179
+
180
+ for field in current_fields:
181
+ current_paths.add(field.path)
182
+ current_hash = self._compute_content_hash(field.path, field.value, field.value_type)
183
+ if field.path not in entity_hashes or entity_hashes[field.path] != current_hash:
184
+ fields_to_upsert.append((entity_id, field))
185
+ else:
186
+ identical_records_count += 1
187
+
188
+ stale_paths = set(entity_hashes.keys()) - current_paths
189
+ paths_to_delete.extend([(entity_id, Ltree(p)) for p in stale_paths])
190
+
191
+ return fields_to_upsert, paths_to_delete, identical_records_count
192
+
193
+ def _execute_batched_deletes(self, paths_to_delete: list[tuple[str, Ltree]], session: Session) -> None:
194
+ """Execute delete operations in batches to avoid PostgreSQL stack depth limits."""
195
+ for i in range(0, len(paths_to_delete), self.chunk_size):
196
+ batch = paths_to_delete[i : i + self.chunk_size]
197
+ delete_stmt = delete(AiSearchIndex).where(tuple_(AiSearchIndex.entity_id, AiSearchIndex.path).in_(batch))
198
+ session.execute(delete_stmt)
199
+ self.logger.debug(f"Deleted batch of {len(batch)} records.")
200
+
201
+ def _get_all_existing_hashes(self, entity_ids: list[str], session: Session) -> dict[str, dict[str, str]]:
202
+ """Fetches all existing hashes for a list of entity IDs in a single query."""
203
+ if not entity_ids:
204
+ return {}
205
+
206
+ results = (
207
+ session.query(AiSearchIndex.entity_id, AiSearchIndex.path, AiSearchIndex.content_hash)
208
+ .filter(AiSearchIndex.entity_id.in_(entity_ids))
209
+ .all()
210
+ )
211
+
212
+ hashes_by_entity: dict[str, dict[str, str]] = {eid: {} for eid in entity_ids}
213
+ for entity_id, path, content_hash in results:
214
+ hashes_by_entity[str(entity_id)][str(path)] = content_hash
215
+ return hashes_by_entity
216
+
217
+ def _generate_upsert_batches(
218
+ self, fields_to_upsert: Iterable[tuple[str, ExtractedField]]
219
+ ) -> Generator[list[IndexableRecord], None, None]:
220
+ """Streams through fields, buffers them by token count, and yields batches."""
221
+ embeddable_buffer: list[tuple[str, ExtractedField]] = []
222
+ non_embeddable_records: list[IndexableRecord] = []
223
+ current_tokens = 0
224
+
225
+ max_ctx = self._get_max_tokens()
226
+ safe_margin = int(max_ctx * llm_settings.EMBEDDING_SAFE_MARGIN_PERCENT)
227
+ token_budget = max(1, max_ctx - safe_margin)
228
+
229
+ max_batch_size = llm_settings.EMBEDDING_MAX_BATCH_SIZE
230
+
231
+ for entity_id, field in fields_to_upsert:
232
+ if field.value_type.is_embeddable(field.value):
233
+ text = self._prepare_text_for_embedding(field)
234
+ try:
235
+ item_tokens = len(encode(model=self.embedding_model, text=text))
236
+ except Exception as e:
237
+ self.logger.warning("Tokenization failed; skipping.", path=field.path, err=str(e))
238
+ continue
239
+
240
+ if item_tokens > max_ctx:
241
+ self.logger.warning(
242
+ "Field exceeds context; skipping.", path=field.path, tokens=item_tokens, max_ctx=max_ctx
243
+ )
244
+ continue
245
+
246
+ should_flush = embeddable_buffer and (
247
+ current_tokens + item_tokens > token_budget
248
+ or (max_batch_size and len(embeddable_buffer) >= max_batch_size)
249
+ )
250
+
251
+ if should_flush:
252
+ yield self._flush_buffer(embeddable_buffer, non_embeddable_records)
253
+ embeddable_buffer.clear()
254
+ non_embeddable_records.clear()
255
+ current_tokens = 0
256
+
257
+ embeddable_buffer.append((entity_id, field))
258
+ current_tokens += item_tokens
259
+ else:
260
+ record = self._make_indexable_record(field, entity_id, embedding=None)
261
+ non_embeddable_records.append(record)
262
+
263
+ if embeddable_buffer or non_embeddable_records:
264
+ yield self._flush_buffer(embeddable_buffer, non_embeddable_records)
265
+
266
+ def _flush_buffer(self, embeddable_buffer: list, non_embeddable_records: list) -> list[IndexableRecord]:
267
+ """Processes and combines buffers into a single batch."""
268
+ if not embeddable_buffer:
269
+ return non_embeddable_records
270
+
271
+ texts_to_embed = [self._prepare_text_for_embedding(f) for _, f in embeddable_buffer]
272
+ embeddings = EmbeddingIndexer.get_embeddings_from_api_batch(texts_to_embed, self.dry_run)
273
+
274
+ if len(embeddable_buffer) != len(embeddings):
275
+ raise ValueError(f"Embedding mismatch: sent {len(embeddable_buffer)}, received {len(embeddings)}")
276
+
277
+ with_embeddings = [
278
+ self._make_indexable_record(field, entity_id, embedding)
279
+ for (entity_id, field), embedding in zip(embeddable_buffer, embeddings)
280
+ ]
281
+ return non_embeddable_records + with_embeddings
282
+
283
+ def _get_max_tokens(self) -> int:
284
+ """Gets max tokens, using a fallback from settings if necessary."""
285
+ try:
286
+ max_ctx = get_max_tokens(self.embedding_model)
287
+ if isinstance(max_ctx, int):
288
+ return max_ctx
289
+ except Exception:
290
+ # Allow local(unknown) models to fall back.
291
+ self.logger.warning("Could not auto-detect max tokens.", model=self.embedding_model)
292
+
293
+ max_ctx = llm_settings.EMBEDDING_FALLBACK_MAX_TOKENS
294
+ if not isinstance(max_ctx, int):
295
+ raise RuntimeError("Model not recognized and EMBEDDING_FALLBACK_MAX_TOKENS not set.")
296
+ self.logger.warning("Using configured fallback token limit.", fallback=max_ctx)
297
+ return max_ctx
298
+
299
+ @staticmethod
300
+ def _prepare_text_for_embedding(field: ExtractedField) -> str:
301
+ return f"{field.path}: {str(field.value)}"
302
+
303
+ @staticmethod
304
+ def _compute_content_hash(path: str, value: Any, value_type: Any) -> str:
305
+ v = "" if value is None else str(value)
306
+ content = f"{path}:{v}:{value_type}"
307
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
308
+
309
+ def _make_indexable_record(
310
+ self, field: ExtractedField, entity_id: str, embedding: list[float] | None
311
+ ) -> IndexableRecord:
312
+ return IndexableRecord(
313
+ entity_id=entity_id,
314
+ entity_type=self.config.entity_kind.value,
315
+ path=Ltree(field.path),
316
+ value=field.value,
317
+ value_type=field.value_type,
318
+ content_hash=self._compute_content_hash(field.path, field.value, field.value_type),
319
+ embedding=embedding if embedding else None,
320
+ )
321
+
322
+ @staticmethod
323
+ @lru_cache(maxsize=1)
324
+ def _get_upsert_statement() -> Insert:
325
+ stmt = insert(AiSearchIndex)
326
+ return stmt.on_conflict_do_update(
327
+ index_elements=[AiSearchIndex.entity_id, AiSearchIndex.path],
328
+ set_={
329
+ AiSearchIndex.value: stmt.excluded.value,
330
+ AiSearchIndex.value_type: stmt.excluded.value_type,
331
+ AiSearchIndex.content_hash: stmt.excluded.content_hash,
332
+ AiSearchIndex.embedding: stmt.excluded.embedding,
333
+ },
334
+ )
@@ -0,0 +1,101 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Generic, TypeVar
16
+ from uuid import UUID
17
+
18
+ from sqlalchemy.orm import Query
19
+ from sqlalchemy.sql import Select
20
+
21
+ from orchestrator.db import (
22
+ ProcessTable,
23
+ ProductTable,
24
+ SubscriptionTable,
25
+ WorkflowTable,
26
+ )
27
+ from orchestrator.db.database import BaseModel
28
+ from orchestrator.search.core.types import EntityType
29
+
30
+ from .traverse import (
31
+ BaseTraverser,
32
+ ProcessTraverser,
33
+ ProductTraverser,
34
+ SubscriptionTraverser,
35
+ WorkflowTraverser,
36
+ )
37
+
38
+ ModelT = TypeVar("ModelT", bound=BaseModel)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class EntityConfig(Generic[ModelT]):
43
+ """A container for all configuration related to a specific entity type."""
44
+
45
+ entity_kind: EntityType
46
+ table: type[ModelT]
47
+
48
+ traverser: "type[BaseTraverser]"
49
+ pk_name: str
50
+ root_name: str
51
+
52
+ def get_all_query(self, entity_id: str | None = None) -> Query | Select:
53
+ query = self.table.query
54
+ if entity_id:
55
+ pk_column = getattr(self.table, self.pk_name)
56
+ query = query.filter(pk_column == UUID(entity_id))
57
+ return query
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class WorkflowConfig(EntityConfig[WorkflowTable]):
62
+ """Workflows have a custom select() function that filters out deleted workflows."""
63
+
64
+ def get_all_query(self, entity_id: str | None = None) -> Select:
65
+ query = self.table.select()
66
+ if entity_id:
67
+ pk_column = getattr(self.table, self.pk_name)
68
+ query = query.where(pk_column == UUID(entity_id))
69
+ return query
70
+
71
+
72
+ ENTITY_CONFIG_REGISTRY: dict[EntityType, EntityConfig] = {
73
+ EntityType.SUBSCRIPTION: EntityConfig(
74
+ entity_kind=EntityType.SUBSCRIPTION,
75
+ table=SubscriptionTable,
76
+ traverser=SubscriptionTraverser,
77
+ pk_name="subscription_id",
78
+ root_name="subscription",
79
+ ),
80
+ EntityType.PRODUCT: EntityConfig(
81
+ entity_kind=EntityType.PRODUCT,
82
+ table=ProductTable,
83
+ traverser=ProductTraverser,
84
+ pk_name="product_id",
85
+ root_name="product",
86
+ ),
87
+ EntityType.PROCESS: EntityConfig(
88
+ entity_kind=EntityType.PROCESS,
89
+ table=ProcessTable,
90
+ traverser=ProcessTraverser,
91
+ pk_name="process_id",
92
+ root_name="process",
93
+ ),
94
+ EntityType.WORKFLOW: WorkflowConfig(
95
+ entity_kind=EntityType.WORKFLOW,
96
+ table=WorkflowTable,
97
+ traverser=WorkflowTraverser,
98
+ pk_name="workflow_id",
99
+ root_name="workflow",
100
+ ),
101
+ }
@@ -0,0 +1,69 @@
1
+ # Copyright 2019-2025 SURF, GÉANT.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ import structlog
15
+ from sqlalchemy.orm import Query
16
+
17
+ from orchestrator.db import db
18
+ from orchestrator.domain.context_cache import cache_subscription_models
19
+ from orchestrator.search.core.types import EntityType
20
+ from orchestrator.search.indexing.indexer import Indexer
21
+ from orchestrator.search.indexing.registry import ENTITY_CONFIG_REGISTRY
22
+
23
+ logger = structlog.get_logger(__name__)
24
+
25
+
26
+ def run_indexing_for_entity(
27
+ entity_kind: EntityType,
28
+ entity_id: str | None = None,
29
+ dry_run: bool = False,
30
+ force_index: bool = False,
31
+ chunk_size: int = 1000,
32
+ ) -> None:
33
+ """Stream and index entities for the given kind.
34
+
35
+ Builds a streaming query via the entity's registry config, disables ORM eager
36
+ loads when applicable and delegates processing to `Indexer`.
37
+
38
+ Args:
39
+ entity_kind (EntityType): The entity type to index (must exist in
40
+ `ENTITY_CONFIG_REGISTRY`).
41
+ entity_id (Optional[str]): If provided, restricts indexing to a single
42
+ entity (UUID string).
43
+ dry_run (bool): When True, runs the full pipeline without performing
44
+ writes or external embedding calls.
45
+ force_index (bool): When True, re-indexes all fields regardless of
46
+ existing hashes.
47
+ chunk_size (int): Number of rows fetched per round-trip and passed to
48
+ the indexer per batch.
49
+
50
+ Returns:
51
+ None
52
+ """
53
+ config = ENTITY_CONFIG_REGISTRY[entity_kind]
54
+
55
+ q = config.get_all_query(entity_id)
56
+
57
+ if isinstance(q, Query):
58
+ q = q.enable_eagerloads(False)
59
+ stmt = q.statement
60
+ else:
61
+ stmt = q
62
+
63
+ stmt = stmt.execution_options(stream_results=True, yield_per=chunk_size)
64
+ entities = db.session.execute(stmt).scalars()
65
+
66
+ indexer = Indexer(config=config, dry_run=dry_run, force_index=force_index, chunk_size=chunk_size)
67
+
68
+ with cache_subscription_models():
69
+ indexer.run(entities)