gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,195 @@
1
+ """OpenSearch backend configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class OpenSearchConfig:
11
+ """OpenSearch backend configuration.
12
+
13
+ Example:
14
+ ```python
15
+ # From environment variables
16
+ config = OpenSearchConfig.from_env()
17
+
18
+ # Explicit configuration
19
+ config = OpenSearchConfig(
20
+ host="localhost",
21
+ port=9200,
22
+ embedding_model="text-embedding-3-small",
23
+ )
24
+
25
+ # AWS OpenSearch Service
26
+ config = OpenSearchConfig(
27
+ host="search-domain.us-east-1.es.amazonaws.com",
28
+ port=443,
29
+ use_ssl=True,
30
+ use_aws_sigv4=True,
31
+ aws_region="us-east-1",
32
+ )
33
+ ```
34
+ """
35
+
36
+ # === Connection ===
37
+ host: str = "localhost"
38
+ port: int = 9200
39
+ use_ssl: bool = False
40
+ verify_certs: bool = True
41
+ ca_certs: str | None = None
42
+
43
+ # Authentication
44
+ username: str | None = None
45
+ password: str | None = None
46
+
47
+ # AWS OpenSearch Service
48
+ use_aws_sigv4: bool = False
49
+ aws_region: str | None = None
50
+ aws_service: str = "es" # "es" for OpenSearch, "aoss" for Serverless
51
+
52
+ # === Cluster (High Availability) ===
53
+ nodes: tuple[str, ...] | None = None # Multiple nodes for HA
54
+ sniff_on_start: bool = False
55
+ sniff_on_node_failure: bool = True
56
+ sniff_timeout: float = 10.0
57
+ sniffer_timeout: float = 60.0
58
+
59
+ # === Embedding ===
60
+ embedding_model: str = "text-embedding-3-small"
61
+ embedding_dimension: int = 1536
62
+ openai_api_key: str | None = None
63
+ embedding_batch_size: int = 100
64
+
65
+ # === Index Settings ===
66
+ index_prefix: str = "gnosisllm"
67
+ number_of_shards: int = 5
68
+ number_of_replicas: int = 1
69
+ refresh_interval: str = "1s"
70
+
71
+ # Pipeline names (auto-generated if not set)
72
+ ingest_pipeline_name: str | None = None
73
+ search_pipeline_name: str | None = None
74
+
75
+ # === k-NN Settings ===
76
+ knn_engine: str = "lucene" # lucene (recommended for OpenSearch 2.9+), faiss
77
+ knn_space_type: str = "l2" # l2, cosinesimil, innerproduct
78
+ knn_algo_param_ef_search: int = 512
79
+ knn_algo_param_ef_construction: int = 512
80
+ knn_algo_param_m: int = 16
81
+
82
+ # === Neural Search (OpenSearch 2.9+) ===
83
+ # model_id is the deployed ML model in OpenSearch for embeddings
84
+ model_id: str | None = None
85
+ model_group_id: str | None = None
86
+ embedding_field: str = "content_embedding" # Field name for embeddings
87
+
88
+ # === Agentic Search ===
89
+ # Agent IDs from 'gnosisllm-knowledge agentic setup'
90
+ flow_agent_id: str | None = None
91
+ conversational_agent_id: str | None = None
92
+ # LLM for agent reasoning (OpenAI model ID)
93
+ agentic_llm_model: str = "gpt-4o"
94
+ # Agent execution limits
95
+ agentic_max_iterations: int = 5
96
+ agentic_timeout_seconds: int = 60
97
+ # Conversation memory settings
98
+ memory_window_size: int = 10 # Messages to keep in context
99
+
100
+ # === Timeouts ===
101
+ connect_timeout: float = 5.0
102
+ read_timeout: float = 30.0
103
+ bulk_timeout: float = 120.0
104
+
105
+ # === Bulk Indexing ===
106
+ bulk_batch_size: int = 500
107
+ bulk_max_concurrent: int = 3
108
+
109
+ @property
110
+ def url(self) -> str:
111
+ """Get the full OpenSearch URL."""
112
+ scheme = "https" if self.use_ssl else "http"
113
+ return f"{scheme}://{self.host}:{self.port}"
114
+
115
+ @property
116
+ def knowledge_index_name(self) -> str:
117
+ """Get the default knowledge index name."""
118
+ return f"{self.index_prefix}-knowledge"
119
+
120
+ @property
121
+ def agentic_memory_index_name(self) -> str:
122
+ """Get the agentic memory index name."""
123
+ return f"{self.index_prefix}-memory"
124
+
125
+ def get_index_name(self, collection_id: str | None = None) -> str:
126
+ """Get index name for a collection.
127
+
128
+ Args:
129
+ collection_id: Optional collection ID for multi-tenant indexing.
130
+
131
+ Returns:
132
+ Index name pattern.
133
+ """
134
+ if collection_id:
135
+ return f"{self.index_prefix}-{collection_id}"
136
+ return self.knowledge_index_name
137
+
138
+ @classmethod
139
+ def from_env(cls) -> OpenSearchConfig:
140
+ """Create config from environment variables.
141
+
142
+ Environment variables:
143
+ OPENSEARCH_HOST: Host (default: localhost)
144
+ OPENSEARCH_PORT: Port (default: 9200)
145
+ OPENSEARCH_USE_SSL: Use SSL (default: false)
146
+ OPENSEARCH_VERIFY_CERTS: Verify certificates (default: true)
147
+ OPENSEARCH_USERNAME: Username
148
+ OPENSEARCH_PASSWORD: Password
149
+ OPENSEARCH_USE_AWS_SIGV4: Use AWS Sig v4 auth (default: false)
150
+ AWS_REGION: AWS region for Sig v4
151
+ OPENSEARCH_NODES: Comma-separated list of nodes
152
+ EMBEDDING_MODEL: OpenAI embedding model
153
+ EMBEDDING_DIMENSION: Embedding vector dimension
154
+ OPENAI_API_KEY: OpenAI API key
155
+ OPENSEARCH_INDEX_PREFIX: Index name prefix
156
+ OPENSEARCH_SHARDS: Number of shards
157
+ OPENSEARCH_REPLICAS: Number of replicas
158
+ OPENSEARCH_FLOW_AGENT_ID: Flow agent ID for agentic search
159
+ OPENSEARCH_CONVERSATIONAL_AGENT_ID: Conversational agent ID
160
+ AGENTIC_LLM_MODEL: LLM model for agent reasoning (default: gpt-4o)
161
+ AGENTIC_MAX_ITERATIONS: Maximum agent iterations (default: 5)
162
+ AGENTIC_TIMEOUT_SECONDS: Agent execution timeout (default: 60)
163
+
164
+ Returns:
165
+ Configuration from environment.
166
+ """
167
+ nodes_str = os.getenv("OPENSEARCH_NODES", "")
168
+ nodes = tuple(nodes_str.split(",")) if nodes_str else None
169
+
170
+ return cls(
171
+ host=os.getenv("OPENSEARCH_HOST", "localhost"),
172
+ port=int(os.getenv("OPENSEARCH_PORT", "9200")),
173
+ use_ssl=os.getenv("OPENSEARCH_USE_SSL", "").lower() == "true",
174
+ verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "true").lower() == "true",
175
+ username=os.getenv("OPENSEARCH_USERNAME"),
176
+ password=os.getenv("OPENSEARCH_PASSWORD"),
177
+ use_aws_sigv4=os.getenv("OPENSEARCH_USE_AWS_SIGV4", "").lower() == "true",
178
+ aws_region=os.getenv("AWS_REGION"),
179
+ nodes=nodes,
180
+ embedding_model=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small"),
181
+ embedding_dimension=int(os.getenv("EMBEDDING_DIMENSION", "1536")),
182
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
183
+ index_prefix=os.getenv("OPENSEARCH_INDEX_PREFIX", "gnosisllm"),
184
+ number_of_shards=int(os.getenv("OPENSEARCH_SHARDS", "5")),
185
+ number_of_replicas=int(os.getenv("OPENSEARCH_REPLICAS", "1")),
186
+ model_id=os.getenv("OPENSEARCH_MODEL_ID"),
187
+ ingest_pipeline_name=os.getenv("OPENSEARCH_INGEST_PIPELINE"),
188
+ search_pipeline_name=os.getenv("OPENSEARCH_SEARCH_PIPELINE"),
189
+ # Agentic search configuration
190
+ flow_agent_id=os.getenv("OPENSEARCH_FLOW_AGENT_ID"),
191
+ conversational_agent_id=os.getenv("OPENSEARCH_CONVERSATIONAL_AGENT_ID"),
192
+ agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
193
+ agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
194
+ agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
195
+ )
@@ -0,0 +1,499 @@
1
+ """OpenSearch document indexer implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import hashlib
7
+ import logging
8
+ from datetime import datetime, timezone
9
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Sequence
10
+
11
+ # Note: Callable is still needed for on_batch_complete callback type
12
+
13
+ from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
14
+ from gnosisllm_knowledge.backends.opensearch.mappings import (
15
+ get_knowledge_index_mappings,
16
+ get_knowledge_index_settings,
17
+ )
18
+ from gnosisllm_knowledge.core.domain.document import Document
19
+ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
20
+ from gnosisllm_knowledge.core.exceptions import IndexError
21
+
22
+ if TYPE_CHECKING:
23
+ from opensearchpy import AsyncOpenSearch
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class OpenSearchIndexer:
29
+ """OpenSearch document indexer.
30
+
31
+ Implements the IDocumentIndexer protocol for indexing documents
32
+ into OpenSearch with k-NN vector embeddings.
33
+
34
+ Example:
35
+ ```python
36
+ config = OpenSearchConfig.from_env()
37
+ client = AsyncOpenSearch(hosts=[config.url])
38
+ indexer = OpenSearchIndexer(client, config)
39
+
40
+ # Index a single document
41
+ result = await indexer.index(document, "my-index")
42
+
43
+ # Bulk index documents
44
+ result = await indexer.bulk_index(documents, "my-index")
45
+ ```
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ client: AsyncOpenSearch,
51
+ config: OpenSearchConfig,
52
+ ) -> None:
53
+ """Initialize the indexer.
54
+
55
+ Args:
56
+ client: OpenSearch async client.
57
+ config: OpenSearch configuration.
58
+
59
+ Note:
60
+ Embeddings are generated automatically by OpenSearch ingest pipeline.
61
+ Run 'gnosisllm-knowledge setup' to configure the ML model and pipeline.
62
+ """
63
+ self._client = client
64
+ self._config = config
65
+
66
+ async def index(
67
+ self,
68
+ document: Document,
69
+ index_name: str,
70
+ **options: Any,
71
+ ) -> IndexResult:
72
+ """Index a single document.
73
+
74
+ Args:
75
+ document: Document to index.
76
+ index_name: Target index name.
77
+ **options: Additional options (e.g., refresh).
78
+
79
+ Returns:
80
+ Index result with success/failure info.
81
+
82
+ Raises:
83
+ IndexError: If indexing fails.
84
+ """
85
+ try:
86
+ # Prepare document for indexing
87
+ # Embeddings are generated by OpenSearch ingest pipeline
88
+ doc_body = self._prepare_document(document)
89
+
90
+ # Index the document
91
+ refresh = options.get("refresh", False)
92
+ await self._client.index(
93
+ index=index_name,
94
+ id=document.doc_id,
95
+ body=doc_body,
96
+ refresh=refresh,
97
+ )
98
+
99
+ return IndexResult(
100
+ success=True,
101
+ document_id=document.doc_id,
102
+ index_name=index_name,
103
+ indexed_count=1,
104
+ failed_count=0,
105
+ )
106
+ except Exception as e:
107
+ logger.error(f"Failed to index document {document.doc_id}: {e}")
108
+ raise IndexError(
109
+ message=f"Failed to index document: {e}",
110
+ details={"document_id": document.doc_id},
111
+ cause=e,
112
+ ) from e
113
+
114
+ async def bulk_index(
115
+ self,
116
+ documents: Sequence[Document],
117
+ index_name: str,
118
+ **options: Any,
119
+ ) -> IndexResult:
120
+ """Index multiple documents efficiently.
121
+
122
+ Args:
123
+ documents: Documents to index.
124
+ index_name: Target index name.
125
+ **options: Additional options (batch_size, refresh).
126
+
127
+ Returns:
128
+ Index result with counts.
129
+ """
130
+ if not documents:
131
+ return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
132
+
133
+ batch_size = options.get("batch_size", self._config.bulk_batch_size)
134
+ refresh = options.get("refresh", False)
135
+
136
+ total_indexed = 0
137
+ total_failed = 0
138
+ errors: list[str] = []
139
+
140
+ # Process in batches
141
+ for i in range(0, len(documents), batch_size):
142
+ batch = documents[i : i + batch_size]
143
+ batch_result = await self._index_batch(batch, index_name)
144
+
145
+ total_indexed += batch_result.indexed_count
146
+ total_failed += batch_result.failed_count
147
+ if batch_result.errors:
148
+ errors.extend(batch_result.errors)
149
+
150
+ # Final refresh if requested
151
+ if refresh:
152
+ await self._client.indices.refresh(index=index_name)
153
+
154
+ return IndexResult(
155
+ success=total_failed == 0,
156
+ index_name=index_name,
157
+ indexed_count=total_indexed,
158
+ failed_count=total_failed,
159
+ errors=errors if errors else None,
160
+ )
161
+
162
+ async def bulk_index_streaming(
163
+ self,
164
+ documents: AsyncIterator[Document],
165
+ index_name: str,
166
+ *,
167
+ batch_size: int = 500,
168
+ max_concurrent_batches: int = 3,
169
+ on_batch_complete: Callable[[BatchResult], None] | None = None,
170
+ ) -> IndexResult:
171
+ """Stream-index documents with backpressure handling.
172
+
173
+ Args:
174
+ documents: Async iterator of documents.
175
+ index_name: Target index name.
176
+ batch_size: Documents per batch.
177
+ max_concurrent_batches: Max concurrent batch operations.
178
+ on_batch_complete: Callback for batch completion.
179
+
180
+ Returns:
181
+ Index result with totals.
182
+ """
183
+ semaphore = asyncio.Semaphore(max_concurrent_batches)
184
+ total_indexed = 0
185
+ total_failed = 0
186
+ batch_count = 0
187
+ errors: list[str] = []
188
+
189
+ current_batch: list[Document] = []
190
+
191
+ async def process_batch(batch: list[Document], batch_num: int) -> IndexResult:
192
+ async with semaphore:
193
+ result = await self._index_batch(batch, index_name)
194
+
195
+ if on_batch_complete:
196
+ batch_result = BatchResult(
197
+ total=len(batch),
198
+ succeeded=result.indexed_count,
199
+ failed=result.failed_count,
200
+ errors=result.errors or [],
201
+ )
202
+ on_batch_complete(batch_result)
203
+
204
+ return result
205
+
206
+ tasks: list[asyncio.Task[IndexResult]] = []
207
+
208
+ async for doc in documents:
209
+ current_batch.append(doc)
210
+
211
+ if len(current_batch) >= batch_size:
212
+ batch_count += 1
213
+ task = asyncio.create_task(
214
+ process_batch(current_batch.copy(), batch_count)
215
+ )
216
+ tasks.append(task)
217
+ current_batch = []
218
+
219
+ # Process remaining documents
220
+ if current_batch:
221
+ batch_count += 1
222
+ task = asyncio.create_task(
223
+ process_batch(current_batch.copy(), batch_count)
224
+ )
225
+ tasks.append(task)
226
+
227
+ # Wait for all tasks
228
+ results = await asyncio.gather(*tasks)
229
+ for result in results:
230
+ total_indexed += result.indexed_count
231
+ total_failed += result.failed_count
232
+ if result.errors:
233
+ errors.extend(result.errors)
234
+
235
+ return IndexResult(
236
+ success=total_failed == 0,
237
+ index_name=index_name,
238
+ indexed_count=total_indexed,
239
+ failed_count=total_failed,
240
+ errors=errors if errors else None,
241
+ )
242
+
243
+ async def upsert(
244
+ self,
245
+ document: Document,
246
+ index_name: str,
247
+ ) -> IndexResult:
248
+ """Upsert (update or insert) a document.
249
+
250
+ Args:
251
+ document: Document to upsert.
252
+ index_name: Target index name.
253
+
254
+ Returns:
255
+ Index result.
256
+ """
257
+ # Embeddings are generated by OpenSearch ingest pipeline
258
+ doc_body = self._prepare_document(document)
259
+
260
+ await self._client.index(
261
+ index=index_name,
262
+ id=document.doc_id,
263
+ body=doc_body,
264
+ refresh=False,
265
+ )
266
+
267
+ return IndexResult(
268
+ success=True,
269
+ document_id=document.doc_id,
270
+ index_name=index_name,
271
+ indexed_count=1,
272
+ failed_count=0,
273
+ )
274
+
275
+ async def delete(
276
+ self,
277
+ doc_id: str,
278
+ index_name: str,
279
+ ) -> bool:
280
+ """Delete a document by ID.
281
+
282
+ Args:
283
+ doc_id: Document ID to delete.
284
+ index_name: Index name.
285
+
286
+ Returns:
287
+ True if deleted, False if not found.
288
+ """
289
+ try:
290
+ await self._client.delete(index=index_name, id=doc_id)
291
+ return True
292
+ except Exception as e:
293
+ if "not_found" in str(e).lower():
294
+ return False
295
+ raise IndexError(
296
+ message=f"Failed to delete document: {e}",
297
+ details={"document_id": doc_id},
298
+ cause=e,
299
+ ) from e
300
+
301
+ async def bulk_delete(
302
+ self,
303
+ doc_ids: Sequence[str],
304
+ index_name: str,
305
+ ) -> int:
306
+ """Delete multiple documents.
307
+
308
+ Args:
309
+ doc_ids: Document IDs to delete.
310
+ index_name: Index name.
311
+
312
+ Returns:
313
+ Count of deleted documents.
314
+ """
315
+ if not doc_ids:
316
+ return 0
317
+
318
+ actions = [
319
+ {"delete": {"_index": index_name, "_id": doc_id}} for doc_id in doc_ids
320
+ ]
321
+
322
+ response = await self._client.bulk(body=actions)
323
+ deleted = sum(
324
+ 1
325
+ for item in response.get("items", [])
326
+ if item.get("delete", {}).get("result") == "deleted"
327
+ )
328
+
329
+ return deleted
330
+
331
+ async def delete_by_query(
332
+ self,
333
+ query: dict[str, Any],
334
+ index_name: str,
335
+ ) -> int:
336
+ """Delete documents matching query.
337
+
338
+ Args:
339
+ query: OpenSearch query dictionary.
340
+ index_name: Index name.
341
+
342
+ Returns:
343
+ Count of deleted documents.
344
+ """
345
+ response = await self._client.delete_by_query(
346
+ index=index_name,
347
+ body=query,
348
+ refresh=True,
349
+ )
350
+ return response.get("deleted", 0)
351
+
352
+ async def ensure_index(
353
+ self,
354
+ index_name: str,
355
+ **options: Any,
356
+ ) -> bool:
357
+ """Ensure index exists with correct mapping.
358
+
359
+ Args:
360
+ index_name: Index name to create.
361
+ **options: Override settings/mappings.
362
+
363
+ Returns:
364
+ True if created, False if already exists.
365
+ """
366
+ exists = await self._client.indices.exists(index=index_name)
367
+ if exists:
368
+ return False
369
+
370
+ settings = options.get("settings", get_knowledge_index_settings(self._config))
371
+ mappings = options.get("mappings", get_knowledge_index_mappings(self._config))
372
+
373
+ await self._client.indices.create(
374
+ index=index_name,
375
+ body={
376
+ "settings": settings,
377
+ "mappings": mappings,
378
+ },
379
+ )
380
+
381
+ logger.info(f"Created index: {index_name}")
382
+ return True
383
+
384
+ async def delete_index(self, index_name: str) -> bool:
385
+ """Delete an index.
386
+
387
+ Args:
388
+ index_name: Index to delete.
389
+
390
+ Returns:
391
+ True if deleted, False if not found.
392
+ """
393
+ try:
394
+ await self._client.indices.delete(index=index_name)
395
+ return True
396
+ except Exception as e:
397
+ if "index_not_found" in str(e).lower():
398
+ return False
399
+ raise
400
+
401
+ async def refresh_index(self, index_name: str) -> bool:
402
+ """Refresh index to make documents searchable.
403
+
404
+ Args:
405
+ index_name: Index to refresh.
406
+
407
+ Returns:
408
+ True on success.
409
+ """
410
+ await self._client.indices.refresh(index=index_name)
411
+ return True
412
+
413
+ async def _index_batch(
414
+ self,
415
+ documents: Sequence[Document],
416
+ index_name: str,
417
+ ) -> IndexResult:
418
+ """Index a batch of documents.
419
+
420
+ Args:
421
+ documents: Documents to index.
422
+ index_name: Target index.
423
+
424
+ Returns:
425
+ Batch index result.
426
+ """
427
+ # Embeddings are generated by OpenSearch ingest pipeline
428
+ actions = []
429
+ for doc in documents:
430
+ doc_body = self._prepare_document(doc)
431
+ actions.append({"index": {"_index": index_name, "_id": doc.doc_id}})
432
+ actions.append(doc_body)
433
+
434
+ if not actions:
435
+ return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
436
+
437
+ response = await self._client.bulk(body=actions)
438
+
439
+ indexed = 0
440
+ failed = 0
441
+ errors: list[str] = []
442
+
443
+ for item in response.get("items", []):
444
+ index_result = item.get("index", {})
445
+ if index_result.get("status") in (200, 201):
446
+ indexed += 1
447
+ else:
448
+ failed += 1
449
+ error = index_result.get("error", {})
450
+ errors.append(f"{index_result.get('_id')}: {error.get('reason', 'Unknown error')}")
451
+
452
+ return IndexResult(
453
+ success=failed == 0,
454
+ index_name=index_name,
455
+ indexed_count=indexed,
456
+ failed_count=failed,
457
+ errors=errors if errors else None,
458
+ )
459
+
460
+ def _prepare_document(self, document: Document) -> dict[str, Any]:
461
+ """Prepare document for indexing.
462
+
463
+ Args:
464
+ document: Document to prepare.
465
+
466
+ Returns:
467
+ Dictionary for indexing.
468
+ """
469
+ # Calculate content hash if not present
470
+ content_hash = document.content_hash
471
+ if not content_hash:
472
+ content_hash = hashlib.sha256(document.content.encode()).hexdigest()
473
+
474
+ now = datetime.now(timezone.utc)
475
+
476
+ return {
477
+ "id": document.doc_id,
478
+ "content": document.content,
479
+ "url": document.url,
480
+ "title": document.title,
481
+ "source": document.source,
482
+ "account_id": document.account_id,
483
+ "collection_id": document.collection_id,
484
+ "source_id": document.source_id,
485
+ "chunk_index": document.chunk_index,
486
+ "total_chunks": document.total_chunks,
487
+ "parent_doc_id": document.parent_doc_id,
488
+ "quality_score": document.quality_score,
489
+ "language": document.language,
490
+ "content_hash": content_hash,
491
+ "word_count": document.word_count or len(document.content.split()),
492
+ "status": document.status.value,
493
+ "pii_detected": document.pii_detected,
494
+ "pii_redacted": document.pii_redacted,
495
+ "metadata": document.metadata,
496
+ "created_at": document.created_at.isoformat() if document.created_at else now.isoformat(),
497
+ "updated_at": document.updated_at.isoformat() if document.updated_at else None,
498
+ "indexed_at": now.isoformat(),
499
+ }