gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""OpenSearch backend configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class OpenSearchConfig:
|
|
11
|
+
"""OpenSearch backend configuration.
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
```python
|
|
15
|
+
# From environment variables
|
|
16
|
+
config = OpenSearchConfig.from_env()
|
|
17
|
+
|
|
18
|
+
# Explicit configuration
|
|
19
|
+
config = OpenSearchConfig(
|
|
20
|
+
host="localhost",
|
|
21
|
+
port=9200,
|
|
22
|
+
embedding_model="text-embedding-3-small",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# AWS OpenSearch Service
|
|
26
|
+
config = OpenSearchConfig(
|
|
27
|
+
host="search-domain.us-east-1.es.amazonaws.com",
|
|
28
|
+
port=443,
|
|
29
|
+
use_ssl=True,
|
|
30
|
+
use_aws_sigv4=True,
|
|
31
|
+
aws_region="us-east-1",
|
|
32
|
+
)
|
|
33
|
+
```
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# === Connection ===
|
|
37
|
+
host: str = "localhost"
|
|
38
|
+
port: int = 9200
|
|
39
|
+
use_ssl: bool = False
|
|
40
|
+
verify_certs: bool = True
|
|
41
|
+
ca_certs: str | None = None
|
|
42
|
+
|
|
43
|
+
# Authentication
|
|
44
|
+
username: str | None = None
|
|
45
|
+
password: str | None = None
|
|
46
|
+
|
|
47
|
+
# AWS OpenSearch Service
|
|
48
|
+
use_aws_sigv4: bool = False
|
|
49
|
+
aws_region: str | None = None
|
|
50
|
+
aws_service: str = "es" # "es" for OpenSearch, "aoss" for Serverless
|
|
51
|
+
|
|
52
|
+
# === Cluster (High Availability) ===
|
|
53
|
+
nodes: tuple[str, ...] | None = None # Multiple nodes for HA
|
|
54
|
+
sniff_on_start: bool = False
|
|
55
|
+
sniff_on_node_failure: bool = True
|
|
56
|
+
sniff_timeout: float = 10.0
|
|
57
|
+
sniffer_timeout: float = 60.0
|
|
58
|
+
|
|
59
|
+
# === Embedding ===
|
|
60
|
+
embedding_model: str = "text-embedding-3-small"
|
|
61
|
+
embedding_dimension: int = 1536
|
|
62
|
+
openai_api_key: str | None = None
|
|
63
|
+
embedding_batch_size: int = 100
|
|
64
|
+
|
|
65
|
+
# === Index Settings ===
|
|
66
|
+
index_prefix: str = "gnosisllm"
|
|
67
|
+
number_of_shards: int = 5
|
|
68
|
+
number_of_replicas: int = 1
|
|
69
|
+
refresh_interval: str = "1s"
|
|
70
|
+
|
|
71
|
+
# Pipeline names (auto-generated if not set)
|
|
72
|
+
ingest_pipeline_name: str | None = None
|
|
73
|
+
search_pipeline_name: str | None = None
|
|
74
|
+
|
|
75
|
+
# === k-NN Settings ===
|
|
76
|
+
knn_engine: str = "lucene" # lucene (recommended for OpenSearch 2.9+), faiss
|
|
77
|
+
knn_space_type: str = "l2" # l2, cosinesimil, innerproduct
|
|
78
|
+
knn_algo_param_ef_search: int = 512
|
|
79
|
+
knn_algo_param_ef_construction: int = 512
|
|
80
|
+
knn_algo_param_m: int = 16
|
|
81
|
+
|
|
82
|
+
# === Neural Search (OpenSearch 2.9+) ===
|
|
83
|
+
# model_id is the deployed ML model in OpenSearch for embeddings
|
|
84
|
+
model_id: str | None = None
|
|
85
|
+
model_group_id: str | None = None
|
|
86
|
+
embedding_field: str = "content_embedding" # Field name for embeddings
|
|
87
|
+
|
|
88
|
+
# === Agentic Search ===
|
|
89
|
+
# Agent IDs from 'gnosisllm-knowledge agentic setup'
|
|
90
|
+
flow_agent_id: str | None = None
|
|
91
|
+
conversational_agent_id: str | None = None
|
|
92
|
+
# LLM for agent reasoning (OpenAI model ID)
|
|
93
|
+
agentic_llm_model: str = "gpt-4o"
|
|
94
|
+
# Agent execution limits
|
|
95
|
+
agentic_max_iterations: int = 5
|
|
96
|
+
agentic_timeout_seconds: int = 60
|
|
97
|
+
# Conversation memory settings
|
|
98
|
+
memory_window_size: int = 10 # Messages to keep in context
|
|
99
|
+
|
|
100
|
+
# === Timeouts ===
|
|
101
|
+
connect_timeout: float = 5.0
|
|
102
|
+
read_timeout: float = 30.0
|
|
103
|
+
bulk_timeout: float = 120.0
|
|
104
|
+
|
|
105
|
+
# === Bulk Indexing ===
|
|
106
|
+
bulk_batch_size: int = 500
|
|
107
|
+
bulk_max_concurrent: int = 3
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def url(self) -> str:
|
|
111
|
+
"""Get the full OpenSearch URL."""
|
|
112
|
+
scheme = "https" if self.use_ssl else "http"
|
|
113
|
+
return f"{scheme}://{self.host}:{self.port}"
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def knowledge_index_name(self) -> str:
|
|
117
|
+
"""Get the default knowledge index name."""
|
|
118
|
+
return f"{self.index_prefix}-knowledge"
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def agentic_memory_index_name(self) -> str:
|
|
122
|
+
"""Get the agentic memory index name."""
|
|
123
|
+
return f"{self.index_prefix}-memory"
|
|
124
|
+
|
|
125
|
+
def get_index_name(self, collection_id: str | None = None) -> str:
|
|
126
|
+
"""Get index name for a collection.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
collection_id: Optional collection ID for multi-tenant indexing.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Index name pattern.
|
|
133
|
+
"""
|
|
134
|
+
if collection_id:
|
|
135
|
+
return f"{self.index_prefix}-{collection_id}"
|
|
136
|
+
return self.knowledge_index_name
|
|
137
|
+
|
|
138
|
+
@classmethod
|
|
139
|
+
def from_env(cls) -> OpenSearchConfig:
|
|
140
|
+
"""Create config from environment variables.
|
|
141
|
+
|
|
142
|
+
Environment variables:
|
|
143
|
+
OPENSEARCH_HOST: Host (default: localhost)
|
|
144
|
+
OPENSEARCH_PORT: Port (default: 9200)
|
|
145
|
+
OPENSEARCH_USE_SSL: Use SSL (default: false)
|
|
146
|
+
OPENSEARCH_VERIFY_CERTS: Verify certificates (default: true)
|
|
147
|
+
OPENSEARCH_USERNAME: Username
|
|
148
|
+
OPENSEARCH_PASSWORD: Password
|
|
149
|
+
OPENSEARCH_USE_AWS_SIGV4: Use AWS Sig v4 auth (default: false)
|
|
150
|
+
AWS_REGION: AWS region for Sig v4
|
|
151
|
+
OPENSEARCH_NODES: Comma-separated list of nodes
|
|
152
|
+
EMBEDDING_MODEL: OpenAI embedding model
|
|
153
|
+
EMBEDDING_DIMENSION: Embedding vector dimension
|
|
154
|
+
OPENAI_API_KEY: OpenAI API key
|
|
155
|
+
OPENSEARCH_INDEX_PREFIX: Index name prefix
|
|
156
|
+
OPENSEARCH_SHARDS: Number of shards
|
|
157
|
+
OPENSEARCH_REPLICAS: Number of replicas
|
|
158
|
+
OPENSEARCH_FLOW_AGENT_ID: Flow agent ID for agentic search
|
|
159
|
+
OPENSEARCH_CONVERSATIONAL_AGENT_ID: Conversational agent ID
|
|
160
|
+
AGENTIC_LLM_MODEL: LLM model for agent reasoning (default: gpt-4o)
|
|
161
|
+
AGENTIC_MAX_ITERATIONS: Maximum agent iterations (default: 5)
|
|
162
|
+
AGENTIC_TIMEOUT_SECONDS: Agent execution timeout (default: 60)
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Configuration from environment.
|
|
166
|
+
"""
|
|
167
|
+
nodes_str = os.getenv("OPENSEARCH_NODES", "")
|
|
168
|
+
nodes = tuple(nodes_str.split(",")) if nodes_str else None
|
|
169
|
+
|
|
170
|
+
return cls(
|
|
171
|
+
host=os.getenv("OPENSEARCH_HOST", "localhost"),
|
|
172
|
+
port=int(os.getenv("OPENSEARCH_PORT", "9200")),
|
|
173
|
+
use_ssl=os.getenv("OPENSEARCH_USE_SSL", "").lower() == "true",
|
|
174
|
+
verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "true").lower() == "true",
|
|
175
|
+
username=os.getenv("OPENSEARCH_USERNAME"),
|
|
176
|
+
password=os.getenv("OPENSEARCH_PASSWORD"),
|
|
177
|
+
use_aws_sigv4=os.getenv("OPENSEARCH_USE_AWS_SIGV4", "").lower() == "true",
|
|
178
|
+
aws_region=os.getenv("AWS_REGION"),
|
|
179
|
+
nodes=nodes,
|
|
180
|
+
embedding_model=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small"),
|
|
181
|
+
embedding_dimension=int(os.getenv("EMBEDDING_DIMENSION", "1536")),
|
|
182
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
183
|
+
index_prefix=os.getenv("OPENSEARCH_INDEX_PREFIX", "gnosisllm"),
|
|
184
|
+
number_of_shards=int(os.getenv("OPENSEARCH_SHARDS", "5")),
|
|
185
|
+
number_of_replicas=int(os.getenv("OPENSEARCH_REPLICAS", "1")),
|
|
186
|
+
model_id=os.getenv("OPENSEARCH_MODEL_ID"),
|
|
187
|
+
ingest_pipeline_name=os.getenv("OPENSEARCH_INGEST_PIPELINE"),
|
|
188
|
+
search_pipeline_name=os.getenv("OPENSEARCH_SEARCH_PIPELINE"),
|
|
189
|
+
# Agentic search configuration
|
|
190
|
+
flow_agent_id=os.getenv("OPENSEARCH_FLOW_AGENT_ID"),
|
|
191
|
+
conversational_agent_id=os.getenv("OPENSEARCH_CONVERSATIONAL_AGENT_ID"),
|
|
192
|
+
agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
|
|
193
|
+
agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
|
|
194
|
+
agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
|
|
195
|
+
)
|
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""OpenSearch document indexer implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import hashlib
|
|
7
|
+
import logging
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Sequence
|
|
10
|
+
|
|
11
|
+
# Note: Callable is still needed for on_batch_complete callback type
|
|
12
|
+
|
|
13
|
+
from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
|
|
14
|
+
from gnosisllm_knowledge.backends.opensearch.mappings import (
|
|
15
|
+
get_knowledge_index_mappings,
|
|
16
|
+
get_knowledge_index_settings,
|
|
17
|
+
)
|
|
18
|
+
from gnosisllm_knowledge.core.domain.document import Document
|
|
19
|
+
from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
|
|
20
|
+
from gnosisllm_knowledge.core.exceptions import IndexError
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from opensearchpy import AsyncOpenSearch
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OpenSearchIndexer:
|
|
29
|
+
"""OpenSearch document indexer.
|
|
30
|
+
|
|
31
|
+
Implements the IDocumentIndexer protocol for indexing documents
|
|
32
|
+
into OpenSearch with k-NN vector embeddings.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
```python
|
|
36
|
+
config = OpenSearchConfig.from_env()
|
|
37
|
+
client = AsyncOpenSearch(hosts=[config.url])
|
|
38
|
+
indexer = OpenSearchIndexer(client, config)
|
|
39
|
+
|
|
40
|
+
# Index a single document
|
|
41
|
+
result = await indexer.index(document, "my-index")
|
|
42
|
+
|
|
43
|
+
# Bulk index documents
|
|
44
|
+
result = await indexer.bulk_index(documents, "my-index")
|
|
45
|
+
```
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
client: AsyncOpenSearch,
|
|
51
|
+
config: OpenSearchConfig,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""Initialize the indexer.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
client: OpenSearch async client.
|
|
57
|
+
config: OpenSearch configuration.
|
|
58
|
+
|
|
59
|
+
Note:
|
|
60
|
+
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
61
|
+
Run 'gnosisllm-knowledge setup' to configure the ML model and pipeline.
|
|
62
|
+
"""
|
|
63
|
+
self._client = client
|
|
64
|
+
self._config = config
|
|
65
|
+
|
|
66
|
+
async def index(
|
|
67
|
+
self,
|
|
68
|
+
document: Document,
|
|
69
|
+
index_name: str,
|
|
70
|
+
**options: Any,
|
|
71
|
+
) -> IndexResult:
|
|
72
|
+
"""Index a single document.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
document: Document to index.
|
|
76
|
+
index_name: Target index name.
|
|
77
|
+
**options: Additional options (e.g., refresh).
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Index result with success/failure info.
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
IndexError: If indexing fails.
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
# Prepare document for indexing
|
|
87
|
+
# Embeddings are generated by OpenSearch ingest pipeline
|
|
88
|
+
doc_body = self._prepare_document(document)
|
|
89
|
+
|
|
90
|
+
# Index the document
|
|
91
|
+
refresh = options.get("refresh", False)
|
|
92
|
+
await self._client.index(
|
|
93
|
+
index=index_name,
|
|
94
|
+
id=document.doc_id,
|
|
95
|
+
body=doc_body,
|
|
96
|
+
refresh=refresh,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return IndexResult(
|
|
100
|
+
success=True,
|
|
101
|
+
document_id=document.doc_id,
|
|
102
|
+
index_name=index_name,
|
|
103
|
+
indexed_count=1,
|
|
104
|
+
failed_count=0,
|
|
105
|
+
)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Failed to index document {document.doc_id}: {e}")
|
|
108
|
+
raise IndexError(
|
|
109
|
+
message=f"Failed to index document: {e}",
|
|
110
|
+
details={"document_id": document.doc_id},
|
|
111
|
+
cause=e,
|
|
112
|
+
) from e
|
|
113
|
+
|
|
114
|
+
async def bulk_index(
|
|
115
|
+
self,
|
|
116
|
+
documents: Sequence[Document],
|
|
117
|
+
index_name: str,
|
|
118
|
+
**options: Any,
|
|
119
|
+
) -> IndexResult:
|
|
120
|
+
"""Index multiple documents efficiently.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
documents: Documents to index.
|
|
124
|
+
index_name: Target index name.
|
|
125
|
+
**options: Additional options (batch_size, refresh).
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Index result with counts.
|
|
129
|
+
"""
|
|
130
|
+
if not documents:
|
|
131
|
+
return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
|
|
132
|
+
|
|
133
|
+
batch_size = options.get("batch_size", self._config.bulk_batch_size)
|
|
134
|
+
refresh = options.get("refresh", False)
|
|
135
|
+
|
|
136
|
+
total_indexed = 0
|
|
137
|
+
total_failed = 0
|
|
138
|
+
errors: list[str] = []
|
|
139
|
+
|
|
140
|
+
# Process in batches
|
|
141
|
+
for i in range(0, len(documents), batch_size):
|
|
142
|
+
batch = documents[i : i + batch_size]
|
|
143
|
+
batch_result = await self._index_batch(batch, index_name)
|
|
144
|
+
|
|
145
|
+
total_indexed += batch_result.indexed_count
|
|
146
|
+
total_failed += batch_result.failed_count
|
|
147
|
+
if batch_result.errors:
|
|
148
|
+
errors.extend(batch_result.errors)
|
|
149
|
+
|
|
150
|
+
# Final refresh if requested
|
|
151
|
+
if refresh:
|
|
152
|
+
await self._client.indices.refresh(index=index_name)
|
|
153
|
+
|
|
154
|
+
return IndexResult(
|
|
155
|
+
success=total_failed == 0,
|
|
156
|
+
index_name=index_name,
|
|
157
|
+
indexed_count=total_indexed,
|
|
158
|
+
failed_count=total_failed,
|
|
159
|
+
errors=errors if errors else None,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
async def bulk_index_streaming(
|
|
163
|
+
self,
|
|
164
|
+
documents: AsyncIterator[Document],
|
|
165
|
+
index_name: str,
|
|
166
|
+
*,
|
|
167
|
+
batch_size: int = 500,
|
|
168
|
+
max_concurrent_batches: int = 3,
|
|
169
|
+
on_batch_complete: Callable[[BatchResult], None] | None = None,
|
|
170
|
+
) -> IndexResult:
|
|
171
|
+
"""Stream-index documents with backpressure handling.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
documents: Async iterator of documents.
|
|
175
|
+
index_name: Target index name.
|
|
176
|
+
batch_size: Documents per batch.
|
|
177
|
+
max_concurrent_batches: Max concurrent batch operations.
|
|
178
|
+
on_batch_complete: Callback for batch completion.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Index result with totals.
|
|
182
|
+
"""
|
|
183
|
+
semaphore = asyncio.Semaphore(max_concurrent_batches)
|
|
184
|
+
total_indexed = 0
|
|
185
|
+
total_failed = 0
|
|
186
|
+
batch_count = 0
|
|
187
|
+
errors: list[str] = []
|
|
188
|
+
|
|
189
|
+
current_batch: list[Document] = []
|
|
190
|
+
|
|
191
|
+
async def process_batch(batch: list[Document], batch_num: int) -> IndexResult:
|
|
192
|
+
async with semaphore:
|
|
193
|
+
result = await self._index_batch(batch, index_name)
|
|
194
|
+
|
|
195
|
+
if on_batch_complete:
|
|
196
|
+
batch_result = BatchResult(
|
|
197
|
+
total=len(batch),
|
|
198
|
+
succeeded=result.indexed_count,
|
|
199
|
+
failed=result.failed_count,
|
|
200
|
+
errors=result.errors or [],
|
|
201
|
+
)
|
|
202
|
+
on_batch_complete(batch_result)
|
|
203
|
+
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
tasks: list[asyncio.Task[IndexResult]] = []
|
|
207
|
+
|
|
208
|
+
async for doc in documents:
|
|
209
|
+
current_batch.append(doc)
|
|
210
|
+
|
|
211
|
+
if len(current_batch) >= batch_size:
|
|
212
|
+
batch_count += 1
|
|
213
|
+
task = asyncio.create_task(
|
|
214
|
+
process_batch(current_batch.copy(), batch_count)
|
|
215
|
+
)
|
|
216
|
+
tasks.append(task)
|
|
217
|
+
current_batch = []
|
|
218
|
+
|
|
219
|
+
# Process remaining documents
|
|
220
|
+
if current_batch:
|
|
221
|
+
batch_count += 1
|
|
222
|
+
task = asyncio.create_task(
|
|
223
|
+
process_batch(current_batch.copy(), batch_count)
|
|
224
|
+
)
|
|
225
|
+
tasks.append(task)
|
|
226
|
+
|
|
227
|
+
# Wait for all tasks
|
|
228
|
+
results = await asyncio.gather(*tasks)
|
|
229
|
+
for result in results:
|
|
230
|
+
total_indexed += result.indexed_count
|
|
231
|
+
total_failed += result.failed_count
|
|
232
|
+
if result.errors:
|
|
233
|
+
errors.extend(result.errors)
|
|
234
|
+
|
|
235
|
+
return IndexResult(
|
|
236
|
+
success=total_failed == 0,
|
|
237
|
+
index_name=index_name,
|
|
238
|
+
indexed_count=total_indexed,
|
|
239
|
+
failed_count=total_failed,
|
|
240
|
+
errors=errors if errors else None,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
async def upsert(
|
|
244
|
+
self,
|
|
245
|
+
document: Document,
|
|
246
|
+
index_name: str,
|
|
247
|
+
) -> IndexResult:
|
|
248
|
+
"""Upsert (update or insert) a document.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
document: Document to upsert.
|
|
252
|
+
index_name: Target index name.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Index result.
|
|
256
|
+
"""
|
|
257
|
+
# Embeddings are generated by OpenSearch ingest pipeline
|
|
258
|
+
doc_body = self._prepare_document(document)
|
|
259
|
+
|
|
260
|
+
await self._client.index(
|
|
261
|
+
index=index_name,
|
|
262
|
+
id=document.doc_id,
|
|
263
|
+
body=doc_body,
|
|
264
|
+
refresh=False,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return IndexResult(
|
|
268
|
+
success=True,
|
|
269
|
+
document_id=document.doc_id,
|
|
270
|
+
index_name=index_name,
|
|
271
|
+
indexed_count=1,
|
|
272
|
+
failed_count=0,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
async def delete(
|
|
276
|
+
self,
|
|
277
|
+
doc_id: str,
|
|
278
|
+
index_name: str,
|
|
279
|
+
) -> bool:
|
|
280
|
+
"""Delete a document by ID.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
doc_id: Document ID to delete.
|
|
284
|
+
index_name: Index name.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
True if deleted, False if not found.
|
|
288
|
+
"""
|
|
289
|
+
try:
|
|
290
|
+
await self._client.delete(index=index_name, id=doc_id)
|
|
291
|
+
return True
|
|
292
|
+
except Exception as e:
|
|
293
|
+
if "not_found" in str(e).lower():
|
|
294
|
+
return False
|
|
295
|
+
raise IndexError(
|
|
296
|
+
message=f"Failed to delete document: {e}",
|
|
297
|
+
details={"document_id": doc_id},
|
|
298
|
+
cause=e,
|
|
299
|
+
) from e
|
|
300
|
+
|
|
301
|
+
async def bulk_delete(
|
|
302
|
+
self,
|
|
303
|
+
doc_ids: Sequence[str],
|
|
304
|
+
index_name: str,
|
|
305
|
+
) -> int:
|
|
306
|
+
"""Delete multiple documents.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
doc_ids: Document IDs to delete.
|
|
310
|
+
index_name: Index name.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Count of deleted documents.
|
|
314
|
+
"""
|
|
315
|
+
if not doc_ids:
|
|
316
|
+
return 0
|
|
317
|
+
|
|
318
|
+
actions = [
|
|
319
|
+
{"delete": {"_index": index_name, "_id": doc_id}} for doc_id in doc_ids
|
|
320
|
+
]
|
|
321
|
+
|
|
322
|
+
response = await self._client.bulk(body=actions)
|
|
323
|
+
deleted = sum(
|
|
324
|
+
1
|
|
325
|
+
for item in response.get("items", [])
|
|
326
|
+
if item.get("delete", {}).get("result") == "deleted"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
return deleted
|
|
330
|
+
|
|
331
|
+
async def delete_by_query(
|
|
332
|
+
self,
|
|
333
|
+
query: dict[str, Any],
|
|
334
|
+
index_name: str,
|
|
335
|
+
) -> int:
|
|
336
|
+
"""Delete documents matching query.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
query: OpenSearch query dictionary.
|
|
340
|
+
index_name: Index name.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Count of deleted documents.
|
|
344
|
+
"""
|
|
345
|
+
response = await self._client.delete_by_query(
|
|
346
|
+
index=index_name,
|
|
347
|
+
body=query,
|
|
348
|
+
refresh=True,
|
|
349
|
+
)
|
|
350
|
+
return response.get("deleted", 0)
|
|
351
|
+
|
|
352
|
+
async def ensure_index(
|
|
353
|
+
self,
|
|
354
|
+
index_name: str,
|
|
355
|
+
**options: Any,
|
|
356
|
+
) -> bool:
|
|
357
|
+
"""Ensure index exists with correct mapping.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
index_name: Index name to create.
|
|
361
|
+
**options: Override settings/mappings.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
True if created, False if already exists.
|
|
365
|
+
"""
|
|
366
|
+
exists = await self._client.indices.exists(index=index_name)
|
|
367
|
+
if exists:
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
settings = options.get("settings", get_knowledge_index_settings(self._config))
|
|
371
|
+
mappings = options.get("mappings", get_knowledge_index_mappings(self._config))
|
|
372
|
+
|
|
373
|
+
await self._client.indices.create(
|
|
374
|
+
index=index_name,
|
|
375
|
+
body={
|
|
376
|
+
"settings": settings,
|
|
377
|
+
"mappings": mappings,
|
|
378
|
+
},
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
logger.info(f"Created index: {index_name}")
|
|
382
|
+
return True
|
|
383
|
+
|
|
384
|
+
async def delete_index(self, index_name: str) -> bool:
|
|
385
|
+
"""Delete an index.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
index_name: Index to delete.
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
True if deleted, False if not found.
|
|
392
|
+
"""
|
|
393
|
+
try:
|
|
394
|
+
await self._client.indices.delete(index=index_name)
|
|
395
|
+
return True
|
|
396
|
+
except Exception as e:
|
|
397
|
+
if "index_not_found" in str(e).lower():
|
|
398
|
+
return False
|
|
399
|
+
raise
|
|
400
|
+
|
|
401
|
+
async def refresh_index(self, index_name: str) -> bool:
|
|
402
|
+
"""Refresh index to make documents searchable.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
index_name: Index to refresh.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
True on success.
|
|
409
|
+
"""
|
|
410
|
+
await self._client.indices.refresh(index=index_name)
|
|
411
|
+
return True
|
|
412
|
+
|
|
413
|
+
async def _index_batch(
|
|
414
|
+
self,
|
|
415
|
+
documents: Sequence[Document],
|
|
416
|
+
index_name: str,
|
|
417
|
+
) -> IndexResult:
|
|
418
|
+
"""Index a batch of documents.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
documents: Documents to index.
|
|
422
|
+
index_name: Target index.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
Batch index result.
|
|
426
|
+
"""
|
|
427
|
+
# Embeddings are generated by OpenSearch ingest pipeline
|
|
428
|
+
actions = []
|
|
429
|
+
for doc in documents:
|
|
430
|
+
doc_body = self._prepare_document(doc)
|
|
431
|
+
actions.append({"index": {"_index": index_name, "_id": doc.doc_id}})
|
|
432
|
+
actions.append(doc_body)
|
|
433
|
+
|
|
434
|
+
if not actions:
|
|
435
|
+
return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
|
|
436
|
+
|
|
437
|
+
response = await self._client.bulk(body=actions)
|
|
438
|
+
|
|
439
|
+
indexed = 0
|
|
440
|
+
failed = 0
|
|
441
|
+
errors: list[str] = []
|
|
442
|
+
|
|
443
|
+
for item in response.get("items", []):
|
|
444
|
+
index_result = item.get("index", {})
|
|
445
|
+
if index_result.get("status") in (200, 201):
|
|
446
|
+
indexed += 1
|
|
447
|
+
else:
|
|
448
|
+
failed += 1
|
|
449
|
+
error = index_result.get("error", {})
|
|
450
|
+
errors.append(f"{index_result.get('_id')}: {error.get('reason', 'Unknown error')}")
|
|
451
|
+
|
|
452
|
+
return IndexResult(
|
|
453
|
+
success=failed == 0,
|
|
454
|
+
index_name=index_name,
|
|
455
|
+
indexed_count=indexed,
|
|
456
|
+
failed_count=failed,
|
|
457
|
+
errors=errors if errors else None,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
def _prepare_document(self, document: Document) -> dict[str, Any]:
|
|
461
|
+
"""Prepare document for indexing.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
document: Document to prepare.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
Dictionary for indexing.
|
|
468
|
+
"""
|
|
469
|
+
# Calculate content hash if not present
|
|
470
|
+
content_hash = document.content_hash
|
|
471
|
+
if not content_hash:
|
|
472
|
+
content_hash = hashlib.sha256(document.content.encode()).hexdigest()
|
|
473
|
+
|
|
474
|
+
now = datetime.now(timezone.utc)
|
|
475
|
+
|
|
476
|
+
return {
|
|
477
|
+
"id": document.doc_id,
|
|
478
|
+
"content": document.content,
|
|
479
|
+
"url": document.url,
|
|
480
|
+
"title": document.title,
|
|
481
|
+
"source": document.source,
|
|
482
|
+
"account_id": document.account_id,
|
|
483
|
+
"collection_id": document.collection_id,
|
|
484
|
+
"source_id": document.source_id,
|
|
485
|
+
"chunk_index": document.chunk_index,
|
|
486
|
+
"total_chunks": document.total_chunks,
|
|
487
|
+
"parent_doc_id": document.parent_doc_id,
|
|
488
|
+
"quality_score": document.quality_score,
|
|
489
|
+
"language": document.language,
|
|
490
|
+
"content_hash": content_hash,
|
|
491
|
+
"word_count": document.word_count or len(document.content.split()),
|
|
492
|
+
"status": document.status.value,
|
|
493
|
+
"pii_detected": document.pii_detected,
|
|
494
|
+
"pii_redacted": document.pii_redacted,
|
|
495
|
+
"metadata": document.metadata,
|
|
496
|
+
"created_at": document.created_at.isoformat() if document.created_at else now.isoformat(),
|
|
497
|
+
"updated_at": document.updated_at.isoformat() if document.updated_at else None,
|
|
498
|
+
"indexed_at": now.isoformat(),
|
|
499
|
+
}
|