gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -74,7 +74,7 @@ class OpenSearchConfig:
|
|
|
74
74
|
|
|
75
75
|
# === k-NN Settings ===
|
|
76
76
|
knn_engine: str = "lucene" # lucene (recommended for OpenSearch 2.9+), faiss
|
|
77
|
-
knn_space_type: str = "
|
|
77
|
+
knn_space_type: str = "cosinesimil" # cosinesimil (recommended), l2, innerproduct
|
|
78
78
|
knn_algo_param_ef_search: int = 512
|
|
79
79
|
knn_algo_param_ef_construction: int = 512
|
|
80
80
|
knn_algo_param_m: int = 16
|
|
@@ -85,10 +85,13 @@ class OpenSearchConfig:
|
|
|
85
85
|
model_group_id: str | None = None
|
|
86
86
|
embedding_field: str = "content_embedding" # Field name for embeddings
|
|
87
87
|
|
|
88
|
-
# === Agentic Search ===
|
|
88
|
+
# === Agentic Search (OpenSearch 3.2+) ===
|
|
89
|
+
# Uses QueryPlanningTool for LLM-generated DSL queries
|
|
89
90
|
# Agent IDs from 'gnosisllm-knowledge agentic setup'
|
|
90
91
|
flow_agent_id: str | None = None
|
|
91
92
|
conversational_agent_id: str | None = None
|
|
93
|
+
# Agentic search pipeline (created during agentic setup)
|
|
94
|
+
agentic_pipeline_name: str | None = None
|
|
92
95
|
# LLM for agent reasoning (OpenAI model ID)
|
|
93
96
|
agentic_llm_model: str = "gpt-4o"
|
|
94
97
|
# Agent execution limits
|
|
@@ -139,57 +142,75 @@ class OpenSearchConfig:
|
|
|
139
142
|
def from_env(cls) -> OpenSearchConfig:
|
|
140
143
|
"""Create config from environment variables.
|
|
141
144
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
OPENSEARCH_PORT: Port (default: 9200)
|
|
145
|
-
OPENSEARCH_USE_SSL: Use SSL (default: false)
|
|
146
|
-
OPENSEARCH_VERIFY_CERTS: Verify certificates (default: true)
|
|
147
|
-
OPENSEARCH_USERNAME: Username
|
|
148
|
-
OPENSEARCH_PASSWORD: Password
|
|
149
|
-
OPENSEARCH_USE_AWS_SIGV4: Use AWS Sig v4 auth (default: false)
|
|
150
|
-
AWS_REGION: AWS region for Sig v4
|
|
151
|
-
OPENSEARCH_NODES: Comma-separated list of nodes
|
|
152
|
-
EMBEDDING_MODEL: OpenAI embedding model
|
|
153
|
-
EMBEDDING_DIMENSION: Embedding vector dimension
|
|
154
|
-
OPENAI_API_KEY: OpenAI API key
|
|
155
|
-
OPENSEARCH_INDEX_PREFIX: Index name prefix
|
|
156
|
-
OPENSEARCH_SHARDS: Number of shards
|
|
157
|
-
OPENSEARCH_REPLICAS: Number of replicas
|
|
158
|
-
OPENSEARCH_FLOW_AGENT_ID: Flow agent ID for agentic search
|
|
159
|
-
OPENSEARCH_CONVERSATIONAL_AGENT_ID: Conversational agent ID
|
|
160
|
-
AGENTIC_LLM_MODEL: LLM model for agent reasoning (default: gpt-4o)
|
|
161
|
-
AGENTIC_MAX_ITERATIONS: Maximum agent iterations (default: 5)
|
|
162
|
-
AGENTIC_TIMEOUT_SECONDS: Agent execution timeout (default: 60)
|
|
145
|
+
All configuration options can be set via environment variables.
|
|
146
|
+
See .env.example for a complete list with descriptions.
|
|
163
147
|
|
|
164
148
|
Returns:
|
|
165
149
|
Configuration from environment.
|
|
166
150
|
"""
|
|
151
|
+
# Parse nodes list
|
|
167
152
|
nodes_str = os.getenv("OPENSEARCH_NODES", "")
|
|
168
|
-
nodes = tuple(nodes_str.split(",")
|
|
153
|
+
nodes = tuple(n.strip() for n in nodes_str.split(",") if n.strip()) or None
|
|
169
154
|
|
|
170
155
|
return cls(
|
|
156
|
+
# === Connection ===
|
|
171
157
|
host=os.getenv("OPENSEARCH_HOST", "localhost"),
|
|
172
158
|
port=int(os.getenv("OPENSEARCH_PORT", "9200")),
|
|
173
|
-
use_ssl=os.getenv("OPENSEARCH_USE_SSL", "").lower() == "true",
|
|
159
|
+
use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
|
|
174
160
|
verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "true").lower() == "true",
|
|
161
|
+
ca_certs=os.getenv("OPENSEARCH_CA_CERTS"),
|
|
162
|
+
# Authentication
|
|
175
163
|
username=os.getenv("OPENSEARCH_USERNAME"),
|
|
176
164
|
password=os.getenv("OPENSEARCH_PASSWORD"),
|
|
177
|
-
|
|
165
|
+
# AWS OpenSearch Service
|
|
166
|
+
use_aws_sigv4=os.getenv("OPENSEARCH_USE_AWS_SIGV4", "false").lower() == "true",
|
|
178
167
|
aws_region=os.getenv("AWS_REGION"),
|
|
168
|
+
aws_service=os.getenv("OPENSEARCH_AWS_SERVICE", "es"),
|
|
169
|
+
# === Cluster (High Availability) ===
|
|
179
170
|
nodes=nodes,
|
|
171
|
+
sniff_on_start=os.getenv("OPENSEARCH_SNIFF_ON_START", "false").lower() == "true",
|
|
172
|
+
sniff_on_node_failure=os.getenv("OPENSEARCH_SNIFF_ON_NODE_FAILURE", "true").lower()
|
|
173
|
+
== "true",
|
|
174
|
+
sniff_timeout=float(os.getenv("OPENSEARCH_SNIFF_TIMEOUT", "10.0")),
|
|
175
|
+
sniffer_timeout=float(os.getenv("OPENSEARCH_SNIFFER_TIMEOUT", "60.0")),
|
|
176
|
+
# === Embedding ===
|
|
180
177
|
embedding_model=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small"),
|
|
181
178
|
embedding_dimension=int(os.getenv("EMBEDDING_DIMENSION", "1536")),
|
|
182
179
|
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
180
|
+
embedding_batch_size=int(os.getenv("EMBEDDING_BATCH_SIZE", "100")),
|
|
181
|
+
# === Index Settings ===
|
|
183
182
|
index_prefix=os.getenv("OPENSEARCH_INDEX_PREFIX", "gnosisllm"),
|
|
184
183
|
number_of_shards=int(os.getenv("OPENSEARCH_SHARDS", "5")),
|
|
185
184
|
number_of_replicas=int(os.getenv("OPENSEARCH_REPLICAS", "1")),
|
|
186
|
-
|
|
185
|
+
refresh_interval=os.getenv("OPENSEARCH_REFRESH_INTERVAL", "1s"),
|
|
186
|
+
# Pipeline names
|
|
187
187
|
ingest_pipeline_name=os.getenv("OPENSEARCH_INGEST_PIPELINE"),
|
|
188
188
|
search_pipeline_name=os.getenv("OPENSEARCH_SEARCH_PIPELINE"),
|
|
189
|
-
#
|
|
189
|
+
# === k-NN Settings ===
|
|
190
|
+
knn_engine=os.getenv("OPENSEARCH_KNN_ENGINE", "lucene"),
|
|
191
|
+
knn_space_type=os.getenv("OPENSEARCH_KNN_SPACE_TYPE", "cosinesimil"),
|
|
192
|
+
knn_algo_param_ef_search=int(os.getenv("OPENSEARCH_KNN_EF_SEARCH", "512")),
|
|
193
|
+
knn_algo_param_ef_construction=int(
|
|
194
|
+
os.getenv("OPENSEARCH_KNN_EF_CONSTRUCTION", "512")
|
|
195
|
+
),
|
|
196
|
+
knn_algo_param_m=int(os.getenv("OPENSEARCH_KNN_M", "16")),
|
|
197
|
+
# === Neural Search ===
|
|
198
|
+
model_id=os.getenv("OPENSEARCH_MODEL_ID"),
|
|
199
|
+
model_group_id=os.getenv("OPENSEARCH_MODEL_GROUP_ID"),
|
|
200
|
+
embedding_field=os.getenv("OPENSEARCH_EMBEDDING_FIELD", "content_embedding"),
|
|
201
|
+
# === Agentic Search ===
|
|
190
202
|
flow_agent_id=os.getenv("OPENSEARCH_FLOW_AGENT_ID"),
|
|
191
203
|
conversational_agent_id=os.getenv("OPENSEARCH_CONVERSATIONAL_AGENT_ID"),
|
|
204
|
+
agentic_pipeline_name=os.getenv("OPENSEARCH_AGENTIC_PIPELINE"),
|
|
192
205
|
agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
|
|
193
206
|
agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
|
|
194
207
|
agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
|
|
208
|
+
memory_window_size=int(os.getenv("AGENTIC_MEMORY_WINDOW_SIZE", "10")),
|
|
209
|
+
# === Timeouts ===
|
|
210
|
+
connect_timeout=float(os.getenv("OPENSEARCH_CONNECT_TIMEOUT", "5.0")),
|
|
211
|
+
read_timeout=float(os.getenv("OPENSEARCH_READ_TIMEOUT", "30.0")),
|
|
212
|
+
bulk_timeout=float(os.getenv("OPENSEARCH_BULK_TIMEOUT", "120.0")),
|
|
213
|
+
# === Bulk Indexing ===
|
|
214
|
+
bulk_batch_size=int(os.getenv("OPENSEARCH_BULK_BATCH_SIZE", "500")),
|
|
215
|
+
bulk_max_concurrent=int(os.getenv("OPENSEARCH_BULK_MAX_CONCURRENT", "3")),
|
|
195
216
|
)
|
|
@@ -87,13 +87,15 @@ class OpenSearchIndexer:
|
|
|
87
87
|
# Embeddings are generated by OpenSearch ingest pipeline
|
|
88
88
|
doc_body = self._prepare_document(document)
|
|
89
89
|
|
|
90
|
-
# Index the document
|
|
90
|
+
# Index the document with ingest pipeline for embedding generation
|
|
91
91
|
refresh = options.get("refresh", False)
|
|
92
|
+
pipeline = self._config.ingest_pipeline_name
|
|
92
93
|
await self._client.index(
|
|
93
94
|
index=index_name,
|
|
94
95
|
id=document.doc_id,
|
|
95
96
|
body=doc_body,
|
|
96
97
|
refresh=refresh,
|
|
98
|
+
pipeline=pipeline,
|
|
97
99
|
)
|
|
98
100
|
|
|
99
101
|
return IndexResult(
|
|
@@ -272,6 +274,43 @@ class OpenSearchIndexer:
|
|
|
272
274
|
failed_count=0,
|
|
273
275
|
)
|
|
274
276
|
|
|
277
|
+
async def get(
|
|
278
|
+
self,
|
|
279
|
+
doc_id: str,
|
|
280
|
+
index_name: str,
|
|
281
|
+
) -> dict[str, Any] | None:
|
|
282
|
+
"""Get a document by ID.
|
|
283
|
+
|
|
284
|
+
Uses OpenSearch client's direct get() API (CRUD operation, not search).
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
doc_id: Document ID to retrieve.
|
|
288
|
+
index_name: Index name.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Document dict (source fields) or None if not found.
|
|
292
|
+
Excludes embeddings from response for efficiency.
|
|
293
|
+
"""
|
|
294
|
+
try:
|
|
295
|
+
response = await self._client.get(
|
|
296
|
+
index=index_name,
|
|
297
|
+
id=doc_id,
|
|
298
|
+
_source_excludes=["content_embedding"],
|
|
299
|
+
)
|
|
300
|
+
source = response.get("_source", {})
|
|
301
|
+
# Include the document ID in the response
|
|
302
|
+
source["id"] = response.get("_id", doc_id)
|
|
303
|
+
return source
|
|
304
|
+
except Exception as e:
|
|
305
|
+
if "not_found" in str(e).lower():
|
|
306
|
+
return None
|
|
307
|
+
logger.error(f"Failed to get document {doc_id}: {e}")
|
|
308
|
+
raise IndexError(
|
|
309
|
+
message=f"Failed to get document: {e}",
|
|
310
|
+
details={"document_id": doc_id},
|
|
311
|
+
cause=e,
|
|
312
|
+
) from e
|
|
313
|
+
|
|
275
314
|
async def delete(
|
|
276
315
|
self,
|
|
277
316
|
doc_id: str,
|
|
@@ -434,7 +473,9 @@ class OpenSearchIndexer:
|
|
|
434
473
|
if not actions:
|
|
435
474
|
return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
|
|
436
475
|
|
|
437
|
-
|
|
476
|
+
# Use ingest pipeline for embedding generation
|
|
477
|
+
pipeline = self._config.ingest_pipeline_name
|
|
478
|
+
response = await self._client.bulk(body=actions, pipeline=pipeline)
|
|
438
479
|
|
|
439
480
|
indexed = 0
|
|
440
481
|
failed = 0
|
|
@@ -460,6 +501,11 @@ class OpenSearchIndexer:
|
|
|
460
501
|
def _prepare_document(self, document: Document) -> dict[str, Any]:
|
|
461
502
|
"""Prepare document for indexing.
|
|
462
503
|
|
|
504
|
+
Note:
|
|
505
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
506
|
+
isolation. Tenant information should be passed in document.metadata if
|
|
507
|
+
needed for audit purposes.
|
|
508
|
+
|
|
463
509
|
Args:
|
|
464
510
|
document: Document to prepare.
|
|
465
511
|
|
|
@@ -479,8 +525,8 @@ class OpenSearchIndexer:
|
|
|
479
525
|
"url": document.url,
|
|
480
526
|
"title": document.title,
|
|
481
527
|
"source": document.source,
|
|
482
|
-
"account_id": document.account_id,
|
|
483
528
|
"collection_id": document.collection_id,
|
|
529
|
+
"collection_name": document.collection_name,
|
|
484
530
|
"source_id": document.source_id,
|
|
485
531
|
"chunk_index": document.chunk_index,
|
|
486
532
|
"total_chunks": document.total_chunks,
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""OpenSearch index mappings for knowledge documents.
|
|
1
|
+
"""OpenSearch index mappings for knowledge documents.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). Index mappings do not include
|
|
6
|
+
tenant-specific fields like account_id.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from __future__ import annotations
|
|
4
10
|
|
|
@@ -56,9 +62,9 @@ def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
|
56
62
|
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
|
|
57
63
|
},
|
|
58
64
|
"source": {"type": "keyword"},
|
|
59
|
-
# ===
|
|
60
|
-
"account_id": {"type": "keyword"},
|
|
65
|
+
# === Collection Fields ===
|
|
61
66
|
"collection_id": {"type": "keyword"},
|
|
67
|
+
"collection_name": {"type": "keyword"}, # For aggregation display
|
|
62
68
|
"source_id": {"type": "keyword"},
|
|
63
69
|
# === Content ===
|
|
64
70
|
"content": {
|
|
@@ -128,13 +134,16 @@ def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
|
128
134
|
def get_memory_index_mappings() -> dict[str, Any]:
|
|
129
135
|
"""Get index mappings for conversation memory.
|
|
130
136
|
|
|
137
|
+
Note:
|
|
138
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
139
|
+
isolation. Use tenant-specific index names for conversation memory.
|
|
140
|
+
|
|
131
141
|
Returns:
|
|
132
142
|
Index mappings dictionary.
|
|
133
143
|
"""
|
|
134
144
|
return {
|
|
135
145
|
"properties": {
|
|
136
146
|
"conversation_id": {"type": "keyword"},
|
|
137
|
-
"account_id": {"type": "keyword"},
|
|
138
147
|
"user_id": {"type": "keyword"},
|
|
139
148
|
"message_index": {"type": "integer"},
|
|
140
149
|
"role": {"type": "keyword"}, # user, assistant, system
|
|
@@ -168,7 +177,7 @@ def get_index_template(
|
|
|
168
177
|
"settings": get_knowledge_index_settings(config),
|
|
169
178
|
"mappings": get_knowledge_index_mappings(config),
|
|
170
179
|
},
|
|
171
|
-
"priority": 100
|
|
180
|
+
"priority": 200, # Higher than default gnosisllm-template (100)
|
|
172
181
|
"version": 1,
|
|
173
182
|
}
|
|
174
183
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""OpenSearch Agentic Memory backend."""
|
|
2
|
+
|
|
3
|
+
from gnosisllm_knowledge.backends.opensearch.memory.client import OpenSearchMemoryClient
|
|
4
|
+
from gnosisllm_knowledge.backends.opensearch.memory.config import MemoryConfig
|
|
5
|
+
from gnosisllm_knowledge.backends.opensearch.memory.setup import MemorySetup, SetupStatus
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"MemoryConfig",
|
|
9
|
+
"MemorySetup",
|
|
10
|
+
"OpenSearchMemoryClient",
|
|
11
|
+
"SetupStatus",
|
|
12
|
+
]
|