gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -74,7 +74,7 @@ class OpenSearchConfig:
74
74
 
75
75
  # === k-NN Settings ===
76
76
  knn_engine: str = "lucene" # lucene (recommended for OpenSearch 2.9+), faiss
77
- knn_space_type: str = "l2" # l2, cosinesimil, innerproduct
77
+ knn_space_type: str = "cosinesimil" # cosinesimil (recommended), l2, innerproduct
78
78
  knn_algo_param_ef_search: int = 512
79
79
  knn_algo_param_ef_construction: int = 512
80
80
  knn_algo_param_m: int = 16
@@ -85,10 +85,13 @@ class OpenSearchConfig:
85
85
  model_group_id: str | None = None
86
86
  embedding_field: str = "content_embedding" # Field name for embeddings
87
87
 
88
- # === Agentic Search ===
88
+ # === Agentic Search (OpenSearch 3.2+) ===
89
+ # Uses QueryPlanningTool for LLM-generated DSL queries
89
90
  # Agent IDs from 'gnosisllm-knowledge agentic setup'
90
91
  flow_agent_id: str | None = None
91
92
  conversational_agent_id: str | None = None
93
+ # Agentic search pipeline (created during agentic setup)
94
+ agentic_pipeline_name: str | None = None
92
95
  # LLM for agent reasoning (OpenAI model ID)
93
96
  agentic_llm_model: str = "gpt-4o"
94
97
  # Agent execution limits
@@ -139,57 +142,75 @@ class OpenSearchConfig:
139
142
  def from_env(cls) -> OpenSearchConfig:
140
143
  """Create config from environment variables.
141
144
 
142
- Environment variables:
143
- OPENSEARCH_HOST: Host (default: localhost)
144
- OPENSEARCH_PORT: Port (default: 9200)
145
- OPENSEARCH_USE_SSL: Use SSL (default: false)
146
- OPENSEARCH_VERIFY_CERTS: Verify certificates (default: true)
147
- OPENSEARCH_USERNAME: Username
148
- OPENSEARCH_PASSWORD: Password
149
- OPENSEARCH_USE_AWS_SIGV4: Use AWS Sig v4 auth (default: false)
150
- AWS_REGION: AWS region for Sig v4
151
- OPENSEARCH_NODES: Comma-separated list of nodes
152
- EMBEDDING_MODEL: OpenAI embedding model
153
- EMBEDDING_DIMENSION: Embedding vector dimension
154
- OPENAI_API_KEY: OpenAI API key
155
- OPENSEARCH_INDEX_PREFIX: Index name prefix
156
- OPENSEARCH_SHARDS: Number of shards
157
- OPENSEARCH_REPLICAS: Number of replicas
158
- OPENSEARCH_FLOW_AGENT_ID: Flow agent ID for agentic search
159
- OPENSEARCH_CONVERSATIONAL_AGENT_ID: Conversational agent ID
160
- AGENTIC_LLM_MODEL: LLM model for agent reasoning (default: gpt-4o)
161
- AGENTIC_MAX_ITERATIONS: Maximum agent iterations (default: 5)
162
- AGENTIC_TIMEOUT_SECONDS: Agent execution timeout (default: 60)
145
+ All configuration options can be set via environment variables.
146
+ See .env.example for a complete list with descriptions.
163
147
 
164
148
  Returns:
165
149
  Configuration from environment.
166
150
  """
151
+ # Parse nodes list
167
152
  nodes_str = os.getenv("OPENSEARCH_NODES", "")
168
- nodes = tuple(nodes_str.split(",")) if nodes_str else None
153
+ nodes = tuple(n.strip() for n in nodes_str.split(",") if n.strip()) or None
169
154
 
170
155
  return cls(
156
+ # === Connection ===
171
157
  host=os.getenv("OPENSEARCH_HOST", "localhost"),
172
158
  port=int(os.getenv("OPENSEARCH_PORT", "9200")),
173
- use_ssl=os.getenv("OPENSEARCH_USE_SSL", "").lower() == "true",
159
+ use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
174
160
  verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "true").lower() == "true",
161
+ ca_certs=os.getenv("OPENSEARCH_CA_CERTS"),
162
+ # Authentication
175
163
  username=os.getenv("OPENSEARCH_USERNAME"),
176
164
  password=os.getenv("OPENSEARCH_PASSWORD"),
177
- use_aws_sigv4=os.getenv("OPENSEARCH_USE_AWS_SIGV4", "").lower() == "true",
165
+ # AWS OpenSearch Service
166
+ use_aws_sigv4=os.getenv("OPENSEARCH_USE_AWS_SIGV4", "false").lower() == "true",
178
167
  aws_region=os.getenv("AWS_REGION"),
168
+ aws_service=os.getenv("OPENSEARCH_AWS_SERVICE", "es"),
169
+ # === Cluster (High Availability) ===
179
170
  nodes=nodes,
171
+ sniff_on_start=os.getenv("OPENSEARCH_SNIFF_ON_START", "false").lower() == "true",
172
+ sniff_on_node_failure=os.getenv("OPENSEARCH_SNIFF_ON_NODE_FAILURE", "true").lower()
173
+ == "true",
174
+ sniff_timeout=float(os.getenv("OPENSEARCH_SNIFF_TIMEOUT", "10.0")),
175
+ sniffer_timeout=float(os.getenv("OPENSEARCH_SNIFFER_TIMEOUT", "60.0")),
176
+ # === Embedding ===
180
177
  embedding_model=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small"),
181
178
  embedding_dimension=int(os.getenv("EMBEDDING_DIMENSION", "1536")),
182
179
  openai_api_key=os.getenv("OPENAI_API_KEY"),
180
+ embedding_batch_size=int(os.getenv("EMBEDDING_BATCH_SIZE", "100")),
181
+ # === Index Settings ===
183
182
  index_prefix=os.getenv("OPENSEARCH_INDEX_PREFIX", "gnosisllm"),
184
183
  number_of_shards=int(os.getenv("OPENSEARCH_SHARDS", "5")),
185
184
  number_of_replicas=int(os.getenv("OPENSEARCH_REPLICAS", "1")),
186
- model_id=os.getenv("OPENSEARCH_MODEL_ID"),
185
+ refresh_interval=os.getenv("OPENSEARCH_REFRESH_INTERVAL", "1s"),
186
+ # Pipeline names
187
187
  ingest_pipeline_name=os.getenv("OPENSEARCH_INGEST_PIPELINE"),
188
188
  search_pipeline_name=os.getenv("OPENSEARCH_SEARCH_PIPELINE"),
189
- # Agentic search configuration
189
+ # === k-NN Settings ===
190
+ knn_engine=os.getenv("OPENSEARCH_KNN_ENGINE", "lucene"),
191
+ knn_space_type=os.getenv("OPENSEARCH_KNN_SPACE_TYPE", "cosinesimil"),
192
+ knn_algo_param_ef_search=int(os.getenv("OPENSEARCH_KNN_EF_SEARCH", "512")),
193
+ knn_algo_param_ef_construction=int(
194
+ os.getenv("OPENSEARCH_KNN_EF_CONSTRUCTION", "512")
195
+ ),
196
+ knn_algo_param_m=int(os.getenv("OPENSEARCH_KNN_M", "16")),
197
+ # === Neural Search ===
198
+ model_id=os.getenv("OPENSEARCH_MODEL_ID"),
199
+ model_group_id=os.getenv("OPENSEARCH_MODEL_GROUP_ID"),
200
+ embedding_field=os.getenv("OPENSEARCH_EMBEDDING_FIELD", "content_embedding"),
201
+ # === Agentic Search ===
190
202
  flow_agent_id=os.getenv("OPENSEARCH_FLOW_AGENT_ID"),
191
203
  conversational_agent_id=os.getenv("OPENSEARCH_CONVERSATIONAL_AGENT_ID"),
204
+ agentic_pipeline_name=os.getenv("OPENSEARCH_AGENTIC_PIPELINE"),
192
205
  agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
193
206
  agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
194
207
  agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
208
+ memory_window_size=int(os.getenv("AGENTIC_MEMORY_WINDOW_SIZE", "10")),
209
+ # === Timeouts ===
210
+ connect_timeout=float(os.getenv("OPENSEARCH_CONNECT_TIMEOUT", "5.0")),
211
+ read_timeout=float(os.getenv("OPENSEARCH_READ_TIMEOUT", "30.0")),
212
+ bulk_timeout=float(os.getenv("OPENSEARCH_BULK_TIMEOUT", "120.0")),
213
+ # === Bulk Indexing ===
214
+ bulk_batch_size=int(os.getenv("OPENSEARCH_BULK_BATCH_SIZE", "500")),
215
+ bulk_max_concurrent=int(os.getenv("OPENSEARCH_BULK_MAX_CONCURRENT", "3")),
195
216
  )
@@ -87,13 +87,15 @@ class OpenSearchIndexer:
87
87
  # Embeddings are generated by OpenSearch ingest pipeline
88
88
  doc_body = self._prepare_document(document)
89
89
 
90
- # Index the document
90
+ # Index the document with ingest pipeline for embedding generation
91
91
  refresh = options.get("refresh", False)
92
+ pipeline = self._config.ingest_pipeline_name
92
93
  await self._client.index(
93
94
  index=index_name,
94
95
  id=document.doc_id,
95
96
  body=doc_body,
96
97
  refresh=refresh,
98
+ pipeline=pipeline,
97
99
  )
98
100
 
99
101
  return IndexResult(
@@ -272,6 +274,43 @@ class OpenSearchIndexer:
272
274
  failed_count=0,
273
275
  )
274
276
 
277
+ async def get(
278
+ self,
279
+ doc_id: str,
280
+ index_name: str,
281
+ ) -> dict[str, Any] | None:
282
+ """Get a document by ID.
283
+
284
+ Uses OpenSearch client's direct get() API (CRUD operation, not search).
285
+
286
+ Args:
287
+ doc_id: Document ID to retrieve.
288
+ index_name: Index name.
289
+
290
+ Returns:
291
+ Document dict (source fields) or None if not found.
292
+ Excludes embeddings from response for efficiency.
293
+ """
294
+ try:
295
+ response = await self._client.get(
296
+ index=index_name,
297
+ id=doc_id,
298
+ _source_excludes=["content_embedding"],
299
+ )
300
+ source = response.get("_source", {})
301
+ # Include the document ID in the response
302
+ source["id"] = response.get("_id", doc_id)
303
+ return source
304
+ except Exception as e:
305
+ if "not_found" in str(e).lower():
306
+ return None
307
+ logger.error(f"Failed to get document {doc_id}: {e}")
308
+ raise IndexError(
309
+ message=f"Failed to get document: {e}",
310
+ details={"document_id": doc_id},
311
+ cause=e,
312
+ ) from e
313
+
275
314
  async def delete(
276
315
  self,
277
316
  doc_id: str,
@@ -434,7 +473,9 @@ class OpenSearchIndexer:
434
473
  if not actions:
435
474
  return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
436
475
 
437
- response = await self._client.bulk(body=actions)
476
+ # Use ingest pipeline for embedding generation
477
+ pipeline = self._config.ingest_pipeline_name
478
+ response = await self._client.bulk(body=actions, pipeline=pipeline)
438
479
 
439
480
  indexed = 0
440
481
  failed = 0
@@ -460,6 +501,11 @@ class OpenSearchIndexer:
460
501
  def _prepare_document(self, document: Document) -> dict[str, Any]:
461
502
  """Prepare document for indexing.
462
503
 
504
+ Note:
505
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
506
+ isolation. Tenant information should be passed in document.metadata if
507
+ needed for audit purposes.
508
+
463
509
  Args:
464
510
  document: Document to prepare.
465
511
 
@@ -479,8 +525,8 @@ class OpenSearchIndexer:
479
525
  "url": document.url,
480
526
  "title": document.title,
481
527
  "source": document.source,
482
- "account_id": document.account_id,
483
528
  "collection_id": document.collection_id,
529
+ "collection_name": document.collection_name,
484
530
  "source_id": document.source_id,
485
531
  "chunk_index": document.chunk_index,
486
532
  "total_chunks": document.total_chunks,
@@ -1,4 +1,10 @@
1
- """OpenSearch index mappings for knowledge documents."""
1
+ """OpenSearch index mappings for knowledge documents.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). Index mappings do not include
6
+ tenant-specific fields like account_id.
7
+ """
2
8
 
3
9
  from __future__ import annotations
4
10
 
@@ -56,9 +62,9 @@ def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
56
62
  "fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
57
63
  },
58
64
  "source": {"type": "keyword"},
59
- # === Multi-tenant Fields ===
60
- "account_id": {"type": "keyword"},
65
+ # === Collection Fields ===
61
66
  "collection_id": {"type": "keyword"},
67
+ "collection_name": {"type": "keyword"}, # For aggregation display
62
68
  "source_id": {"type": "keyword"},
63
69
  # === Content ===
64
70
  "content": {
@@ -128,13 +134,16 @@ def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
128
134
  def get_memory_index_mappings() -> dict[str, Any]:
129
135
  """Get index mappings for conversation memory.
130
136
 
137
+ Note:
138
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
139
+ isolation. Use tenant-specific index names for conversation memory.
140
+
131
141
  Returns:
132
142
  Index mappings dictionary.
133
143
  """
134
144
  return {
135
145
  "properties": {
136
146
  "conversation_id": {"type": "keyword"},
137
- "account_id": {"type": "keyword"},
138
147
  "user_id": {"type": "keyword"},
139
148
  "message_index": {"type": "integer"},
140
149
  "role": {"type": "keyword"}, # user, assistant, system
@@ -168,7 +177,7 @@ def get_index_template(
168
177
  "settings": get_knowledge_index_settings(config),
169
178
  "mappings": get_knowledge_index_mappings(config),
170
179
  },
171
- "priority": 100,
180
+ "priority": 200, # Higher than default gnosisllm-template (100)
172
181
  "version": 1,
173
182
  }
174
183
 
@@ -0,0 +1,12 @@
1
+ """OpenSearch Agentic Memory backend."""
2
+
3
+ from gnosisllm_knowledge.backends.opensearch.memory.client import OpenSearchMemoryClient
4
+ from gnosisllm_knowledge.backends.opensearch.memory.config import MemoryConfig
5
+ from gnosisllm_knowledge.backends.opensearch.memory.setup import MemorySetup, SetupStatus
6
+
7
+ __all__ = [
8
+ "MemoryConfig",
9
+ "MemorySetup",
10
+ "OpenSearchMemoryClient",
11
+ "SetupStatus",
12
+ ]