gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,8 +1,44 @@
|
|
|
1
|
-
"""High-level Knowledge API facade.
|
|
1
|
+
"""High-level Knowledge API facade.
|
|
2
|
+
|
|
3
|
+
This module provides the main entry point for the gnosisllm-knowledge library.
|
|
4
|
+
The Knowledge class is a high-level facade that abstracts the complexity of
|
|
5
|
+
loading, indexing, and searching knowledge documents.
|
|
6
|
+
|
|
7
|
+
Note:
|
|
8
|
+
This library is tenant-agnostic. Multi-tenancy should be handled at the
|
|
9
|
+
API layer by using separate indices per account (e.g.,
|
|
10
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
```python
|
|
14
|
+
# Create Knowledge instance for a specific tenant
|
|
15
|
+
knowledge = Knowledge.from_opensearch(
|
|
16
|
+
host="localhost",
|
|
17
|
+
port=9200,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Use a tenant-specific index
|
|
21
|
+
tenant_index = f"knowledge-{account_id}"
|
|
22
|
+
|
|
23
|
+
# Load content
|
|
24
|
+
await knowledge.load(
|
|
25
|
+
"https://docs.example.com/sitemap.xml",
|
|
26
|
+
index_name=tenant_index,
|
|
27
|
+
collection_id="docs",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Search (tenant isolation via index name)
|
|
31
|
+
results = await knowledge.search(
|
|
32
|
+
"how to configure",
|
|
33
|
+
index_name=tenant_index,
|
|
34
|
+
)
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
2
37
|
|
|
3
38
|
from __future__ import annotations
|
|
4
39
|
|
|
5
40
|
import logging
|
|
41
|
+
from collections.abc import Callable
|
|
6
42
|
from typing import TYPE_CHECKING, Any
|
|
7
43
|
|
|
8
44
|
from gnosisllm_knowledge.backends.opensearch import (
|
|
@@ -11,15 +47,24 @@ from gnosisllm_knowledge.backends.opensearch import (
|
|
|
11
47
|
OpenSearchKnowledgeSearcher,
|
|
12
48
|
OpenSearchSetupAdapter,
|
|
13
49
|
)
|
|
50
|
+
from gnosisllm_knowledge.backends.opensearch.agentic import OpenSearchAgenticSearcher
|
|
14
51
|
from gnosisllm_knowledge.chunking import SentenceChunker
|
|
15
52
|
from gnosisllm_knowledge.core.domain.result import IndexResult
|
|
16
|
-
from gnosisllm_knowledge.core.domain.search import
|
|
53
|
+
from gnosisllm_knowledge.core.domain.search import (
|
|
54
|
+
AgentType,
|
|
55
|
+
AgenticSearchQuery,
|
|
56
|
+
AgenticSearchResult,
|
|
57
|
+
SearchMode,
|
|
58
|
+
SearchResult,
|
|
59
|
+
)
|
|
17
60
|
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
18
61
|
from gnosisllm_knowledge.core.interfaces.setup import DiagnosticReport, HealthReport
|
|
62
|
+
from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
|
|
19
63
|
from gnosisllm_knowledge.fetchers import NeoreaderContentFetcher
|
|
20
64
|
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
21
65
|
from gnosisllm_knowledge.loaders import LoaderFactory
|
|
22
66
|
from gnosisllm_knowledge.services import KnowledgeIndexingService, KnowledgeSearchService
|
|
67
|
+
from gnosisllm_knowledge.services.streaming_pipeline import StreamingIndexingPipeline
|
|
23
68
|
|
|
24
69
|
if TYPE_CHECKING:
|
|
25
70
|
from opensearchpy import AsyncOpenSearch
|
|
@@ -120,6 +165,10 @@ class Knowledge:
|
|
|
120
165
|
) -> Knowledge:
|
|
121
166
|
"""Create Knowledge instance with OpenSearch backend.
|
|
122
167
|
|
|
168
|
+
This factory creates a Knowledge instance configured for OpenSearch.
|
|
169
|
+
The returned instance is tenant-agnostic - multi-tenancy should be
|
|
170
|
+
handled by using separate indices per account.
|
|
171
|
+
|
|
123
172
|
Args:
|
|
124
173
|
host: OpenSearch host.
|
|
125
174
|
port: OpenSearch port.
|
|
@@ -137,6 +186,19 @@ class Knowledge:
|
|
|
137
186
|
Note:
|
|
138
187
|
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
139
188
|
Run 'gnosisllm-knowledge setup' to configure the ML model.
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
```python
|
|
192
|
+
# Create a Knowledge instance
|
|
193
|
+
knowledge = Knowledge.from_opensearch(
|
|
194
|
+
host="localhost",
|
|
195
|
+
port=9200,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Use tenant-specific index for isolation
|
|
199
|
+
tenant_index = f"gnosisllm-{account_id}-knowledge"
|
|
200
|
+
await knowledge.load(source, index_name=tenant_index)
|
|
201
|
+
```
|
|
140
202
|
"""
|
|
141
203
|
# Import OpenSearch client
|
|
142
204
|
try:
|
|
@@ -159,11 +221,12 @@ class Knowledge:
|
|
|
159
221
|
**kwargs,
|
|
160
222
|
)
|
|
161
223
|
|
|
162
|
-
# Create client
|
|
224
|
+
# Create client with proper timeout settings
|
|
163
225
|
client_kwargs: dict[str, Any] = {
|
|
164
226
|
"hosts": [{"host": config.host, "port": config.port}],
|
|
165
227
|
"use_ssl": config.use_ssl,
|
|
166
228
|
"verify_certs": config.verify_certs,
|
|
229
|
+
"timeout": max(config.read_timeout, config.agentic_timeout_seconds),
|
|
167
230
|
}
|
|
168
231
|
|
|
169
232
|
if config.username and config.password:
|
|
@@ -181,11 +244,16 @@ class Knowledge:
|
|
|
181
244
|
# Create fetcher
|
|
182
245
|
fetcher = None
|
|
183
246
|
if neoreader_url:
|
|
184
|
-
neoreader_config = NeoreaderConfig(
|
|
247
|
+
neoreader_config = NeoreaderConfig(host=neoreader_url)
|
|
185
248
|
fetcher = NeoreaderContentFetcher(neoreader_config)
|
|
186
249
|
|
|
187
|
-
# Create
|
|
188
|
-
|
|
250
|
+
# Create chunker
|
|
251
|
+
chunker = SentenceChunker()
|
|
252
|
+
|
|
253
|
+
# Create loader factory (fetcher is optional, defaults will be used if None)
|
|
254
|
+
loader_factory = None
|
|
255
|
+
if fetcher:
|
|
256
|
+
loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
|
|
189
257
|
|
|
190
258
|
return cls(
|
|
191
259
|
indexer=indexer,
|
|
@@ -200,15 +268,29 @@ class Knowledge:
|
|
|
200
268
|
def from_env(cls) -> Knowledge:
|
|
201
269
|
"""Create Knowledge instance from environment variables.
|
|
202
270
|
|
|
271
|
+
This factory creates a Knowledge instance using configuration from
|
|
272
|
+
environment variables. The returned instance is tenant-agnostic -
|
|
273
|
+
multi-tenancy should be handled by using separate indices per account.
|
|
274
|
+
|
|
203
275
|
Returns:
|
|
204
276
|
Configured Knowledge instance.
|
|
277
|
+
|
|
278
|
+
Example:
|
|
279
|
+
```python
|
|
280
|
+
# Create from environment
|
|
281
|
+
knowledge = Knowledge.from_env()
|
|
282
|
+
|
|
283
|
+
# Use tenant-specific index for isolation
|
|
284
|
+
tenant_index = f"gnosisllm-{account_id}-knowledge"
|
|
285
|
+
await knowledge.search("query", index_name=tenant_index)
|
|
286
|
+
```
|
|
205
287
|
"""
|
|
206
288
|
config = OpenSearchConfig.from_env()
|
|
207
289
|
neoreader_config = NeoreaderConfig.from_env()
|
|
208
290
|
|
|
209
291
|
return cls.from_opensearch(
|
|
210
292
|
config=config,
|
|
211
|
-
neoreader_url=neoreader_config.
|
|
293
|
+
neoreader_url=neoreader_config.host if neoreader_config.host else None,
|
|
212
294
|
)
|
|
213
295
|
|
|
214
296
|
@property
|
|
@@ -302,7 +384,6 @@ class Knowledge:
|
|
|
302
384
|
source: str,
|
|
303
385
|
*,
|
|
304
386
|
index_name: str | None = None,
|
|
305
|
-
account_id: str | None = None,
|
|
306
387
|
collection_id: str | None = None,
|
|
307
388
|
source_id: str | None = None,
|
|
308
389
|
source_type: str | None = None,
|
|
@@ -313,10 +394,13 @@ class Knowledge:
|
|
|
313
394
|
|
|
314
395
|
Automatically detects source type (sitemap, website, etc.).
|
|
315
396
|
|
|
397
|
+
Note:
|
|
398
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
399
|
+
by using separate indices per account.
|
|
400
|
+
|
|
316
401
|
Args:
|
|
317
402
|
source: Source URL or path.
|
|
318
|
-
index_name: Target index (
|
|
319
|
-
account_id: Account ID for multi-tenancy.
|
|
403
|
+
index_name: Target index (use tenant-specific name for isolation).
|
|
320
404
|
collection_id: Collection ID.
|
|
321
405
|
source_id: Source ID (auto-generated if not provided).
|
|
322
406
|
source_type: Explicit source type (auto-detected if not provided).
|
|
@@ -335,9 +419,9 @@ class Knowledge:
|
|
|
335
419
|
|
|
336
420
|
# Auto-detect or use explicit source type
|
|
337
421
|
if source_type:
|
|
338
|
-
loader = self._loader_factory.create(source_type
|
|
422
|
+
loader = self._loader_factory.create(source_type)
|
|
339
423
|
else:
|
|
340
|
-
loader = self._loader_factory.create_for_source(source
|
|
424
|
+
loader = self._loader_factory.create_for_source(source)
|
|
341
425
|
|
|
342
426
|
# Create service for this load operation
|
|
343
427
|
service = KnowledgeIndexingService(
|
|
@@ -350,12 +434,102 @@ class Knowledge:
|
|
|
350
434
|
return await service.load_and_index(
|
|
351
435
|
source=source,
|
|
352
436
|
index_name=index,
|
|
353
|
-
account_id=account_id,
|
|
354
437
|
collection_id=collection_id,
|
|
355
438
|
source_id=source_id,
|
|
356
439
|
**options,
|
|
357
440
|
)
|
|
358
441
|
|
|
442
|
+
async def load_streaming(
|
|
443
|
+
self,
|
|
444
|
+
source: str,
|
|
445
|
+
*,
|
|
446
|
+
index_name: str | None = None,
|
|
447
|
+
collection_id: str | None = None,
|
|
448
|
+
collection_name: str | None = None,
|
|
449
|
+
source_id: str | None = None,
|
|
450
|
+
url_batch_size: int = 50,
|
|
451
|
+
fetch_concurrency: int = 10,
|
|
452
|
+
index_batch_size: int = 100,
|
|
453
|
+
on_progress: Callable[[int, int], None] | None = None,
|
|
454
|
+
**options: Any,
|
|
455
|
+
) -> IndexResult:
|
|
456
|
+
"""Load and index content using streaming pipeline with bounded memory.
|
|
457
|
+
|
|
458
|
+
This method is optimized for large sitemaps (10,000+ URLs) that would
|
|
459
|
+
otherwise exhaust memory. It processes URLs in batches, indexing
|
|
460
|
+
documents immediately rather than loading all content first.
|
|
461
|
+
|
|
462
|
+
Memory usage is bounded and independent of sitemap size:
|
|
463
|
+
- URL storage: O(url_batch_size)
|
|
464
|
+
- Document storage: O(index_batch_size)
|
|
465
|
+
- In-flight fetches: O(fetch_concurrency * avg_page_size)
|
|
466
|
+
|
|
467
|
+
Note:
|
|
468
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
469
|
+
by using separate indices per account.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
source: Sitemap URL.
|
|
473
|
+
index_name: Target index (use tenant-specific name for isolation).
|
|
474
|
+
collection_id: Collection ID.
|
|
475
|
+
collection_name: Collection name for display.
|
|
476
|
+
source_id: Source ID (auto-generated if not provided).
|
|
477
|
+
url_batch_size: URLs to discover per batch (default 50).
|
|
478
|
+
fetch_concurrency: Parallel URL fetches (default 10).
|
|
479
|
+
index_batch_size: Documents per index batch (default 100).
|
|
480
|
+
on_progress: Optional progress callback (urls_processed, docs_indexed).
|
|
481
|
+
**options: Additional loading options (max_urls, patterns, etc.).
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Index result with counts.
|
|
485
|
+
|
|
486
|
+
Example:
|
|
487
|
+
```python
|
|
488
|
+
# Efficiently load 100k+ URL sitemap
|
|
489
|
+
result = await knowledge.load_streaming(
|
|
490
|
+
"https://large-site.com/sitemap.xml",
|
|
491
|
+
index_name="knowledge-account123", # Tenant-specific
|
|
492
|
+
url_batch_size=100,
|
|
493
|
+
fetch_concurrency=20,
|
|
494
|
+
max_urls=50000,
|
|
495
|
+
)
|
|
496
|
+
print(f"Indexed {result.indexed_count} documents")
|
|
497
|
+
```
|
|
498
|
+
"""
|
|
499
|
+
if self._loader_factory is None:
|
|
500
|
+
raise ValueError("Loader factory not configured")
|
|
501
|
+
|
|
502
|
+
index = index_name or self._default_index
|
|
503
|
+
if not index:
|
|
504
|
+
raise ValueError("No index specified and no default index configured")
|
|
505
|
+
|
|
506
|
+
# Create sitemap loader specifically for streaming
|
|
507
|
+
loader = self._loader_factory.create("sitemap")
|
|
508
|
+
|
|
509
|
+
# Configure pipeline
|
|
510
|
+
config = PipelineConfig(
|
|
511
|
+
url_batch_size=url_batch_size,
|
|
512
|
+
fetch_concurrency=fetch_concurrency,
|
|
513
|
+
index_batch_size=index_batch_size,
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
# Create streaming pipeline
|
|
517
|
+
pipeline = StreamingIndexingPipeline(
|
|
518
|
+
loader=loader,
|
|
519
|
+
indexer=self._indexer,
|
|
520
|
+
config=config,
|
|
521
|
+
events=self._events,
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
return await pipeline.execute(
|
|
525
|
+
source=source,
|
|
526
|
+
index_name=index,
|
|
527
|
+
collection_id=collection_id,
|
|
528
|
+
collection_name=collection_name,
|
|
529
|
+
source_id=source_id,
|
|
530
|
+
**options,
|
|
531
|
+
)
|
|
532
|
+
|
|
359
533
|
# === Search Methods ===
|
|
360
534
|
|
|
361
535
|
async def search(
|
|
@@ -366,7 +540,6 @@ class Knowledge:
|
|
|
366
540
|
mode: SearchMode = SearchMode.HYBRID,
|
|
367
541
|
limit: int = 10,
|
|
368
542
|
offset: int = 0,
|
|
369
|
-
account_id: str | None = None,
|
|
370
543
|
collection_ids: list[str] | None = None,
|
|
371
544
|
source_ids: list[str] | None = None,
|
|
372
545
|
min_score: float | None = None,
|
|
@@ -374,13 +547,16 @@ class Knowledge:
|
|
|
374
547
|
) -> SearchResult:
|
|
375
548
|
"""Search for knowledge documents.
|
|
376
549
|
|
|
550
|
+
Note:
|
|
551
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
552
|
+
by using separate indices per account.
|
|
553
|
+
|
|
377
554
|
Args:
|
|
378
555
|
query: Search query text.
|
|
379
|
-
index_name: Index to search (
|
|
556
|
+
index_name: Index to search (use tenant-specific name for isolation).
|
|
380
557
|
mode: Search mode (semantic, keyword, hybrid).
|
|
381
558
|
limit: Maximum results.
|
|
382
559
|
offset: Result offset for pagination.
|
|
383
|
-
account_id: Account ID for multi-tenancy.
|
|
384
560
|
collection_ids: Filter by collection IDs.
|
|
385
561
|
source_ids: Filter by source IDs.
|
|
386
562
|
min_score: Minimum score threshold.
|
|
@@ -395,7 +571,6 @@ class Knowledge:
|
|
|
395
571
|
mode=mode,
|
|
396
572
|
limit=limit,
|
|
397
573
|
offset=offset,
|
|
398
|
-
account_id=account_id,
|
|
399
574
|
collection_ids=collection_ids,
|
|
400
575
|
source_ids=source_ids,
|
|
401
576
|
min_score=min_score,
|
|
@@ -473,19 +648,73 @@ class Knowledge:
|
|
|
473
648
|
|
|
474
649
|
# === Management Methods ===
|
|
475
650
|
|
|
651
|
+
async def get_document(
|
|
652
|
+
self,
|
|
653
|
+
document_id: str,
|
|
654
|
+
*,
|
|
655
|
+
index_name: str | None = None,
|
|
656
|
+
) -> dict[str, Any] | None:
|
|
657
|
+
"""Get a single document by ID.
|
|
658
|
+
|
|
659
|
+
Note:
|
|
660
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
661
|
+
by using separate indices per account.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
document_id: Document ID to retrieve.
|
|
665
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
666
|
+
Uses default index if not provided.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Document dict with all fields (excluding embeddings) or None if not found.
|
|
670
|
+
"""
|
|
671
|
+
index = index_name or self._default_index
|
|
672
|
+
if not index:
|
|
673
|
+
raise ValueError("No index specified and no default index configured")
|
|
674
|
+
|
|
675
|
+
return await self._indexer.get(document_id, index)
|
|
676
|
+
|
|
677
|
+
async def delete_document(
|
|
678
|
+
self,
|
|
679
|
+
document_id: str,
|
|
680
|
+
*,
|
|
681
|
+
index_name: str | None = None,
|
|
682
|
+
) -> bool:
|
|
683
|
+
"""Delete a single document by ID.
|
|
684
|
+
|
|
685
|
+
Note:
|
|
686
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
687
|
+
by using separate indices per account.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
document_id: Document ID to delete.
|
|
691
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
692
|
+
Uses default index if not provided.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
True if deleted, False if not found.
|
|
696
|
+
"""
|
|
697
|
+
index = index_name or self._default_index
|
|
698
|
+
if not index:
|
|
699
|
+
raise ValueError("No index specified and no default index configured")
|
|
700
|
+
|
|
701
|
+
return await self._indexer.delete(document_id, index)
|
|
702
|
+
|
|
476
703
|
async def delete_source(
|
|
477
704
|
self,
|
|
478
705
|
source_id: str,
|
|
479
706
|
*,
|
|
480
707
|
index_name: str | None = None,
|
|
481
|
-
account_id: str | None = None,
|
|
482
708
|
) -> int:
|
|
483
709
|
"""Delete all documents from a source.
|
|
484
710
|
|
|
711
|
+
Note:
|
|
712
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
713
|
+
by using separate indices per account.
|
|
714
|
+
|
|
485
715
|
Args:
|
|
486
716
|
source_id: Source ID to delete.
|
|
487
|
-
index_name: Index name.
|
|
488
|
-
account_id: Account ID for multi-tenancy.
|
|
717
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
489
718
|
|
|
490
719
|
Returns:
|
|
491
720
|
Count of deleted documents.
|
|
@@ -494,21 +723,23 @@ class Knowledge:
|
|
|
494
723
|
if not index:
|
|
495
724
|
raise ValueError("No index specified")
|
|
496
725
|
|
|
497
|
-
return await self.indexing.delete_source(source_id, index
|
|
726
|
+
return await self.indexing.delete_source(source_id, index)
|
|
498
727
|
|
|
499
728
|
async def delete_collection(
|
|
500
729
|
self,
|
|
501
730
|
collection_id: str,
|
|
502
731
|
*,
|
|
503
732
|
index_name: str | None = None,
|
|
504
|
-
account_id: str | None = None,
|
|
505
733
|
) -> int:
|
|
506
734
|
"""Delete all documents from a collection.
|
|
507
735
|
|
|
736
|
+
Note:
|
|
737
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
738
|
+
by using separate indices per account.
|
|
739
|
+
|
|
508
740
|
Args:
|
|
509
741
|
collection_id: Collection ID to delete.
|
|
510
|
-
index_name: Index name.
|
|
511
|
-
account_id: Account ID for multi-tenancy.
|
|
742
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
512
743
|
|
|
513
744
|
Returns:
|
|
514
745
|
Count of deleted documents.
|
|
@@ -517,32 +748,271 @@ class Knowledge:
|
|
|
517
748
|
if not index:
|
|
518
749
|
raise ValueError("No index specified")
|
|
519
750
|
|
|
520
|
-
return await self.indexing.delete_collection(collection_id, index
|
|
751
|
+
return await self.indexing.delete_collection(collection_id, index)
|
|
521
752
|
|
|
522
753
|
async def count(
|
|
523
754
|
self,
|
|
524
755
|
*,
|
|
525
756
|
index_name: str | None = None,
|
|
526
|
-
account_id: str | None = None,
|
|
527
757
|
collection_id: str | None = None,
|
|
758
|
+
source_id: str | None = None,
|
|
528
759
|
) -> int:
|
|
529
760
|
"""Count documents.
|
|
530
761
|
|
|
762
|
+
Note:
|
|
763
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
764
|
+
by using separate indices per account.
|
|
765
|
+
|
|
531
766
|
Args:
|
|
532
|
-
index_name: Index to count.
|
|
533
|
-
account_id: Filter by account.
|
|
767
|
+
index_name: Index to count (use tenant-specific name for isolation).
|
|
534
768
|
collection_id: Filter by collection.
|
|
769
|
+
source_id: Filter by source (for source deletion confirmation).
|
|
535
770
|
|
|
536
771
|
Returns:
|
|
537
772
|
Document count.
|
|
538
773
|
"""
|
|
539
774
|
return await self.search_service.count(
|
|
540
775
|
index_name=index_name,
|
|
541
|
-
account_id=account_id,
|
|
542
776
|
collection_id=collection_id,
|
|
777
|
+
source_id=source_id,
|
|
543
778
|
)
|
|
544
779
|
|
|
780
|
+
# === Collection and Stats Methods ===
|
|
781
|
+
|
|
782
|
+
async def get_collections(
|
|
783
|
+
self,
|
|
784
|
+
*,
|
|
785
|
+
index_name: str | None = None,
|
|
786
|
+
) -> list[dict[str, Any]]:
|
|
787
|
+
"""Get all collections with document counts.
|
|
788
|
+
|
|
789
|
+
Aggregates unique collection_ids from indexed documents.
|
|
790
|
+
|
|
791
|
+
Note:
|
|
792
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
793
|
+
by using separate indices per account.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
797
|
+
Uses default index if not provided.
|
|
798
|
+
|
|
799
|
+
Returns:
|
|
800
|
+
List of collection dictionaries with id, name, and document_count.
|
|
801
|
+
"""
|
|
802
|
+
index = index_name or self._default_index
|
|
803
|
+
return await self.search_service.get_collections(index_name=index)
|
|
804
|
+
|
|
805
|
+
async def get_stats(
|
|
806
|
+
self,
|
|
807
|
+
*,
|
|
808
|
+
index_name: str | None = None,
|
|
809
|
+
) -> dict[str, Any]:
|
|
810
|
+
"""Get index statistics.
|
|
811
|
+
|
|
812
|
+
Note:
|
|
813
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
814
|
+
by using separate indices per account.
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
818
|
+
Uses default index if not provided.
|
|
819
|
+
|
|
820
|
+
Returns:
|
|
821
|
+
Dictionary with document_count, index_name, and other stats.
|
|
822
|
+
"""
|
|
823
|
+
index = index_name or self._default_index
|
|
824
|
+
return await self.search_service.get_stats(index_name=index)
|
|
825
|
+
|
|
826
|
+
async def list_documents(
|
|
827
|
+
self,
|
|
828
|
+
*,
|
|
829
|
+
index_name: str | None = None,
|
|
830
|
+
source_id: str | None = None,
|
|
831
|
+
collection_id: str | None = None,
|
|
832
|
+
limit: int = 50,
|
|
833
|
+
offset: int = 0,
|
|
834
|
+
) -> dict[str, Any]:
|
|
835
|
+
"""List documents with optional filters.
|
|
836
|
+
|
|
837
|
+
Note:
|
|
838
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
839
|
+
by using separate indices per account.
|
|
840
|
+
|
|
841
|
+
Args:
|
|
842
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
843
|
+
Uses default index if not provided.
|
|
844
|
+
source_id: Optional source ID filter.
|
|
845
|
+
collection_id: Optional collection ID filter.
|
|
846
|
+
limit: Maximum documents to return (max 100).
|
|
847
|
+
offset: Number of documents to skip.
|
|
848
|
+
|
|
849
|
+
Returns:
|
|
850
|
+
Dictionary with documents, total, limit, offset.
|
|
851
|
+
"""
|
|
852
|
+
index = index_name or self._default_index
|
|
853
|
+
if not index:
|
|
854
|
+
raise ValueError("No index specified and no default index configured")
|
|
855
|
+
|
|
856
|
+
# Clamp limit to reasonable bounds
|
|
857
|
+
limit = min(max(1, limit), 100)
|
|
858
|
+
offset = max(0, offset)
|
|
859
|
+
|
|
860
|
+
return await self._searcher.list_documents(
|
|
861
|
+
index_name=index,
|
|
862
|
+
source_id=source_id,
|
|
863
|
+
collection_id=collection_id,
|
|
864
|
+
limit=limit,
|
|
865
|
+
offset=offset,
|
|
866
|
+
)
|
|
867
|
+
|
|
868
|
+
# === Agentic Search Status ===
|
|
869
|
+
|
|
870
|
+
@property
|
|
871
|
+
def is_agentic_configured(self) -> bool:
|
|
872
|
+
"""Check if agentic search is configured.
|
|
873
|
+
|
|
874
|
+
Returns:
|
|
875
|
+
True if at least one agent type is configured.
|
|
876
|
+
"""
|
|
877
|
+
if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
|
|
878
|
+
return False
|
|
879
|
+
config = self._searcher._config
|
|
880
|
+
return bool(config.flow_agent_id or config.conversational_agent_id)
|
|
881
|
+
|
|
882
|
+
async def get_agentic_status(self) -> dict[str, Any]:
|
|
883
|
+
"""Get status of agentic search configuration.
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
Dictionary with agent availability status:
|
|
887
|
+
- available: True if any agent is configured
|
|
888
|
+
- flow_agent: True if flow agent is configured
|
|
889
|
+
- conversational_agent: True if conversational agent is configured
|
|
890
|
+
"""
|
|
891
|
+
if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
|
|
892
|
+
return {
|
|
893
|
+
"available": False,
|
|
894
|
+
"flow_agent": False,
|
|
895
|
+
"conversational_agent": False,
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
config = self._searcher._config
|
|
899
|
+
return {
|
|
900
|
+
"available": bool(config.flow_agent_id or config.conversational_agent_id),
|
|
901
|
+
"flow_agent": bool(config.flow_agent_id),
|
|
902
|
+
"conversational_agent": bool(config.conversational_agent_id),
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
async def agentic_search(
|
|
906
|
+
self,
|
|
907
|
+
query: str,
|
|
908
|
+
*,
|
|
909
|
+
agent_type: AgentType = AgentType.FLOW,
|
|
910
|
+
index_name: str | None = None,
|
|
911
|
+
collection_ids: list[str] | None = None,
|
|
912
|
+
source_ids: list[str] | None = None,
|
|
913
|
+
conversation_id: str | None = None,
|
|
914
|
+
include_reasoning: bool = True,
|
|
915
|
+
limit: int = 10,
|
|
916
|
+
**options: Any,
|
|
917
|
+
) -> AgenticSearchResult:
|
|
918
|
+
"""Execute agentic search with AI-powered reasoning.
|
|
919
|
+
|
|
920
|
+
Uses OpenSearch ML agents to understand queries, retrieve relevant
|
|
921
|
+
documents, and generate natural language answers.
|
|
922
|
+
|
|
923
|
+
Args:
|
|
924
|
+
query: Search query text.
|
|
925
|
+
agent_type: Type of agent (FLOW for fast RAG, CONVERSATIONAL for multi-turn).
|
|
926
|
+
index_name: Index to search (uses default if not provided).
|
|
927
|
+
collection_ids: Filter by collection IDs.
|
|
928
|
+
source_ids: Filter by source IDs.
|
|
929
|
+
conversation_id: Conversation ID for multi-turn (conversational agent).
|
|
930
|
+
include_reasoning: Include reasoning steps in response.
|
|
931
|
+
limit: Maximum source documents to retrieve.
|
|
932
|
+
**options: Additional agent options.
|
|
933
|
+
|
|
934
|
+
Returns:
|
|
935
|
+
AgenticSearchResult with answer, reasoning steps, and sources.
|
|
936
|
+
|
|
937
|
+
Raises:
|
|
938
|
+
AgenticSearchError: If agent execution fails.
|
|
939
|
+
ValueError: If agentic search is not configured.
|
|
940
|
+
|
|
941
|
+
Example:
|
|
942
|
+
```python
|
|
943
|
+
result = await knowledge.agentic_search(
|
|
944
|
+
"How does authentication work?",
|
|
945
|
+
agent_type=AgentType.FLOW,
|
|
946
|
+
)
|
|
947
|
+
print(result.answer)
|
|
948
|
+
for source in result.items:
|
|
949
|
+
print(f"- {source.title}")
|
|
950
|
+
```
|
|
951
|
+
"""
|
|
952
|
+
# Check if agentic search is configured
|
|
953
|
+
if not self.is_agentic_configured:
|
|
954
|
+
raise ValueError(
|
|
955
|
+
"Agentic search is not configured. "
|
|
956
|
+
"Run 'gnosisllm-knowledge agentic setup' and set agent IDs in environment."
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
# Get client and config from the searcher
|
|
960
|
+
if not hasattr(self._searcher, '_client') or not hasattr(self._searcher, '_config'):
|
|
961
|
+
raise ValueError("Searcher does not have OpenSearch client/config")
|
|
962
|
+
|
|
963
|
+
client = self._searcher._client
|
|
964
|
+
config = self._searcher._config
|
|
965
|
+
|
|
966
|
+
# Create agentic searcher
|
|
967
|
+
agentic_searcher = OpenSearchAgenticSearcher(client, config)
|
|
968
|
+
|
|
969
|
+
# Build agentic query
|
|
970
|
+
agentic_query = AgenticSearchQuery(
|
|
971
|
+
text=query,
|
|
972
|
+
agent_type=agent_type,
|
|
973
|
+
collection_ids=collection_ids,
|
|
974
|
+
source_ids=source_ids,
|
|
975
|
+
conversation_id=conversation_id,
|
|
976
|
+
include_reasoning=include_reasoning,
|
|
977
|
+
limit=limit,
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
# Determine index name
|
|
981
|
+
index = index_name or self._default_index
|
|
982
|
+
if not index:
|
|
983
|
+
raise ValueError("No index specified and no default index configured")
|
|
984
|
+
|
|
985
|
+
# Execute agentic search
|
|
986
|
+
return await agentic_searcher.agentic_search(agentic_query, index, **options)
|
|
987
|
+
|
|
545
988
|
async def close(self) -> None:
|
|
546
|
-
"""Close connections and clean up resources.
|
|
547
|
-
|
|
548
|
-
|
|
989
|
+
"""Close connections and clean up resources.
|
|
990
|
+
|
|
991
|
+
Closes the underlying AsyncOpenSearch client to prevent
|
|
992
|
+
unclosed aiohttp session warnings. Properly handles
|
|
993
|
+
CancelledError during event loop shutdown.
|
|
994
|
+
"""
|
|
995
|
+
import asyncio
|
|
996
|
+
|
|
997
|
+
# Close the OpenSearch client via the searcher
|
|
998
|
+
# Note: indexer, searcher, and setup share the same client instance,
|
|
999
|
+
# so closing via searcher is sufficient
|
|
1000
|
+
if hasattr(self._searcher, '_client') and self._searcher._client is not None:
|
|
1001
|
+
client = self._searcher._client
|
|
1002
|
+
try:
|
|
1003
|
+
await client.close()
|
|
1004
|
+
logger.debug("Closed OpenSearch client connection")
|
|
1005
|
+
except asyncio.CancelledError:
|
|
1006
|
+
# Event loop is shutting down - this is expected during cleanup
|
|
1007
|
+
logger.debug("OpenSearch client close cancelled (event loop shutting down)")
|
|
1008
|
+
except Exception as e:
|
|
1009
|
+
logger.warning(f"Error closing OpenSearch client: {e}")
|
|
1010
|
+
finally:
|
|
1011
|
+
# Clear client reference on all components that share it
|
|
1012
|
+
# This prevents any accidental reuse after close
|
|
1013
|
+
if hasattr(self._searcher, '_client'):
|
|
1014
|
+
self._searcher._client = None
|
|
1015
|
+
if hasattr(self._indexer, '_client'):
|
|
1016
|
+
self._indexer._client = None
|
|
1017
|
+
if self._setup and hasattr(self._setup, '_client'):
|
|
1018
|
+
self._setup._client = None
|