gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,44 @@
1
- """High-level Knowledge API facade."""
1
+ """High-level Knowledge API facade.
2
+
3
+ This module provides the main entry point for the gnosisllm-knowledge library.
4
+ The Knowledge class is a high-level facade that abstracts the complexity of
5
+ loading, indexing, and searching knowledge documents.
6
+
7
+ Note:
8
+ This library is tenant-agnostic. Multi-tenancy should be handled at the
9
+ API layer by using separate indices per account (e.g.,
10
+ `knowledge-{account_id}`) rather than filtering by account_id.
11
+
12
+ Example:
13
+ ```python
14
+ # Create Knowledge instance for a specific tenant
15
+ knowledge = Knowledge.from_opensearch(
16
+ host="localhost",
17
+ port=9200,
18
+ )
19
+
20
+ # Use a tenant-specific index
21
+ tenant_index = f"knowledge-{account_id}"
22
+
23
+ # Load content
24
+ await knowledge.load(
25
+ "https://docs.example.com/sitemap.xml",
26
+ index_name=tenant_index,
27
+ collection_id="docs",
28
+ )
29
+
30
+ # Search (tenant isolation via index name)
31
+ results = await knowledge.search(
32
+ "how to configure",
33
+ index_name=tenant_index,
34
+ )
35
+ ```
36
+ """
2
37
 
3
38
  from __future__ import annotations
4
39
 
5
40
  import logging
41
+ from collections.abc import Callable
6
42
  from typing import TYPE_CHECKING, Any
7
43
 
8
44
  from gnosisllm_knowledge.backends.opensearch import (
@@ -11,15 +47,24 @@ from gnosisllm_knowledge.backends.opensearch import (
11
47
  OpenSearchKnowledgeSearcher,
12
48
  OpenSearchSetupAdapter,
13
49
  )
50
+ from gnosisllm_knowledge.backends.opensearch.agentic import OpenSearchAgenticSearcher
14
51
  from gnosisllm_knowledge.chunking import SentenceChunker
15
52
  from gnosisllm_knowledge.core.domain.result import IndexResult
16
- from gnosisllm_knowledge.core.domain.search import SearchMode, SearchResult
53
+ from gnosisllm_knowledge.core.domain.search import (
54
+ AgentType,
55
+ AgenticSearchQuery,
56
+ AgenticSearchResult,
57
+ SearchMode,
58
+ SearchResult,
59
+ )
17
60
  from gnosisllm_knowledge.core.events.emitter import EventEmitter
18
61
  from gnosisllm_knowledge.core.interfaces.setup import DiagnosticReport, HealthReport
62
+ from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
19
63
  from gnosisllm_knowledge.fetchers import NeoreaderContentFetcher
20
64
  from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
21
65
  from gnosisllm_knowledge.loaders import LoaderFactory
22
66
  from gnosisllm_knowledge.services import KnowledgeIndexingService, KnowledgeSearchService
67
+ from gnosisllm_knowledge.services.streaming_pipeline import StreamingIndexingPipeline
23
68
 
24
69
  if TYPE_CHECKING:
25
70
  from opensearchpy import AsyncOpenSearch
@@ -120,6 +165,10 @@ class Knowledge:
120
165
  ) -> Knowledge:
121
166
  """Create Knowledge instance with OpenSearch backend.
122
167
 
168
+ This factory creates a Knowledge instance configured for OpenSearch.
169
+ The returned instance is tenant-agnostic - multi-tenancy should be
170
+ handled by using separate indices per account.
171
+
123
172
  Args:
124
173
  host: OpenSearch host.
125
174
  port: OpenSearch port.
@@ -137,6 +186,19 @@ class Knowledge:
137
186
  Note:
138
187
  Embeddings are generated automatically by OpenSearch ingest pipeline.
139
188
  Run 'gnosisllm-knowledge setup' to configure the ML model.
189
+
190
+ Example:
191
+ ```python
192
+ # Create a Knowledge instance
193
+ knowledge = Knowledge.from_opensearch(
194
+ host="localhost",
195
+ port=9200,
196
+ )
197
+
198
+ # Use tenant-specific index for isolation
199
+ tenant_index = f"gnosisllm-{account_id}-knowledge"
200
+ await knowledge.load(source, index_name=tenant_index)
201
+ ```
140
202
  """
141
203
  # Import OpenSearch client
142
204
  try:
@@ -159,11 +221,12 @@ class Knowledge:
159
221
  **kwargs,
160
222
  )
161
223
 
162
- # Create client
224
+ # Create client with proper timeout settings
163
225
  client_kwargs: dict[str, Any] = {
164
226
  "hosts": [{"host": config.host, "port": config.port}],
165
227
  "use_ssl": config.use_ssl,
166
228
  "verify_certs": config.verify_certs,
229
+ "timeout": max(config.read_timeout, config.agentic_timeout_seconds),
167
230
  }
168
231
 
169
232
  if config.username and config.password:
@@ -181,11 +244,16 @@ class Knowledge:
181
244
  # Create fetcher
182
245
  fetcher = None
183
246
  if neoreader_url:
184
- neoreader_config = NeoreaderConfig(base_url=neoreader_url)
247
+ neoreader_config = NeoreaderConfig(host=neoreader_url)
185
248
  fetcher = NeoreaderContentFetcher(neoreader_config)
186
249
 
187
- # Create loader factory
188
- loader_factory = LoaderFactory(default_fetcher=fetcher)
250
+ # Create chunker
251
+ chunker = SentenceChunker()
252
+
253
+ # Create loader factory (fetcher is optional, defaults will be used if None)
254
+ loader_factory = None
255
+ if fetcher:
256
+ loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
189
257
 
190
258
  return cls(
191
259
  indexer=indexer,
@@ -200,15 +268,29 @@ class Knowledge:
200
268
  def from_env(cls) -> Knowledge:
201
269
  """Create Knowledge instance from environment variables.
202
270
 
271
+ This factory creates a Knowledge instance using configuration from
272
+ environment variables. The returned instance is tenant-agnostic -
273
+ multi-tenancy should be handled by using separate indices per account.
274
+
203
275
  Returns:
204
276
  Configured Knowledge instance.
277
+
278
+ Example:
279
+ ```python
280
+ # Create from environment
281
+ knowledge = Knowledge.from_env()
282
+
283
+ # Use tenant-specific index for isolation
284
+ tenant_index = f"gnosisllm-{account_id}-knowledge"
285
+ await knowledge.search("query", index_name=tenant_index)
286
+ ```
205
287
  """
206
288
  config = OpenSearchConfig.from_env()
207
289
  neoreader_config = NeoreaderConfig.from_env()
208
290
 
209
291
  return cls.from_opensearch(
210
292
  config=config,
211
- neoreader_url=neoreader_config.base_url if neoreader_config.base_url else None,
293
+ neoreader_url=neoreader_config.host if neoreader_config.host else None,
212
294
  )
213
295
 
214
296
  @property
@@ -302,7 +384,6 @@ class Knowledge:
302
384
  source: str,
303
385
  *,
304
386
  index_name: str | None = None,
305
- account_id: str | None = None,
306
387
  collection_id: str | None = None,
307
388
  source_id: str | None = None,
308
389
  source_type: str | None = None,
@@ -313,10 +394,13 @@ class Knowledge:
313
394
 
314
395
  Automatically detects source type (sitemap, website, etc.).
315
396
 
397
+ Note:
398
+ This method is tenant-agnostic. Multi-tenancy should be handled
399
+ by using separate indices per account.
400
+
316
401
  Args:
317
402
  source: Source URL or path.
318
- index_name: Target index (uses default if not provided).
319
- account_id: Account ID for multi-tenancy.
403
+ index_name: Target index (use tenant-specific name for isolation).
320
404
  collection_id: Collection ID.
321
405
  source_id: Source ID (auto-generated if not provided).
322
406
  source_type: Explicit source type (auto-detected if not provided).
@@ -335,9 +419,9 @@ class Knowledge:
335
419
 
336
420
  # Auto-detect or use explicit source type
337
421
  if source_type:
338
- loader = self._loader_factory.create(source_type, self._fetcher)
422
+ loader = self._loader_factory.create(source_type)
339
423
  else:
340
- loader = self._loader_factory.create_for_source(source, self._fetcher)
424
+ loader = self._loader_factory.create_for_source(source)
341
425
 
342
426
  # Create service for this load operation
343
427
  service = KnowledgeIndexingService(
@@ -350,12 +434,102 @@ class Knowledge:
350
434
  return await service.load_and_index(
351
435
  source=source,
352
436
  index_name=index,
353
- account_id=account_id,
354
437
  collection_id=collection_id,
355
438
  source_id=source_id,
356
439
  **options,
357
440
  )
358
441
 
442
+ async def load_streaming(
443
+ self,
444
+ source: str,
445
+ *,
446
+ index_name: str | None = None,
447
+ collection_id: str | None = None,
448
+ collection_name: str | None = None,
449
+ source_id: str | None = None,
450
+ url_batch_size: int = 50,
451
+ fetch_concurrency: int = 10,
452
+ index_batch_size: int = 100,
453
+ on_progress: Callable[[int, int], None] | None = None,
454
+ **options: Any,
455
+ ) -> IndexResult:
456
+ """Load and index content using streaming pipeline with bounded memory.
457
+
458
+ This method is optimized for large sitemaps (10,000+ URLs) that would
459
+ otherwise exhaust memory. It processes URLs in batches, indexing
460
+ documents immediately rather than loading all content first.
461
+
462
+ Memory usage is bounded and independent of sitemap size:
463
+ - URL storage: O(url_batch_size)
464
+ - Document storage: O(index_batch_size)
465
+ - In-flight fetches: O(fetch_concurrency * avg_page_size)
466
+
467
+ Note:
468
+ This method is tenant-agnostic. Multi-tenancy should be handled
469
+ by using separate indices per account.
470
+
471
+ Args:
472
+ source: Sitemap URL.
473
+ index_name: Target index (use tenant-specific name for isolation).
474
+ collection_id: Collection ID.
475
+ collection_name: Collection name for display.
476
+ source_id: Source ID (auto-generated if not provided).
477
+ url_batch_size: URLs to discover per batch (default 50).
478
+ fetch_concurrency: Parallel URL fetches (default 10).
479
+ index_batch_size: Documents per index batch (default 100).
480
+ on_progress: Optional progress callback (urls_processed, docs_indexed).
481
+ **options: Additional loading options (max_urls, patterns, etc.).
482
+
483
+ Returns:
484
+ Index result with counts.
485
+
486
+ Example:
487
+ ```python
488
+ # Efficiently load 100k+ URL sitemap
489
+ result = await knowledge.load_streaming(
490
+ "https://large-site.com/sitemap.xml",
491
+ index_name="knowledge-account123", # Tenant-specific
492
+ url_batch_size=100,
493
+ fetch_concurrency=20,
494
+ max_urls=50000,
495
+ )
496
+ print(f"Indexed {result.indexed_count} documents")
497
+ ```
498
+ """
499
+ if self._loader_factory is None:
500
+ raise ValueError("Loader factory not configured")
501
+
502
+ index = index_name or self._default_index
503
+ if not index:
504
+ raise ValueError("No index specified and no default index configured")
505
+
506
+ # Create sitemap loader specifically for streaming
507
+ loader = self._loader_factory.create("sitemap")
508
+
509
+ # Configure pipeline
510
+ config = PipelineConfig(
511
+ url_batch_size=url_batch_size,
512
+ fetch_concurrency=fetch_concurrency,
513
+ index_batch_size=index_batch_size,
514
+ )
515
+
516
+ # Create streaming pipeline
517
+ pipeline = StreamingIndexingPipeline(
518
+ loader=loader,
519
+ indexer=self._indexer,
520
+ config=config,
521
+ events=self._events,
522
+ )
523
+
524
+ return await pipeline.execute(
525
+ source=source,
526
+ index_name=index,
527
+ collection_id=collection_id,
528
+ collection_name=collection_name,
529
+ source_id=source_id,
530
+ **options,
531
+ )
532
+
359
533
  # === Search Methods ===
360
534
 
361
535
  async def search(
@@ -366,7 +540,6 @@ class Knowledge:
366
540
  mode: SearchMode = SearchMode.HYBRID,
367
541
  limit: int = 10,
368
542
  offset: int = 0,
369
- account_id: str | None = None,
370
543
  collection_ids: list[str] | None = None,
371
544
  source_ids: list[str] | None = None,
372
545
  min_score: float | None = None,
@@ -374,13 +547,16 @@ class Knowledge:
374
547
  ) -> SearchResult:
375
548
  """Search for knowledge documents.
376
549
 
550
+ Note:
551
+ This method is tenant-agnostic. Multi-tenancy should be handled
552
+ by using separate indices per account.
553
+
377
554
  Args:
378
555
  query: Search query text.
379
- index_name: Index to search (uses default if not provided).
556
+ index_name: Index to search (use tenant-specific name for isolation).
380
557
  mode: Search mode (semantic, keyword, hybrid).
381
558
  limit: Maximum results.
382
559
  offset: Result offset for pagination.
383
- account_id: Account ID for multi-tenancy.
384
560
  collection_ids: Filter by collection IDs.
385
561
  source_ids: Filter by source IDs.
386
562
  min_score: Minimum score threshold.
@@ -395,7 +571,6 @@ class Knowledge:
395
571
  mode=mode,
396
572
  limit=limit,
397
573
  offset=offset,
398
- account_id=account_id,
399
574
  collection_ids=collection_ids,
400
575
  source_ids=source_ids,
401
576
  min_score=min_score,
@@ -473,19 +648,73 @@ class Knowledge:
473
648
 
474
649
  # === Management Methods ===
475
650
 
651
+ async def get_document(
652
+ self,
653
+ document_id: str,
654
+ *,
655
+ index_name: str | None = None,
656
+ ) -> dict[str, Any] | None:
657
+ """Get a single document by ID.
658
+
659
+ Note:
660
+ This method is tenant-agnostic. Multi-tenancy should be handled
661
+ by using separate indices per account.
662
+
663
+ Args:
664
+ document_id: Document ID to retrieve.
665
+ index_name: Index name (use tenant-specific name for isolation).
666
+ Uses default index if not provided.
667
+
668
+ Returns:
669
+ Document dict with all fields (excluding embeddings) or None if not found.
670
+ """
671
+ index = index_name or self._default_index
672
+ if not index:
673
+ raise ValueError("No index specified and no default index configured")
674
+
675
+ return await self._indexer.get(document_id, index)
676
+
677
+ async def delete_document(
678
+ self,
679
+ document_id: str,
680
+ *,
681
+ index_name: str | None = None,
682
+ ) -> bool:
683
+ """Delete a single document by ID.
684
+
685
+ Note:
686
+ This method is tenant-agnostic. Multi-tenancy should be handled
687
+ by using separate indices per account.
688
+
689
+ Args:
690
+ document_id: Document ID to delete.
691
+ index_name: Index name (use tenant-specific name for isolation).
692
+ Uses default index if not provided.
693
+
694
+ Returns:
695
+ True if deleted, False if not found.
696
+ """
697
+ index = index_name or self._default_index
698
+ if not index:
699
+ raise ValueError("No index specified and no default index configured")
700
+
701
+ return await self._indexer.delete(document_id, index)
702
+
476
703
  async def delete_source(
477
704
  self,
478
705
  source_id: str,
479
706
  *,
480
707
  index_name: str | None = None,
481
- account_id: str | None = None,
482
708
  ) -> int:
483
709
  """Delete all documents from a source.
484
710
 
711
+ Note:
712
+ This method is tenant-agnostic. Multi-tenancy should be handled
713
+ by using separate indices per account.
714
+
485
715
  Args:
486
716
  source_id: Source ID to delete.
487
- index_name: Index name.
488
- account_id: Account ID for multi-tenancy.
717
+ index_name: Index name (use tenant-specific name for isolation).
489
718
 
490
719
  Returns:
491
720
  Count of deleted documents.
@@ -494,21 +723,23 @@ class Knowledge:
494
723
  if not index:
495
724
  raise ValueError("No index specified")
496
725
 
497
- return await self.indexing.delete_source(source_id, index, account_id)
726
+ return await self.indexing.delete_source(source_id, index)
498
727
 
499
728
  async def delete_collection(
500
729
  self,
501
730
  collection_id: str,
502
731
  *,
503
732
  index_name: str | None = None,
504
- account_id: str | None = None,
505
733
  ) -> int:
506
734
  """Delete all documents from a collection.
507
735
 
736
+ Note:
737
+ This method is tenant-agnostic. Multi-tenancy should be handled
738
+ by using separate indices per account.
739
+
508
740
  Args:
509
741
  collection_id: Collection ID to delete.
510
- index_name: Index name.
511
- account_id: Account ID for multi-tenancy.
742
+ index_name: Index name (use tenant-specific name for isolation).
512
743
 
513
744
  Returns:
514
745
  Count of deleted documents.
@@ -517,32 +748,271 @@ class Knowledge:
517
748
  if not index:
518
749
  raise ValueError("No index specified")
519
750
 
520
- return await self.indexing.delete_collection(collection_id, index, account_id)
751
+ return await self.indexing.delete_collection(collection_id, index)
521
752
 
522
753
  async def count(
523
754
  self,
524
755
  *,
525
756
  index_name: str | None = None,
526
- account_id: str | None = None,
527
757
  collection_id: str | None = None,
758
+ source_id: str | None = None,
528
759
  ) -> int:
529
760
  """Count documents.
530
761
 
762
+ Note:
763
+ This method is tenant-agnostic. Multi-tenancy should be handled
764
+ by using separate indices per account.
765
+
531
766
  Args:
532
- index_name: Index to count.
533
- account_id: Filter by account.
767
+ index_name: Index to count (use tenant-specific name for isolation).
534
768
  collection_id: Filter by collection.
769
+ source_id: Filter by source (for source deletion confirmation).
535
770
 
536
771
  Returns:
537
772
  Document count.
538
773
  """
539
774
  return await self.search_service.count(
540
775
  index_name=index_name,
541
- account_id=account_id,
542
776
  collection_id=collection_id,
777
+ source_id=source_id,
543
778
  )
544
779
 
780
+ # === Collection and Stats Methods ===
781
+
782
+ async def get_collections(
783
+ self,
784
+ *,
785
+ index_name: str | None = None,
786
+ ) -> list[dict[str, Any]]:
787
+ """Get all collections with document counts.
788
+
789
+ Aggregates unique collection_ids from indexed documents.
790
+
791
+ Note:
792
+ This method is tenant-agnostic. Multi-tenancy should be handled
793
+ by using separate indices per account.
794
+
795
+ Args:
796
+ index_name: Index to query (use tenant-specific name for isolation).
797
+ Uses default index if not provided.
798
+
799
+ Returns:
800
+ List of collection dictionaries with id, name, and document_count.
801
+ """
802
+ index = index_name or self._default_index
803
+ return await self.search_service.get_collections(index_name=index)
804
+
805
+ async def get_stats(
806
+ self,
807
+ *,
808
+ index_name: str | None = None,
809
+ ) -> dict[str, Any]:
810
+ """Get index statistics.
811
+
812
+ Note:
813
+ This method is tenant-agnostic. Multi-tenancy should be handled
814
+ by using separate indices per account.
815
+
816
+ Args:
817
+ index_name: Index to query (use tenant-specific name for isolation).
818
+ Uses default index if not provided.
819
+
820
+ Returns:
821
+ Dictionary with document_count, index_name, and other stats.
822
+ """
823
+ index = index_name or self._default_index
824
+ return await self.search_service.get_stats(index_name=index)
825
+
826
+ async def list_documents(
827
+ self,
828
+ *,
829
+ index_name: str | None = None,
830
+ source_id: str | None = None,
831
+ collection_id: str | None = None,
832
+ limit: int = 50,
833
+ offset: int = 0,
834
+ ) -> dict[str, Any]:
835
+ """List documents with optional filters.
836
+
837
+ Note:
838
+ This method is tenant-agnostic. Multi-tenancy should be handled
839
+ by using separate indices per account.
840
+
841
+ Args:
842
+ index_name: Index to query (use tenant-specific name for isolation).
843
+ Uses default index if not provided.
844
+ source_id: Optional source ID filter.
845
+ collection_id: Optional collection ID filter.
846
+ limit: Maximum documents to return (max 100).
847
+ offset: Number of documents to skip.
848
+
849
+ Returns:
850
+ Dictionary with documents, total, limit, offset.
851
+ """
852
+ index = index_name or self._default_index
853
+ if not index:
854
+ raise ValueError("No index specified and no default index configured")
855
+
856
+ # Clamp limit to reasonable bounds
857
+ limit = min(max(1, limit), 100)
858
+ offset = max(0, offset)
859
+
860
+ return await self._searcher.list_documents(
861
+ index_name=index,
862
+ source_id=source_id,
863
+ collection_id=collection_id,
864
+ limit=limit,
865
+ offset=offset,
866
+ )
867
+
868
+ # === Agentic Search Status ===
869
+
870
+ @property
871
+ def is_agentic_configured(self) -> bool:
872
+ """Check if agentic search is configured.
873
+
874
+ Returns:
875
+ True if at least one agent type is configured.
876
+ """
877
+ if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
878
+ return False
879
+ config = self._searcher._config
880
+ return bool(config.flow_agent_id or config.conversational_agent_id)
881
+
882
+ async def get_agentic_status(self) -> dict[str, Any]:
883
+ """Get status of agentic search configuration.
884
+
885
+ Returns:
886
+ Dictionary with agent availability status:
887
+ - available: True if any agent is configured
888
+ - flow_agent: True if flow agent is configured
889
+ - conversational_agent: True if conversational agent is configured
890
+ """
891
+ if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
892
+ return {
893
+ "available": False,
894
+ "flow_agent": False,
895
+ "conversational_agent": False,
896
+ }
897
+
898
+ config = self._searcher._config
899
+ return {
900
+ "available": bool(config.flow_agent_id or config.conversational_agent_id),
901
+ "flow_agent": bool(config.flow_agent_id),
902
+ "conversational_agent": bool(config.conversational_agent_id),
903
+ }
904
+
905
+ async def agentic_search(
906
+ self,
907
+ query: str,
908
+ *,
909
+ agent_type: AgentType = AgentType.FLOW,
910
+ index_name: str | None = None,
911
+ collection_ids: list[str] | None = None,
912
+ source_ids: list[str] | None = None,
913
+ conversation_id: str | None = None,
914
+ include_reasoning: bool = True,
915
+ limit: int = 10,
916
+ **options: Any,
917
+ ) -> AgenticSearchResult:
918
+ """Execute agentic search with AI-powered reasoning.
919
+
920
+ Uses OpenSearch ML agents to understand queries, retrieve relevant
921
+ documents, and generate natural language answers.
922
+
923
+ Args:
924
+ query: Search query text.
925
+ agent_type: Type of agent (FLOW for fast RAG, CONVERSATIONAL for multi-turn).
926
+ index_name: Index to search (uses default if not provided).
927
+ collection_ids: Filter by collection IDs.
928
+ source_ids: Filter by source IDs.
929
+ conversation_id: Conversation ID for multi-turn (conversational agent).
930
+ include_reasoning: Include reasoning steps in response.
931
+ limit: Maximum source documents to retrieve.
932
+ **options: Additional agent options.
933
+
934
+ Returns:
935
+ AgenticSearchResult with answer, reasoning steps, and sources.
936
+
937
+ Raises:
938
+ AgenticSearchError: If agent execution fails.
939
+ ValueError: If agentic search is not configured.
940
+
941
+ Example:
942
+ ```python
943
+ result = await knowledge.agentic_search(
944
+ "How does authentication work?",
945
+ agent_type=AgentType.FLOW,
946
+ )
947
+ print(result.answer)
948
+ for source in result.items:
949
+ print(f"- {source.title}")
950
+ ```
951
+ """
952
+ # Check if agentic search is configured
953
+ if not self.is_agentic_configured:
954
+ raise ValueError(
955
+ "Agentic search is not configured. "
956
+ "Run 'gnosisllm-knowledge agentic setup' and set agent IDs in environment."
957
+ )
958
+
959
+ # Get client and config from the searcher
960
+ if not hasattr(self._searcher, '_client') or not hasattr(self._searcher, '_config'):
961
+ raise ValueError("Searcher does not have OpenSearch client/config")
962
+
963
+ client = self._searcher._client
964
+ config = self._searcher._config
965
+
966
+ # Create agentic searcher
967
+ agentic_searcher = OpenSearchAgenticSearcher(client, config)
968
+
969
+ # Build agentic query
970
+ agentic_query = AgenticSearchQuery(
971
+ text=query,
972
+ agent_type=agent_type,
973
+ collection_ids=collection_ids,
974
+ source_ids=source_ids,
975
+ conversation_id=conversation_id,
976
+ include_reasoning=include_reasoning,
977
+ limit=limit,
978
+ )
979
+
980
+ # Determine index name
981
+ index = index_name or self._default_index
982
+ if not index:
983
+ raise ValueError("No index specified and no default index configured")
984
+
985
+ # Execute agentic search
986
+ return await agentic_searcher.agentic_search(agentic_query, index, **options)
987
+
545
988
  async def close(self) -> None:
546
- """Close connections and clean up resources."""
547
- # Subclasses or future implementations can override this
548
- pass
989
+ """Close connections and clean up resources.
990
+
991
+ Closes the underlying AsyncOpenSearch client to prevent
992
+ unclosed aiohttp session warnings. Properly handles
993
+ CancelledError during event loop shutdown.
994
+ """
995
+ import asyncio
996
+
997
+ # Close the OpenSearch client via the searcher
998
+ # Note: indexer, searcher, and setup share the same client instance,
999
+ # so closing via searcher is sufficient
1000
+ if hasattr(self._searcher, '_client') and self._searcher._client is not None:
1001
+ client = self._searcher._client
1002
+ try:
1003
+ await client.close()
1004
+ logger.debug("Closed OpenSearch client connection")
1005
+ except asyncio.CancelledError:
1006
+ # Event loop is shutting down - this is expected during cleanup
1007
+ logger.debug("OpenSearch client close cancelled (event loop shutting down)")
1008
+ except Exception as e:
1009
+ logger.warning(f"Error closing OpenSearch client: {e}")
1010
+ finally:
1011
+ # Clear client reference on all components that share it
1012
+ # This prevents any accidental reuse after close
1013
+ if hasattr(self._searcher, '_client'):
1014
+ self._searcher._client = None
1015
+ if hasattr(self._indexer, '_client'):
1016
+ self._indexer._client = None
1017
+ if self._setup and hasattr(self._setup, '_client'):
1018
+ self._setup._client = None