gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +287 -7
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
  7. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  8. gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
  9. gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
  10. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  11. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  12. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  16. gnosisllm_knowledge/cli/app.py +378 -12
  17. gnosisllm_knowledge/cli/commands/agentic.py +11 -0
  18. gnosisllm_knowledge/cli/commands/memory.py +723 -0
  19. gnosisllm_knowledge/cli/commands/setup.py +24 -22
  20. gnosisllm_knowledge/cli/display/service.py +43 -0
  21. gnosisllm_knowledge/cli/utils/config.py +58 -0
  22. gnosisllm_knowledge/core/domain/__init__.py +41 -0
  23. gnosisllm_knowledge/core/domain/document.py +5 -0
  24. gnosisllm_knowledge/core/domain/memory.py +440 -0
  25. gnosisllm_knowledge/core/domain/result.py +11 -3
  26. gnosisllm_knowledge/core/domain/search.py +2 -0
  27. gnosisllm_knowledge/core/events/types.py +76 -0
  28. gnosisllm_knowledge/core/exceptions.py +134 -0
  29. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  30. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  31. gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
  32. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  33. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  34. gnosisllm_knowledge/loaders/base.py +3 -4
  35. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  36. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  37. gnosisllm_knowledge/services/indexing.py +67 -75
  38. gnosisllm_knowledge/services/search.py +47 -11
  39. gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
  40. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
  42. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  43. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
  44. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0
@@ -38,7 +38,7 @@ Features:
38
38
  - SOLID principles throughout
39
39
  """
40
40
 
41
- from gnosisllm_knowledge.api import Knowledge
41
+ from gnosisllm_knowledge.api import Knowledge, Memory
42
42
  from gnosisllm_knowledge.backends import (
43
43
  AgenticSearchFallback,
44
44
  MemoryIndexer,
@@ -51,6 +51,20 @@ from gnosisllm_knowledge.backends import (
51
51
  )
52
52
  from gnosisllm_knowledge.chunking import FixedSizeChunker, SentenceChunker
53
53
  from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
54
+ from gnosisllm_knowledge.core.domain.memory import (
55
+ ContainerConfig,
56
+ ContainerInfo,
57
+ HistoryEntry,
58
+ MemoryEntry,
59
+ MemoryStats,
60
+ MemoryStrategy,
61
+ MemoryType,
62
+ Message,
63
+ Namespace,
64
+ RecallResult,
65
+ SessionInfo,
66
+ StrategyConfig,
67
+ )
54
68
  from gnosisllm_knowledge.core.domain.result import (
55
69
  BatchResult,
56
70
  IndexResult,
@@ -58,9 +72,9 @@ from gnosisllm_knowledge.core.domain.result import (
58
72
  ValidationResult,
59
73
  )
60
74
  from gnosisllm_knowledge.core.domain.search import (
61
- AgentType,
62
75
  AgenticSearchQuery,
63
76
  AgenticSearchResult,
77
+ AgentType,
64
78
  ReasoningStep,
65
79
  SearchMode,
66
80
  SearchQuery,
@@ -72,15 +86,27 @@ from gnosisllm_knowledge.core.exceptions import (
72
86
  AgenticSearchError,
73
87
  ConfigurationError,
74
88
  ConnectionError,
89
+ ContainerExistsError,
90
+ ContainerNotFoundError,
75
91
  IndexError,
92
+ InferenceError,
93
+ InferenceTimeoutError,
76
94
  KnowledgeError,
77
95
  LoadError,
96
+ MemoryConfigurationError,
97
+ MemoryError,
78
98
  SearchError,
99
+ SessionNotFoundError,
100
+ )
101
+ from gnosisllm_knowledge.core.streaming import (
102
+ BatchCollector,
103
+ BoundedQueue,
104
+ PipelineConfig,
79
105
  )
80
106
  from gnosisllm_knowledge.fetchers import (
81
107
  HTTPContentFetcher,
82
- NeoreaderContentFetcher,
83
108
  NeoreaderConfig,
109
+ NeoreaderContentFetcher,
84
110
  )
85
111
  from gnosisllm_knowledge.loaders import (
86
112
  LoaderFactory,
@@ -95,58 +121,84 @@ from gnosisllm_knowledge.services import (
95
121
  __version__ = "0.2.0"
96
122
 
97
123
  __all__ = [
98
- # Main API
99
- "Knowledge",
100
- # Domain Models
101
- "Document",
102
- "DocumentStatus",
103
- "TextChunk",
104
- "SearchQuery",
105
- "SearchResult",
106
- "SearchResultItem",
107
- "SearchMode",
108
124
  "AgentType",
125
+ "AgenticSearchError",
126
+ "AgenticSearchFallback",
109
127
  "AgenticSearchQuery",
110
128
  "AgenticSearchResult",
111
- "ReasoningStep",
112
- "LoadResult",
113
- "IndexResult",
129
+ "BatchCollector",
114
130
  "BatchResult",
115
- "ValidationResult",
131
+ "BoundedQueue",
132
+ "ConfigurationError",
133
+ "ConnectionError",
134
+ "ContainerConfig",
135
+ "ContainerExistsError",
136
+ "ContainerInfo",
137
+ "ContainerNotFoundError",
138
+ # Domain Models
139
+ "Document",
140
+ "DocumentStatus",
116
141
  # Events
117
142
  "Event",
118
- "EventType",
119
143
  "EventEmitter",
144
+ "EventType",
145
+ "FixedSizeChunker",
146
+ # Fetchers
147
+ "HTTPContentFetcher",
148
+ "HistoryEntry",
149
+ "IndexError",
150
+ "IndexResult",
151
+ "InferenceError",
152
+ "InferenceTimeoutError",
153
+ # Main API
154
+ "Knowledge",
120
155
  # Exceptions
121
156
  "KnowledgeError",
122
- "ConfigurationError",
123
- "ConnectionError",
157
+ # Services
158
+ "KnowledgeIndexingService",
159
+ "KnowledgeSearchService",
124
160
  "LoadError",
125
- "IndexError",
126
- "SearchError",
127
- "AgenticSearchError",
161
+ "LoadResult",
128
162
  # Loaders
129
163
  "LoaderFactory",
130
- "WebsiteLoader",
131
- "SitemapLoader",
132
- # Fetchers
133
- "HTTPContentFetcher",
134
- "NeoreaderContentFetcher",
164
+ "Memory",
165
+ "MemoryConfigurationError",
166
+ "MemoryEntry",
167
+ # Memory Exceptions
168
+ "MemoryError",
169
+ # Memory Backend (for testing)
170
+ "MemoryIndexer",
171
+ "MemorySearcher",
172
+ "MemoryStats",
173
+ # Memory Domain Models
174
+ "MemoryStrategy",
175
+ "MemoryType",
176
+ "Message",
177
+ "Namespace",
135
178
  "NeoreaderConfig",
136
- # Chunkers
137
- "SentenceChunker",
138
- "FixedSizeChunker",
179
+ "NeoreaderContentFetcher",
180
+ "OpenSearchAgenticSearcher",
139
181
  # OpenSearch Backend
140
182
  "OpenSearchConfig",
141
183
  "OpenSearchIndexer",
142
184
  "OpenSearchKnowledgeSearcher",
143
185
  "OpenSearchSetupAdapter",
144
- "OpenSearchAgenticSearcher",
145
- "AgenticSearchFallback",
146
- # Memory Backend (for testing)
147
- "MemoryIndexer",
148
- "MemorySearcher",
149
- # Services
150
- "KnowledgeIndexingService",
151
- "KnowledgeSearchService",
186
+ # Streaming Pipeline
187
+ "PipelineConfig",
188
+ "ReasoningStep",
189
+ "RecallResult",
190
+ "SearchError",
191
+ "SearchMode",
192
+ "SearchQuery",
193
+ "SearchResult",
194
+ "SearchResultItem",
195
+ # Chunkers
196
+ "SentenceChunker",
197
+ "SessionInfo",
198
+ "SessionNotFoundError",
199
+ "SitemapLoader",
200
+ "StrategyConfig",
201
+ "TextChunk",
202
+ "ValidationResult",
203
+ "WebsiteLoader",
152
204
  ]
@@ -1,5 +1,6 @@
1
- """High-level API for knowledge operations."""
1
+ """High-level API for knowledge and memory operations."""
2
2
 
3
3
  from gnosisllm_knowledge.api.knowledge import Knowledge
4
+ from gnosisllm_knowledge.api.memory import Memory
4
5
 
5
- __all__ = ["Knowledge"]
6
+ __all__ = ["Knowledge", "Memory"]
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import logging
6
+ from collections.abc import Callable
6
7
  from typing import TYPE_CHECKING, Any
7
8
 
8
9
  from gnosisllm_knowledge.backends.opensearch import (
@@ -11,15 +12,24 @@ from gnosisllm_knowledge.backends.opensearch import (
11
12
  OpenSearchKnowledgeSearcher,
12
13
  OpenSearchSetupAdapter,
13
14
  )
15
+ from gnosisllm_knowledge.backends.opensearch.agentic import OpenSearchAgenticSearcher
14
16
  from gnosisllm_knowledge.chunking import SentenceChunker
15
17
  from gnosisllm_knowledge.core.domain.result import IndexResult
16
- from gnosisllm_knowledge.core.domain.search import SearchMode, SearchResult
18
+ from gnosisllm_knowledge.core.domain.search import (
19
+ AgentType,
20
+ AgenticSearchQuery,
21
+ AgenticSearchResult,
22
+ SearchMode,
23
+ SearchResult,
24
+ )
17
25
  from gnosisllm_knowledge.core.events.emitter import EventEmitter
18
26
  from gnosisllm_knowledge.core.interfaces.setup import DiagnosticReport, HealthReport
27
+ from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
19
28
  from gnosisllm_knowledge.fetchers import NeoreaderContentFetcher
20
29
  from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
21
30
  from gnosisllm_knowledge.loaders import LoaderFactory
22
31
  from gnosisllm_knowledge.services import KnowledgeIndexingService, KnowledgeSearchService
32
+ from gnosisllm_knowledge.services.streaming_pipeline import StreamingIndexingPipeline
23
33
 
24
34
  if TYPE_CHECKING:
25
35
  from opensearchpy import AsyncOpenSearch
@@ -159,11 +169,12 @@ class Knowledge:
159
169
  **kwargs,
160
170
  )
161
171
 
162
- # Create client
172
+ # Create client with proper timeout settings
163
173
  client_kwargs: dict[str, Any] = {
164
174
  "hosts": [{"host": config.host, "port": config.port}],
165
175
  "use_ssl": config.use_ssl,
166
176
  "verify_certs": config.verify_certs,
177
+ "timeout": max(config.read_timeout, config.agentic_timeout_seconds),
167
178
  }
168
179
 
169
180
  if config.username and config.password:
@@ -181,11 +192,16 @@ class Knowledge:
181
192
  # Create fetcher
182
193
  fetcher = None
183
194
  if neoreader_url:
184
- neoreader_config = NeoreaderConfig(base_url=neoreader_url)
195
+ neoreader_config = NeoreaderConfig(host=neoreader_url)
185
196
  fetcher = NeoreaderContentFetcher(neoreader_config)
186
197
 
187
- # Create loader factory
188
- loader_factory = LoaderFactory(default_fetcher=fetcher)
198
+ # Create chunker
199
+ chunker = SentenceChunker()
200
+
201
+ # Create loader factory (fetcher is optional, defaults will be used if None)
202
+ loader_factory = None
203
+ if fetcher:
204
+ loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
189
205
 
190
206
  return cls(
191
207
  indexer=indexer,
@@ -335,9 +351,9 @@ class Knowledge:
335
351
 
336
352
  # Auto-detect or use explicit source type
337
353
  if source_type:
338
- loader = self._loader_factory.create(source_type, self._fetcher)
354
+ loader = self._loader_factory.create(source_type)
339
355
  else:
340
- loader = self._loader_factory.create_for_source(source, self._fetcher)
356
+ loader = self._loader_factory.create_for_source(source)
341
357
 
342
358
  # Create service for this load operation
343
359
  service = KnowledgeIndexingService(
@@ -356,6 +372,95 @@ class Knowledge:
356
372
  **options,
357
373
  )
358
374
 
375
+ async def load_streaming(
376
+ self,
377
+ source: str,
378
+ *,
379
+ index_name: str | None = None,
380
+ account_id: str | None = None,
381
+ collection_id: str | None = None,
382
+ collection_name: str | None = None,
383
+ source_id: str | None = None,
384
+ url_batch_size: int = 50,
385
+ fetch_concurrency: int = 10,
386
+ index_batch_size: int = 100,
387
+ on_progress: Callable[[int, int], None] | None = None,
388
+ **options: Any,
389
+ ) -> IndexResult:
390
+ """Load and index content using streaming pipeline with bounded memory.
391
+
392
+ This method is optimized for large sitemaps (10,000+ URLs) that would
393
+ otherwise exhaust memory. It processes URLs in batches, indexing
394
+ documents immediately rather than loading all content first.
395
+
396
+ Memory usage is bounded and independent of sitemap size:
397
+ - URL storage: O(url_batch_size)
398
+ - Document storage: O(index_batch_size)
399
+ - In-flight fetches: O(fetch_concurrency * avg_page_size)
400
+
401
+ Args:
402
+ source: Sitemap URL.
403
+ index_name: Target index (uses default if not provided).
404
+ account_id: Account ID for multi-tenancy.
405
+ collection_id: Collection ID.
406
+ collection_name: Collection name for display.
407
+ source_id: Source ID (auto-generated if not provided).
408
+ url_batch_size: URLs to discover per batch (default 50).
409
+ fetch_concurrency: Parallel URL fetches (default 10).
410
+ index_batch_size: Documents per index batch (default 100).
411
+ on_progress: Optional progress callback (urls_processed, docs_indexed).
412
+ **options: Additional loading options (max_urls, patterns, etc.).
413
+
414
+ Returns:
415
+ Index result with counts.
416
+
417
+ Example:
418
+ ```python
419
+ # Efficiently load 100k+ URL sitemap
420
+ result = await knowledge.load_streaming(
421
+ "https://large-site.com/sitemap.xml",
422
+ url_batch_size=100,
423
+ fetch_concurrency=20,
424
+ max_urls=50000,
425
+ )
426
+ print(f"Indexed {result.indexed_count} documents")
427
+ ```
428
+ """
429
+ if self._loader_factory is None:
430
+ raise ValueError("Loader factory not configured")
431
+
432
+ index = index_name or self._default_index
433
+ if not index:
434
+ raise ValueError("No index specified and no default index configured")
435
+
436
+ # Create sitemap loader specifically for streaming
437
+ loader = self._loader_factory.create("sitemap")
438
+
439
+ # Configure pipeline
440
+ config = PipelineConfig(
441
+ url_batch_size=url_batch_size,
442
+ fetch_concurrency=fetch_concurrency,
443
+ index_batch_size=index_batch_size,
444
+ )
445
+
446
+ # Create streaming pipeline
447
+ pipeline = StreamingIndexingPipeline(
448
+ loader=loader,
449
+ indexer=self._indexer,
450
+ config=config,
451
+ events=self._events,
452
+ )
453
+
454
+ return await pipeline.execute(
455
+ source=source,
456
+ index_name=index,
457
+ account_id=account_id,
458
+ collection_id=collection_id,
459
+ collection_name=collection_name,
460
+ source_id=source_id,
461
+ **options,
462
+ )
463
+
359
464
  # === Search Methods ===
360
465
 
361
466
  async def search(
@@ -542,6 +647,181 @@ class Knowledge:
542
647
  collection_id=collection_id,
543
648
  )
544
649
 
650
+ # === Collection and Stats Methods ===
651
+
652
+ async def get_collections(self) -> list[dict[str, Any]]:
653
+ """Get all collections with document counts.
654
+
655
+ Aggregates unique collection_ids from indexed documents.
656
+
657
+ Returns:
658
+ List of collection dictionaries with id, name, and document_count.
659
+ """
660
+ return await self.search_service.get_collections()
661
+
662
+ async def get_stats(self) -> dict[str, Any]:
663
+ """Get index statistics.
664
+
665
+ Returns:
666
+ Dictionary with document_count, index_name, and other stats.
667
+ """
668
+ return await self.search_service.get_stats()
669
+
670
+ async def list_documents(
671
+ self,
672
+ *,
673
+ source_id: str | None = None,
674
+ collection_id: str | None = None,
675
+ limit: int = 50,
676
+ offset: int = 0,
677
+ ) -> dict[str, Any]:
678
+ """List documents with optional filters.
679
+
680
+ Args:
681
+ source_id: Optional source ID filter.
682
+ collection_id: Optional collection ID filter.
683
+ limit: Maximum documents to return (max 100).
684
+ offset: Number of documents to skip.
685
+
686
+ Returns:
687
+ Dictionary with documents, total, limit, offset.
688
+ """
689
+ index = self._default_index
690
+ if not index:
691
+ raise ValueError("No default index configured")
692
+
693
+ # Clamp limit to reasonable bounds
694
+ limit = min(max(1, limit), 100)
695
+ offset = max(0, offset)
696
+
697
+ return await self._searcher.list_documents(
698
+ index_name=index,
699
+ source_id=source_id,
700
+ collection_id=collection_id,
701
+ limit=limit,
702
+ offset=offset,
703
+ )
704
+
705
+ # === Agentic Search Status ===
706
+
707
+ @property
708
+ def is_agentic_configured(self) -> bool:
709
+ """Check if agentic search is configured.
710
+
711
+ Returns:
712
+ True if at least one agent type is configured.
713
+ """
714
+ if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
715
+ return False
716
+ config = self._searcher._config
717
+ return bool(config.flow_agent_id or config.conversational_agent_id)
718
+
719
+ async def get_agentic_status(self) -> dict[str, Any]:
720
+ """Get status of agentic search configuration.
721
+
722
+ Returns:
723
+ Dictionary with agent availability status:
724
+ - available: True if any agent is configured
725
+ - flow_agent: True if flow agent is configured
726
+ - conversational_agent: True if conversational agent is configured
727
+ """
728
+ if not hasattr(self, '_searcher') or not hasattr(self._searcher, '_config'):
729
+ return {
730
+ "available": False,
731
+ "flow_agent": False,
732
+ "conversational_agent": False,
733
+ }
734
+
735
+ config = self._searcher._config
736
+ return {
737
+ "available": bool(config.flow_agent_id or config.conversational_agent_id),
738
+ "flow_agent": bool(config.flow_agent_id),
739
+ "conversational_agent": bool(config.conversational_agent_id),
740
+ }
741
+
742
+ async def agentic_search(
743
+ self,
744
+ query: str,
745
+ *,
746
+ agent_type: AgentType = AgentType.FLOW,
747
+ index_name: str | None = None,
748
+ collection_ids: list[str] | None = None,
749
+ source_ids: list[str] | None = None,
750
+ conversation_id: str | None = None,
751
+ include_reasoning: bool = True,
752
+ limit: int = 10,
753
+ **options: Any,
754
+ ) -> AgenticSearchResult:
755
+ """Execute agentic search with AI-powered reasoning.
756
+
757
+ Uses OpenSearch ML agents to understand queries, retrieve relevant
758
+ documents, and generate natural language answers.
759
+
760
+ Args:
761
+ query: Search query text.
762
+ agent_type: Type of agent (FLOW for fast RAG, CONVERSATIONAL for multi-turn).
763
+ index_name: Index to search (uses default if not provided).
764
+ collection_ids: Filter by collection IDs.
765
+ source_ids: Filter by source IDs.
766
+ conversation_id: Conversation ID for multi-turn (conversational agent).
767
+ include_reasoning: Include reasoning steps in response.
768
+ limit: Maximum source documents to retrieve.
769
+ **options: Additional agent options.
770
+
771
+ Returns:
772
+ AgenticSearchResult with answer, reasoning steps, and sources.
773
+
774
+ Raises:
775
+ AgenticSearchError: If agent execution fails.
776
+ ValueError: If agentic search is not configured.
777
+
778
+ Example:
779
+ ```python
780
+ result = await knowledge.agentic_search(
781
+ "How does authentication work?",
782
+ agent_type=AgentType.FLOW,
783
+ )
784
+ print(result.answer)
785
+ for source in result.items:
786
+ print(f"- {source.title}")
787
+ ```
788
+ """
789
+ # Check if agentic search is configured
790
+ if not self.is_agentic_configured:
791
+ raise ValueError(
792
+ "Agentic search is not configured. "
793
+ "Run 'gnosisllm-knowledge agentic setup' and set agent IDs in environment."
794
+ )
795
+
796
+ # Get client and config from the searcher
797
+ if not hasattr(self._searcher, '_client') or not hasattr(self._searcher, '_config'):
798
+ raise ValueError("Searcher does not have OpenSearch client/config")
799
+
800
+ client = self._searcher._client
801
+ config = self._searcher._config
802
+
803
+ # Create agentic searcher
804
+ agentic_searcher = OpenSearchAgenticSearcher(client, config)
805
+
806
+ # Build agentic query
807
+ agentic_query = AgenticSearchQuery(
808
+ text=query,
809
+ agent_type=agent_type,
810
+ collection_ids=collection_ids,
811
+ source_ids=source_ids,
812
+ conversation_id=conversation_id,
813
+ include_reasoning=include_reasoning,
814
+ limit=limit,
815
+ )
816
+
817
+ # Determine index name
818
+ index = index_name or self._default_index
819
+ if not index:
820
+ raise ValueError("No index specified and no default index configured")
821
+
822
+ # Execute agentic search
823
+ return await agentic_searcher.agentic_search(agentic_query, index, **options)
824
+
545
825
  async def close(self) -> None:
546
826
  """Close connections and clean up resources."""
547
827
  # Subclasses or future implementations can override this