gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. gnosisllm_knowledge/api/knowledge.py +233 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +132 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/config.py +7 -0
  6. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  7. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  8. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  9. gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
  10. gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
  11. gnosisllm_knowledge/cli/app.py +58 -19
  12. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  13. gnosisllm_knowledge/cli/commands/load.py +169 -19
  14. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  15. gnosisllm_knowledge/cli/commands/search.py +9 -10
  16. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  17. gnosisllm_knowledge/cli/utils/config.py +4 -4
  18. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  19. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  20. gnosisllm_knowledge/core/domain/document.py +14 -19
  21. gnosisllm_knowledge/core/domain/search.py +10 -25
  22. gnosisllm_knowledge/core/domain/source.py +11 -12
  23. gnosisllm_knowledge/core/events/__init__.py +8 -0
  24. gnosisllm_knowledge/core/events/types.py +122 -5
  25. gnosisllm_knowledge/core/exceptions.py +93 -0
  26. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  27. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  28. gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
  29. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  30. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  31. gnosisllm_knowledge/fetchers/config.py +27 -0
  32. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  33. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  34. gnosisllm_knowledge/loaders/__init__.py +5 -1
  35. gnosisllm_knowledge/loaders/discovery.py +338 -0
  36. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  37. gnosisllm_knowledge/loaders/factory.py +46 -0
  38. gnosisllm_knowledge/services/indexing.py +51 -21
  39. gnosisllm_knowledge/services/search.py +42 -28
  40. gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
  41. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
  42. gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
  43. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  44. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
  45. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,39 @@
1
- """High-level Knowledge API facade."""
1
+ """High-level Knowledge API facade.
2
+
3
+ This module provides the main entry point for the gnosisllm-knowledge library.
4
+ The Knowledge class is a high-level facade that abstracts the complexity of
5
+ loading, indexing, and searching knowledge documents.
6
+
7
+ Note:
8
+ This library is tenant-agnostic. Multi-tenancy should be handled at the
9
+ API layer by using separate indices per account (e.g.,
10
+ `knowledge-{account_id}`) rather than filtering by account_id.
11
+
12
+ Example:
13
+ ```python
14
+ # Create Knowledge instance for a specific tenant
15
+ knowledge = Knowledge.from_opensearch(
16
+ host="localhost",
17
+ port=9200,
18
+ )
19
+
20
+ # Use a tenant-specific index
21
+ tenant_index = f"knowledge-{account_id}"
22
+
23
+ # Load content
24
+ await knowledge.load(
25
+ "https://docs.example.com/sitemap.xml",
26
+ index_name=tenant_index,
27
+ collection_id="docs",
28
+ )
29
+
30
+ # Search (tenant isolation via index name)
31
+ results = await knowledge.search(
32
+ "how to configure",
33
+ index_name=tenant_index,
34
+ )
35
+ ```
36
+ """
2
37
 
3
38
  from __future__ import annotations
4
39
 
@@ -84,6 +119,7 @@ class Knowledge:
84
119
  loader_factory: LoaderFactory | None = None,
85
120
  default_index: str | None = None,
86
121
  events: EventEmitter | None = None,
122
+ config: OpenSearchConfig | None = None,
87
123
  ) -> None:
88
124
  """Initialize Knowledge with components.
89
125
 
@@ -96,6 +132,7 @@ class Knowledge:
96
132
  loader_factory: Optional loader factory.
97
133
  default_index: Default index name.
98
134
  events: Optional event emitter.
135
+ config: Optional OpenSearch config for settings like batch sizes.
99
136
 
100
137
  Note:
101
138
  Embeddings are generated automatically by OpenSearch ingest pipeline.
@@ -109,6 +146,7 @@ class Knowledge:
109
146
  self._loader_factory = loader_factory
110
147
  self._default_index = default_index
111
148
  self._events = events or EventEmitter()
149
+ self._config = config
112
150
 
113
151
  # Initialize services lazily
114
152
  self._indexing_service: KnowledgeIndexingService | None = None
@@ -130,6 +168,10 @@ class Knowledge:
130
168
  ) -> Knowledge:
131
169
  """Create Knowledge instance with OpenSearch backend.
132
170
 
171
+ This factory creates a Knowledge instance configured for OpenSearch.
172
+ The returned instance is tenant-agnostic - multi-tenancy should be
173
+ handled by using separate indices per account.
174
+
133
175
  Args:
134
176
  host: OpenSearch host.
135
177
  port: OpenSearch port.
@@ -147,6 +189,19 @@ class Knowledge:
147
189
  Note:
148
190
  Embeddings are generated automatically by OpenSearch ingest pipeline.
149
191
  Run 'gnosisllm-knowledge setup' to configure the ML model.
192
+
193
+ Example:
194
+ ```python
195
+ # Create a Knowledge instance
196
+ knowledge = Knowledge.from_opensearch(
197
+ host="localhost",
198
+ port=9200,
199
+ )
200
+
201
+ # Use tenant-specific index for isolation
202
+ tenant_index = f"gnosisllm-{account_id}-knowledge"
203
+ await knowledge.load(source, index_name=tenant_index)
204
+ ```
150
205
  """
151
206
  # Import OpenSearch client
152
207
  try:
@@ -210,21 +265,36 @@ class Knowledge:
210
265
  fetcher=fetcher,
211
266
  loader_factory=loader_factory,
212
267
  default_index=config.knowledge_index_name,
268
+ config=config,
213
269
  )
214
270
 
215
271
  @classmethod
216
272
  def from_env(cls) -> Knowledge:
217
273
  """Create Knowledge instance from environment variables.
218
274
 
275
+ This factory creates a Knowledge instance using configuration from
276
+ environment variables. The returned instance is tenant-agnostic -
277
+ multi-tenancy should be handled by using separate indices per account.
278
+
219
279
  Returns:
220
280
  Configured Knowledge instance.
281
+
282
+ Example:
283
+ ```python
284
+ # Create from environment
285
+ knowledge = Knowledge.from_env()
286
+
287
+ # Use tenant-specific index for isolation
288
+ tenant_index = f"gnosisllm-{account_id}-knowledge"
289
+ await knowledge.search("query", index_name=tenant_index)
290
+ ```
221
291
  """
222
292
  config = OpenSearchConfig.from_env()
223
293
  neoreader_config = NeoreaderConfig.from_env()
224
294
 
225
295
  return cls.from_opensearch(
226
296
  config=config,
227
- neoreader_url=neoreader_config.base_url if neoreader_config.base_url else None,
297
+ neoreader_url=neoreader_config.host if neoreader_config.host else None,
228
298
  )
229
299
 
230
300
  @property
@@ -318,7 +388,6 @@ class Knowledge:
318
388
  source: str,
319
389
  *,
320
390
  index_name: str | None = None,
321
- account_id: str | None = None,
322
391
  collection_id: str | None = None,
323
392
  source_id: str | None = None,
324
393
  source_type: str | None = None,
@@ -329,10 +398,13 @@ class Knowledge:
329
398
 
330
399
  Automatically detects source type (sitemap, website, etc.).
331
400
 
401
+ Note:
402
+ This method is tenant-agnostic. Multi-tenancy should be handled
403
+ by using separate indices per account.
404
+
332
405
  Args:
333
406
  source: Source URL or path.
334
- index_name: Target index (uses default if not provided).
335
- account_id: Account ID for multi-tenancy.
407
+ index_name: Target index (use tenant-specific name for isolation).
336
408
  collection_id: Collection ID.
337
409
  source_id: Source ID (auto-generated if not provided).
338
410
  source_type: Explicit source type (auto-detected if not provided).
@@ -363,12 +435,15 @@ class Knowledge:
363
435
  events=self._events,
364
436
  )
365
437
 
438
+ # Get batch size from config or use default
439
+ batch_size = self._config.indexing_batch_size if self._config else 10
440
+
366
441
  return await service.load_and_index(
367
442
  source=source,
368
443
  index_name=index,
369
- account_id=account_id,
370
444
  collection_id=collection_id,
371
445
  source_id=source_id,
446
+ batch_size=batch_size,
372
447
  **options,
373
448
  )
374
449
 
@@ -377,7 +452,6 @@ class Knowledge:
377
452
  source: str,
378
453
  *,
379
454
  index_name: str | None = None,
380
- account_id: str | None = None,
381
455
  collection_id: str | None = None,
382
456
  collection_name: str | None = None,
383
457
  source_id: str | None = None,
@@ -398,10 +472,13 @@ class Knowledge:
398
472
  - Document storage: O(index_batch_size)
399
473
  - In-flight fetches: O(fetch_concurrency * avg_page_size)
400
474
 
475
+ Note:
476
+ This method is tenant-agnostic. Multi-tenancy should be handled
477
+ by using separate indices per account.
478
+
401
479
  Args:
402
480
  source: Sitemap URL.
403
- index_name: Target index (uses default if not provided).
404
- account_id: Account ID for multi-tenancy.
481
+ index_name: Target index (use tenant-specific name for isolation).
405
482
  collection_id: Collection ID.
406
483
  collection_name: Collection name for display.
407
484
  source_id: Source ID (auto-generated if not provided).
@@ -419,6 +496,7 @@ class Knowledge:
419
496
  # Efficiently load 100k+ URL sitemap
420
497
  result = await knowledge.load_streaming(
421
498
  "https://large-site.com/sitemap.xml",
499
+ index_name="knowledge-account123", # Tenant-specific
422
500
  url_batch_size=100,
423
501
  fetch_concurrency=20,
424
502
  max_urls=50000,
@@ -454,7 +532,6 @@ class Knowledge:
454
532
  return await pipeline.execute(
455
533
  source=source,
456
534
  index_name=index,
457
- account_id=account_id,
458
535
  collection_id=collection_id,
459
536
  collection_name=collection_name,
460
537
  source_id=source_id,
@@ -471,7 +548,6 @@ class Knowledge:
471
548
  mode: SearchMode = SearchMode.HYBRID,
472
549
  limit: int = 10,
473
550
  offset: int = 0,
474
- account_id: str | None = None,
475
551
  collection_ids: list[str] | None = None,
476
552
  source_ids: list[str] | None = None,
477
553
  min_score: float | None = None,
@@ -479,13 +555,16 @@ class Knowledge:
479
555
  ) -> SearchResult:
480
556
  """Search for knowledge documents.
481
557
 
558
+ Note:
559
+ This method is tenant-agnostic. Multi-tenancy should be handled
560
+ by using separate indices per account.
561
+
482
562
  Args:
483
563
  query: Search query text.
484
- index_name: Index to search (uses default if not provided).
564
+ index_name: Index to search (use tenant-specific name for isolation).
485
565
  mode: Search mode (semantic, keyword, hybrid).
486
566
  limit: Maximum results.
487
567
  offset: Result offset for pagination.
488
- account_id: Account ID for multi-tenancy.
489
568
  collection_ids: Filter by collection IDs.
490
569
  source_ids: Filter by source IDs.
491
570
  min_score: Minimum score threshold.
@@ -500,7 +579,6 @@ class Knowledge:
500
579
  mode=mode,
501
580
  limit=limit,
502
581
  offset=offset,
503
- account_id=account_id,
504
582
  collection_ids=collection_ids,
505
583
  source_ids=source_ids,
506
584
  min_score=min_score,
@@ -578,19 +656,73 @@ class Knowledge:
578
656
 
579
657
  # === Management Methods ===
580
658
 
659
+ async def get_document(
660
+ self,
661
+ document_id: str,
662
+ *,
663
+ index_name: str | None = None,
664
+ ) -> dict[str, Any] | None:
665
+ """Get a single document by ID.
666
+
667
+ Note:
668
+ This method is tenant-agnostic. Multi-tenancy should be handled
669
+ by using separate indices per account.
670
+
671
+ Args:
672
+ document_id: Document ID to retrieve.
673
+ index_name: Index name (use tenant-specific name for isolation).
674
+ Uses default index if not provided.
675
+
676
+ Returns:
677
+ Document dict with all fields (excluding embeddings) or None if not found.
678
+ """
679
+ index = index_name or self._default_index
680
+ if not index:
681
+ raise ValueError("No index specified and no default index configured")
682
+
683
+ return await self._indexer.get(document_id, index)
684
+
685
+ async def delete_document(
686
+ self,
687
+ document_id: str,
688
+ *,
689
+ index_name: str | None = None,
690
+ ) -> bool:
691
+ """Delete a single document by ID.
692
+
693
+ Note:
694
+ This method is tenant-agnostic. Multi-tenancy should be handled
695
+ by using separate indices per account.
696
+
697
+ Args:
698
+ document_id: Document ID to delete.
699
+ index_name: Index name (use tenant-specific name for isolation).
700
+ Uses default index if not provided.
701
+
702
+ Returns:
703
+ True if deleted, False if not found.
704
+ """
705
+ index = index_name or self._default_index
706
+ if not index:
707
+ raise ValueError("No index specified and no default index configured")
708
+
709
+ return await self._indexer.delete(document_id, index)
710
+
581
711
  async def delete_source(
582
712
  self,
583
713
  source_id: str,
584
714
  *,
585
715
  index_name: str | None = None,
586
- account_id: str | None = None,
587
716
  ) -> int:
588
717
  """Delete all documents from a source.
589
718
 
719
+ Note:
720
+ This method is tenant-agnostic. Multi-tenancy should be handled
721
+ by using separate indices per account.
722
+
590
723
  Args:
591
724
  source_id: Source ID to delete.
592
- index_name: Index name.
593
- account_id: Account ID for multi-tenancy.
725
+ index_name: Index name (use tenant-specific name for isolation).
594
726
 
595
727
  Returns:
596
728
  Count of deleted documents.
@@ -599,21 +731,23 @@ class Knowledge:
599
731
  if not index:
600
732
  raise ValueError("No index specified")
601
733
 
602
- return await self.indexing.delete_source(source_id, index, account_id)
734
+ return await self.indexing.delete_source(source_id, index)
603
735
 
604
736
  async def delete_collection(
605
737
  self,
606
738
  collection_id: str,
607
739
  *,
608
740
  index_name: str | None = None,
609
- account_id: str | None = None,
610
741
  ) -> int:
611
742
  """Delete all documents from a collection.
612
743
 
744
+ Note:
745
+ This method is tenant-agnostic. Multi-tenancy should be handled
746
+ by using separate indices per account.
747
+
613
748
  Args:
614
749
  collection_id: Collection ID to delete.
615
- index_name: Index name.
616
- account_id: Account ID for multi-tenancy.
750
+ index_name: Index name (use tenant-specific name for isolation).
617
751
 
618
752
  Returns:
619
753
  Count of deleted documents.
@@ -622,54 +756,85 @@ class Knowledge:
622
756
  if not index:
623
757
  raise ValueError("No index specified")
624
758
 
625
- return await self.indexing.delete_collection(collection_id, index, account_id)
759
+ return await self.indexing.delete_collection(collection_id, index)
626
760
 
627
761
  async def count(
628
762
  self,
629
763
  *,
630
764
  index_name: str | None = None,
631
- account_id: str | None = None,
632
765
  collection_id: str | None = None,
766
+ source_id: str | None = None,
633
767
  ) -> int:
634
768
  """Count documents.
635
769
 
770
+ Note:
771
+ This method is tenant-agnostic. Multi-tenancy should be handled
772
+ by using separate indices per account.
773
+
636
774
  Args:
637
- index_name: Index to count.
638
- account_id: Filter by account.
775
+ index_name: Index to count (use tenant-specific name for isolation).
639
776
  collection_id: Filter by collection.
777
+ source_id: Filter by source (for source deletion confirmation).
640
778
 
641
779
  Returns:
642
780
  Document count.
643
781
  """
644
782
  return await self.search_service.count(
645
783
  index_name=index_name,
646
- account_id=account_id,
647
784
  collection_id=collection_id,
785
+ source_id=source_id,
648
786
  )
649
787
 
650
788
  # === Collection and Stats Methods ===
651
789
 
652
- async def get_collections(self) -> list[dict[str, Any]]:
790
+ async def get_collections(
791
+ self,
792
+ *,
793
+ index_name: str | None = None,
794
+ ) -> list[dict[str, Any]]:
653
795
  """Get all collections with document counts.
654
796
 
655
797
  Aggregates unique collection_ids from indexed documents.
656
798
 
799
+ Note:
800
+ This method is tenant-agnostic. Multi-tenancy should be handled
801
+ by using separate indices per account.
802
+
803
+ Args:
804
+ index_name: Index to query (use tenant-specific name for isolation).
805
+ Uses default index if not provided.
806
+
657
807
  Returns:
658
808
  List of collection dictionaries with id, name, and document_count.
659
809
  """
660
- return await self.search_service.get_collections()
810
+ index = index_name or self._default_index
811
+ return await self.search_service.get_collections(index_name=index)
661
812
 
662
- async def get_stats(self) -> dict[str, Any]:
813
+ async def get_stats(
814
+ self,
815
+ *,
816
+ index_name: str | None = None,
817
+ ) -> dict[str, Any]:
663
818
  """Get index statistics.
664
819
 
820
+ Note:
821
+ This method is tenant-agnostic. Multi-tenancy should be handled
822
+ by using separate indices per account.
823
+
824
+ Args:
825
+ index_name: Index to query (use tenant-specific name for isolation).
826
+ Uses default index if not provided.
827
+
665
828
  Returns:
666
829
  Dictionary with document_count, index_name, and other stats.
667
830
  """
668
- return await self.search_service.get_stats()
831
+ index = index_name or self._default_index
832
+ return await self.search_service.get_stats(index_name=index)
669
833
 
670
834
  async def list_documents(
671
835
  self,
672
836
  *,
837
+ index_name: str | None = None,
673
838
  source_id: str | None = None,
674
839
  collection_id: str | None = None,
675
840
  limit: int = 50,
@@ -677,7 +842,13 @@ class Knowledge:
677
842
  ) -> dict[str, Any]:
678
843
  """List documents with optional filters.
679
844
 
845
+ Note:
846
+ This method is tenant-agnostic. Multi-tenancy should be handled
847
+ by using separate indices per account.
848
+
680
849
  Args:
850
+ index_name: Index to query (use tenant-specific name for isolation).
851
+ Uses default index if not provided.
681
852
  source_id: Optional source ID filter.
682
853
  collection_id: Optional collection ID filter.
683
854
  limit: Maximum documents to return (max 100).
@@ -686,9 +857,9 @@ class Knowledge:
686
857
  Returns:
687
858
  Dictionary with documents, total, limit, offset.
688
859
  """
689
- index = self._default_index
860
+ index = index_name or self._default_index
690
861
  if not index:
691
- raise ValueError("No default index configured")
862
+ raise ValueError("No index specified and no default index configured")
692
863
 
693
864
  # Clamp limit to reasonable bounds
694
865
  limit = min(max(1, limit), 100)
@@ -823,6 +994,33 @@ class Knowledge:
823
994
  return await agentic_searcher.agentic_search(agentic_query, index, **options)
824
995
 
825
996
  async def close(self) -> None:
826
- """Close connections and clean up resources."""
827
- # Subclasses or future implementations can override this
828
- pass
997
+ """Close connections and clean up resources.
998
+
999
+ Closes the underlying AsyncOpenSearch client to prevent
1000
+ unclosed aiohttp session warnings. Properly handles
1001
+ CancelledError during event loop shutdown.
1002
+ """
1003
+ import asyncio
1004
+
1005
+ # Close the OpenSearch client via the searcher
1006
+ # Note: indexer, searcher, and setup share the same client instance,
1007
+ # so closing via searcher is sufficient
1008
+ if hasattr(self._searcher, '_client') and self._searcher._client is not None:
1009
+ client = self._searcher._client
1010
+ try:
1011
+ await client.close()
1012
+ logger.debug("Closed OpenSearch client connection")
1013
+ except asyncio.CancelledError:
1014
+ # Event loop is shutting down - this is expected during cleanup
1015
+ logger.debug("OpenSearch client close cancelled (event loop shutting down)")
1016
+ except Exception as e:
1017
+ logger.warning(f"Error closing OpenSearch client: {e}")
1018
+ finally:
1019
+ # Clear client reference on all components that share it
1020
+ # This prevents any accidental reuse after close
1021
+ if hasattr(self._searcher, '_client'):
1022
+ self._searcher._client = None
1023
+ if hasattr(self._indexer, '_client'):
1024
+ self._indexer._client = None
1025
+ if self._setup and hasattr(self._setup, '_client'):
1026
+ self._setup._client = None
@@ -1,4 +1,10 @@
1
- """In-memory document indexer for testing."""
1
+ """In-memory document indexer for testing.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). The memory indexer does not
6
+ include tenant filtering logic - use separate index names per tenant.
7
+ """
2
8
 
3
9
  from __future__ import annotations
4
10
 
@@ -14,6 +20,9 @@ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
14
20
  class MemoryIndexer:
15
21
  """In-memory document indexer for testing.
16
22
 
23
+ This indexer is tenant-agnostic. Multi-tenancy is achieved through index
24
+ isolation by using tenant-specific index names.
25
+
17
26
  Stores documents in a dictionary for fast testing without
18
27
  requiring an external OpenSearch instance.
19
28
 
@@ -185,6 +194,22 @@ class MemoryIndexer:
185
194
  """
186
195
  return await self.index(document, index_name)
187
196
 
197
+ async def get(
198
+ self,
199
+ doc_id: str,
200
+ index_name: str,
201
+ ) -> dict[str, Any] | None:
202
+ """Get a document by ID (async interface).
203
+
204
+ Args:
205
+ doc_id: Document ID.
206
+ index_name: Index name.
207
+
208
+ Returns:
209
+ Document dictionary or None if not found.
210
+ """
211
+ return self._indices.get(index_name, {}).get(doc_id)
212
+
188
213
  async def delete(
189
214
  self,
190
215
  doc_id: str,
@@ -365,8 +390,8 @@ class MemoryIndexer:
365
390
  "url": document.url,
366
391
  "title": document.title,
367
392
  "source": document.source,
368
- "account_id": document.account_id,
369
393
  "collection_id": document.collection_id,
394
+ "collection_name": document.collection_name,
370
395
  "source_id": document.source_id,
371
396
  "chunk_index": document.chunk_index,
372
397
  "total_chunks": document.total_chunks,