gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,39 @@
|
|
|
1
|
-
"""High-level Knowledge API facade.
|
|
1
|
+
"""High-level Knowledge API facade.
|
|
2
|
+
|
|
3
|
+
This module provides the main entry point for the gnosisllm-knowledge library.
|
|
4
|
+
The Knowledge class is a high-level facade that abstracts the complexity of
|
|
5
|
+
loading, indexing, and searching knowledge documents.
|
|
6
|
+
|
|
7
|
+
Note:
|
|
8
|
+
This library is tenant-agnostic. Multi-tenancy should be handled at the
|
|
9
|
+
API layer by using separate indices per account (e.g.,
|
|
10
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
```python
|
|
14
|
+
# Create Knowledge instance for a specific tenant
|
|
15
|
+
knowledge = Knowledge.from_opensearch(
|
|
16
|
+
host="localhost",
|
|
17
|
+
port=9200,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Use a tenant-specific index
|
|
21
|
+
tenant_index = f"knowledge-{account_id}"
|
|
22
|
+
|
|
23
|
+
# Load content
|
|
24
|
+
await knowledge.load(
|
|
25
|
+
"https://docs.example.com/sitemap.xml",
|
|
26
|
+
index_name=tenant_index,
|
|
27
|
+
collection_id="docs",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Search (tenant isolation via index name)
|
|
31
|
+
results = await knowledge.search(
|
|
32
|
+
"how to configure",
|
|
33
|
+
index_name=tenant_index,
|
|
34
|
+
)
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
2
37
|
|
|
3
38
|
from __future__ import annotations
|
|
4
39
|
|
|
@@ -84,6 +119,7 @@ class Knowledge:
|
|
|
84
119
|
loader_factory: LoaderFactory | None = None,
|
|
85
120
|
default_index: str | None = None,
|
|
86
121
|
events: EventEmitter | None = None,
|
|
122
|
+
config: OpenSearchConfig | None = None,
|
|
87
123
|
) -> None:
|
|
88
124
|
"""Initialize Knowledge with components.
|
|
89
125
|
|
|
@@ -96,6 +132,7 @@ class Knowledge:
|
|
|
96
132
|
loader_factory: Optional loader factory.
|
|
97
133
|
default_index: Default index name.
|
|
98
134
|
events: Optional event emitter.
|
|
135
|
+
config: Optional OpenSearch config for settings like batch sizes.
|
|
99
136
|
|
|
100
137
|
Note:
|
|
101
138
|
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
@@ -109,6 +146,7 @@ class Knowledge:
|
|
|
109
146
|
self._loader_factory = loader_factory
|
|
110
147
|
self._default_index = default_index
|
|
111
148
|
self._events = events or EventEmitter()
|
|
149
|
+
self._config = config
|
|
112
150
|
|
|
113
151
|
# Initialize services lazily
|
|
114
152
|
self._indexing_service: KnowledgeIndexingService | None = None
|
|
@@ -130,6 +168,10 @@ class Knowledge:
|
|
|
130
168
|
) -> Knowledge:
|
|
131
169
|
"""Create Knowledge instance with OpenSearch backend.
|
|
132
170
|
|
|
171
|
+
This factory creates a Knowledge instance configured for OpenSearch.
|
|
172
|
+
The returned instance is tenant-agnostic - multi-tenancy should be
|
|
173
|
+
handled by using separate indices per account.
|
|
174
|
+
|
|
133
175
|
Args:
|
|
134
176
|
host: OpenSearch host.
|
|
135
177
|
port: OpenSearch port.
|
|
@@ -147,6 +189,19 @@ class Knowledge:
|
|
|
147
189
|
Note:
|
|
148
190
|
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
149
191
|
Run 'gnosisllm-knowledge setup' to configure the ML model.
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
```python
|
|
195
|
+
# Create a Knowledge instance
|
|
196
|
+
knowledge = Knowledge.from_opensearch(
|
|
197
|
+
host="localhost",
|
|
198
|
+
port=9200,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Use tenant-specific index for isolation
|
|
202
|
+
tenant_index = f"gnosisllm-{account_id}-knowledge"
|
|
203
|
+
await knowledge.load(source, index_name=tenant_index)
|
|
204
|
+
```
|
|
150
205
|
"""
|
|
151
206
|
# Import OpenSearch client
|
|
152
207
|
try:
|
|
@@ -210,21 +265,36 @@ class Knowledge:
|
|
|
210
265
|
fetcher=fetcher,
|
|
211
266
|
loader_factory=loader_factory,
|
|
212
267
|
default_index=config.knowledge_index_name,
|
|
268
|
+
config=config,
|
|
213
269
|
)
|
|
214
270
|
|
|
215
271
|
@classmethod
|
|
216
272
|
def from_env(cls) -> Knowledge:
|
|
217
273
|
"""Create Knowledge instance from environment variables.
|
|
218
274
|
|
|
275
|
+
This factory creates a Knowledge instance using configuration from
|
|
276
|
+
environment variables. The returned instance is tenant-agnostic -
|
|
277
|
+
multi-tenancy should be handled by using separate indices per account.
|
|
278
|
+
|
|
219
279
|
Returns:
|
|
220
280
|
Configured Knowledge instance.
|
|
281
|
+
|
|
282
|
+
Example:
|
|
283
|
+
```python
|
|
284
|
+
# Create from environment
|
|
285
|
+
knowledge = Knowledge.from_env()
|
|
286
|
+
|
|
287
|
+
# Use tenant-specific index for isolation
|
|
288
|
+
tenant_index = f"gnosisllm-{account_id}-knowledge"
|
|
289
|
+
await knowledge.search("query", index_name=tenant_index)
|
|
290
|
+
```
|
|
221
291
|
"""
|
|
222
292
|
config = OpenSearchConfig.from_env()
|
|
223
293
|
neoreader_config = NeoreaderConfig.from_env()
|
|
224
294
|
|
|
225
295
|
return cls.from_opensearch(
|
|
226
296
|
config=config,
|
|
227
|
-
neoreader_url=neoreader_config.
|
|
297
|
+
neoreader_url=neoreader_config.host if neoreader_config.host else None,
|
|
228
298
|
)
|
|
229
299
|
|
|
230
300
|
@property
|
|
@@ -318,7 +388,6 @@ class Knowledge:
|
|
|
318
388
|
source: str,
|
|
319
389
|
*,
|
|
320
390
|
index_name: str | None = None,
|
|
321
|
-
account_id: str | None = None,
|
|
322
391
|
collection_id: str | None = None,
|
|
323
392
|
source_id: str | None = None,
|
|
324
393
|
source_type: str | None = None,
|
|
@@ -329,10 +398,13 @@ class Knowledge:
|
|
|
329
398
|
|
|
330
399
|
Automatically detects source type (sitemap, website, etc.).
|
|
331
400
|
|
|
401
|
+
Note:
|
|
402
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
403
|
+
by using separate indices per account.
|
|
404
|
+
|
|
332
405
|
Args:
|
|
333
406
|
source: Source URL or path.
|
|
334
|
-
index_name: Target index (
|
|
335
|
-
account_id: Account ID for multi-tenancy.
|
|
407
|
+
index_name: Target index (use tenant-specific name for isolation).
|
|
336
408
|
collection_id: Collection ID.
|
|
337
409
|
source_id: Source ID (auto-generated if not provided).
|
|
338
410
|
source_type: Explicit source type (auto-detected if not provided).
|
|
@@ -363,12 +435,15 @@ class Knowledge:
|
|
|
363
435
|
events=self._events,
|
|
364
436
|
)
|
|
365
437
|
|
|
438
|
+
# Get batch size from config or use default
|
|
439
|
+
batch_size = self._config.indexing_batch_size if self._config else 10
|
|
440
|
+
|
|
366
441
|
return await service.load_and_index(
|
|
367
442
|
source=source,
|
|
368
443
|
index_name=index,
|
|
369
|
-
account_id=account_id,
|
|
370
444
|
collection_id=collection_id,
|
|
371
445
|
source_id=source_id,
|
|
446
|
+
batch_size=batch_size,
|
|
372
447
|
**options,
|
|
373
448
|
)
|
|
374
449
|
|
|
@@ -377,7 +452,6 @@ class Knowledge:
|
|
|
377
452
|
source: str,
|
|
378
453
|
*,
|
|
379
454
|
index_name: str | None = None,
|
|
380
|
-
account_id: str | None = None,
|
|
381
455
|
collection_id: str | None = None,
|
|
382
456
|
collection_name: str | None = None,
|
|
383
457
|
source_id: str | None = None,
|
|
@@ -398,10 +472,13 @@ class Knowledge:
|
|
|
398
472
|
- Document storage: O(index_batch_size)
|
|
399
473
|
- In-flight fetches: O(fetch_concurrency * avg_page_size)
|
|
400
474
|
|
|
475
|
+
Note:
|
|
476
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
477
|
+
by using separate indices per account.
|
|
478
|
+
|
|
401
479
|
Args:
|
|
402
480
|
source: Sitemap URL.
|
|
403
|
-
index_name: Target index (
|
|
404
|
-
account_id: Account ID for multi-tenancy.
|
|
481
|
+
index_name: Target index (use tenant-specific name for isolation).
|
|
405
482
|
collection_id: Collection ID.
|
|
406
483
|
collection_name: Collection name for display.
|
|
407
484
|
source_id: Source ID (auto-generated if not provided).
|
|
@@ -419,6 +496,7 @@ class Knowledge:
|
|
|
419
496
|
# Efficiently load 100k+ URL sitemap
|
|
420
497
|
result = await knowledge.load_streaming(
|
|
421
498
|
"https://large-site.com/sitemap.xml",
|
|
499
|
+
index_name="knowledge-account123", # Tenant-specific
|
|
422
500
|
url_batch_size=100,
|
|
423
501
|
fetch_concurrency=20,
|
|
424
502
|
max_urls=50000,
|
|
@@ -454,7 +532,6 @@ class Knowledge:
|
|
|
454
532
|
return await pipeline.execute(
|
|
455
533
|
source=source,
|
|
456
534
|
index_name=index,
|
|
457
|
-
account_id=account_id,
|
|
458
535
|
collection_id=collection_id,
|
|
459
536
|
collection_name=collection_name,
|
|
460
537
|
source_id=source_id,
|
|
@@ -471,7 +548,6 @@ class Knowledge:
|
|
|
471
548
|
mode: SearchMode = SearchMode.HYBRID,
|
|
472
549
|
limit: int = 10,
|
|
473
550
|
offset: int = 0,
|
|
474
|
-
account_id: str | None = None,
|
|
475
551
|
collection_ids: list[str] | None = None,
|
|
476
552
|
source_ids: list[str] | None = None,
|
|
477
553
|
min_score: float | None = None,
|
|
@@ -479,13 +555,16 @@ class Knowledge:
|
|
|
479
555
|
) -> SearchResult:
|
|
480
556
|
"""Search for knowledge documents.
|
|
481
557
|
|
|
558
|
+
Note:
|
|
559
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
560
|
+
by using separate indices per account.
|
|
561
|
+
|
|
482
562
|
Args:
|
|
483
563
|
query: Search query text.
|
|
484
|
-
index_name: Index to search (
|
|
564
|
+
index_name: Index to search (use tenant-specific name for isolation).
|
|
485
565
|
mode: Search mode (semantic, keyword, hybrid).
|
|
486
566
|
limit: Maximum results.
|
|
487
567
|
offset: Result offset for pagination.
|
|
488
|
-
account_id: Account ID for multi-tenancy.
|
|
489
568
|
collection_ids: Filter by collection IDs.
|
|
490
569
|
source_ids: Filter by source IDs.
|
|
491
570
|
min_score: Minimum score threshold.
|
|
@@ -500,7 +579,6 @@ class Knowledge:
|
|
|
500
579
|
mode=mode,
|
|
501
580
|
limit=limit,
|
|
502
581
|
offset=offset,
|
|
503
|
-
account_id=account_id,
|
|
504
582
|
collection_ids=collection_ids,
|
|
505
583
|
source_ids=source_ids,
|
|
506
584
|
min_score=min_score,
|
|
@@ -578,19 +656,73 @@ class Knowledge:
|
|
|
578
656
|
|
|
579
657
|
# === Management Methods ===
|
|
580
658
|
|
|
659
|
+
async def get_document(
|
|
660
|
+
self,
|
|
661
|
+
document_id: str,
|
|
662
|
+
*,
|
|
663
|
+
index_name: str | None = None,
|
|
664
|
+
) -> dict[str, Any] | None:
|
|
665
|
+
"""Get a single document by ID.
|
|
666
|
+
|
|
667
|
+
Note:
|
|
668
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
669
|
+
by using separate indices per account.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
document_id: Document ID to retrieve.
|
|
673
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
674
|
+
Uses default index if not provided.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
Document dict with all fields (excluding embeddings) or None if not found.
|
|
678
|
+
"""
|
|
679
|
+
index = index_name or self._default_index
|
|
680
|
+
if not index:
|
|
681
|
+
raise ValueError("No index specified and no default index configured")
|
|
682
|
+
|
|
683
|
+
return await self._indexer.get(document_id, index)
|
|
684
|
+
|
|
685
|
+
async def delete_document(
|
|
686
|
+
self,
|
|
687
|
+
document_id: str,
|
|
688
|
+
*,
|
|
689
|
+
index_name: str | None = None,
|
|
690
|
+
) -> bool:
|
|
691
|
+
"""Delete a single document by ID.
|
|
692
|
+
|
|
693
|
+
Note:
|
|
694
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
695
|
+
by using separate indices per account.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
document_id: Document ID to delete.
|
|
699
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
700
|
+
Uses default index if not provided.
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
True if deleted, False if not found.
|
|
704
|
+
"""
|
|
705
|
+
index = index_name or self._default_index
|
|
706
|
+
if not index:
|
|
707
|
+
raise ValueError("No index specified and no default index configured")
|
|
708
|
+
|
|
709
|
+
return await self._indexer.delete(document_id, index)
|
|
710
|
+
|
|
581
711
|
async def delete_source(
|
|
582
712
|
self,
|
|
583
713
|
source_id: str,
|
|
584
714
|
*,
|
|
585
715
|
index_name: str | None = None,
|
|
586
|
-
account_id: str | None = None,
|
|
587
716
|
) -> int:
|
|
588
717
|
"""Delete all documents from a source.
|
|
589
718
|
|
|
719
|
+
Note:
|
|
720
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
721
|
+
by using separate indices per account.
|
|
722
|
+
|
|
590
723
|
Args:
|
|
591
724
|
source_id: Source ID to delete.
|
|
592
|
-
index_name: Index name.
|
|
593
|
-
account_id: Account ID for multi-tenancy.
|
|
725
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
594
726
|
|
|
595
727
|
Returns:
|
|
596
728
|
Count of deleted documents.
|
|
@@ -599,21 +731,23 @@ class Knowledge:
|
|
|
599
731
|
if not index:
|
|
600
732
|
raise ValueError("No index specified")
|
|
601
733
|
|
|
602
|
-
return await self.indexing.delete_source(source_id, index
|
|
734
|
+
return await self.indexing.delete_source(source_id, index)
|
|
603
735
|
|
|
604
736
|
async def delete_collection(
|
|
605
737
|
self,
|
|
606
738
|
collection_id: str,
|
|
607
739
|
*,
|
|
608
740
|
index_name: str | None = None,
|
|
609
|
-
account_id: str | None = None,
|
|
610
741
|
) -> int:
|
|
611
742
|
"""Delete all documents from a collection.
|
|
612
743
|
|
|
744
|
+
Note:
|
|
745
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
746
|
+
by using separate indices per account.
|
|
747
|
+
|
|
613
748
|
Args:
|
|
614
749
|
collection_id: Collection ID to delete.
|
|
615
|
-
index_name: Index name.
|
|
616
|
-
account_id: Account ID for multi-tenancy.
|
|
750
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
617
751
|
|
|
618
752
|
Returns:
|
|
619
753
|
Count of deleted documents.
|
|
@@ -622,54 +756,85 @@ class Knowledge:
|
|
|
622
756
|
if not index:
|
|
623
757
|
raise ValueError("No index specified")
|
|
624
758
|
|
|
625
|
-
return await self.indexing.delete_collection(collection_id, index
|
|
759
|
+
return await self.indexing.delete_collection(collection_id, index)
|
|
626
760
|
|
|
627
761
|
async def count(
|
|
628
762
|
self,
|
|
629
763
|
*,
|
|
630
764
|
index_name: str | None = None,
|
|
631
|
-
account_id: str | None = None,
|
|
632
765
|
collection_id: str | None = None,
|
|
766
|
+
source_id: str | None = None,
|
|
633
767
|
) -> int:
|
|
634
768
|
"""Count documents.
|
|
635
769
|
|
|
770
|
+
Note:
|
|
771
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
772
|
+
by using separate indices per account.
|
|
773
|
+
|
|
636
774
|
Args:
|
|
637
|
-
index_name: Index to count.
|
|
638
|
-
account_id: Filter by account.
|
|
775
|
+
index_name: Index to count (use tenant-specific name for isolation).
|
|
639
776
|
collection_id: Filter by collection.
|
|
777
|
+
source_id: Filter by source (for source deletion confirmation).
|
|
640
778
|
|
|
641
779
|
Returns:
|
|
642
780
|
Document count.
|
|
643
781
|
"""
|
|
644
782
|
return await self.search_service.count(
|
|
645
783
|
index_name=index_name,
|
|
646
|
-
account_id=account_id,
|
|
647
784
|
collection_id=collection_id,
|
|
785
|
+
source_id=source_id,
|
|
648
786
|
)
|
|
649
787
|
|
|
650
788
|
# === Collection and Stats Methods ===
|
|
651
789
|
|
|
652
|
-
async def get_collections(
|
|
790
|
+
async def get_collections(
|
|
791
|
+
self,
|
|
792
|
+
*,
|
|
793
|
+
index_name: str | None = None,
|
|
794
|
+
) -> list[dict[str, Any]]:
|
|
653
795
|
"""Get all collections with document counts.
|
|
654
796
|
|
|
655
797
|
Aggregates unique collection_ids from indexed documents.
|
|
656
798
|
|
|
799
|
+
Note:
|
|
800
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
801
|
+
by using separate indices per account.
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
805
|
+
Uses default index if not provided.
|
|
806
|
+
|
|
657
807
|
Returns:
|
|
658
808
|
List of collection dictionaries with id, name, and document_count.
|
|
659
809
|
"""
|
|
660
|
-
|
|
810
|
+
index = index_name or self._default_index
|
|
811
|
+
return await self.search_service.get_collections(index_name=index)
|
|
661
812
|
|
|
662
|
-
async def get_stats(
|
|
813
|
+
async def get_stats(
|
|
814
|
+
self,
|
|
815
|
+
*,
|
|
816
|
+
index_name: str | None = None,
|
|
817
|
+
) -> dict[str, Any]:
|
|
663
818
|
"""Get index statistics.
|
|
664
819
|
|
|
820
|
+
Note:
|
|
821
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
822
|
+
by using separate indices per account.
|
|
823
|
+
|
|
824
|
+
Args:
|
|
825
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
826
|
+
Uses default index if not provided.
|
|
827
|
+
|
|
665
828
|
Returns:
|
|
666
829
|
Dictionary with document_count, index_name, and other stats.
|
|
667
830
|
"""
|
|
668
|
-
|
|
831
|
+
index = index_name or self._default_index
|
|
832
|
+
return await self.search_service.get_stats(index_name=index)
|
|
669
833
|
|
|
670
834
|
async def list_documents(
|
|
671
835
|
self,
|
|
672
836
|
*,
|
|
837
|
+
index_name: str | None = None,
|
|
673
838
|
source_id: str | None = None,
|
|
674
839
|
collection_id: str | None = None,
|
|
675
840
|
limit: int = 50,
|
|
@@ -677,7 +842,13 @@ class Knowledge:
|
|
|
677
842
|
) -> dict[str, Any]:
|
|
678
843
|
"""List documents with optional filters.
|
|
679
844
|
|
|
845
|
+
Note:
|
|
846
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
847
|
+
by using separate indices per account.
|
|
848
|
+
|
|
680
849
|
Args:
|
|
850
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
851
|
+
Uses default index if not provided.
|
|
681
852
|
source_id: Optional source ID filter.
|
|
682
853
|
collection_id: Optional collection ID filter.
|
|
683
854
|
limit: Maximum documents to return (max 100).
|
|
@@ -686,9 +857,9 @@ class Knowledge:
|
|
|
686
857
|
Returns:
|
|
687
858
|
Dictionary with documents, total, limit, offset.
|
|
688
859
|
"""
|
|
689
|
-
index = self._default_index
|
|
860
|
+
index = index_name or self._default_index
|
|
690
861
|
if not index:
|
|
691
|
-
raise ValueError("No default index configured")
|
|
862
|
+
raise ValueError("No index specified and no default index configured")
|
|
692
863
|
|
|
693
864
|
# Clamp limit to reasonable bounds
|
|
694
865
|
limit = min(max(1, limit), 100)
|
|
@@ -823,6 +994,33 @@ class Knowledge:
|
|
|
823
994
|
return await agentic_searcher.agentic_search(agentic_query, index, **options)
|
|
824
995
|
|
|
825
996
|
async def close(self) -> None:
|
|
826
|
-
"""Close connections and clean up resources.
|
|
827
|
-
|
|
828
|
-
|
|
997
|
+
"""Close connections and clean up resources.
|
|
998
|
+
|
|
999
|
+
Closes the underlying AsyncOpenSearch client to prevent
|
|
1000
|
+
unclosed aiohttp session warnings. Properly handles
|
|
1001
|
+
CancelledError during event loop shutdown.
|
|
1002
|
+
"""
|
|
1003
|
+
import asyncio
|
|
1004
|
+
|
|
1005
|
+
# Close the OpenSearch client via the searcher
|
|
1006
|
+
# Note: indexer, searcher, and setup share the same client instance,
|
|
1007
|
+
# so closing via searcher is sufficient
|
|
1008
|
+
if hasattr(self._searcher, '_client') and self._searcher._client is not None:
|
|
1009
|
+
client = self._searcher._client
|
|
1010
|
+
try:
|
|
1011
|
+
await client.close()
|
|
1012
|
+
logger.debug("Closed OpenSearch client connection")
|
|
1013
|
+
except asyncio.CancelledError:
|
|
1014
|
+
# Event loop is shutting down - this is expected during cleanup
|
|
1015
|
+
logger.debug("OpenSearch client close cancelled (event loop shutting down)")
|
|
1016
|
+
except Exception as e:
|
|
1017
|
+
logger.warning(f"Error closing OpenSearch client: {e}")
|
|
1018
|
+
finally:
|
|
1019
|
+
# Clear client reference on all components that share it
|
|
1020
|
+
# This prevents any accidental reuse after close
|
|
1021
|
+
if hasattr(self._searcher, '_client'):
|
|
1022
|
+
self._searcher._client = None
|
|
1023
|
+
if hasattr(self._indexer, '_client'):
|
|
1024
|
+
self._indexer._client = None
|
|
1025
|
+
if self._setup and hasattr(self._setup, '_client'):
|
|
1026
|
+
self._setup._client = None
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""In-memory document indexer for testing.
|
|
1
|
+
"""In-memory document indexer for testing.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). The memory indexer does not
|
|
6
|
+
include tenant filtering logic - use separate index names per tenant.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from __future__ import annotations
|
|
4
10
|
|
|
@@ -14,6 +20,9 @@ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
|
|
|
14
20
|
class MemoryIndexer:
|
|
15
21
|
"""In-memory document indexer for testing.
|
|
16
22
|
|
|
23
|
+
This indexer is tenant-agnostic. Multi-tenancy is achieved through index
|
|
24
|
+
isolation by using tenant-specific index names.
|
|
25
|
+
|
|
17
26
|
Stores documents in a dictionary for fast testing without
|
|
18
27
|
requiring an external OpenSearch instance.
|
|
19
28
|
|
|
@@ -185,6 +194,22 @@ class MemoryIndexer:
|
|
|
185
194
|
"""
|
|
186
195
|
return await self.index(document, index_name)
|
|
187
196
|
|
|
197
|
+
async def get(
|
|
198
|
+
self,
|
|
199
|
+
doc_id: str,
|
|
200
|
+
index_name: str,
|
|
201
|
+
) -> dict[str, Any] | None:
|
|
202
|
+
"""Get a document by ID (async interface).
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
doc_id: Document ID.
|
|
206
|
+
index_name: Index name.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Document dictionary or None if not found.
|
|
210
|
+
"""
|
|
211
|
+
return self._indices.get(index_name, {}).get(doc_id)
|
|
212
|
+
|
|
188
213
|
async def delete(
|
|
189
214
|
self,
|
|
190
215
|
doc_id: str,
|
|
@@ -365,8 +390,8 @@ class MemoryIndexer:
|
|
|
365
390
|
"url": document.url,
|
|
366
391
|
"title": document.title,
|
|
367
392
|
"source": document.source,
|
|
368
|
-
"account_id": document.account_id,
|
|
369
393
|
"collection_id": document.collection_id,
|
|
394
|
+
"collection_name": document.collection_name,
|
|
370
395
|
"source_id": document.source_id,
|
|
371
396
|
"chunk_index": document.chunk_index,
|
|
372
397
|
"total_chunks": document.total_chunks,
|