gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +225 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +35 -20
- gnosisllm_knowledge/services/search.py +37 -20
- gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -9,7 +9,11 @@ from typing import Any
|
|
|
9
9
|
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
10
10
|
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
11
11
|
from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
|
|
12
|
+
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
13
|
+
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
14
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
12
15
|
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
16
|
+
from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
|
|
13
17
|
from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
|
|
14
18
|
from gnosisllm_knowledge.loaders.website import WebsiteLoader
|
|
15
19
|
|
|
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
|
|
|
20
24
|
]
|
|
21
25
|
|
|
22
26
|
|
|
27
|
+
def _create_discovery_loader(
|
|
28
|
+
fetcher: IContentFetcher,
|
|
29
|
+
chunker: ITextChunker,
|
|
30
|
+
config: dict[str, Any] | None,
|
|
31
|
+
event_emitter: EventEmitter | None,
|
|
32
|
+
) -> DiscoveryLoader:
|
|
33
|
+
"""Factory function for creating DiscoveryLoader instances.
|
|
34
|
+
|
|
35
|
+
Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
|
|
36
|
+
is a NeoreaderContentFetcher, reuses its config to ensure consistency.
|
|
37
|
+
Otherwise, creates config from environment variables.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
fetcher: Content fetcher for retrieving URL content.
|
|
41
|
+
chunker: Text chunker for splitting content.
|
|
42
|
+
config: Optional configuration dictionary.
|
|
43
|
+
event_emitter: Optional event emitter for progress events.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Configured DiscoveryLoader instance.
|
|
47
|
+
"""
|
|
48
|
+
# Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
|
|
49
|
+
if isinstance(fetcher, NeoreaderContentFetcher):
|
|
50
|
+
neoreader_config = fetcher.config
|
|
51
|
+
else:
|
|
52
|
+
neoreader_config = NeoreaderConfig.from_env()
|
|
53
|
+
|
|
54
|
+
discovery_client = NeoreaderDiscoveryClient(neoreader_config)
|
|
55
|
+
return DiscoveryLoader(
|
|
56
|
+
fetcher=fetcher,
|
|
57
|
+
chunker=chunker,
|
|
58
|
+
discovery_client=discovery_client,
|
|
59
|
+
config=config,
|
|
60
|
+
event_emitter=event_emitter,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
23
64
|
class LoaderFactory:
|
|
24
65
|
"""Factory for creating content loaders (Registry Pattern).
|
|
25
66
|
|
|
@@ -29,6 +70,7 @@ class LoaderFactory:
|
|
|
29
70
|
Built-in loaders:
|
|
30
71
|
- website: Single URL loading
|
|
31
72
|
- sitemap: Sitemap XML with recursive discovery
|
|
73
|
+
- discovery: Website crawling via Neo Reader Discovery API
|
|
32
74
|
|
|
33
75
|
Example:
|
|
34
76
|
```python
|
|
@@ -40,6 +82,9 @@ class LoaderFactory:
|
|
|
40
82
|
# Explicit type
|
|
41
83
|
loader = factory.create("sitemap", config={"max_urls": 500})
|
|
42
84
|
|
|
85
|
+
# Discovery loader for full website crawling
|
|
86
|
+
loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
|
|
87
|
+
|
|
43
88
|
# Register custom loader
|
|
44
89
|
factory.register("custom", MyCustomLoader)
|
|
45
90
|
```
|
|
@@ -76,6 +121,7 @@ class LoaderFactory:
|
|
|
76
121
|
"""Register built-in loader types."""
|
|
77
122
|
self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
|
|
78
123
|
self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
|
|
124
|
+
self.register("discovery", _create_discovery_loader)
|
|
79
125
|
|
|
80
126
|
def register(self, name: str, creator: LoaderCreator) -> None:
|
|
81
127
|
"""Register a loader type.
|
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
"""Knowledge indexing service.
|
|
1
|
+
"""Knowledge indexing service.
|
|
2
|
+
|
|
3
|
+
This service orchestrates the document ingestion pipeline from source to index,
|
|
4
|
+
including loading, chunking, and indexing.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This service is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g.,
|
|
9
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
10
|
+
"""
|
|
2
11
|
|
|
3
12
|
from __future__ import annotations
|
|
4
13
|
|
|
@@ -82,7 +91,6 @@ class KnowledgeIndexingService:
|
|
|
82
91
|
source: str,
|
|
83
92
|
index_name: str,
|
|
84
93
|
*,
|
|
85
|
-
account_id: str | None = None,
|
|
86
94
|
collection_id: str | None = None,
|
|
87
95
|
source_id: str | None = None,
|
|
88
96
|
batch_size: int = 100,
|
|
@@ -93,10 +101,13 @@ class KnowledgeIndexingService:
|
|
|
93
101
|
Uses streaming to process and index documents as they're fetched,
|
|
94
102
|
avoiding memory issues with large sitemaps.
|
|
95
103
|
|
|
104
|
+
Note:
|
|
105
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
106
|
+
at the API layer by using separate indices per account.
|
|
107
|
+
|
|
96
108
|
Args:
|
|
97
109
|
source: Source URL or path.
|
|
98
|
-
index_name: Target index name.
|
|
99
|
-
account_id: Account ID for multi-tenancy.
|
|
110
|
+
index_name: Target index name (use tenant-specific name for isolation).
|
|
100
111
|
collection_id: Collection ID.
|
|
101
112
|
source_id: Source ID (auto-generated if not provided).
|
|
102
113
|
batch_size: Documents per batch for indexing.
|
|
@@ -127,14 +138,13 @@ class KnowledgeIndexingService:
|
|
|
127
138
|
# Stream documents and index in batches as they arrive
|
|
128
139
|
# Note: Loader already chunks content, so we don't re-chunk here
|
|
129
140
|
async for doc in self._loader.load_streaming(source, **options):
|
|
130
|
-
# Enrich document with
|
|
141
|
+
# Enrich document with collection info
|
|
131
142
|
enriched_doc = Document(
|
|
132
143
|
content=doc.content,
|
|
133
144
|
source=source,
|
|
134
145
|
doc_id=doc.doc_id,
|
|
135
146
|
url=doc.url,
|
|
136
147
|
title=doc.title,
|
|
137
|
-
account_id=account_id,
|
|
138
148
|
collection_id=collection_id,
|
|
139
149
|
source_id=source_id,
|
|
140
150
|
chunk_index=doc.chunk_index,
|
|
@@ -230,8 +240,8 @@ class KnowledgeIndexingService:
|
|
|
230
240
|
doc_id=f"{doc.doc_id}-chunk-{i}",
|
|
231
241
|
url=doc.url,
|
|
232
242
|
title=doc.title,
|
|
233
|
-
account_id=doc.account_id,
|
|
234
243
|
collection_id=doc.collection_id,
|
|
244
|
+
collection_name=doc.collection_name,
|
|
235
245
|
source_id=doc.source_id,
|
|
236
246
|
chunk_index=i,
|
|
237
247
|
total_chunks=len(chunks),
|
|
@@ -271,14 +281,16 @@ class KnowledgeIndexingService:
|
|
|
271
281
|
self,
|
|
272
282
|
source_id: str,
|
|
273
283
|
index_name: str,
|
|
274
|
-
account_id: str | None = None,
|
|
275
284
|
) -> int:
|
|
276
285
|
"""Delete all documents from a source.
|
|
277
286
|
|
|
287
|
+
Note:
|
|
288
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
289
|
+
at the API layer by using separate indices per account.
|
|
290
|
+
|
|
278
291
|
Args:
|
|
279
292
|
source_id: Source ID to delete.
|
|
280
|
-
index_name: Index name.
|
|
281
|
-
account_id: Optional account filter.
|
|
293
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
282
294
|
|
|
283
295
|
Returns:
|
|
284
296
|
Count of deleted documents.
|
|
@@ -287,21 +299,23 @@ class KnowledgeIndexingService:
|
|
|
287
299
|
build_delete_by_source_query,
|
|
288
300
|
)
|
|
289
301
|
|
|
290
|
-
query = build_delete_by_source_query(source_id
|
|
302
|
+
query = build_delete_by_source_query(source_id)
|
|
291
303
|
return await self._indexer.delete_by_query(query, index_name)
|
|
292
304
|
|
|
293
305
|
async def delete_collection(
|
|
294
306
|
self,
|
|
295
307
|
collection_id: str,
|
|
296
308
|
index_name: str,
|
|
297
|
-
account_id: str | None = None,
|
|
298
309
|
) -> int:
|
|
299
310
|
"""Delete all documents from a collection.
|
|
300
311
|
|
|
312
|
+
Note:
|
|
313
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
314
|
+
at the API layer by using separate indices per account.
|
|
315
|
+
|
|
301
316
|
Args:
|
|
302
317
|
collection_id: Collection ID to delete.
|
|
303
|
-
index_name: Index name.
|
|
304
|
-
account_id: Optional account filter.
|
|
318
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
305
319
|
|
|
306
320
|
Returns:
|
|
307
321
|
Count of deleted documents.
|
|
@@ -310,7 +324,7 @@ class KnowledgeIndexingService:
|
|
|
310
324
|
build_delete_by_collection_query,
|
|
311
325
|
)
|
|
312
326
|
|
|
313
|
-
query = build_delete_by_collection_query(collection_id
|
|
327
|
+
query = build_delete_by_collection_query(collection_id)
|
|
314
328
|
return await self._indexer.delete_by_query(query, index_name)
|
|
315
329
|
|
|
316
330
|
async def reindex_source(
|
|
@@ -319,17 +333,19 @@ class KnowledgeIndexingService:
|
|
|
319
333
|
source_id: str,
|
|
320
334
|
index_name: str,
|
|
321
335
|
*,
|
|
322
|
-
account_id: str | None = None,
|
|
323
336
|
collection_id: str | None = None,
|
|
324
337
|
**options: Any,
|
|
325
338
|
) -> IndexResult:
|
|
326
339
|
"""Reindex a source by deleting and re-loading.
|
|
327
340
|
|
|
341
|
+
Note:
|
|
342
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
343
|
+
at the API layer by using separate indices per account.
|
|
344
|
+
|
|
328
345
|
Args:
|
|
329
346
|
source: Source URL or path.
|
|
330
347
|
source_id: Existing source ID.
|
|
331
|
-
index_name: Index name.
|
|
332
|
-
account_id: Account ID.
|
|
348
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
333
349
|
collection_id: Collection ID.
|
|
334
350
|
**options: Additional options.
|
|
335
351
|
|
|
@@ -337,13 +353,12 @@ class KnowledgeIndexingService:
|
|
|
337
353
|
Index result.
|
|
338
354
|
"""
|
|
339
355
|
# Delete existing documents
|
|
340
|
-
await self.delete_source(source_id, index_name
|
|
356
|
+
await self.delete_source(source_id, index_name)
|
|
341
357
|
|
|
342
358
|
# Re-index
|
|
343
359
|
return await self.load_and_index(
|
|
344
360
|
source=source,
|
|
345
361
|
index_name=index_name,
|
|
346
|
-
account_id=account_id,
|
|
347
362
|
collection_id=collection_id,
|
|
348
363
|
source_id=source_id,
|
|
349
364
|
**options,
|
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
"""Knowledge search service.
|
|
1
|
+
"""Knowledge search service.
|
|
2
|
+
|
|
3
|
+
This service provides a high-level interface for searching knowledge documents
|
|
4
|
+
using semantic, keyword, and hybrid search modes.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This service is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g., knowledge-{account_id}).
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
from __future__ import annotations
|
|
4
12
|
|
|
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
|
|
|
70
78
|
mode: SearchMode = SearchMode.HYBRID,
|
|
71
79
|
limit: int = 10,
|
|
72
80
|
offset: int = 0,
|
|
73
|
-
account_id: str | None = None,
|
|
74
81
|
collection_ids: list[str] | None = None,
|
|
75
82
|
source_ids: list[str] | None = None,
|
|
76
83
|
min_score: float | None = None,
|
|
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
|
|
|
78
85
|
) -> SearchResult:
|
|
79
86
|
"""Search for knowledge documents.
|
|
80
87
|
|
|
88
|
+
Note:
|
|
89
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
90
|
+
at the API layer by using separate indices per account.
|
|
91
|
+
|
|
81
92
|
Args:
|
|
82
93
|
query: Search query text.
|
|
83
94
|
index_name: Index to search (uses default if not provided).
|
|
84
95
|
mode: Search mode (semantic, keyword, hybrid).
|
|
85
96
|
limit: Maximum results.
|
|
86
97
|
offset: Result offset for pagination.
|
|
87
|
-
account_id: Account ID for multi-tenancy.
|
|
88
98
|
collection_ids: Filter by collection IDs.
|
|
89
99
|
source_ids: Filter by source IDs.
|
|
90
100
|
min_score: Minimum score threshold.
|
|
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
|
|
|
105
115
|
mode=mode,
|
|
106
116
|
limit=limit,
|
|
107
117
|
offset=offset,
|
|
108
|
-
account_id=account_id,
|
|
109
118
|
collection_ids=collection_ids,
|
|
110
119
|
source_ids=source_ids,
|
|
111
120
|
min_score=min_score,
|
|
@@ -133,17 +142,19 @@ class KnowledgeSearchService:
|
|
|
133
142
|
*,
|
|
134
143
|
index_name: str | None = None,
|
|
135
144
|
limit: int = 10,
|
|
136
|
-
account_id: str | None = None,
|
|
137
145
|
collection_ids: list[str] | None = None,
|
|
138
146
|
**options: Any,
|
|
139
147
|
) -> SearchResult:
|
|
140
148
|
"""Execute semantic (vector) search.
|
|
141
149
|
|
|
150
|
+
Note:
|
|
151
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
152
|
+
at the API layer by using separate indices per account.
|
|
153
|
+
|
|
142
154
|
Args:
|
|
143
155
|
query: Search query text.
|
|
144
156
|
index_name: Index to search.
|
|
145
157
|
limit: Maximum results.
|
|
146
|
-
account_id: Account ID for multi-tenancy.
|
|
147
158
|
collection_ids: Filter by collection IDs.
|
|
148
159
|
**options: Additional options.
|
|
149
160
|
|
|
@@ -155,7 +166,6 @@ class KnowledgeSearchService:
|
|
|
155
166
|
index_name=index_name,
|
|
156
167
|
mode=SearchMode.SEMANTIC,
|
|
157
168
|
limit=limit,
|
|
158
|
-
account_id=account_id,
|
|
159
169
|
collection_ids=collection_ids,
|
|
160
170
|
**options,
|
|
161
171
|
)
|
|
@@ -166,17 +176,19 @@ class KnowledgeSearchService:
|
|
|
166
176
|
*,
|
|
167
177
|
index_name: str | None = None,
|
|
168
178
|
limit: int = 10,
|
|
169
|
-
account_id: str | None = None,
|
|
170
179
|
collection_ids: list[str] | None = None,
|
|
171
180
|
**options: Any,
|
|
172
181
|
) -> SearchResult:
|
|
173
182
|
"""Execute keyword (BM25) search.
|
|
174
183
|
|
|
184
|
+
Note:
|
|
185
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
186
|
+
at the API layer by using separate indices per account.
|
|
187
|
+
|
|
175
188
|
Args:
|
|
176
189
|
query: Search query text.
|
|
177
190
|
index_name: Index to search.
|
|
178
191
|
limit: Maximum results.
|
|
179
|
-
account_id: Account ID for multi-tenancy.
|
|
180
192
|
collection_ids: Filter by collection IDs.
|
|
181
193
|
**options: Additional options.
|
|
182
194
|
|
|
@@ -188,7 +200,6 @@ class KnowledgeSearchService:
|
|
|
188
200
|
index_name=index_name,
|
|
189
201
|
mode=SearchMode.KEYWORD,
|
|
190
202
|
limit=limit,
|
|
191
|
-
account_id=account_id,
|
|
192
203
|
collection_ids=collection_ids,
|
|
193
204
|
**options,
|
|
194
205
|
)
|
|
@@ -199,7 +210,6 @@ class KnowledgeSearchService:
|
|
|
199
210
|
*,
|
|
200
211
|
index_name: str | None = None,
|
|
201
212
|
limit: int = 10,
|
|
202
|
-
account_id: str | None = None,
|
|
203
213
|
collection_ids: list[str] | None = None,
|
|
204
214
|
semantic_weight: float = 0.7,
|
|
205
215
|
keyword_weight: float = 0.3,
|
|
@@ -207,11 +217,14 @@ class KnowledgeSearchService:
|
|
|
207
217
|
) -> SearchResult:
|
|
208
218
|
"""Execute hybrid search (semantic + keyword).
|
|
209
219
|
|
|
220
|
+
Note:
|
|
221
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
222
|
+
at the API layer by using separate indices per account.
|
|
223
|
+
|
|
210
224
|
Args:
|
|
211
225
|
query: Search query text.
|
|
212
226
|
index_name: Index to search.
|
|
213
227
|
limit: Maximum results.
|
|
214
|
-
account_id: Account ID for multi-tenancy.
|
|
215
228
|
collection_ids: Filter by collection IDs.
|
|
216
229
|
semantic_weight: Weight for semantic score.
|
|
217
230
|
keyword_weight: Weight for keyword score.
|
|
@@ -225,7 +238,6 @@ class KnowledgeSearchService:
|
|
|
225
238
|
index_name=index_name,
|
|
226
239
|
mode=SearchMode.HYBRID,
|
|
227
240
|
limit=limit,
|
|
228
|
-
account_id=account_id,
|
|
229
241
|
collection_ids=collection_ids,
|
|
230
242
|
semantic_weight=semantic_weight,
|
|
231
243
|
keyword_weight=keyword_weight,
|
|
@@ -264,17 +276,19 @@ class KnowledgeSearchService:
|
|
|
264
276
|
index_name: str | None = None,
|
|
265
277
|
mode: SearchMode = SearchMode.HYBRID,
|
|
266
278
|
limit: int = 10,
|
|
267
|
-
account_id: str | None = None,
|
|
268
279
|
**options: Any,
|
|
269
280
|
) -> list[SearchResult]:
|
|
270
281
|
"""Execute multiple searches in parallel.
|
|
271
282
|
|
|
283
|
+
Note:
|
|
284
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
285
|
+
at the API layer by using separate indices per account.
|
|
286
|
+
|
|
272
287
|
Args:
|
|
273
288
|
queries: List of query texts.
|
|
274
289
|
index_name: Index to search.
|
|
275
290
|
mode: Search mode.
|
|
276
291
|
limit: Maximum results per query.
|
|
277
|
-
account_id: Account ID for multi-tenancy.
|
|
278
292
|
**options: Additional options.
|
|
279
293
|
|
|
280
294
|
Returns:
|
|
@@ -289,7 +303,6 @@ class KnowledgeSearchService:
|
|
|
289
303
|
text=query,
|
|
290
304
|
mode=mode,
|
|
291
305
|
limit=limit,
|
|
292
|
-
account_id=account_id,
|
|
293
306
|
)
|
|
294
307
|
for query in queries
|
|
295
308
|
]
|
|
@@ -310,15 +323,19 @@ class KnowledgeSearchService:
|
|
|
310
323
|
async def count(
|
|
311
324
|
self,
|
|
312
325
|
index_name: str | None = None,
|
|
313
|
-
account_id: str | None = None,
|
|
314
326
|
collection_id: str | None = None,
|
|
327
|
+
source_id: str | None = None,
|
|
315
328
|
) -> int:
|
|
316
329
|
"""Count documents in index.
|
|
317
330
|
|
|
331
|
+
Note:
|
|
332
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
333
|
+
at the API layer by using separate indices per account.
|
|
334
|
+
|
|
318
335
|
Args:
|
|
319
336
|
index_name: Index to count.
|
|
320
|
-
account_id: Filter by account.
|
|
321
337
|
collection_id: Filter by collection.
|
|
338
|
+
source_id: Filter by source (for source deletion confirmation).
|
|
322
339
|
|
|
323
340
|
Returns:
|
|
324
341
|
Document count.
|
|
@@ -327,12 +344,12 @@ class KnowledgeSearchService:
|
|
|
327
344
|
if not index:
|
|
328
345
|
raise SearchError(message="No index specified")
|
|
329
346
|
|
|
330
|
-
# Build count query
|
|
347
|
+
# Build count query with optional filters
|
|
331
348
|
query = SearchQuery(
|
|
332
349
|
text="",
|
|
333
350
|
limit=0,
|
|
334
|
-
account_id=account_id,
|
|
335
351
|
collection_ids=[collection_id] if collection_id else None,
|
|
352
|
+
source_ids=[source_id] if source_id else None,
|
|
336
353
|
)
|
|
337
354
|
|
|
338
355
|
# Use a simple match_all to get total count
|
|
@@ -2,12 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
This module provides the StreamingIndexingPipeline that orchestrates
|
|
4
4
|
the load -> index pipeline with guaranteed bounded memory usage.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This module is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g.,
|
|
9
|
+
gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
|
|
10
|
+
The account_id parameters are deprecated and will be ignored.
|
|
5
11
|
"""
|
|
6
12
|
|
|
7
13
|
from __future__ import annotations
|
|
8
14
|
|
|
9
15
|
import logging
|
|
10
16
|
import time
|
|
17
|
+
import warnings
|
|
11
18
|
from dataclasses import dataclass, field
|
|
12
19
|
from typing import TYPE_CHECKING, Any
|
|
13
20
|
|
|
@@ -141,10 +148,16 @@ class StreamingIndexingPipeline:
|
|
|
141
148
|
) -> IndexResult:
|
|
142
149
|
"""Execute the streaming pipeline.
|
|
143
150
|
|
|
151
|
+
Note:
|
|
152
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
153
|
+
at the API layer by using separate indices per account. The
|
|
154
|
+
account_id parameter is deprecated and will be ignored.
|
|
155
|
+
|
|
144
156
|
Args:
|
|
145
157
|
source: Sitemap URL.
|
|
146
158
|
index_name: Target OpenSearch index.
|
|
147
|
-
account_id:
|
|
159
|
+
account_id: Deprecated. This parameter is ignored.
|
|
160
|
+
Use index isolation (separate index per account) instead.
|
|
148
161
|
collection_id: Collection within account.
|
|
149
162
|
collection_name: Collection name for display.
|
|
150
163
|
source_id: Source identifier.
|
|
@@ -153,6 +166,13 @@ class StreamingIndexingPipeline:
|
|
|
153
166
|
Returns:
|
|
154
167
|
Aggregated index result.
|
|
155
168
|
"""
|
|
169
|
+
if account_id is not None:
|
|
170
|
+
warnings.warn(
|
|
171
|
+
"account_id parameter is deprecated and will be ignored. "
|
|
172
|
+
"Use index isolation (separate index per account) instead.",
|
|
173
|
+
DeprecationWarning,
|
|
174
|
+
stacklevel=2,
|
|
175
|
+
)
|
|
156
176
|
start_time = time.time()
|
|
157
177
|
self._progress = StreamingProgress(current_phase="starting")
|
|
158
178
|
await self._emit_progress()
|
|
@@ -167,7 +187,6 @@ class StreamingIndexingPipeline:
|
|
|
167
187
|
self._enrich_document(
|
|
168
188
|
doc,
|
|
169
189
|
source=source,
|
|
170
|
-
account_id=account_id,
|
|
171
190
|
collection_id=collection_id,
|
|
172
191
|
collection_name=collection_name,
|
|
173
192
|
source_id=source_id,
|
|
@@ -248,31 +267,44 @@ class StreamingIndexingPipeline:
|
|
|
248
267
|
self,
|
|
249
268
|
doc: Document,
|
|
250
269
|
source: str,
|
|
251
|
-
account_id: str | None,
|
|
252
270
|
collection_id: str | None,
|
|
253
271
|
collection_name: str | None,
|
|
254
272
|
source_id: str | None,
|
|
273
|
+
account_id: str | None = None,
|
|
255
274
|
) -> Document:
|
|
256
|
-
"""Add
|
|
275
|
+
"""Add source info to document.
|
|
276
|
+
|
|
277
|
+
Note:
|
|
278
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
279
|
+
at the API layer by using separate indices per account. The
|
|
280
|
+
account_id parameter is deprecated and will be ignored.
|
|
257
281
|
|
|
258
282
|
Args:
|
|
259
283
|
doc: Original document.
|
|
260
284
|
source: Source URL.
|
|
261
|
-
account_id: Account identifier.
|
|
262
285
|
collection_id: Collection identifier.
|
|
263
286
|
collection_name: Collection name for display.
|
|
264
287
|
source_id: Source identifier.
|
|
288
|
+
account_id: Deprecated. This parameter is ignored.
|
|
289
|
+
Use index isolation (separate index per account) instead.
|
|
265
290
|
|
|
266
291
|
Returns:
|
|
267
|
-
New Document with
|
|
292
|
+
New Document with source info.
|
|
268
293
|
"""
|
|
294
|
+
if account_id is not None:
|
|
295
|
+
warnings.warn(
|
|
296
|
+
"account_id parameter is deprecated and will be ignored. "
|
|
297
|
+
"Use index isolation (separate index per account) instead.",
|
|
298
|
+
DeprecationWarning,
|
|
299
|
+
stacklevel=2,
|
|
300
|
+
)
|
|
301
|
+
|
|
269
302
|
return Document(
|
|
270
303
|
content=doc.content,
|
|
271
304
|
source=source,
|
|
272
305
|
doc_id=doc.doc_id,
|
|
273
306
|
url=doc.url,
|
|
274
307
|
title=doc.title,
|
|
275
|
-
account_id=account_id,
|
|
276
308
|
collection_id=collection_id,
|
|
277
309
|
collection_name=collection_name,
|
|
278
310
|
source_id=source_id,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gnosisllm-knowledge
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Enterprise-grade knowledge loading, indexing, and search for Python
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: knowledge-base,rag,semantic-search,vector-search,opensearch,llm,embeddings,enterprise
|
|
@@ -46,7 +46,7 @@ Enterprise-grade knowledge loading, indexing, and semantic search library for Py
|
|
|
46
46
|
- **Multiple Loaders**: Load content from websites, sitemaps, and files
|
|
47
47
|
- **Intelligent Chunking**: Sentence-aware text splitting with configurable overlap
|
|
48
48
|
- **OpenSearch Backend**: Production-ready with k-NN vector search
|
|
49
|
-
- **Multi-Tenancy**:
|
|
49
|
+
- **Multi-Tenancy**: Index isolation for complete tenant separation (tenant-agnostic library)
|
|
50
50
|
- **Event-Driven**: Observer pattern for progress tracking and monitoring
|
|
51
51
|
- **SOLID Architecture**: Clean, maintainable, and extensible codebase
|
|
52
52
|
|
|
@@ -144,14 +144,15 @@ gnosisllm-knowledge load <URL> [OPTIONS]
|
|
|
144
144
|
|
|
145
145
|
Options:
|
|
146
146
|
--type Source type: website, sitemap (auto-detects)
|
|
147
|
-
--index Target index name (
|
|
148
|
-
--account-id Multi-tenant account ID
|
|
147
|
+
--index Target index name (e.g., knowledge-tenant-123)
|
|
149
148
|
--collection-id Collection grouping ID
|
|
150
149
|
--batch-size Documents per batch (default: 100)
|
|
151
150
|
--max-urls Max URLs from sitemap (default: 1000)
|
|
152
151
|
--dry-run Preview without indexing
|
|
153
152
|
```
|
|
154
153
|
|
|
154
|
+
Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names (e.g., `--index knowledge-tenant-123`).
|
|
155
|
+
|
|
155
156
|
### Search
|
|
156
157
|
|
|
157
158
|
Search indexed content with multiple modes:
|
|
@@ -161,14 +162,15 @@ gnosisllm-knowledge search <QUERY> [OPTIONS]
|
|
|
161
162
|
|
|
162
163
|
Options:
|
|
163
164
|
--mode Search mode: semantic, keyword, hybrid, agentic
|
|
164
|
-
--index Index to search (
|
|
165
|
+
--index Index to search (e.g., knowledge-tenant-123)
|
|
165
166
|
--limit Max results (default: 5)
|
|
166
|
-
--account-id Filter by account
|
|
167
167
|
--collection-ids Filter by collections (comma-separated)
|
|
168
168
|
--json Output as JSON for scripting
|
|
169
169
|
--interactive Interactive search session
|
|
170
170
|
```
|
|
171
171
|
|
|
172
|
+
Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names.
|
|
173
|
+
|
|
172
174
|
## Architecture
|
|
173
175
|
|
|
174
176
|
```
|
|
@@ -319,22 +321,40 @@ agent_body = {
|
|
|
319
321
|
|
|
320
322
|
## Multi-Tenancy
|
|
321
323
|
|
|
324
|
+
This library is **tenant-agnostic**. Multi-tenancy is achieved through **index isolation** - each tenant gets their own OpenSearch index.
|
|
325
|
+
|
|
322
326
|
```python
|
|
323
|
-
#
|
|
327
|
+
# The calling application (e.g., API) constructs tenant-specific index names
|
|
328
|
+
index_name = f"knowledge-{account_id}"
|
|
329
|
+
|
|
330
|
+
# Create Knowledge instance for the tenant
|
|
331
|
+
knowledge = Knowledge.from_opensearch(
|
|
332
|
+
host="localhost",
|
|
333
|
+
port=9200,
|
|
334
|
+
index_prefix=index_name, # knowledge-tenant-123
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Load content to tenant's isolated index
|
|
324
338
|
await knowledge.load(
|
|
325
339
|
source="https://docs.example.com/sitemap.xml",
|
|
326
|
-
account_id="tenant-123",
|
|
327
340
|
collection_id="docs",
|
|
328
341
|
)
|
|
329
342
|
|
|
330
|
-
# Search within tenant
|
|
343
|
+
# Search within tenant's index (no account_id filter needed)
|
|
331
344
|
results = await knowledge.search(
|
|
332
345
|
"query",
|
|
333
|
-
account_id="tenant-123",
|
|
334
346
|
collection_ids=["docs"],
|
|
335
347
|
)
|
|
336
348
|
```
|
|
337
349
|
|
|
350
|
+
**Note**: For audit purposes, you can store `account_id` in document metadata:
|
|
351
|
+
```python
|
|
352
|
+
await knowledge.load(
|
|
353
|
+
source="https://docs.example.com/sitemap.xml",
|
|
354
|
+
document_defaults={"metadata": {"account_id": "tenant-123"}},
|
|
355
|
+
)
|
|
356
|
+
```
|
|
357
|
+
|
|
338
358
|
## Agentic Memory
|
|
339
359
|
|
|
340
360
|
Conversational memory with automatic fact extraction using OpenSearch's ML Memory plugin.
|