gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -9,7 +9,11 @@ from typing import Any
|
|
|
9
9
|
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
10
10
|
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
11
11
|
from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
|
|
12
|
+
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
13
|
+
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
14
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
12
15
|
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
16
|
+
from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
|
|
13
17
|
from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
|
|
14
18
|
from gnosisllm_knowledge.loaders.website import WebsiteLoader
|
|
15
19
|
|
|
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
|
|
|
20
24
|
]
|
|
21
25
|
|
|
22
26
|
|
|
27
|
+
def _create_discovery_loader(
|
|
28
|
+
fetcher: IContentFetcher,
|
|
29
|
+
chunker: ITextChunker,
|
|
30
|
+
config: dict[str, Any] | None,
|
|
31
|
+
event_emitter: EventEmitter | None,
|
|
32
|
+
) -> DiscoveryLoader:
|
|
33
|
+
"""Factory function for creating DiscoveryLoader instances.
|
|
34
|
+
|
|
35
|
+
Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
|
|
36
|
+
is a NeoreaderContentFetcher, reuses its config to ensure consistency.
|
|
37
|
+
Otherwise, creates config from environment variables.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
fetcher: Content fetcher for retrieving URL content.
|
|
41
|
+
chunker: Text chunker for splitting content.
|
|
42
|
+
config: Optional configuration dictionary.
|
|
43
|
+
event_emitter: Optional event emitter for progress events.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Configured DiscoveryLoader instance.
|
|
47
|
+
"""
|
|
48
|
+
# Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
|
|
49
|
+
if isinstance(fetcher, NeoreaderContentFetcher):
|
|
50
|
+
neoreader_config = fetcher.config
|
|
51
|
+
else:
|
|
52
|
+
neoreader_config = NeoreaderConfig.from_env()
|
|
53
|
+
|
|
54
|
+
discovery_client = NeoreaderDiscoveryClient(neoreader_config)
|
|
55
|
+
return DiscoveryLoader(
|
|
56
|
+
fetcher=fetcher,
|
|
57
|
+
chunker=chunker,
|
|
58
|
+
discovery_client=discovery_client,
|
|
59
|
+
config=config,
|
|
60
|
+
event_emitter=event_emitter,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
23
64
|
class LoaderFactory:
|
|
24
65
|
"""Factory for creating content loaders (Registry Pattern).
|
|
25
66
|
|
|
@@ -29,6 +70,7 @@ class LoaderFactory:
|
|
|
29
70
|
Built-in loaders:
|
|
30
71
|
- website: Single URL loading
|
|
31
72
|
- sitemap: Sitemap XML with recursive discovery
|
|
73
|
+
- discovery: Website crawling via Neo Reader Discovery API
|
|
32
74
|
|
|
33
75
|
Example:
|
|
34
76
|
```python
|
|
@@ -40,6 +82,9 @@ class LoaderFactory:
|
|
|
40
82
|
# Explicit type
|
|
41
83
|
loader = factory.create("sitemap", config={"max_urls": 500})
|
|
42
84
|
|
|
85
|
+
# Discovery loader for full website crawling
|
|
86
|
+
loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
|
|
87
|
+
|
|
43
88
|
# Register custom loader
|
|
44
89
|
factory.register("custom", MyCustomLoader)
|
|
45
90
|
```
|
|
@@ -76,6 +121,7 @@ class LoaderFactory:
|
|
|
76
121
|
"""Register built-in loader types."""
|
|
77
122
|
self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
|
|
78
123
|
self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
|
|
124
|
+
self.register("discovery", _create_discovery_loader)
|
|
79
125
|
|
|
80
126
|
def register(self, name: str, creator: LoaderCreator) -> None:
|
|
81
127
|
"""Register a loader type.
|
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
"""Knowledge indexing service.
|
|
1
|
+
"""Knowledge indexing service.
|
|
2
|
+
|
|
3
|
+
This service orchestrates the document ingestion pipeline from source to index,
|
|
4
|
+
including loading, chunking, and indexing.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This service is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g.,
|
|
9
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
10
|
+
"""
|
|
2
11
|
|
|
3
12
|
from __future__ import annotations
|
|
4
13
|
|
|
@@ -82,7 +91,6 @@ class KnowledgeIndexingService:
|
|
|
82
91
|
source: str,
|
|
83
92
|
index_name: str,
|
|
84
93
|
*,
|
|
85
|
-
account_id: str | None = None,
|
|
86
94
|
collection_id: str | None = None,
|
|
87
95
|
source_id: str | None = None,
|
|
88
96
|
batch_size: int = 100,
|
|
@@ -93,10 +101,13 @@ class KnowledgeIndexingService:
|
|
|
93
101
|
Uses streaming to process and index documents as they're fetched,
|
|
94
102
|
avoiding memory issues with large sitemaps.
|
|
95
103
|
|
|
104
|
+
Note:
|
|
105
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
106
|
+
at the API layer by using separate indices per account.
|
|
107
|
+
|
|
96
108
|
Args:
|
|
97
109
|
source: Source URL or path.
|
|
98
|
-
index_name: Target index name.
|
|
99
|
-
account_id: Account ID for multi-tenancy.
|
|
110
|
+
index_name: Target index name (use tenant-specific name for isolation).
|
|
100
111
|
collection_id: Collection ID.
|
|
101
112
|
source_id: Source ID (auto-generated if not provided).
|
|
102
113
|
batch_size: Documents per batch for indexing.
|
|
@@ -108,6 +119,16 @@ class KnowledgeIndexingService:
|
|
|
108
119
|
source_id = source_id or str(uuid.uuid4())
|
|
109
120
|
document_defaults = options.pop("document_defaults", {})
|
|
110
121
|
|
|
122
|
+
# Extract metadata from document_defaults to merge with doc.metadata later
|
|
123
|
+
# This allows callers to pass custom metadata (e.g., parent_collection_id)
|
|
124
|
+
# without conflicting with the explicit metadata= parameter
|
|
125
|
+
extra_metadata = document_defaults.pop("metadata", {})
|
|
126
|
+
|
|
127
|
+
# Ensure index exists with correct mappings before indexing
|
|
128
|
+
# This prevents OpenSearch from auto-creating the index with dynamic mapping
|
|
129
|
+
# which would map keyword fields (like collection_id) as text fields
|
|
130
|
+
await self._indexer.ensure_index(index_name)
|
|
131
|
+
|
|
111
132
|
# Emit batch started event
|
|
112
133
|
await self._events.emit_async(
|
|
113
134
|
BatchStartedEvent(
|
|
@@ -127,21 +148,22 @@ class KnowledgeIndexingService:
|
|
|
127
148
|
# Stream documents and index in batches as they arrive
|
|
128
149
|
# Note: Loader already chunks content, so we don't re-chunk here
|
|
129
150
|
async for doc in self._loader.load_streaming(source, **options):
|
|
130
|
-
# Enrich document with
|
|
151
|
+
# Enrich document with collection info
|
|
152
|
+
# Merge doc.metadata with extra_metadata from document_defaults
|
|
153
|
+
merged_metadata = {**doc.metadata, **extra_metadata}
|
|
131
154
|
enriched_doc = Document(
|
|
132
155
|
content=doc.content,
|
|
133
156
|
source=source,
|
|
134
157
|
doc_id=doc.doc_id,
|
|
135
158
|
url=doc.url,
|
|
136
159
|
title=doc.title,
|
|
137
|
-
account_id=account_id,
|
|
138
160
|
collection_id=collection_id,
|
|
139
161
|
source_id=source_id,
|
|
140
162
|
chunk_index=doc.chunk_index,
|
|
141
163
|
total_chunks=doc.total_chunks,
|
|
142
164
|
parent_doc_id=doc.parent_doc_id,
|
|
143
165
|
status=DocumentStatus.INDEXED,
|
|
144
|
-
metadata=
|
|
166
|
+
metadata=merged_metadata,
|
|
145
167
|
**document_defaults,
|
|
146
168
|
)
|
|
147
169
|
|
|
@@ -213,6 +235,9 @@ class KnowledgeIndexingService:
|
|
|
213
235
|
Returns:
|
|
214
236
|
Index result.
|
|
215
237
|
"""
|
|
238
|
+
# Ensure index exists with correct mappings before indexing
|
|
239
|
+
await self._indexer.ensure_index(index_name)
|
|
240
|
+
|
|
216
241
|
total_indexed = 0
|
|
217
242
|
total_failed = 0
|
|
218
243
|
errors: list[str] = []
|
|
@@ -230,8 +255,8 @@ class KnowledgeIndexingService:
|
|
|
230
255
|
doc_id=f"{doc.doc_id}-chunk-{i}",
|
|
231
256
|
url=doc.url,
|
|
232
257
|
title=doc.title,
|
|
233
|
-
account_id=doc.account_id,
|
|
234
258
|
collection_id=doc.collection_id,
|
|
259
|
+
collection_name=doc.collection_name,
|
|
235
260
|
source_id=doc.source_id,
|
|
236
261
|
chunk_index=i,
|
|
237
262
|
total_chunks=len(chunks),
|
|
@@ -271,14 +296,16 @@ class KnowledgeIndexingService:
|
|
|
271
296
|
self,
|
|
272
297
|
source_id: str,
|
|
273
298
|
index_name: str,
|
|
274
|
-
account_id: str | None = None,
|
|
275
299
|
) -> int:
|
|
276
300
|
"""Delete all documents from a source.
|
|
277
301
|
|
|
302
|
+
Note:
|
|
303
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
304
|
+
at the API layer by using separate indices per account.
|
|
305
|
+
|
|
278
306
|
Args:
|
|
279
307
|
source_id: Source ID to delete.
|
|
280
|
-
index_name: Index name.
|
|
281
|
-
account_id: Optional account filter.
|
|
308
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
282
309
|
|
|
283
310
|
Returns:
|
|
284
311
|
Count of deleted documents.
|
|
@@ -287,21 +314,23 @@ class KnowledgeIndexingService:
|
|
|
287
314
|
build_delete_by_source_query,
|
|
288
315
|
)
|
|
289
316
|
|
|
290
|
-
query = build_delete_by_source_query(source_id
|
|
317
|
+
query = build_delete_by_source_query(source_id)
|
|
291
318
|
return await self._indexer.delete_by_query(query, index_name)
|
|
292
319
|
|
|
293
320
|
async def delete_collection(
|
|
294
321
|
self,
|
|
295
322
|
collection_id: str,
|
|
296
323
|
index_name: str,
|
|
297
|
-
account_id: str | None = None,
|
|
298
324
|
) -> int:
|
|
299
325
|
"""Delete all documents from a collection.
|
|
300
326
|
|
|
327
|
+
Note:
|
|
328
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
329
|
+
at the API layer by using separate indices per account.
|
|
330
|
+
|
|
301
331
|
Args:
|
|
302
332
|
collection_id: Collection ID to delete.
|
|
303
|
-
index_name: Index name.
|
|
304
|
-
account_id: Optional account filter.
|
|
333
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
305
334
|
|
|
306
335
|
Returns:
|
|
307
336
|
Count of deleted documents.
|
|
@@ -310,7 +339,7 @@ class KnowledgeIndexingService:
|
|
|
310
339
|
build_delete_by_collection_query,
|
|
311
340
|
)
|
|
312
341
|
|
|
313
|
-
query = build_delete_by_collection_query(collection_id
|
|
342
|
+
query = build_delete_by_collection_query(collection_id)
|
|
314
343
|
return await self._indexer.delete_by_query(query, index_name)
|
|
315
344
|
|
|
316
345
|
async def reindex_source(
|
|
@@ -319,17 +348,19 @@ class KnowledgeIndexingService:
|
|
|
319
348
|
source_id: str,
|
|
320
349
|
index_name: str,
|
|
321
350
|
*,
|
|
322
|
-
account_id: str | None = None,
|
|
323
351
|
collection_id: str | None = None,
|
|
324
352
|
**options: Any,
|
|
325
353
|
) -> IndexResult:
|
|
326
354
|
"""Reindex a source by deleting and re-loading.
|
|
327
355
|
|
|
356
|
+
Note:
|
|
357
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
358
|
+
at the API layer by using separate indices per account.
|
|
359
|
+
|
|
328
360
|
Args:
|
|
329
361
|
source: Source URL or path.
|
|
330
362
|
source_id: Existing source ID.
|
|
331
|
-
index_name: Index name.
|
|
332
|
-
account_id: Account ID.
|
|
363
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
333
364
|
collection_id: Collection ID.
|
|
334
365
|
**options: Additional options.
|
|
335
366
|
|
|
@@ -337,13 +368,12 @@ class KnowledgeIndexingService:
|
|
|
337
368
|
Index result.
|
|
338
369
|
"""
|
|
339
370
|
# Delete existing documents
|
|
340
|
-
await self.delete_source(source_id, index_name
|
|
371
|
+
await self.delete_source(source_id, index_name)
|
|
341
372
|
|
|
342
373
|
# Re-index
|
|
343
374
|
return await self.load_and_index(
|
|
344
375
|
source=source,
|
|
345
376
|
index_name=index_name,
|
|
346
|
-
account_id=account_id,
|
|
347
377
|
collection_id=collection_id,
|
|
348
378
|
source_id=source_id,
|
|
349
379
|
**options,
|
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
"""Knowledge search service.
|
|
1
|
+
"""Knowledge search service.
|
|
2
|
+
|
|
3
|
+
This service provides a high-level interface for searching knowledge documents
|
|
4
|
+
using semantic, keyword, and hybrid search modes.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This service is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g., knowledge-{account_id}).
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
from __future__ import annotations
|
|
4
12
|
|
|
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
|
|
|
70
78
|
mode: SearchMode = SearchMode.HYBRID,
|
|
71
79
|
limit: int = 10,
|
|
72
80
|
offset: int = 0,
|
|
73
|
-
account_id: str | None = None,
|
|
74
81
|
collection_ids: list[str] | None = None,
|
|
75
82
|
source_ids: list[str] | None = None,
|
|
76
83
|
min_score: float | None = None,
|
|
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
|
|
|
78
85
|
) -> SearchResult:
|
|
79
86
|
"""Search for knowledge documents.
|
|
80
87
|
|
|
88
|
+
Note:
|
|
89
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
90
|
+
at the API layer by using separate indices per account.
|
|
91
|
+
|
|
81
92
|
Args:
|
|
82
93
|
query: Search query text.
|
|
83
94
|
index_name: Index to search (uses default if not provided).
|
|
84
95
|
mode: Search mode (semantic, keyword, hybrid).
|
|
85
96
|
limit: Maximum results.
|
|
86
97
|
offset: Result offset for pagination.
|
|
87
|
-
account_id: Account ID for multi-tenancy.
|
|
88
98
|
collection_ids: Filter by collection IDs.
|
|
89
99
|
source_ids: Filter by source IDs.
|
|
90
100
|
min_score: Minimum score threshold.
|
|
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
|
|
|
105
115
|
mode=mode,
|
|
106
116
|
limit=limit,
|
|
107
117
|
offset=offset,
|
|
108
|
-
account_id=account_id,
|
|
109
118
|
collection_ids=collection_ids,
|
|
110
119
|
source_ids=source_ids,
|
|
111
120
|
min_score=min_score,
|
|
@@ -133,17 +142,19 @@ class KnowledgeSearchService:
|
|
|
133
142
|
*,
|
|
134
143
|
index_name: str | None = None,
|
|
135
144
|
limit: int = 10,
|
|
136
|
-
account_id: str | None = None,
|
|
137
145
|
collection_ids: list[str] | None = None,
|
|
138
146
|
**options: Any,
|
|
139
147
|
) -> SearchResult:
|
|
140
148
|
"""Execute semantic (vector) search.
|
|
141
149
|
|
|
150
|
+
Note:
|
|
151
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
152
|
+
at the API layer by using separate indices per account.
|
|
153
|
+
|
|
142
154
|
Args:
|
|
143
155
|
query: Search query text.
|
|
144
156
|
index_name: Index to search.
|
|
145
157
|
limit: Maximum results.
|
|
146
|
-
account_id: Account ID for multi-tenancy.
|
|
147
158
|
collection_ids: Filter by collection IDs.
|
|
148
159
|
**options: Additional options.
|
|
149
160
|
|
|
@@ -155,7 +166,6 @@ class KnowledgeSearchService:
|
|
|
155
166
|
index_name=index_name,
|
|
156
167
|
mode=SearchMode.SEMANTIC,
|
|
157
168
|
limit=limit,
|
|
158
|
-
account_id=account_id,
|
|
159
169
|
collection_ids=collection_ids,
|
|
160
170
|
**options,
|
|
161
171
|
)
|
|
@@ -166,17 +176,19 @@ class KnowledgeSearchService:
|
|
|
166
176
|
*,
|
|
167
177
|
index_name: str | None = None,
|
|
168
178
|
limit: int = 10,
|
|
169
|
-
account_id: str | None = None,
|
|
170
179
|
collection_ids: list[str] | None = None,
|
|
171
180
|
**options: Any,
|
|
172
181
|
) -> SearchResult:
|
|
173
182
|
"""Execute keyword (BM25) search.
|
|
174
183
|
|
|
184
|
+
Note:
|
|
185
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
186
|
+
at the API layer by using separate indices per account.
|
|
187
|
+
|
|
175
188
|
Args:
|
|
176
189
|
query: Search query text.
|
|
177
190
|
index_name: Index to search.
|
|
178
191
|
limit: Maximum results.
|
|
179
|
-
account_id: Account ID for multi-tenancy.
|
|
180
192
|
collection_ids: Filter by collection IDs.
|
|
181
193
|
**options: Additional options.
|
|
182
194
|
|
|
@@ -188,7 +200,6 @@ class KnowledgeSearchService:
|
|
|
188
200
|
index_name=index_name,
|
|
189
201
|
mode=SearchMode.KEYWORD,
|
|
190
202
|
limit=limit,
|
|
191
|
-
account_id=account_id,
|
|
192
203
|
collection_ids=collection_ids,
|
|
193
204
|
**options,
|
|
194
205
|
)
|
|
@@ -199,7 +210,6 @@ class KnowledgeSearchService:
|
|
|
199
210
|
*,
|
|
200
211
|
index_name: str | None = None,
|
|
201
212
|
limit: int = 10,
|
|
202
|
-
account_id: str | None = None,
|
|
203
213
|
collection_ids: list[str] | None = None,
|
|
204
214
|
semantic_weight: float = 0.7,
|
|
205
215
|
keyword_weight: float = 0.3,
|
|
@@ -207,11 +217,14 @@ class KnowledgeSearchService:
|
|
|
207
217
|
) -> SearchResult:
|
|
208
218
|
"""Execute hybrid search (semantic + keyword).
|
|
209
219
|
|
|
220
|
+
Note:
|
|
221
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
222
|
+
at the API layer by using separate indices per account.
|
|
223
|
+
|
|
210
224
|
Args:
|
|
211
225
|
query: Search query text.
|
|
212
226
|
index_name: Index to search.
|
|
213
227
|
limit: Maximum results.
|
|
214
|
-
account_id: Account ID for multi-tenancy.
|
|
215
228
|
collection_ids: Filter by collection IDs.
|
|
216
229
|
semantic_weight: Weight for semantic score.
|
|
217
230
|
keyword_weight: Weight for keyword score.
|
|
@@ -225,7 +238,6 @@ class KnowledgeSearchService:
|
|
|
225
238
|
index_name=index_name,
|
|
226
239
|
mode=SearchMode.HYBRID,
|
|
227
240
|
limit=limit,
|
|
228
|
-
account_id=account_id,
|
|
229
241
|
collection_ids=collection_ids,
|
|
230
242
|
semantic_weight=semantic_weight,
|
|
231
243
|
keyword_weight=keyword_weight,
|
|
@@ -264,17 +276,19 @@ class KnowledgeSearchService:
|
|
|
264
276
|
index_name: str | None = None,
|
|
265
277
|
mode: SearchMode = SearchMode.HYBRID,
|
|
266
278
|
limit: int = 10,
|
|
267
|
-
account_id: str | None = None,
|
|
268
279
|
**options: Any,
|
|
269
280
|
) -> list[SearchResult]:
|
|
270
281
|
"""Execute multiple searches in parallel.
|
|
271
282
|
|
|
283
|
+
Note:
|
|
284
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
285
|
+
at the API layer by using separate indices per account.
|
|
286
|
+
|
|
272
287
|
Args:
|
|
273
288
|
queries: List of query texts.
|
|
274
289
|
index_name: Index to search.
|
|
275
290
|
mode: Search mode.
|
|
276
291
|
limit: Maximum results per query.
|
|
277
|
-
account_id: Account ID for multi-tenancy.
|
|
278
292
|
**options: Additional options.
|
|
279
293
|
|
|
280
294
|
Returns:
|
|
@@ -289,7 +303,6 @@ class KnowledgeSearchService:
|
|
|
289
303
|
text=query,
|
|
290
304
|
mode=mode,
|
|
291
305
|
limit=limit,
|
|
292
|
-
account_id=account_id,
|
|
293
306
|
)
|
|
294
307
|
for query in queries
|
|
295
308
|
]
|
|
@@ -310,15 +323,22 @@ class KnowledgeSearchService:
|
|
|
310
323
|
async def count(
|
|
311
324
|
self,
|
|
312
325
|
index_name: str | None = None,
|
|
313
|
-
account_id: str | None = None,
|
|
314
326
|
collection_id: str | None = None,
|
|
327
|
+
source_id: str | None = None,
|
|
315
328
|
) -> int:
|
|
316
329
|
"""Count documents in index.
|
|
317
330
|
|
|
331
|
+
Uses native count API instead of search for efficiency and to avoid
|
|
332
|
+
hybrid search issues with empty queries.
|
|
333
|
+
|
|
334
|
+
Note:
|
|
335
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
336
|
+
at the API layer by using separate indices per account.
|
|
337
|
+
|
|
318
338
|
Args:
|
|
319
339
|
index_name: Index to count.
|
|
320
|
-
account_id: Filter by account.
|
|
321
340
|
collection_id: Filter by collection.
|
|
341
|
+
source_id: Filter by source (for source deletion confirmation).
|
|
322
342
|
|
|
323
343
|
Returns:
|
|
324
344
|
Document count.
|
|
@@ -327,18 +347,12 @@ class KnowledgeSearchService:
|
|
|
327
347
|
if not index:
|
|
328
348
|
raise SearchError(message="No index specified")
|
|
329
349
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
account_id=account_id,
|
|
335
|
-
collection_ids=[collection_id] if collection_id else None,
|
|
350
|
+
return await self._searcher.count(
|
|
351
|
+
index_name=index,
|
|
352
|
+
collection_id=collection_id,
|
|
353
|
+
source_id=source_id,
|
|
336
354
|
)
|
|
337
355
|
|
|
338
|
-
# Use a simple match_all to get total count
|
|
339
|
-
result = await self._searcher.search(query, index)
|
|
340
|
-
return result.total_hits
|
|
341
|
-
|
|
342
356
|
async def get_collections(
|
|
343
357
|
self,
|
|
344
358
|
index_name: str | None = None,
|
|
@@ -2,12 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
This module provides the StreamingIndexingPipeline that orchestrates
|
|
4
4
|
the load -> index pipeline with guaranteed bounded memory usage.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This module is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g.,
|
|
9
|
+
gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
|
|
10
|
+
The account_id parameters are deprecated and will be ignored.
|
|
5
11
|
"""
|
|
6
12
|
|
|
7
13
|
from __future__ import annotations
|
|
8
14
|
|
|
9
15
|
import logging
|
|
10
16
|
import time
|
|
17
|
+
import warnings
|
|
11
18
|
from dataclasses import dataclass, field
|
|
12
19
|
from typing import TYPE_CHECKING, Any
|
|
13
20
|
|
|
@@ -141,10 +148,16 @@ class StreamingIndexingPipeline:
|
|
|
141
148
|
) -> IndexResult:
|
|
142
149
|
"""Execute the streaming pipeline.
|
|
143
150
|
|
|
151
|
+
Note:
|
|
152
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
153
|
+
at the API layer by using separate indices per account. The
|
|
154
|
+
account_id parameter is deprecated and will be ignored.
|
|
155
|
+
|
|
144
156
|
Args:
|
|
145
157
|
source: Sitemap URL.
|
|
146
158
|
index_name: Target OpenSearch index.
|
|
147
|
-
account_id:
|
|
159
|
+
account_id: Deprecated. This parameter is ignored.
|
|
160
|
+
Use index isolation (separate index per account) instead.
|
|
148
161
|
collection_id: Collection within account.
|
|
149
162
|
collection_name: Collection name for display.
|
|
150
163
|
source_id: Source identifier.
|
|
@@ -153,6 +166,19 @@ class StreamingIndexingPipeline:
|
|
|
153
166
|
Returns:
|
|
154
167
|
Aggregated index result.
|
|
155
168
|
"""
|
|
169
|
+
if account_id is not None:
|
|
170
|
+
warnings.warn(
|
|
171
|
+
"account_id parameter is deprecated and will be ignored. "
|
|
172
|
+
"Use index isolation (separate index per account) instead.",
|
|
173
|
+
DeprecationWarning,
|
|
174
|
+
stacklevel=2,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Ensure index exists with correct mappings before indexing
|
|
178
|
+
# This prevents OpenSearch from auto-creating the index with dynamic mapping
|
|
179
|
+
# which would map keyword fields (like collection_id) as text fields
|
|
180
|
+
await self._indexer.ensure_index(index_name)
|
|
181
|
+
|
|
156
182
|
start_time = time.time()
|
|
157
183
|
self._progress = StreamingProgress(current_phase="starting")
|
|
158
184
|
await self._emit_progress()
|
|
@@ -167,7 +193,6 @@ class StreamingIndexingPipeline:
|
|
|
167
193
|
self._enrich_document(
|
|
168
194
|
doc,
|
|
169
195
|
source=source,
|
|
170
|
-
account_id=account_id,
|
|
171
196
|
collection_id=collection_id,
|
|
172
197
|
collection_name=collection_name,
|
|
173
198
|
source_id=source_id,
|
|
@@ -248,31 +273,44 @@ class StreamingIndexingPipeline:
|
|
|
248
273
|
self,
|
|
249
274
|
doc: Document,
|
|
250
275
|
source: str,
|
|
251
|
-
account_id: str | None,
|
|
252
276
|
collection_id: str | None,
|
|
253
277
|
collection_name: str | None,
|
|
254
278
|
source_id: str | None,
|
|
279
|
+
account_id: str | None = None,
|
|
255
280
|
) -> Document:
|
|
256
|
-
"""Add
|
|
281
|
+
"""Add source info to document.
|
|
282
|
+
|
|
283
|
+
Note:
|
|
284
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
285
|
+
at the API layer by using separate indices per account. The
|
|
286
|
+
account_id parameter is deprecated and will be ignored.
|
|
257
287
|
|
|
258
288
|
Args:
|
|
259
289
|
doc: Original document.
|
|
260
290
|
source: Source URL.
|
|
261
|
-
account_id: Account identifier.
|
|
262
291
|
collection_id: Collection identifier.
|
|
263
292
|
collection_name: Collection name for display.
|
|
264
293
|
source_id: Source identifier.
|
|
294
|
+
account_id: Deprecated. This parameter is ignored.
|
|
295
|
+
Use index isolation (separate index per account) instead.
|
|
265
296
|
|
|
266
297
|
Returns:
|
|
267
|
-
New Document with
|
|
298
|
+
New Document with source info.
|
|
268
299
|
"""
|
|
300
|
+
if account_id is not None:
|
|
301
|
+
warnings.warn(
|
|
302
|
+
"account_id parameter is deprecated and will be ignored. "
|
|
303
|
+
"Use index isolation (separate index per account) instead.",
|
|
304
|
+
DeprecationWarning,
|
|
305
|
+
stacklevel=2,
|
|
306
|
+
)
|
|
307
|
+
|
|
269
308
|
return Document(
|
|
270
309
|
content=doc.content,
|
|
271
310
|
source=source,
|
|
272
311
|
doc_id=doc.doc_id,
|
|
273
312
|
url=doc.url,
|
|
274
313
|
title=doc.title,
|
|
275
|
-
account_id=account_id,
|
|
276
314
|
collection_id=collection_id,
|
|
277
315
|
collection_name=collection_name,
|
|
278
316
|
source_id=source_id,
|