gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
"""Knowledge indexing service.
|
|
1
|
+
"""Knowledge indexing service.
|
|
2
|
+
|
|
3
|
+
This service orchestrates the document ingestion pipeline from source to index,
|
|
4
|
+
including loading, chunking, and indexing.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This service is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g.,
|
|
9
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
10
|
+
"""
|
|
2
11
|
|
|
3
12
|
from __future__ import annotations
|
|
4
13
|
|
|
@@ -82,112 +91,107 @@ class KnowledgeIndexingService:
|
|
|
82
91
|
source: str,
|
|
83
92
|
index_name: str,
|
|
84
93
|
*,
|
|
85
|
-
account_id: str | None = None,
|
|
86
94
|
collection_id: str | None = None,
|
|
87
95
|
source_id: str | None = None,
|
|
88
96
|
batch_size: int = 100,
|
|
89
97
|
**options: Any,
|
|
90
98
|
) -> IndexResult:
|
|
91
|
-
"""Load content from source and index it.
|
|
99
|
+
"""Load content from source and index it with streaming.
|
|
100
|
+
|
|
101
|
+
Uses streaming to process and index documents as they're fetched,
|
|
102
|
+
avoiding memory issues with large sitemaps.
|
|
103
|
+
|
|
104
|
+
Note:
|
|
105
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
106
|
+
at the API layer by using separate indices per account.
|
|
92
107
|
|
|
93
108
|
Args:
|
|
94
109
|
source: Source URL or path.
|
|
95
|
-
index_name: Target index name.
|
|
96
|
-
account_id: Account ID for multi-tenancy.
|
|
110
|
+
index_name: Target index name (use tenant-specific name for isolation).
|
|
97
111
|
collection_id: Collection ID.
|
|
98
112
|
source_id: Source ID (auto-generated if not provided).
|
|
99
|
-
batch_size: Documents per batch.
|
|
113
|
+
batch_size: Documents per batch for indexing.
|
|
100
114
|
**options: Additional loader/indexer options.
|
|
101
115
|
|
|
102
116
|
Returns:
|
|
103
117
|
Index result with counts.
|
|
104
118
|
"""
|
|
105
119
|
source_id = source_id or str(uuid.uuid4())
|
|
120
|
+
document_defaults = options.pop("document_defaults", {})
|
|
106
121
|
|
|
107
122
|
# Emit batch started event
|
|
108
123
|
await self._events.emit_async(
|
|
109
|
-
EventType.BATCH_STARTED,
|
|
110
124
|
BatchStartedEvent(
|
|
111
|
-
|
|
112
|
-
|
|
125
|
+
batch_index=0,
|
|
126
|
+
batch_size=batch_size,
|
|
127
|
+
total_batches=0, # Unknown for streaming
|
|
113
128
|
),
|
|
114
129
|
)
|
|
115
130
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
131
|
+
total_indexed = 0
|
|
132
|
+
total_failed = 0
|
|
133
|
+
errors: list[str] = []
|
|
134
|
+
batch: list[Document] = []
|
|
135
|
+
batch_index = 0
|
|
119
136
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
137
|
+
try:
|
|
138
|
+
# Stream documents and index in batches as they arrive
|
|
139
|
+
# Note: Loader already chunks content, so we don't re-chunk here
|
|
140
|
+
async for doc in self._loader.load_streaming(source, **options):
|
|
141
|
+
# Enrich document with collection info
|
|
142
|
+
enriched_doc = Document(
|
|
143
|
+
content=doc.content,
|
|
144
|
+
source=source,
|
|
145
|
+
doc_id=doc.doc_id,
|
|
146
|
+
url=doc.url,
|
|
147
|
+
title=doc.title,
|
|
148
|
+
collection_id=collection_id,
|
|
149
|
+
source_id=source_id,
|
|
150
|
+
chunk_index=doc.chunk_index,
|
|
151
|
+
total_chunks=doc.total_chunks,
|
|
152
|
+
parent_doc_id=doc.parent_doc_id,
|
|
153
|
+
status=DocumentStatus.INDEXED,
|
|
154
|
+
metadata=doc.metadata,
|
|
155
|
+
**document_defaults,
|
|
124
156
|
)
|
|
125
157
|
|
|
126
|
-
|
|
127
|
-
total_indexed = 0
|
|
128
|
-
total_failed = 0
|
|
129
|
-
errors: list[str] = []
|
|
130
|
-
|
|
131
|
-
batch: list[Document] = []
|
|
158
|
+
batch.append(enriched_doc)
|
|
132
159
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
title=doc.title,
|
|
144
|
-
source=source,
|
|
145
|
-
account_id=account_id,
|
|
146
|
-
collection_id=collection_id,
|
|
147
|
-
source_id=source_id,
|
|
148
|
-
chunk_index=i,
|
|
149
|
-
total_chunks=len(chunks),
|
|
150
|
-
parent_doc_id=doc.id,
|
|
151
|
-
status=DocumentStatus.INDEXED,
|
|
152
|
-
metadata=doc.metadata,
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
batch.append(chunk_doc)
|
|
156
|
-
|
|
157
|
-
# Index batch when full
|
|
158
|
-
if len(batch) >= batch_size:
|
|
159
|
-
result = await self._index_batch(batch, index_name)
|
|
160
|
-
total_indexed += result.documents_indexed
|
|
161
|
-
total_failed += result.documents_failed
|
|
162
|
-
if result.errors:
|
|
163
|
-
errors.extend(result.errors)
|
|
164
|
-
batch = []
|
|
160
|
+
# Index batch when full
|
|
161
|
+
if len(batch) >= batch_size:
|
|
162
|
+
result = await self._index_batch(batch, index_name)
|
|
163
|
+
total_indexed += result.indexed_count
|
|
164
|
+
total_failed += result.failed_count
|
|
165
|
+
if result.errors:
|
|
166
|
+
errors.extend(result.errors)
|
|
167
|
+
batch = []
|
|
168
|
+
batch_index += 1
|
|
169
|
+
logger.info(f"Indexed batch {batch_index}: {total_indexed} total documents")
|
|
165
170
|
|
|
166
171
|
# Index remaining documents
|
|
167
172
|
if batch:
|
|
168
173
|
result = await self._index_batch(batch, index_name)
|
|
169
|
-
total_indexed += result.
|
|
170
|
-
total_failed += result.
|
|
174
|
+
total_indexed += result.indexed_count
|
|
175
|
+
total_failed += result.failed_count
|
|
171
176
|
if result.errors:
|
|
172
177
|
errors.extend(result.errors)
|
|
173
178
|
|
|
174
179
|
# Emit batch completed event
|
|
175
180
|
await self._events.emit_async(
|
|
176
|
-
EventType.BATCH_COMPLETED,
|
|
177
181
|
BatchCompletedEvent(
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
documents_failed=total_failed,
|
|
182
|
-
success=total_failed == 0,
|
|
182
|
+
batch_index=batch_index,
|
|
183
|
+
success_count=total_indexed,
|
|
184
|
+
failure_count=total_failed,
|
|
183
185
|
),
|
|
184
186
|
)
|
|
185
187
|
|
|
188
|
+
logger.info(f"Completed indexing from {source}: {total_indexed} documents")
|
|
189
|
+
|
|
186
190
|
return IndexResult(
|
|
187
191
|
success=total_failed == 0,
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
errors=errors if errors else
|
|
192
|
+
indexed_count=total_indexed,
|
|
193
|
+
failed_count=total_failed,
|
|
194
|
+
errors=errors if errors else [],
|
|
191
195
|
)
|
|
192
196
|
|
|
193
197
|
except Exception as e:
|
|
@@ -231,17 +235,17 @@ class KnowledgeIndexingService:
|
|
|
231
235
|
|
|
232
236
|
for i, chunk_obj in enumerate(chunks):
|
|
233
237
|
chunk_doc = Document(
|
|
234
|
-
id=f"{doc.id}-chunk-{i}",
|
|
235
238
|
content=chunk_obj.content,
|
|
239
|
+
source=doc.source,
|
|
240
|
+
doc_id=f"{doc.doc_id}-chunk-{i}",
|
|
236
241
|
url=doc.url,
|
|
237
242
|
title=doc.title,
|
|
238
|
-
source=doc.source,
|
|
239
|
-
account_id=doc.account_id,
|
|
240
243
|
collection_id=doc.collection_id,
|
|
244
|
+
collection_name=doc.collection_name,
|
|
241
245
|
source_id=doc.source_id,
|
|
242
246
|
chunk_index=i,
|
|
243
247
|
total_chunks=len(chunks),
|
|
244
|
-
parent_doc_id=doc.
|
|
248
|
+
parent_doc_id=doc.doc_id,
|
|
245
249
|
status=DocumentStatus.INDEXED,
|
|
246
250
|
metadata=doc.metadata,
|
|
247
251
|
)
|
|
@@ -252,8 +256,8 @@ class KnowledgeIndexingService:
|
|
|
252
256
|
# Index batch when full
|
|
253
257
|
if len(batch) >= batch_size:
|
|
254
258
|
result = await self._index_batch(batch, index_name)
|
|
255
|
-
total_indexed += result.
|
|
256
|
-
total_failed += result.
|
|
259
|
+
total_indexed += result.indexed_count
|
|
260
|
+
total_failed += result.failed_count
|
|
257
261
|
if result.errors:
|
|
258
262
|
errors.extend(result.errors)
|
|
259
263
|
batch = []
|
|
@@ -261,30 +265,32 @@ class KnowledgeIndexingService:
|
|
|
261
265
|
# Index remaining
|
|
262
266
|
if batch:
|
|
263
267
|
result = await self._index_batch(batch, index_name)
|
|
264
|
-
total_indexed += result.
|
|
265
|
-
total_failed += result.
|
|
268
|
+
total_indexed += result.indexed_count
|
|
269
|
+
total_failed += result.failed_count
|
|
266
270
|
if result.errors:
|
|
267
271
|
errors.extend(result.errors)
|
|
268
272
|
|
|
269
273
|
return IndexResult(
|
|
270
274
|
success=total_failed == 0,
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
errors=errors if errors else
|
|
275
|
+
indexed_count=total_indexed,
|
|
276
|
+
failed_count=total_failed,
|
|
277
|
+
errors=errors if errors else [],
|
|
274
278
|
)
|
|
275
279
|
|
|
276
280
|
async def delete_source(
|
|
277
281
|
self,
|
|
278
282
|
source_id: str,
|
|
279
283
|
index_name: str,
|
|
280
|
-
account_id: str | None = None,
|
|
281
284
|
) -> int:
|
|
282
285
|
"""Delete all documents from a source.
|
|
283
286
|
|
|
287
|
+
Note:
|
|
288
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
289
|
+
at the API layer by using separate indices per account.
|
|
290
|
+
|
|
284
291
|
Args:
|
|
285
292
|
source_id: Source ID to delete.
|
|
286
|
-
index_name: Index name.
|
|
287
|
-
account_id: Optional account filter.
|
|
293
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
288
294
|
|
|
289
295
|
Returns:
|
|
290
296
|
Count of deleted documents.
|
|
@@ -293,21 +299,23 @@ class KnowledgeIndexingService:
|
|
|
293
299
|
build_delete_by_source_query,
|
|
294
300
|
)
|
|
295
301
|
|
|
296
|
-
query = build_delete_by_source_query(source_id
|
|
302
|
+
query = build_delete_by_source_query(source_id)
|
|
297
303
|
return await self._indexer.delete_by_query(query, index_name)
|
|
298
304
|
|
|
299
305
|
async def delete_collection(
|
|
300
306
|
self,
|
|
301
307
|
collection_id: str,
|
|
302
308
|
index_name: str,
|
|
303
|
-
account_id: str | None = None,
|
|
304
309
|
) -> int:
|
|
305
310
|
"""Delete all documents from a collection.
|
|
306
311
|
|
|
312
|
+
Note:
|
|
313
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
314
|
+
at the API layer by using separate indices per account.
|
|
315
|
+
|
|
307
316
|
Args:
|
|
308
317
|
collection_id: Collection ID to delete.
|
|
309
|
-
index_name: Index name.
|
|
310
|
-
account_id: Optional account filter.
|
|
318
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
311
319
|
|
|
312
320
|
Returns:
|
|
313
321
|
Count of deleted documents.
|
|
@@ -316,7 +324,7 @@ class KnowledgeIndexingService:
|
|
|
316
324
|
build_delete_by_collection_query,
|
|
317
325
|
)
|
|
318
326
|
|
|
319
|
-
query = build_delete_by_collection_query(collection_id
|
|
327
|
+
query = build_delete_by_collection_query(collection_id)
|
|
320
328
|
return await self._indexer.delete_by_query(query, index_name)
|
|
321
329
|
|
|
322
330
|
async def reindex_source(
|
|
@@ -325,17 +333,19 @@ class KnowledgeIndexingService:
|
|
|
325
333
|
source_id: str,
|
|
326
334
|
index_name: str,
|
|
327
335
|
*,
|
|
328
|
-
account_id: str | None = None,
|
|
329
336
|
collection_id: str | None = None,
|
|
330
337
|
**options: Any,
|
|
331
338
|
) -> IndexResult:
|
|
332
339
|
"""Reindex a source by deleting and re-loading.
|
|
333
340
|
|
|
341
|
+
Note:
|
|
342
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
343
|
+
at the API layer by using separate indices per account.
|
|
344
|
+
|
|
334
345
|
Args:
|
|
335
346
|
source: Source URL or path.
|
|
336
347
|
source_id: Existing source ID.
|
|
337
|
-
index_name: Index name.
|
|
338
|
-
account_id: Account ID.
|
|
348
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
339
349
|
collection_id: Collection ID.
|
|
340
350
|
**options: Additional options.
|
|
341
351
|
|
|
@@ -343,13 +353,12 @@ class KnowledgeIndexingService:
|
|
|
343
353
|
Index result.
|
|
344
354
|
"""
|
|
345
355
|
# Delete existing documents
|
|
346
|
-
await self.delete_source(source_id, index_name
|
|
356
|
+
await self.delete_source(source_id, index_name)
|
|
347
357
|
|
|
348
358
|
# Re-index
|
|
349
359
|
return await self.load_and_index(
|
|
350
360
|
source=source,
|
|
351
361
|
index_name=index_name,
|
|
352
|
-
account_id=account_id,
|
|
353
362
|
collection_id=collection_id,
|
|
354
363
|
source_id=source_id,
|
|
355
364
|
**options,
|
|
@@ -375,12 +384,10 @@ class KnowledgeIndexingService:
|
|
|
375
384
|
for doc in documents:
|
|
376
385
|
if result.success:
|
|
377
386
|
await self._events.emit_async(
|
|
378
|
-
EventType.DOCUMENT_INDEXED,
|
|
379
387
|
DocumentIndexedEvent(
|
|
380
|
-
|
|
388
|
+
doc_id=doc.doc_id,
|
|
381
389
|
index_name=index_name,
|
|
382
|
-
|
|
383
|
-
total_chunks=doc.total_chunks,
|
|
390
|
+
success=True,
|
|
384
391
|
),
|
|
385
392
|
)
|
|
386
393
|
|
|
@@ -1,4 +1,12 @@
|
|
|
1
|
-
"""Knowledge search service.
|
|
1
|
+
"""Knowledge search service.
|
|
2
|
+
|
|
3
|
+
This service provides a high-level interface for searching knowledge documents
|
|
4
|
+
using semantic, keyword, and hybrid search modes.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This service is tenant-agnostic. Multi-tenancy should be handled at the
|
|
8
|
+
API layer by using separate indices per account (e.g., knowledge-{account_id}).
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
from __future__ import annotations
|
|
4
12
|
|
|
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
|
|
|
70
78
|
mode: SearchMode = SearchMode.HYBRID,
|
|
71
79
|
limit: int = 10,
|
|
72
80
|
offset: int = 0,
|
|
73
|
-
account_id: str | None = None,
|
|
74
81
|
collection_ids: list[str] | None = None,
|
|
75
82
|
source_ids: list[str] | None = None,
|
|
76
83
|
min_score: float | None = None,
|
|
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
|
|
|
78
85
|
) -> SearchResult:
|
|
79
86
|
"""Search for knowledge documents.
|
|
80
87
|
|
|
88
|
+
Note:
|
|
89
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
90
|
+
at the API layer by using separate indices per account.
|
|
91
|
+
|
|
81
92
|
Args:
|
|
82
93
|
query: Search query text.
|
|
83
94
|
index_name: Index to search (uses default if not provided).
|
|
84
95
|
mode: Search mode (semantic, keyword, hybrid).
|
|
85
96
|
limit: Maximum results.
|
|
86
97
|
offset: Result offset for pagination.
|
|
87
|
-
account_id: Account ID for multi-tenancy.
|
|
88
98
|
collection_ids: Filter by collection IDs.
|
|
89
99
|
source_ids: Filter by source IDs.
|
|
90
100
|
min_score: Minimum score threshold.
|
|
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
|
|
|
105
115
|
mode=mode,
|
|
106
116
|
limit=limit,
|
|
107
117
|
offset=offset,
|
|
108
|
-
account_id=account_id,
|
|
109
118
|
collection_ids=collection_ids,
|
|
110
119
|
source_ids=source_ids,
|
|
111
120
|
min_score=min_score,
|
|
@@ -114,17 +123,8 @@ class KnowledgeSearchService:
|
|
|
114
123
|
try:
|
|
115
124
|
result = await self._searcher.search(search_query, index, **options)
|
|
116
125
|
|
|
117
|
-
# Emit search event
|
|
118
|
-
await self._events.emit_async(
|
|
119
|
-
EventType.SEARCH_COMPLETED,
|
|
120
|
-
{
|
|
121
|
-
"query": query,
|
|
122
|
-
"mode": mode.value,
|
|
123
|
-
"results_count": len(result.items),
|
|
124
|
-
"total_hits": result.total_hits,
|
|
125
|
-
"duration_ms": result.duration_ms,
|
|
126
|
-
},
|
|
127
|
-
)
|
|
126
|
+
# TODO: Emit search event when SearchCompletedEvent is defined
|
|
127
|
+
# await self._events.emit_async(SearchCompletedEvent(...))
|
|
128
128
|
|
|
129
129
|
return result
|
|
130
130
|
|
|
@@ -142,17 +142,19 @@ class KnowledgeSearchService:
|
|
|
142
142
|
*,
|
|
143
143
|
index_name: str | None = None,
|
|
144
144
|
limit: int = 10,
|
|
145
|
-
account_id: str | None = None,
|
|
146
145
|
collection_ids: list[str] | None = None,
|
|
147
146
|
**options: Any,
|
|
148
147
|
) -> SearchResult:
|
|
149
148
|
"""Execute semantic (vector) search.
|
|
150
149
|
|
|
150
|
+
Note:
|
|
151
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
152
|
+
at the API layer by using separate indices per account.
|
|
153
|
+
|
|
151
154
|
Args:
|
|
152
155
|
query: Search query text.
|
|
153
156
|
index_name: Index to search.
|
|
154
157
|
limit: Maximum results.
|
|
155
|
-
account_id: Account ID for multi-tenancy.
|
|
156
158
|
collection_ids: Filter by collection IDs.
|
|
157
159
|
**options: Additional options.
|
|
158
160
|
|
|
@@ -164,7 +166,6 @@ class KnowledgeSearchService:
|
|
|
164
166
|
index_name=index_name,
|
|
165
167
|
mode=SearchMode.SEMANTIC,
|
|
166
168
|
limit=limit,
|
|
167
|
-
account_id=account_id,
|
|
168
169
|
collection_ids=collection_ids,
|
|
169
170
|
**options,
|
|
170
171
|
)
|
|
@@ -175,17 +176,19 @@ class KnowledgeSearchService:
|
|
|
175
176
|
*,
|
|
176
177
|
index_name: str | None = None,
|
|
177
178
|
limit: int = 10,
|
|
178
|
-
account_id: str | None = None,
|
|
179
179
|
collection_ids: list[str] | None = None,
|
|
180
180
|
**options: Any,
|
|
181
181
|
) -> SearchResult:
|
|
182
182
|
"""Execute keyword (BM25) search.
|
|
183
183
|
|
|
184
|
+
Note:
|
|
185
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
186
|
+
at the API layer by using separate indices per account.
|
|
187
|
+
|
|
184
188
|
Args:
|
|
185
189
|
query: Search query text.
|
|
186
190
|
index_name: Index to search.
|
|
187
191
|
limit: Maximum results.
|
|
188
|
-
account_id: Account ID for multi-tenancy.
|
|
189
192
|
collection_ids: Filter by collection IDs.
|
|
190
193
|
**options: Additional options.
|
|
191
194
|
|
|
@@ -197,7 +200,6 @@ class KnowledgeSearchService:
|
|
|
197
200
|
index_name=index_name,
|
|
198
201
|
mode=SearchMode.KEYWORD,
|
|
199
202
|
limit=limit,
|
|
200
|
-
account_id=account_id,
|
|
201
203
|
collection_ids=collection_ids,
|
|
202
204
|
**options,
|
|
203
205
|
)
|
|
@@ -208,7 +210,6 @@ class KnowledgeSearchService:
|
|
|
208
210
|
*,
|
|
209
211
|
index_name: str | None = None,
|
|
210
212
|
limit: int = 10,
|
|
211
|
-
account_id: str | None = None,
|
|
212
213
|
collection_ids: list[str] | None = None,
|
|
213
214
|
semantic_weight: float = 0.7,
|
|
214
215
|
keyword_weight: float = 0.3,
|
|
@@ -216,11 +217,14 @@ class KnowledgeSearchService:
|
|
|
216
217
|
) -> SearchResult:
|
|
217
218
|
"""Execute hybrid search (semantic + keyword).
|
|
218
219
|
|
|
220
|
+
Note:
|
|
221
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
222
|
+
at the API layer by using separate indices per account.
|
|
223
|
+
|
|
219
224
|
Args:
|
|
220
225
|
query: Search query text.
|
|
221
226
|
index_name: Index to search.
|
|
222
227
|
limit: Maximum results.
|
|
223
|
-
account_id: Account ID for multi-tenancy.
|
|
224
228
|
collection_ids: Filter by collection IDs.
|
|
225
229
|
semantic_weight: Weight for semantic score.
|
|
226
230
|
keyword_weight: Weight for keyword score.
|
|
@@ -234,7 +238,6 @@ class KnowledgeSearchService:
|
|
|
234
238
|
index_name=index_name,
|
|
235
239
|
mode=SearchMode.HYBRID,
|
|
236
240
|
limit=limit,
|
|
237
|
-
account_id=account_id,
|
|
238
241
|
collection_ids=collection_ids,
|
|
239
242
|
semantic_weight=semantic_weight,
|
|
240
243
|
keyword_weight=keyword_weight,
|
|
@@ -273,17 +276,19 @@ class KnowledgeSearchService:
|
|
|
273
276
|
index_name: str | None = None,
|
|
274
277
|
mode: SearchMode = SearchMode.HYBRID,
|
|
275
278
|
limit: int = 10,
|
|
276
|
-
account_id: str | None = None,
|
|
277
279
|
**options: Any,
|
|
278
280
|
) -> list[SearchResult]:
|
|
279
281
|
"""Execute multiple searches in parallel.
|
|
280
282
|
|
|
283
|
+
Note:
|
|
284
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
285
|
+
at the API layer by using separate indices per account.
|
|
286
|
+
|
|
281
287
|
Args:
|
|
282
288
|
queries: List of query texts.
|
|
283
289
|
index_name: Index to search.
|
|
284
290
|
mode: Search mode.
|
|
285
291
|
limit: Maximum results per query.
|
|
286
|
-
account_id: Account ID for multi-tenancy.
|
|
287
292
|
**options: Additional options.
|
|
288
293
|
|
|
289
294
|
Returns:
|
|
@@ -298,7 +303,6 @@ class KnowledgeSearchService:
|
|
|
298
303
|
text=query,
|
|
299
304
|
mode=mode,
|
|
300
305
|
limit=limit,
|
|
301
|
-
account_id=account_id,
|
|
302
306
|
)
|
|
303
307
|
for query in queries
|
|
304
308
|
]
|
|
@@ -319,15 +323,19 @@ class KnowledgeSearchService:
|
|
|
319
323
|
async def count(
|
|
320
324
|
self,
|
|
321
325
|
index_name: str | None = None,
|
|
322
|
-
account_id: str | None = None,
|
|
323
326
|
collection_id: str | None = None,
|
|
327
|
+
source_id: str | None = None,
|
|
324
328
|
) -> int:
|
|
325
329
|
"""Count documents in index.
|
|
326
330
|
|
|
331
|
+
Note:
|
|
332
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
333
|
+
at the API layer by using separate indices per account.
|
|
334
|
+
|
|
327
335
|
Args:
|
|
328
336
|
index_name: Index to count.
|
|
329
|
-
account_id: Filter by account.
|
|
330
337
|
collection_id: Filter by collection.
|
|
338
|
+
source_id: Filter by source (for source deletion confirmation).
|
|
331
339
|
|
|
332
340
|
Returns:
|
|
333
341
|
Document count.
|
|
@@ -336,14 +344,59 @@ class KnowledgeSearchService:
|
|
|
336
344
|
if not index:
|
|
337
345
|
raise SearchError(message="No index specified")
|
|
338
346
|
|
|
339
|
-
# Build count query
|
|
347
|
+
# Build count query with optional filters
|
|
340
348
|
query = SearchQuery(
|
|
341
349
|
text="",
|
|
342
350
|
limit=0,
|
|
343
|
-
account_id=account_id,
|
|
344
351
|
collection_ids=[collection_id] if collection_id else None,
|
|
352
|
+
source_ids=[source_id] if source_id else None,
|
|
345
353
|
)
|
|
346
354
|
|
|
347
355
|
# Use a simple match_all to get total count
|
|
348
356
|
result = await self._searcher.search(query, index)
|
|
349
357
|
return result.total_hits
|
|
358
|
+
|
|
359
|
+
async def get_collections(
|
|
360
|
+
self,
|
|
361
|
+
index_name: str | None = None,
|
|
362
|
+
) -> list[dict[str, Any]]:
|
|
363
|
+
"""Get all collections with document counts.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
index_name: Index to query (uses default if not provided).
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
List of collections with id, name, and document_count.
|
|
370
|
+
"""
|
|
371
|
+
index = index_name or self._default_index
|
|
372
|
+
if not index:
|
|
373
|
+
logger.warning("No index specified for get_collections")
|
|
374
|
+
return []
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
return await self._searcher.get_collections(index)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
logger.error(f"Failed to get collections: {e}")
|
|
380
|
+
return []
|
|
381
|
+
|
|
382
|
+
async def get_stats(
|
|
383
|
+
self,
|
|
384
|
+
index_name: str | None = None,
|
|
385
|
+
) -> dict[str, Any]:
|
|
386
|
+
"""Get index statistics.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
index_name: Index to query (uses default if not provided).
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Dictionary with document_count, index_name, and other stats.
|
|
393
|
+
"""
|
|
394
|
+
index = index_name or self._default_index
|
|
395
|
+
if not index:
|
|
396
|
+
return {"document_count": 0, "index_name": "", "exists": False}
|
|
397
|
+
|
|
398
|
+
try:
|
|
399
|
+
return await self._searcher.get_stats(index)
|
|
400
|
+
except Exception as e:
|
|
401
|
+
logger.error(f"Failed to get stats: {e}")
|
|
402
|
+
return {"document_count": 0, "index_name": index, "error": str(e)}
|