gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,10 +1,16 @@
|
|
|
1
|
-
"""In-memory document searcher for testing.
|
|
1
|
+
"""In-memory document searcher for testing.
|
|
2
|
+
|
|
3
|
+
Note: This module is tenant-agnostic. Multi-tenancy should be handled
|
|
4
|
+
at the API layer by using separate indices per account (e.g.,
|
|
5
|
+
gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
|
|
6
|
+
"""
|
|
2
7
|
|
|
3
8
|
from __future__ import annotations
|
|
4
9
|
|
|
5
10
|
import math
|
|
6
11
|
import re
|
|
7
12
|
import time
|
|
13
|
+
import warnings
|
|
8
14
|
from typing import Any, Callable
|
|
9
15
|
|
|
10
16
|
from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
|
|
@@ -147,7 +153,7 @@ class MemorySearcher:
|
|
|
147
153
|
|
|
148
154
|
for doc in filtered_docs:
|
|
149
155
|
content = doc.get("content", "").lower()
|
|
150
|
-
title = doc.get("title"
|
|
156
|
+
title = (doc.get("title") or "").lower()
|
|
151
157
|
|
|
152
158
|
# Simple TF scoring
|
|
153
159
|
content_score = sum(
|
|
@@ -209,7 +215,7 @@ class MemorySearcher:
|
|
|
209
215
|
for doc in filtered_docs:
|
|
210
216
|
# Keyword score
|
|
211
217
|
content = doc.get("content", "").lower()
|
|
212
|
-
title = doc.get("title"
|
|
218
|
+
title = (doc.get("title") or "").lower()
|
|
213
219
|
keyword_score = sum(content.count(term) for term in query_terms)
|
|
214
220
|
keyword_score += sum(title.count(term) for term in query_terms) * 2
|
|
215
221
|
|
|
@@ -348,6 +354,122 @@ class MemorySearcher:
|
|
|
348
354
|
results.append(result)
|
|
349
355
|
return results
|
|
350
356
|
|
|
357
|
+
async def list_documents(
|
|
358
|
+
self,
|
|
359
|
+
index_name: str,
|
|
360
|
+
*,
|
|
361
|
+
source_id: str | None = None,
|
|
362
|
+
collection_id: str | None = None,
|
|
363
|
+
limit: int = 50,
|
|
364
|
+
offset: int = 0,
|
|
365
|
+
) -> dict[str, Any]:
|
|
366
|
+
"""List documents with optional filters.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
index_name: Index to query.
|
|
370
|
+
source_id: Optional source ID filter.
|
|
371
|
+
collection_id: Optional collection ID filter.
|
|
372
|
+
limit: Maximum documents to return.
|
|
373
|
+
offset: Number of documents to skip.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Dictionary with documents, total, limit, offset.
|
|
377
|
+
"""
|
|
378
|
+
documents = self._indexer.get_all(index_name)
|
|
379
|
+
|
|
380
|
+
# Apply filters
|
|
381
|
+
if source_id:
|
|
382
|
+
documents = [d for d in documents if d.get("source_id") == source_id]
|
|
383
|
+
if collection_id:
|
|
384
|
+
documents = [d for d in documents if d.get("collection_id") == collection_id]
|
|
385
|
+
|
|
386
|
+
total = len(documents)
|
|
387
|
+
|
|
388
|
+
# Apply pagination
|
|
389
|
+
paginated = documents[offset : offset + limit]
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
"documents": paginated,
|
|
393
|
+
"total": total,
|
|
394
|
+
"limit": limit,
|
|
395
|
+
"offset": offset,
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
async def count(
|
|
399
|
+
self,
|
|
400
|
+
index_name: str,
|
|
401
|
+
collection_id: str | None = None,
|
|
402
|
+
source_id: str | None = None,
|
|
403
|
+
) -> int:
|
|
404
|
+
"""Count documents in index with optional filters.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
index_name: Index to count.
|
|
408
|
+
collection_id: Filter by collection.
|
|
409
|
+
source_id: Filter by source.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Document count.
|
|
413
|
+
"""
|
|
414
|
+
# Use efficient O(1) count when no filters
|
|
415
|
+
if not collection_id and not source_id:
|
|
416
|
+
return self._indexer.count(index_name)
|
|
417
|
+
|
|
418
|
+
# With filters, iterate over index values (memory backend is for testing only)
|
|
419
|
+
index_data = self._indexer._indices.get(index_name, {})
|
|
420
|
+
count = 0
|
|
421
|
+
for doc in index_data.values():
|
|
422
|
+
if collection_id and doc.get("collection_id") != collection_id:
|
|
423
|
+
continue
|
|
424
|
+
if source_id and doc.get("source_id") != source_id:
|
|
425
|
+
continue
|
|
426
|
+
count += 1
|
|
427
|
+
|
|
428
|
+
return count
|
|
429
|
+
|
|
430
|
+
async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
|
|
431
|
+
"""Get unique collections with document counts.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
index_name: Index to query.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
List of collections with id, name, and document_count.
|
|
438
|
+
"""
|
|
439
|
+
documents = self._indexer.get_all(index_name)
|
|
440
|
+
collections: dict[str, dict[str, Any]] = {}
|
|
441
|
+
|
|
442
|
+
for doc in documents:
|
|
443
|
+
col_id = doc.get("collection_id")
|
|
444
|
+
if not col_id:
|
|
445
|
+
continue
|
|
446
|
+
|
|
447
|
+
if col_id not in collections:
|
|
448
|
+
collections[col_id] = {
|
|
449
|
+
"id": col_id,
|
|
450
|
+
"name": doc.get("collection_name") or col_id,
|
|
451
|
+
"document_count": 0,
|
|
452
|
+
}
|
|
453
|
+
collections[col_id]["document_count"] += 1
|
|
454
|
+
|
|
455
|
+
return list(collections.values())
|
|
456
|
+
|
|
457
|
+
async def get_stats(self, index_name: str) -> dict[str, Any]:
|
|
458
|
+
"""Get index statistics.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
index_name: Index to query.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
Dictionary with document_count and index info.
|
|
465
|
+
"""
|
|
466
|
+
count = self._indexer.count(index_name)
|
|
467
|
+
return {
|
|
468
|
+
"document_count": count,
|
|
469
|
+
"index_name": index_name,
|
|
470
|
+
"exists": count > 0 or index_name in self._indexer._indices,
|
|
471
|
+
}
|
|
472
|
+
|
|
351
473
|
def _apply_filters(
|
|
352
474
|
self,
|
|
353
475
|
documents: list[dict[str, Any]],
|
|
@@ -355,6 +477,10 @@ class MemorySearcher:
|
|
|
355
477
|
) -> list[dict[str, Any]]:
|
|
356
478
|
"""Apply query filters to documents.
|
|
357
479
|
|
|
480
|
+
Note:
|
|
481
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
482
|
+
at the API layer by using separate indices per account.
|
|
483
|
+
|
|
358
484
|
Args:
|
|
359
485
|
documents: Documents to filter.
|
|
360
486
|
query: Query with filter parameters.
|
|
@@ -364,10 +490,6 @@ class MemorySearcher:
|
|
|
364
490
|
"""
|
|
365
491
|
filtered = documents
|
|
366
492
|
|
|
367
|
-
# Account filter
|
|
368
|
-
if query.account_id:
|
|
369
|
-
filtered = [d for d in filtered if d.get("account_id") == query.account_id]
|
|
370
|
-
|
|
371
493
|
# Collection filter
|
|
372
494
|
if query.collection_ids:
|
|
373
495
|
filtered = [
|
|
@@ -378,9 +500,9 @@ class MemorySearcher:
|
|
|
378
500
|
if query.source_ids:
|
|
379
501
|
filtered = [d for d in filtered if d.get("source_id") in query.source_ids]
|
|
380
502
|
|
|
381
|
-
# Custom filters
|
|
382
|
-
if query.
|
|
383
|
-
for field, value in query.
|
|
503
|
+
# Custom metadata filters
|
|
504
|
+
if query.metadata_filters:
|
|
505
|
+
for field, value in query.metadata_filters.items():
|
|
384
506
|
if isinstance(value, list):
|
|
385
507
|
filtered = [d for d in filtered if d.get(field) in value]
|
|
386
508
|
else:
|
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
Uses OpenSearch ML agents for AI-powered search with reasoning capabilities.
|
|
4
4
|
Supports flow agents (fast RAG) and conversational agents (multi-turn with memory).
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This module is **tenant-agnostic**. Multi-tenancy is achieved through index isolation:
|
|
8
|
+
each tenant's data resides in a separate OpenSearch index. The caller (e.g., gnosisllm-api)
|
|
9
|
+
is responsible for constructing the appropriate index name (e.g., `knowledge-{account_id}`).
|
|
10
|
+
The library operates on the provided index without any tenant-specific filtering logic.
|
|
5
11
|
"""
|
|
6
12
|
|
|
7
13
|
from __future__ import annotations
|
|
@@ -9,7 +15,6 @@ from __future__ import annotations
|
|
|
9
15
|
import asyncio
|
|
10
16
|
import json
|
|
11
17
|
import logging
|
|
12
|
-
import uuid
|
|
13
18
|
from datetime import UTC, datetime
|
|
14
19
|
from typing import TYPE_CHECKING, Any
|
|
15
20
|
|
|
@@ -297,13 +302,15 @@ class OpenSearchAgenticSearcher:
|
|
|
297
302
|
|
|
298
303
|
async def list_conversations(
|
|
299
304
|
self,
|
|
300
|
-
account_id: str | None = None,
|
|
301
305
|
limit: int = 100,
|
|
302
306
|
) -> list[dict[str, Any]]:
|
|
303
307
|
"""List active conversations.
|
|
304
308
|
|
|
309
|
+
Note:
|
|
310
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through
|
|
311
|
+
index isolation (separate index per account).
|
|
312
|
+
|
|
305
313
|
Args:
|
|
306
|
-
account_id: Filter by account (multi-tenant).
|
|
307
314
|
limit: Maximum number of conversations.
|
|
308
315
|
|
|
309
316
|
Returns:
|
|
@@ -311,8 +318,6 @@ class OpenSearchAgenticSearcher:
|
|
|
311
318
|
"""
|
|
312
319
|
try:
|
|
313
320
|
body: dict[str, Any] = {"size": limit}
|
|
314
|
-
if account_id:
|
|
315
|
-
body["query"] = {"term": {"account_id": account_id}}
|
|
316
321
|
|
|
317
322
|
response = await self._client.transport.perform_request(
|
|
318
323
|
"POST",
|
|
@@ -365,16 +370,18 @@ class OpenSearchAgenticSearcher:
|
|
|
365
370
|
async def create_conversation(
|
|
366
371
|
self,
|
|
367
372
|
name: str | None = None,
|
|
368
|
-
account_id: str | None = None,
|
|
369
373
|
) -> str | None:
|
|
370
374
|
"""Create a new conversation memory.
|
|
371
375
|
|
|
372
376
|
Uses the OpenSearch Memory API to create a conversation memory.
|
|
373
377
|
The endpoint is POST /_plugins/_ml/memory (introduced in 2.12).
|
|
374
378
|
|
|
379
|
+
Note:
|
|
380
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through
|
|
381
|
+
index isolation (separate index per account).
|
|
382
|
+
|
|
375
383
|
Args:
|
|
376
384
|
name: Optional name for the conversation.
|
|
377
|
-
account_id: Optional account ID for multi-tenancy.
|
|
378
385
|
|
|
379
386
|
Returns:
|
|
380
387
|
The new conversation/memory ID, or None if creation fails.
|
|
@@ -382,8 +389,6 @@ class OpenSearchAgenticSearcher:
|
|
|
382
389
|
body: dict[str, Any] = {}
|
|
383
390
|
if name:
|
|
384
391
|
body["name"] = name
|
|
385
|
-
if account_id:
|
|
386
|
-
body["account_id"] = account_id
|
|
387
392
|
|
|
388
393
|
try:
|
|
389
394
|
# POST /_plugins/_ml/memory creates a new memory (OpenSearch 2.12+)
|
|
@@ -109,6 +109,11 @@ class OpenSearchConfig:
|
|
|
109
109
|
bulk_batch_size: int = 500
|
|
110
110
|
bulk_max_concurrent: int = 3
|
|
111
111
|
|
|
112
|
+
# === Indexing Service ===
|
|
113
|
+
# Batch size for progressive indexing during load operations
|
|
114
|
+
# Documents are indexed in batches of this size as they stream in
|
|
115
|
+
indexing_batch_size: int = 10
|
|
116
|
+
|
|
112
117
|
@property
|
|
113
118
|
def url(self) -> str:
|
|
114
119
|
"""Get the full OpenSearch URL."""
|
|
@@ -213,4 +218,6 @@ class OpenSearchConfig:
|
|
|
213
218
|
# === Bulk Indexing ===
|
|
214
219
|
bulk_batch_size=int(os.getenv("OPENSEARCH_BULK_BATCH_SIZE", "500")),
|
|
215
220
|
bulk_max_concurrent=int(os.getenv("OPENSEARCH_BULK_MAX_CONCURRENT", "3")),
|
|
221
|
+
# === Indexing Service ===
|
|
222
|
+
indexing_batch_size=int(os.getenv("GNOSISLLM_INDEXING_BATCH_SIZE", "10")),
|
|
216
223
|
)
|
|
@@ -87,13 +87,15 @@ class OpenSearchIndexer:
|
|
|
87
87
|
# Embeddings are generated by OpenSearch ingest pipeline
|
|
88
88
|
doc_body = self._prepare_document(document)
|
|
89
89
|
|
|
90
|
-
# Index the document
|
|
90
|
+
# Index the document with ingest pipeline for embedding generation
|
|
91
91
|
refresh = options.get("refresh", False)
|
|
92
|
+
pipeline = self._config.ingest_pipeline_name
|
|
92
93
|
await self._client.index(
|
|
93
94
|
index=index_name,
|
|
94
95
|
id=document.doc_id,
|
|
95
96
|
body=doc_body,
|
|
96
97
|
refresh=refresh,
|
|
98
|
+
pipeline=pipeline,
|
|
97
99
|
)
|
|
98
100
|
|
|
99
101
|
return IndexResult(
|
|
@@ -272,6 +274,43 @@ class OpenSearchIndexer:
|
|
|
272
274
|
failed_count=0,
|
|
273
275
|
)
|
|
274
276
|
|
|
277
|
+
async def get(
|
|
278
|
+
self,
|
|
279
|
+
doc_id: str,
|
|
280
|
+
index_name: str,
|
|
281
|
+
) -> dict[str, Any] | None:
|
|
282
|
+
"""Get a document by ID.
|
|
283
|
+
|
|
284
|
+
Uses OpenSearch client's direct get() API (CRUD operation, not search).
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
doc_id: Document ID to retrieve.
|
|
288
|
+
index_name: Index name.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Document dict (source fields) or None if not found.
|
|
292
|
+
Excludes embeddings from response for efficiency.
|
|
293
|
+
"""
|
|
294
|
+
try:
|
|
295
|
+
response = await self._client.get(
|
|
296
|
+
index=index_name,
|
|
297
|
+
id=doc_id,
|
|
298
|
+
_source_excludes=["content_embedding"],
|
|
299
|
+
)
|
|
300
|
+
source = response.get("_source", {})
|
|
301
|
+
# Include the document ID in the response
|
|
302
|
+
source["id"] = response.get("_id", doc_id)
|
|
303
|
+
return source
|
|
304
|
+
except Exception as e:
|
|
305
|
+
if "not_found" in str(e).lower():
|
|
306
|
+
return None
|
|
307
|
+
logger.error(f"Failed to get document {doc_id}: {e}")
|
|
308
|
+
raise IndexError(
|
|
309
|
+
message=f"Failed to get document: {e}",
|
|
310
|
+
details={"document_id": doc_id},
|
|
311
|
+
cause=e,
|
|
312
|
+
) from e
|
|
313
|
+
|
|
275
314
|
async def delete(
|
|
276
315
|
self,
|
|
277
316
|
doc_id: str,
|
|
@@ -434,7 +473,9 @@ class OpenSearchIndexer:
|
|
|
434
473
|
if not actions:
|
|
435
474
|
return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
|
|
436
475
|
|
|
437
|
-
|
|
476
|
+
# Use ingest pipeline for embedding generation
|
|
477
|
+
pipeline = self._config.ingest_pipeline_name
|
|
478
|
+
response = await self._client.bulk(body=actions, pipeline=pipeline)
|
|
438
479
|
|
|
439
480
|
indexed = 0
|
|
440
481
|
failed = 0
|
|
@@ -460,6 +501,11 @@ class OpenSearchIndexer:
|
|
|
460
501
|
def _prepare_document(self, document: Document) -> dict[str, Any]:
|
|
461
502
|
"""Prepare document for indexing.
|
|
462
503
|
|
|
504
|
+
Note:
|
|
505
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
506
|
+
isolation. Tenant information should be passed in document.metadata if
|
|
507
|
+
needed for audit purposes.
|
|
508
|
+
|
|
463
509
|
Args:
|
|
464
510
|
document: Document to prepare.
|
|
465
511
|
|
|
@@ -479,7 +525,6 @@ class OpenSearchIndexer:
|
|
|
479
525
|
"url": document.url,
|
|
480
526
|
"title": document.title,
|
|
481
527
|
"source": document.source,
|
|
482
|
-
"account_id": document.account_id,
|
|
483
528
|
"collection_id": document.collection_id,
|
|
484
529
|
"collection_name": document.collection_name,
|
|
485
530
|
"source_id": document.source_id,
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""OpenSearch index mappings for knowledge documents.
|
|
1
|
+
"""OpenSearch index mappings for knowledge documents.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). Index mappings do not include
|
|
6
|
+
tenant-specific fields like account_id.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from __future__ import annotations
|
|
4
10
|
|
|
@@ -56,8 +62,7 @@ def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
|
56
62
|
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
|
|
57
63
|
},
|
|
58
64
|
"source": {"type": "keyword"},
|
|
59
|
-
# ===
|
|
60
|
-
"account_id": {"type": "keyword"},
|
|
65
|
+
# === Collection Fields ===
|
|
61
66
|
"collection_id": {"type": "keyword"},
|
|
62
67
|
"collection_name": {"type": "keyword"}, # For aggregation display
|
|
63
68
|
"source_id": {"type": "keyword"},
|
|
@@ -129,13 +134,16 @@ def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
|
129
134
|
def get_memory_index_mappings() -> dict[str, Any]:
|
|
130
135
|
"""Get index mappings for conversation memory.
|
|
131
136
|
|
|
137
|
+
Note:
|
|
138
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
139
|
+
isolation. Use tenant-specific index names for conversation memory.
|
|
140
|
+
|
|
132
141
|
Returns:
|
|
133
142
|
Index mappings dictionary.
|
|
134
143
|
"""
|
|
135
144
|
return {
|
|
136
145
|
"properties": {
|
|
137
146
|
"conversation_id": {"type": "keyword"},
|
|
138
|
-
"account_id": {"type": "keyword"},
|
|
139
147
|
"user_id": {"type": "keyword"},
|
|
140
148
|
"message_index": {"type": "integer"},
|
|
141
149
|
"role": {"type": "keyword"}, # user, assistant, system
|
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Uses OpenSearch neural search - embeddings are generated automatically
|
|
4
4
|
via the deployed model. No Python-side embedding generation needed.
|
|
5
|
+
|
|
6
|
+
Note: This module is tenant-agnostic. Multi-tenancy should be handled
|
|
7
|
+
at the API layer by using separate indices per account (e.g.,
|
|
8
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
5
9
|
"""
|
|
6
10
|
|
|
7
11
|
from __future__ import annotations
|
|
@@ -18,9 +22,13 @@ class QueryBuilder:
|
|
|
18
22
|
model handles embedding generation automatically via ingest and
|
|
19
23
|
search pipelines.
|
|
20
24
|
|
|
25
|
+
Note:
|
|
26
|
+
This builder is tenant-agnostic. Multi-tenancy should be handled
|
|
27
|
+
by using separate indices per account.
|
|
28
|
+
|
|
21
29
|
Example:
|
|
22
30
|
```python
|
|
23
|
-
query = SearchQuery(text="how to configure",
|
|
31
|
+
query = SearchQuery(text="how to configure", collection_ids=["col-1"])
|
|
24
32
|
builder = QueryBuilder(query, model_id="abc123")
|
|
25
33
|
os_query = builder.build_hybrid_query()
|
|
26
34
|
```
|
|
@@ -204,12 +212,12 @@ class QueryBuilder:
|
|
|
204
212
|
},
|
|
205
213
|
}
|
|
206
214
|
|
|
207
|
-
# Apply filters
|
|
215
|
+
# Apply filters using post_filter for hybrid queries
|
|
216
|
+
# Hybrid queries cannot be wrapped in bool - they must be top-level
|
|
208
217
|
filters = self._build_filters()
|
|
209
218
|
if filters:
|
|
210
|
-
query["
|
|
219
|
+
query["post_filter"] = {
|
|
211
220
|
"bool": {
|
|
212
|
-
"must": [query["query"]],
|
|
213
221
|
"filter": filters,
|
|
214
222
|
}
|
|
215
223
|
}
|
|
@@ -270,15 +278,15 @@ class QueryBuilder:
|
|
|
270
278
|
def _build_filters(self) -> list[dict[str, Any]]:
|
|
271
279
|
"""Build filter clauses from query parameters.
|
|
272
280
|
|
|
281
|
+
Note:
|
|
282
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
283
|
+
at the API layer by using separate indices per account.
|
|
284
|
+
|
|
273
285
|
Returns:
|
|
274
|
-
List of filter clauses.
|
|
286
|
+
List of filter clauses for collection, source, and metadata filters.
|
|
275
287
|
"""
|
|
276
288
|
filters: list[dict[str, Any]] = []
|
|
277
289
|
|
|
278
|
-
# Multi-tenant filter (required for security)
|
|
279
|
-
if self._query.account_id:
|
|
280
|
-
filters.append({"term": {"account_id": self._query.account_id}})
|
|
281
|
-
|
|
282
290
|
# Collection filter
|
|
283
291
|
if self._query.collection_ids:
|
|
284
292
|
filters.append({"terms": {"collection_id": self._query.collection_ids}})
|
|
@@ -357,67 +365,61 @@ class QueryBuilder:
|
|
|
357
365
|
]
|
|
358
366
|
|
|
359
367
|
|
|
360
|
-
def build_delete_by_source_query(
|
|
361
|
-
source_id: str,
|
|
362
|
-
account_id: str | None = None,
|
|
363
|
-
) -> dict[str, Any]:
|
|
368
|
+
def build_delete_by_source_query(source_id: str) -> dict[str, Any]:
|
|
364
369
|
"""Build query to delete documents by source.
|
|
365
370
|
|
|
371
|
+
Note:
|
|
372
|
+
This function is tenant-agnostic. Multi-tenancy should be handled
|
|
373
|
+
at the API layer by using separate indices per account.
|
|
374
|
+
|
|
366
375
|
Args:
|
|
367
376
|
source_id: Source ID to delete.
|
|
368
|
-
account_id: Optional account filter for multi-tenancy.
|
|
369
377
|
|
|
370
378
|
Returns:
|
|
371
379
|
Delete-by-query dictionary.
|
|
372
380
|
"""
|
|
373
|
-
filters = [{"term": {"source_id": source_id}}]
|
|
374
|
-
if account_id:
|
|
375
|
-
filters.append({"term": {"account_id": account_id}})
|
|
376
|
-
|
|
377
381
|
return {
|
|
378
382
|
"query": {
|
|
379
383
|
"bool": {
|
|
380
|
-
"filter":
|
|
384
|
+
"filter": [{"term": {"source_id": source_id}}],
|
|
381
385
|
}
|
|
382
386
|
}
|
|
383
387
|
}
|
|
384
388
|
|
|
385
389
|
|
|
386
|
-
def build_delete_by_collection_query(
|
|
387
|
-
collection_id: str,
|
|
388
|
-
account_id: str | None = None,
|
|
389
|
-
) -> dict[str, Any]:
|
|
390
|
+
def build_delete_by_collection_query(collection_id: str) -> dict[str, Any]:
|
|
390
391
|
"""Build query to delete documents by collection.
|
|
391
392
|
|
|
393
|
+
Note:
|
|
394
|
+
This function is tenant-agnostic. Multi-tenancy should be handled
|
|
395
|
+
at the API layer by using separate indices per account.
|
|
396
|
+
|
|
392
397
|
Args:
|
|
393
398
|
collection_id: Collection ID to delete.
|
|
394
|
-
account_id: Optional account filter for multi-tenancy.
|
|
395
399
|
|
|
396
400
|
Returns:
|
|
397
401
|
Delete-by-query dictionary.
|
|
398
402
|
"""
|
|
399
|
-
filters = [{"term": {"collection_id": collection_id}}]
|
|
400
|
-
if account_id:
|
|
401
|
-
filters.append({"term": {"account_id": account_id}})
|
|
402
|
-
|
|
403
403
|
return {
|
|
404
404
|
"query": {
|
|
405
405
|
"bool": {
|
|
406
|
-
"filter":
|
|
406
|
+
"filter": [{"term": {"collection_id": collection_id}}],
|
|
407
407
|
}
|
|
408
408
|
}
|
|
409
409
|
}
|
|
410
410
|
|
|
411
411
|
|
|
412
412
|
def build_count_query(
|
|
413
|
-
account_id: str | None = None,
|
|
414
413
|
collection_id: str | None = None,
|
|
415
414
|
source_id: str | None = None,
|
|
416
415
|
) -> dict[str, Any]:
|
|
417
416
|
"""Build query to count documents.
|
|
418
417
|
|
|
418
|
+
Note:
|
|
419
|
+
This function is tenant-agnostic. Multi-tenancy should be handled
|
|
420
|
+
at the API layer by using separate indices per account.
|
|
421
|
+
|
|
419
422
|
Args:
|
|
420
|
-
account_id: Optional account filter.
|
|
421
423
|
collection_id: Optional collection filter.
|
|
422
424
|
source_id: Optional source filter.
|
|
423
425
|
|
|
@@ -426,8 +428,6 @@ def build_count_query(
|
|
|
426
428
|
"""
|
|
427
429
|
filters: list[dict[str, Any]] = []
|
|
428
430
|
|
|
429
|
-
if account_id:
|
|
430
|
-
filters.append({"term": {"account_id": account_id}})
|
|
431
431
|
if collection_id:
|
|
432
432
|
filters.append({"term": {"collection_id": collection_id}})
|
|
433
433
|
if source_id:
|
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Uses OpenSearch neural search - embeddings are generated automatically
|
|
4
4
|
by the deployed ML model. No Python-side embedding generation needed.
|
|
5
|
+
|
|
6
|
+
Note: This module is tenant-agnostic. Multi-tenancy should be handled
|
|
7
|
+
at the API layer by using separate indices per account (e.g.,
|
|
8
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
5
9
|
"""
|
|
6
10
|
|
|
7
11
|
from __future__ import annotations
|
|
@@ -502,11 +506,65 @@ class OpenSearchKnowledgeSearcher:
|
|
|
502
506
|
"error": str(e),
|
|
503
507
|
}
|
|
504
508
|
|
|
509
|
+
async def count(
|
|
510
|
+
self,
|
|
511
|
+
index_name: str,
|
|
512
|
+
collection_id: str | None = None,
|
|
513
|
+
source_id: str | None = None,
|
|
514
|
+
) -> int:
|
|
515
|
+
"""Count documents in index with optional filters.
|
|
516
|
+
|
|
517
|
+
Uses native _count API instead of search for efficiency and to avoid
|
|
518
|
+
hybrid search issues with empty queries.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
index_name: Index to query.
|
|
522
|
+
collection_id: Filter by collection.
|
|
523
|
+
source_id: Filter by source.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
Document count.
|
|
527
|
+
"""
|
|
528
|
+
try:
|
|
529
|
+
# Check if index exists first
|
|
530
|
+
exists = await self._client.indices.exists(index=index_name)
|
|
531
|
+
if not exists:
|
|
532
|
+
logger.debug(f"Index {index_name} does not exist, returning count 0")
|
|
533
|
+
return 0
|
|
534
|
+
|
|
535
|
+
# Build query with optional filters
|
|
536
|
+
query: dict[str, Any] = {"match_all": {}}
|
|
537
|
+
|
|
538
|
+
filters = []
|
|
539
|
+
if collection_id:
|
|
540
|
+
filters.append({"term": {"collection_id": collection_id}})
|
|
541
|
+
if source_id:
|
|
542
|
+
filters.append({"term": {"source_id": source_id}})
|
|
543
|
+
|
|
544
|
+
if filters:
|
|
545
|
+
query = {"bool": {"filter": filters}}
|
|
546
|
+
|
|
547
|
+
# Use native _count API
|
|
548
|
+
response = await self._client.count(
|
|
549
|
+
index=index_name,
|
|
550
|
+
body={"query": query},
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
count = response.get("count", 0)
|
|
554
|
+
logger.debug(f"Count for {index_name}: {count} (collection={collection_id}, source={source_id})")
|
|
555
|
+
return count
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
logger.error(f"Failed to count documents in {index_name}: {e}")
|
|
559
|
+
raise SearchError(
|
|
560
|
+
message=f"Count failed: {e}",
|
|
561
|
+
details={"index": index_name, "collection_id": collection_id, "source_id": source_id},
|
|
562
|
+
) from e
|
|
563
|
+
|
|
505
564
|
async def list_documents(
|
|
506
565
|
self,
|
|
507
566
|
index_name: str,
|
|
508
567
|
*,
|
|
509
|
-
account_id: str | None = None,
|
|
510
568
|
source_id: str | None = None,
|
|
511
569
|
collection_id: str | None = None,
|
|
512
570
|
limit: int = 50,
|
|
@@ -514,9 +572,12 @@ class OpenSearchKnowledgeSearcher:
|
|
|
514
572
|
) -> dict[str, Any]:
|
|
515
573
|
"""List documents with optional filters.
|
|
516
574
|
|
|
575
|
+
Note:
|
|
576
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
577
|
+
at the API layer by using separate indices per account.
|
|
578
|
+
|
|
517
579
|
Args:
|
|
518
|
-
index_name: Index to query.
|
|
519
|
-
account_id: Optional account ID filter.
|
|
580
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
520
581
|
source_id: Optional source ID filter.
|
|
521
582
|
collection_id: Optional collection ID filter.
|
|
522
583
|
limit: Maximum documents to return.
|
|
@@ -540,9 +601,6 @@ class OpenSearchKnowledgeSearcher:
|
|
|
540
601
|
# Build filter clauses
|
|
541
602
|
filters: list[dict[str, Any]] = []
|
|
542
603
|
|
|
543
|
-
if account_id:
|
|
544
|
-
filters.append({"term": {"account_id": account_id}})
|
|
545
|
-
|
|
546
604
|
if source_id:
|
|
547
605
|
filters.append({"term": {"source_id": source_id}})
|
|
548
606
|
|