gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. gnosisllm_knowledge/api/knowledge.py +233 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +132 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/config.py +7 -0
  6. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  7. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  8. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  9. gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
  10. gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
  11. gnosisllm_knowledge/cli/app.py +58 -19
  12. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  13. gnosisllm_knowledge/cli/commands/load.py +169 -19
  14. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  15. gnosisllm_knowledge/cli/commands/search.py +9 -10
  16. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  17. gnosisllm_knowledge/cli/utils/config.py +4 -4
  18. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  19. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  20. gnosisllm_knowledge/core/domain/document.py +14 -19
  21. gnosisllm_knowledge/core/domain/search.py +10 -25
  22. gnosisllm_knowledge/core/domain/source.py +11 -12
  23. gnosisllm_knowledge/core/events/__init__.py +8 -0
  24. gnosisllm_knowledge/core/events/types.py +122 -5
  25. gnosisllm_knowledge/core/exceptions.py +93 -0
  26. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  27. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  28. gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
  29. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  30. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  31. gnosisllm_knowledge/fetchers/config.py +27 -0
  32. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  33. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  34. gnosisllm_knowledge/loaders/__init__.py +5 -1
  35. gnosisllm_knowledge/loaders/discovery.py +338 -0
  36. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  37. gnosisllm_knowledge/loaders/factory.py +46 -0
  38. gnosisllm_knowledge/services/indexing.py +51 -21
  39. gnosisllm_knowledge/services/search.py +42 -28
  40. gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
  41. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
  42. gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
  43. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  44. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
  45. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,16 @@
1
- """In-memory document searcher for testing."""
1
+ """In-memory document searcher for testing.
2
+
3
+ Note: This module is tenant-agnostic. Multi-tenancy should be handled
4
+ at the API layer by using separate indices per account (e.g.,
5
+ gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
6
+ """
2
7
 
3
8
  from __future__ import annotations
4
9
 
5
10
  import math
6
11
  import re
7
12
  import time
13
+ import warnings
8
14
  from typing import Any, Callable
9
15
 
10
16
  from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
@@ -147,7 +153,7 @@ class MemorySearcher:
147
153
 
148
154
  for doc in filtered_docs:
149
155
  content = doc.get("content", "").lower()
150
- title = doc.get("title", "").lower()
156
+ title = (doc.get("title") or "").lower()
151
157
 
152
158
  # Simple TF scoring
153
159
  content_score = sum(
@@ -209,7 +215,7 @@ class MemorySearcher:
209
215
  for doc in filtered_docs:
210
216
  # Keyword score
211
217
  content = doc.get("content", "").lower()
212
- title = doc.get("title", "").lower()
218
+ title = (doc.get("title") or "").lower()
213
219
  keyword_score = sum(content.count(term) for term in query_terms)
214
220
  keyword_score += sum(title.count(term) for term in query_terms) * 2
215
221
 
@@ -348,6 +354,122 @@ class MemorySearcher:
348
354
  results.append(result)
349
355
  return results
350
356
 
357
+ async def list_documents(
358
+ self,
359
+ index_name: str,
360
+ *,
361
+ source_id: str | None = None,
362
+ collection_id: str | None = None,
363
+ limit: int = 50,
364
+ offset: int = 0,
365
+ ) -> dict[str, Any]:
366
+ """List documents with optional filters.
367
+
368
+ Args:
369
+ index_name: Index to query.
370
+ source_id: Optional source ID filter.
371
+ collection_id: Optional collection ID filter.
372
+ limit: Maximum documents to return.
373
+ offset: Number of documents to skip.
374
+
375
+ Returns:
376
+ Dictionary with documents, total, limit, offset.
377
+ """
378
+ documents = self._indexer.get_all(index_name)
379
+
380
+ # Apply filters
381
+ if source_id:
382
+ documents = [d for d in documents if d.get("source_id") == source_id]
383
+ if collection_id:
384
+ documents = [d for d in documents if d.get("collection_id") == collection_id]
385
+
386
+ total = len(documents)
387
+
388
+ # Apply pagination
389
+ paginated = documents[offset : offset + limit]
390
+
391
+ return {
392
+ "documents": paginated,
393
+ "total": total,
394
+ "limit": limit,
395
+ "offset": offset,
396
+ }
397
+
398
+ async def count(
399
+ self,
400
+ index_name: str,
401
+ collection_id: str | None = None,
402
+ source_id: str | None = None,
403
+ ) -> int:
404
+ """Count documents in index with optional filters.
405
+
406
+ Args:
407
+ index_name: Index to count.
408
+ collection_id: Filter by collection.
409
+ source_id: Filter by source.
410
+
411
+ Returns:
412
+ Document count.
413
+ """
414
+ # Use efficient O(1) count when no filters
415
+ if not collection_id and not source_id:
416
+ return self._indexer.count(index_name)
417
+
418
+ # With filters, iterate over index values (memory backend is for testing only)
419
+ index_data = self._indexer._indices.get(index_name, {})
420
+ count = 0
421
+ for doc in index_data.values():
422
+ if collection_id and doc.get("collection_id") != collection_id:
423
+ continue
424
+ if source_id and doc.get("source_id") != source_id:
425
+ continue
426
+ count += 1
427
+
428
+ return count
429
+
430
+ async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
431
+ """Get unique collections with document counts.
432
+
433
+ Args:
434
+ index_name: Index to query.
435
+
436
+ Returns:
437
+ List of collections with id, name, and document_count.
438
+ """
439
+ documents = self._indexer.get_all(index_name)
440
+ collections: dict[str, dict[str, Any]] = {}
441
+
442
+ for doc in documents:
443
+ col_id = doc.get("collection_id")
444
+ if not col_id:
445
+ continue
446
+
447
+ if col_id not in collections:
448
+ collections[col_id] = {
449
+ "id": col_id,
450
+ "name": doc.get("collection_name") or col_id,
451
+ "document_count": 0,
452
+ }
453
+ collections[col_id]["document_count"] += 1
454
+
455
+ return list(collections.values())
456
+
457
+ async def get_stats(self, index_name: str) -> dict[str, Any]:
458
+ """Get index statistics.
459
+
460
+ Args:
461
+ index_name: Index to query.
462
+
463
+ Returns:
464
+ Dictionary with document_count and index info.
465
+ """
466
+ count = self._indexer.count(index_name)
467
+ return {
468
+ "document_count": count,
469
+ "index_name": index_name,
470
+ "exists": count > 0 or index_name in self._indexer._indices,
471
+ }
472
+
351
473
  def _apply_filters(
352
474
  self,
353
475
  documents: list[dict[str, Any]],
@@ -355,6 +477,10 @@ class MemorySearcher:
355
477
  ) -> list[dict[str, Any]]:
356
478
  """Apply query filters to documents.
357
479
 
480
+ Note:
481
+ This method is tenant-agnostic. Multi-tenancy should be handled
482
+ at the API layer by using separate indices per account.
483
+
358
484
  Args:
359
485
  documents: Documents to filter.
360
486
  query: Query with filter parameters.
@@ -364,10 +490,6 @@ class MemorySearcher:
364
490
  """
365
491
  filtered = documents
366
492
 
367
- # Account filter
368
- if query.account_id:
369
- filtered = [d for d in filtered if d.get("account_id") == query.account_id]
370
-
371
493
  # Collection filter
372
494
  if query.collection_ids:
373
495
  filtered = [
@@ -378,9 +500,9 @@ class MemorySearcher:
378
500
  if query.source_ids:
379
501
  filtered = [d for d in filtered if d.get("source_id") in query.source_ids]
380
502
 
381
- # Custom filters
382
- if query.filters:
383
- for field, value in query.filters.items():
503
+ # Custom metadata filters
504
+ if query.metadata_filters:
505
+ for field, value in query.metadata_filters.items():
384
506
  if isinstance(value, list):
385
507
  filtered = [d for d in filtered if d.get(field) in value]
386
508
  else:
@@ -2,6 +2,12 @@
2
2
 
3
3
  Uses OpenSearch ML agents for AI-powered search with reasoning capabilities.
4
4
  Supports flow agents (fast RAG) and conversational agents (multi-turn with memory).
5
+
6
+ Note:
7
+ This module is **tenant-agnostic**. Multi-tenancy is achieved through index isolation:
8
+ each tenant's data resides in a separate OpenSearch index. The caller (e.g., gnosisllm-api)
9
+ is responsible for constructing the appropriate index name (e.g., `knowledge-{account_id}`).
10
+ The library operates on the provided index without any tenant-specific filtering logic.
5
11
  """
6
12
 
7
13
  from __future__ import annotations
@@ -9,7 +15,6 @@ from __future__ import annotations
9
15
  import asyncio
10
16
  import json
11
17
  import logging
12
- import uuid
13
18
  from datetime import UTC, datetime
14
19
  from typing import TYPE_CHECKING, Any
15
20
 
@@ -297,13 +302,15 @@ class OpenSearchAgenticSearcher:
297
302
 
298
303
  async def list_conversations(
299
304
  self,
300
- account_id: str | None = None,
301
305
  limit: int = 100,
302
306
  ) -> list[dict[str, Any]]:
303
307
  """List active conversations.
304
308
 
309
+ Note:
310
+ This library is tenant-agnostic. Multi-tenancy is achieved through
311
+ index isolation (separate index per account).
312
+
305
313
  Args:
306
- account_id: Filter by account (multi-tenant).
307
314
  limit: Maximum number of conversations.
308
315
 
309
316
  Returns:
@@ -311,8 +318,6 @@ class OpenSearchAgenticSearcher:
311
318
  """
312
319
  try:
313
320
  body: dict[str, Any] = {"size": limit}
314
- if account_id:
315
- body["query"] = {"term": {"account_id": account_id}}
316
321
 
317
322
  response = await self._client.transport.perform_request(
318
323
  "POST",
@@ -365,16 +370,18 @@ class OpenSearchAgenticSearcher:
365
370
  async def create_conversation(
366
371
  self,
367
372
  name: str | None = None,
368
- account_id: str | None = None,
369
373
  ) -> str | None:
370
374
  """Create a new conversation memory.
371
375
 
372
376
  Uses the OpenSearch Memory API to create a conversation memory.
373
377
  The endpoint is POST /_plugins/_ml/memory (introduced in 2.12).
374
378
 
379
+ Note:
380
+ This library is tenant-agnostic. Multi-tenancy is achieved through
381
+ index isolation (separate index per account).
382
+
375
383
  Args:
376
384
  name: Optional name for the conversation.
377
- account_id: Optional account ID for multi-tenancy.
378
385
 
379
386
  Returns:
380
387
  The new conversation/memory ID, or None if creation fails.
@@ -382,8 +389,6 @@ class OpenSearchAgenticSearcher:
382
389
  body: dict[str, Any] = {}
383
390
  if name:
384
391
  body["name"] = name
385
- if account_id:
386
- body["account_id"] = account_id
387
392
 
388
393
  try:
389
394
  # POST /_plugins/_ml/memory creates a new memory (OpenSearch 2.12+)
@@ -109,6 +109,11 @@ class OpenSearchConfig:
109
109
  bulk_batch_size: int = 500
110
110
  bulk_max_concurrent: int = 3
111
111
 
112
+ # === Indexing Service ===
113
+ # Batch size for progressive indexing during load operations
114
+ # Documents are indexed in batches of this size as they stream in
115
+ indexing_batch_size: int = 10
116
+
112
117
  @property
113
118
  def url(self) -> str:
114
119
  """Get the full OpenSearch URL."""
@@ -213,4 +218,6 @@ class OpenSearchConfig:
213
218
  # === Bulk Indexing ===
214
219
  bulk_batch_size=int(os.getenv("OPENSEARCH_BULK_BATCH_SIZE", "500")),
215
220
  bulk_max_concurrent=int(os.getenv("OPENSEARCH_BULK_MAX_CONCURRENT", "3")),
221
+ # === Indexing Service ===
222
+ indexing_batch_size=int(os.getenv("GNOSISLLM_INDEXING_BATCH_SIZE", "10")),
216
223
  )
@@ -87,13 +87,15 @@ class OpenSearchIndexer:
87
87
  # Embeddings are generated by OpenSearch ingest pipeline
88
88
  doc_body = self._prepare_document(document)
89
89
 
90
- # Index the document
90
+ # Index the document with ingest pipeline for embedding generation
91
91
  refresh = options.get("refresh", False)
92
+ pipeline = self._config.ingest_pipeline_name
92
93
  await self._client.index(
93
94
  index=index_name,
94
95
  id=document.doc_id,
95
96
  body=doc_body,
96
97
  refresh=refresh,
98
+ pipeline=pipeline,
97
99
  )
98
100
 
99
101
  return IndexResult(
@@ -272,6 +274,43 @@ class OpenSearchIndexer:
272
274
  failed_count=0,
273
275
  )
274
276
 
277
+ async def get(
278
+ self,
279
+ doc_id: str,
280
+ index_name: str,
281
+ ) -> dict[str, Any] | None:
282
+ """Get a document by ID.
283
+
284
+ Uses OpenSearch client's direct get() API (CRUD operation, not search).
285
+
286
+ Args:
287
+ doc_id: Document ID to retrieve.
288
+ index_name: Index name.
289
+
290
+ Returns:
291
+ Document dict (source fields) or None if not found.
292
+ Excludes embeddings from response for efficiency.
293
+ """
294
+ try:
295
+ response = await self._client.get(
296
+ index=index_name,
297
+ id=doc_id,
298
+ _source_excludes=["content_embedding"],
299
+ )
300
+ source = response.get("_source", {})
301
+ # Include the document ID in the response
302
+ source["id"] = response.get("_id", doc_id)
303
+ return source
304
+ except Exception as e:
305
+ if "not_found" in str(e).lower():
306
+ return None
307
+ logger.error(f"Failed to get document {doc_id}: {e}")
308
+ raise IndexError(
309
+ message=f"Failed to get document: {e}",
310
+ details={"document_id": doc_id},
311
+ cause=e,
312
+ ) from e
313
+
275
314
  async def delete(
276
315
  self,
277
316
  doc_id: str,
@@ -434,7 +473,9 @@ class OpenSearchIndexer:
434
473
  if not actions:
435
474
  return IndexResult(success=True, index_name=index_name, indexed_count=0, failed_count=0)
436
475
 
437
- response = await self._client.bulk(body=actions)
476
+ # Use ingest pipeline for embedding generation
477
+ pipeline = self._config.ingest_pipeline_name
478
+ response = await self._client.bulk(body=actions, pipeline=pipeline)
438
479
 
439
480
  indexed = 0
440
481
  failed = 0
@@ -460,6 +501,11 @@ class OpenSearchIndexer:
460
501
  def _prepare_document(self, document: Document) -> dict[str, Any]:
461
502
  """Prepare document for indexing.
462
503
 
504
+ Note:
505
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
506
+ isolation. Tenant information should be passed in document.metadata if
507
+ needed for audit purposes.
508
+
463
509
  Args:
464
510
  document: Document to prepare.
465
511
 
@@ -479,7 +525,6 @@ class OpenSearchIndexer:
479
525
  "url": document.url,
480
526
  "title": document.title,
481
527
  "source": document.source,
482
- "account_id": document.account_id,
483
528
  "collection_id": document.collection_id,
484
529
  "collection_name": document.collection_name,
485
530
  "source_id": document.source_id,
@@ -1,4 +1,10 @@
1
- """OpenSearch index mappings for knowledge documents."""
1
+ """OpenSearch index mappings for knowledge documents.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). Index mappings do not include
6
+ tenant-specific fields like account_id.
7
+ """
2
8
 
3
9
  from __future__ import annotations
4
10
 
@@ -56,8 +62,7 @@ def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
56
62
  "fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
57
63
  },
58
64
  "source": {"type": "keyword"},
59
- # === Multi-tenant Fields ===
60
- "account_id": {"type": "keyword"},
65
+ # === Collection Fields ===
61
66
  "collection_id": {"type": "keyword"},
62
67
  "collection_name": {"type": "keyword"}, # For aggregation display
63
68
  "source_id": {"type": "keyword"},
@@ -129,13 +134,16 @@ def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
129
134
  def get_memory_index_mappings() -> dict[str, Any]:
130
135
  """Get index mappings for conversation memory.
131
136
 
137
+ Note:
138
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
139
+ isolation. Use tenant-specific index names for conversation memory.
140
+
132
141
  Returns:
133
142
  Index mappings dictionary.
134
143
  """
135
144
  return {
136
145
  "properties": {
137
146
  "conversation_id": {"type": "keyword"},
138
- "account_id": {"type": "keyword"},
139
147
  "user_id": {"type": "keyword"},
140
148
  "message_index": {"type": "integer"},
141
149
  "role": {"type": "keyword"}, # user, assistant, system
@@ -2,6 +2,10 @@
2
2
 
3
3
  Uses OpenSearch neural search - embeddings are generated automatically
4
4
  via the deployed model. No Python-side embedding generation needed.
5
+
6
+ Note: This module is tenant-agnostic. Multi-tenancy should be handled
7
+ at the API layer by using separate indices per account (e.g.,
8
+ `knowledge-{account_id}`) rather than filtering by account_id.
5
9
  """
6
10
 
7
11
  from __future__ import annotations
@@ -18,9 +22,13 @@ class QueryBuilder:
18
22
  model handles embedding generation automatically via ingest and
19
23
  search pipelines.
20
24
 
25
+ Note:
26
+ This builder is tenant-agnostic. Multi-tenancy should be handled
27
+ by using separate indices per account.
28
+
21
29
  Example:
22
30
  ```python
23
- query = SearchQuery(text="how to configure", account_id="acc123")
31
+ query = SearchQuery(text="how to configure", collection_ids=["col-1"])
24
32
  builder = QueryBuilder(query, model_id="abc123")
25
33
  os_query = builder.build_hybrid_query()
26
34
  ```
@@ -204,12 +212,12 @@ class QueryBuilder:
204
212
  },
205
213
  }
206
214
 
207
- # Apply filters at top level for hybrid
215
+ # Apply filters using post_filter for hybrid queries
216
+ # Hybrid queries cannot be wrapped in bool - they must be top-level
208
217
  filters = self._build_filters()
209
218
  if filters:
210
- query["query"] = {
219
+ query["post_filter"] = {
211
220
  "bool": {
212
- "must": [query["query"]],
213
221
  "filter": filters,
214
222
  }
215
223
  }
@@ -270,15 +278,15 @@ class QueryBuilder:
270
278
  def _build_filters(self) -> list[dict[str, Any]]:
271
279
  """Build filter clauses from query parameters.
272
280
 
281
+ Note:
282
+ This method is tenant-agnostic. Multi-tenancy should be handled
283
+ at the API layer by using separate indices per account.
284
+
273
285
  Returns:
274
- List of filter clauses.
286
+ List of filter clauses for collection, source, and metadata filters.
275
287
  """
276
288
  filters: list[dict[str, Any]] = []
277
289
 
278
- # Multi-tenant filter (required for security)
279
- if self._query.account_id:
280
- filters.append({"term": {"account_id": self._query.account_id}})
281
-
282
290
  # Collection filter
283
291
  if self._query.collection_ids:
284
292
  filters.append({"terms": {"collection_id": self._query.collection_ids}})
@@ -357,67 +365,61 @@ class QueryBuilder:
357
365
  ]
358
366
 
359
367
 
360
- def build_delete_by_source_query(
361
- source_id: str,
362
- account_id: str | None = None,
363
- ) -> dict[str, Any]:
368
+ def build_delete_by_source_query(source_id: str) -> dict[str, Any]:
364
369
  """Build query to delete documents by source.
365
370
 
371
+ Note:
372
+ This function is tenant-agnostic. Multi-tenancy should be handled
373
+ at the API layer by using separate indices per account.
374
+
366
375
  Args:
367
376
  source_id: Source ID to delete.
368
- account_id: Optional account filter for multi-tenancy.
369
377
 
370
378
  Returns:
371
379
  Delete-by-query dictionary.
372
380
  """
373
- filters = [{"term": {"source_id": source_id}}]
374
- if account_id:
375
- filters.append({"term": {"account_id": account_id}})
376
-
377
381
  return {
378
382
  "query": {
379
383
  "bool": {
380
- "filter": filters,
384
+ "filter": [{"term": {"source_id": source_id}}],
381
385
  }
382
386
  }
383
387
  }
384
388
 
385
389
 
386
- def build_delete_by_collection_query(
387
- collection_id: str,
388
- account_id: str | None = None,
389
- ) -> dict[str, Any]:
390
+ def build_delete_by_collection_query(collection_id: str) -> dict[str, Any]:
390
391
  """Build query to delete documents by collection.
391
392
 
393
+ Note:
394
+ This function is tenant-agnostic. Multi-tenancy should be handled
395
+ at the API layer by using separate indices per account.
396
+
392
397
  Args:
393
398
  collection_id: Collection ID to delete.
394
- account_id: Optional account filter for multi-tenancy.
395
399
 
396
400
  Returns:
397
401
  Delete-by-query dictionary.
398
402
  """
399
- filters = [{"term": {"collection_id": collection_id}}]
400
- if account_id:
401
- filters.append({"term": {"account_id": account_id}})
402
-
403
403
  return {
404
404
  "query": {
405
405
  "bool": {
406
- "filter": filters,
406
+ "filter": [{"term": {"collection_id": collection_id}}],
407
407
  }
408
408
  }
409
409
  }
410
410
 
411
411
 
412
412
  def build_count_query(
413
- account_id: str | None = None,
414
413
  collection_id: str | None = None,
415
414
  source_id: str | None = None,
416
415
  ) -> dict[str, Any]:
417
416
  """Build query to count documents.
418
417
 
418
+ Note:
419
+ This function is tenant-agnostic. Multi-tenancy should be handled
420
+ at the API layer by using separate indices per account.
421
+
419
422
  Args:
420
- account_id: Optional account filter.
421
423
  collection_id: Optional collection filter.
422
424
  source_id: Optional source filter.
423
425
 
@@ -426,8 +428,6 @@ def build_count_query(
426
428
  """
427
429
  filters: list[dict[str, Any]] = []
428
430
 
429
- if account_id:
430
- filters.append({"term": {"account_id": account_id}})
431
431
  if collection_id:
432
432
  filters.append({"term": {"collection_id": collection_id}})
433
433
  if source_id:
@@ -2,6 +2,10 @@
2
2
 
3
3
  Uses OpenSearch neural search - embeddings are generated automatically
4
4
  by the deployed ML model. No Python-side embedding generation needed.
5
+
6
+ Note: This module is tenant-agnostic. Multi-tenancy should be handled
7
+ at the API layer by using separate indices per account (e.g.,
8
+ `knowledge-{account_id}`) rather than filtering by account_id.
5
9
  """
6
10
 
7
11
  from __future__ import annotations
@@ -502,11 +506,65 @@ class OpenSearchKnowledgeSearcher:
502
506
  "error": str(e),
503
507
  }
504
508
 
509
+ async def count(
510
+ self,
511
+ index_name: str,
512
+ collection_id: str | None = None,
513
+ source_id: str | None = None,
514
+ ) -> int:
515
+ """Count documents in index with optional filters.
516
+
517
+ Uses native _count API instead of search for efficiency and to avoid
518
+ hybrid search issues with empty queries.
519
+
520
+ Args:
521
+ index_name: Index to query.
522
+ collection_id: Filter by collection.
523
+ source_id: Filter by source.
524
+
525
+ Returns:
526
+ Document count.
527
+ """
528
+ try:
529
+ # Check if index exists first
530
+ exists = await self._client.indices.exists(index=index_name)
531
+ if not exists:
532
+ logger.debug(f"Index {index_name} does not exist, returning count 0")
533
+ return 0
534
+
535
+ # Build query with optional filters
536
+ query: dict[str, Any] = {"match_all": {}}
537
+
538
+ filters = []
539
+ if collection_id:
540
+ filters.append({"term": {"collection_id": collection_id}})
541
+ if source_id:
542
+ filters.append({"term": {"source_id": source_id}})
543
+
544
+ if filters:
545
+ query = {"bool": {"filter": filters}}
546
+
547
+ # Use native _count API
548
+ response = await self._client.count(
549
+ index=index_name,
550
+ body={"query": query},
551
+ )
552
+
553
+ count = response.get("count", 0)
554
+ logger.debug(f"Count for {index_name}: {count} (collection={collection_id}, source={source_id})")
555
+ return count
556
+
557
+ except Exception as e:
558
+ logger.error(f"Failed to count documents in {index_name}: {e}")
559
+ raise SearchError(
560
+ message=f"Count failed: {e}",
561
+ details={"index": index_name, "collection_id": collection_id, "source_id": source_id},
562
+ ) from e
563
+
505
564
  async def list_documents(
506
565
  self,
507
566
  index_name: str,
508
567
  *,
509
- account_id: str | None = None,
510
568
  source_id: str | None = None,
511
569
  collection_id: str | None = None,
512
570
  limit: int = 50,
@@ -514,9 +572,12 @@ class OpenSearchKnowledgeSearcher:
514
572
  ) -> dict[str, Any]:
515
573
  """List documents with optional filters.
516
574
 
575
+ Note:
576
+ This method is tenant-agnostic. Multi-tenancy should be handled
577
+ at the API layer by using separate indices per account.
578
+
517
579
  Args:
518
- index_name: Index to query.
519
- account_id: Optional account ID filter.
580
+ index_name: Index to query (use tenant-specific name for isolation).
520
581
  source_id: Optional source ID filter.
521
582
  collection_id: Optional collection ID filter.
522
583
  limit: Maximum documents to return.
@@ -540,9 +601,6 @@ class OpenSearchKnowledgeSearcher:
540
601
  # Build filter clauses
541
602
  filters: list[dict[str, Any]] = []
542
603
 
543
- if account_id:
544
- filters.append({"term": {"account_id": account_id}})
545
-
546
604
  if source_id:
547
605
  filters.append({"term": {"source_id": source_id}})
548
606