gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,13 @@
1
- """Knowledge indexing service."""
1
+ """Knowledge indexing service.
2
+
3
+ This service orchestrates the document ingestion pipeline from source to index,
4
+ including loading, chunking, and indexing.
5
+
6
+ Note:
7
+ This service is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g.,
9
+ `knowledge-{account_id}`) rather than filtering by account_id.
10
+ """
2
11
 
3
12
  from __future__ import annotations
4
13
 
@@ -82,112 +91,107 @@ class KnowledgeIndexingService:
82
91
  source: str,
83
92
  index_name: str,
84
93
  *,
85
- account_id: str | None = None,
86
94
  collection_id: str | None = None,
87
95
  source_id: str | None = None,
88
96
  batch_size: int = 100,
89
97
  **options: Any,
90
98
  ) -> IndexResult:
91
- """Load content from source and index it.
99
+ """Load content from source and index it with streaming.
100
+
101
+ Uses streaming to process and index documents as they're fetched,
102
+ avoiding memory issues with large sitemaps.
103
+
104
+ Note:
105
+ This method is tenant-agnostic. Multi-tenancy should be handled
106
+ at the API layer by using separate indices per account.
92
107
 
93
108
  Args:
94
109
  source: Source URL or path.
95
- index_name: Target index name.
96
- account_id: Account ID for multi-tenancy.
110
+ index_name: Target index name (use tenant-specific name for isolation).
97
111
  collection_id: Collection ID.
98
112
  source_id: Source ID (auto-generated if not provided).
99
- batch_size: Documents per batch.
113
+ batch_size: Documents per batch for indexing.
100
114
  **options: Additional loader/indexer options.
101
115
 
102
116
  Returns:
103
117
  Index result with counts.
104
118
  """
105
119
  source_id = source_id or str(uuid.uuid4())
120
+ document_defaults = options.pop("document_defaults", {})
106
121
 
107
122
  # Emit batch started event
108
123
  await self._events.emit_async(
109
- EventType.BATCH_STARTED,
110
124
  BatchStartedEvent(
111
- source=source,
112
- source_id=source_id,
125
+ batch_index=0,
126
+ batch_size=batch_size,
127
+ total_batches=0, # Unknown for streaming
113
128
  ),
114
129
  )
115
130
 
116
- try:
117
- # Load documents
118
- load_result = await self._loader.load(source, **options)
131
+ total_indexed = 0
132
+ total_failed = 0
133
+ errors: list[str] = []
134
+ batch: list[Document] = []
135
+ batch_index = 0
119
136
 
120
- if not load_result.success:
121
- raise LoadError(
122
- message=f"Failed to load from {source}",
123
- details={"errors": load_result.errors},
137
+ try:
138
+ # Stream documents and index in batches as they arrive
139
+ # Note: Loader already chunks content, so we don't re-chunk here
140
+ async for doc in self._loader.load_streaming(source, **options):
141
+ # Enrich document with collection info
142
+ enriched_doc = Document(
143
+ content=doc.content,
144
+ source=source,
145
+ doc_id=doc.doc_id,
146
+ url=doc.url,
147
+ title=doc.title,
148
+ collection_id=collection_id,
149
+ source_id=source_id,
150
+ chunk_index=doc.chunk_index,
151
+ total_chunks=doc.total_chunks,
152
+ parent_doc_id=doc.parent_doc_id,
153
+ status=DocumentStatus.INDEXED,
154
+ metadata=doc.metadata,
155
+ **document_defaults,
124
156
  )
125
157
 
126
- # Process and index documents
127
- total_indexed = 0
128
- total_failed = 0
129
- errors: list[str] = []
130
-
131
- batch: list[Document] = []
158
+ batch.append(enriched_doc)
132
159
 
133
- for doc in load_result.documents:
134
- # Chunk the document
135
- chunks = self._chunker.chunk(doc.content)
136
-
137
- for i, chunk in enumerate(chunks):
138
- # Create chunk document
139
- chunk_doc = Document(
140
- id=f"{doc.id}-chunk-{i}",
141
- content=chunk.content,
142
- url=doc.url,
143
- title=doc.title,
144
- source=source,
145
- account_id=account_id,
146
- collection_id=collection_id,
147
- source_id=source_id,
148
- chunk_index=i,
149
- total_chunks=len(chunks),
150
- parent_doc_id=doc.id,
151
- status=DocumentStatus.INDEXED,
152
- metadata=doc.metadata,
153
- )
154
-
155
- batch.append(chunk_doc)
156
-
157
- # Index batch when full
158
- if len(batch) >= batch_size:
159
- result = await self._index_batch(batch, index_name)
160
- total_indexed += result.documents_indexed
161
- total_failed += result.documents_failed
162
- if result.errors:
163
- errors.extend(result.errors)
164
- batch = []
160
+ # Index batch when full
161
+ if len(batch) >= batch_size:
162
+ result = await self._index_batch(batch, index_name)
163
+ total_indexed += result.indexed_count
164
+ total_failed += result.failed_count
165
+ if result.errors:
166
+ errors.extend(result.errors)
167
+ batch = []
168
+ batch_index += 1
169
+ logger.info(f"Indexed batch {batch_index}: {total_indexed} total documents")
165
170
 
166
171
  # Index remaining documents
167
172
  if batch:
168
173
  result = await self._index_batch(batch, index_name)
169
- total_indexed += result.documents_indexed
170
- total_failed += result.documents_failed
174
+ total_indexed += result.indexed_count
175
+ total_failed += result.failed_count
171
176
  if result.errors:
172
177
  errors.extend(result.errors)
173
178
 
174
179
  # Emit batch completed event
175
180
  await self._events.emit_async(
176
- EventType.BATCH_COMPLETED,
177
181
  BatchCompletedEvent(
178
- source=source,
179
- source_id=source_id,
180
- documents_indexed=total_indexed,
181
- documents_failed=total_failed,
182
- success=total_failed == 0,
182
+ batch_index=batch_index,
183
+ success_count=total_indexed,
184
+ failure_count=total_failed,
183
185
  ),
184
186
  )
185
187
 
188
+ logger.info(f"Completed indexing from {source}: {total_indexed} documents")
189
+
186
190
  return IndexResult(
187
191
  success=total_failed == 0,
188
- documents_indexed=total_indexed,
189
- documents_failed=total_failed,
190
- errors=errors if errors else None,
192
+ indexed_count=total_indexed,
193
+ failed_count=total_failed,
194
+ errors=errors if errors else [],
191
195
  )
192
196
 
193
197
  except Exception as e:
@@ -231,17 +235,17 @@ class KnowledgeIndexingService:
231
235
 
232
236
  for i, chunk_obj in enumerate(chunks):
233
237
  chunk_doc = Document(
234
- id=f"{doc.id}-chunk-{i}",
235
238
  content=chunk_obj.content,
239
+ source=doc.source,
240
+ doc_id=f"{doc.doc_id}-chunk-{i}",
236
241
  url=doc.url,
237
242
  title=doc.title,
238
- source=doc.source,
239
- account_id=doc.account_id,
240
243
  collection_id=doc.collection_id,
244
+ collection_name=doc.collection_name,
241
245
  source_id=doc.source_id,
242
246
  chunk_index=i,
243
247
  total_chunks=len(chunks),
244
- parent_doc_id=doc.id,
248
+ parent_doc_id=doc.doc_id,
245
249
  status=DocumentStatus.INDEXED,
246
250
  metadata=doc.metadata,
247
251
  )
@@ -252,8 +256,8 @@ class KnowledgeIndexingService:
252
256
  # Index batch when full
253
257
  if len(batch) >= batch_size:
254
258
  result = await self._index_batch(batch, index_name)
255
- total_indexed += result.documents_indexed
256
- total_failed += result.documents_failed
259
+ total_indexed += result.indexed_count
260
+ total_failed += result.failed_count
257
261
  if result.errors:
258
262
  errors.extend(result.errors)
259
263
  batch = []
@@ -261,30 +265,32 @@ class KnowledgeIndexingService:
261
265
  # Index remaining
262
266
  if batch:
263
267
  result = await self._index_batch(batch, index_name)
264
- total_indexed += result.documents_indexed
265
- total_failed += result.documents_failed
268
+ total_indexed += result.indexed_count
269
+ total_failed += result.failed_count
266
270
  if result.errors:
267
271
  errors.extend(result.errors)
268
272
 
269
273
  return IndexResult(
270
274
  success=total_failed == 0,
271
- documents_indexed=total_indexed,
272
- documents_failed=total_failed,
273
- errors=errors if errors else None,
275
+ indexed_count=total_indexed,
276
+ failed_count=total_failed,
277
+ errors=errors if errors else [],
274
278
  )
275
279
 
276
280
  async def delete_source(
277
281
  self,
278
282
  source_id: str,
279
283
  index_name: str,
280
- account_id: str | None = None,
281
284
  ) -> int:
282
285
  """Delete all documents from a source.
283
286
 
287
+ Note:
288
+ This method is tenant-agnostic. Multi-tenancy should be handled
289
+ at the API layer by using separate indices per account.
290
+
284
291
  Args:
285
292
  source_id: Source ID to delete.
286
- index_name: Index name.
287
- account_id: Optional account filter.
293
+ index_name: Index name (use tenant-specific name for isolation).
288
294
 
289
295
  Returns:
290
296
  Count of deleted documents.
@@ -293,21 +299,23 @@ class KnowledgeIndexingService:
293
299
  build_delete_by_source_query,
294
300
  )
295
301
 
296
- query = build_delete_by_source_query(source_id, account_id)
302
+ query = build_delete_by_source_query(source_id)
297
303
  return await self._indexer.delete_by_query(query, index_name)
298
304
 
299
305
  async def delete_collection(
300
306
  self,
301
307
  collection_id: str,
302
308
  index_name: str,
303
- account_id: str | None = None,
304
309
  ) -> int:
305
310
  """Delete all documents from a collection.
306
311
 
312
+ Note:
313
+ This method is tenant-agnostic. Multi-tenancy should be handled
314
+ at the API layer by using separate indices per account.
315
+
307
316
  Args:
308
317
  collection_id: Collection ID to delete.
309
- index_name: Index name.
310
- account_id: Optional account filter.
318
+ index_name: Index name (use tenant-specific name for isolation).
311
319
 
312
320
  Returns:
313
321
  Count of deleted documents.
@@ -316,7 +324,7 @@ class KnowledgeIndexingService:
316
324
  build_delete_by_collection_query,
317
325
  )
318
326
 
319
- query = build_delete_by_collection_query(collection_id, account_id)
327
+ query = build_delete_by_collection_query(collection_id)
320
328
  return await self._indexer.delete_by_query(query, index_name)
321
329
 
322
330
  async def reindex_source(
@@ -325,17 +333,19 @@ class KnowledgeIndexingService:
325
333
  source_id: str,
326
334
  index_name: str,
327
335
  *,
328
- account_id: str | None = None,
329
336
  collection_id: str | None = None,
330
337
  **options: Any,
331
338
  ) -> IndexResult:
332
339
  """Reindex a source by deleting and re-loading.
333
340
 
341
+ Note:
342
+ This method is tenant-agnostic. Multi-tenancy should be handled
343
+ at the API layer by using separate indices per account.
344
+
334
345
  Args:
335
346
  source: Source URL or path.
336
347
  source_id: Existing source ID.
337
- index_name: Index name.
338
- account_id: Account ID.
348
+ index_name: Index name (use tenant-specific name for isolation).
339
349
  collection_id: Collection ID.
340
350
  **options: Additional options.
341
351
 
@@ -343,13 +353,12 @@ class KnowledgeIndexingService:
343
353
  Index result.
344
354
  """
345
355
  # Delete existing documents
346
- await self.delete_source(source_id, index_name, account_id)
356
+ await self.delete_source(source_id, index_name)
347
357
 
348
358
  # Re-index
349
359
  return await self.load_and_index(
350
360
  source=source,
351
361
  index_name=index_name,
352
- account_id=account_id,
353
362
  collection_id=collection_id,
354
363
  source_id=source_id,
355
364
  **options,
@@ -375,12 +384,10 @@ class KnowledgeIndexingService:
375
384
  for doc in documents:
376
385
  if result.success:
377
386
  await self._events.emit_async(
378
- EventType.DOCUMENT_INDEXED,
379
387
  DocumentIndexedEvent(
380
- document_id=doc.id,
388
+ doc_id=doc.doc_id,
381
389
  index_name=index_name,
382
- chunk_index=doc.chunk_index,
383
- total_chunks=doc.total_chunks,
390
+ success=True,
384
391
  ),
385
392
  )
386
393
 
@@ -1,4 +1,12 @@
1
- """Knowledge search service."""
1
+ """Knowledge search service.
2
+
3
+ This service provides a high-level interface for searching knowledge documents
4
+ using semantic, keyword, and hybrid search modes.
5
+
6
+ Note:
7
+ This service is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g., knowledge-{account_id}).
9
+ """
2
10
 
3
11
  from __future__ import annotations
4
12
 
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
70
78
  mode: SearchMode = SearchMode.HYBRID,
71
79
  limit: int = 10,
72
80
  offset: int = 0,
73
- account_id: str | None = None,
74
81
  collection_ids: list[str] | None = None,
75
82
  source_ids: list[str] | None = None,
76
83
  min_score: float | None = None,
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
78
85
  ) -> SearchResult:
79
86
  """Search for knowledge documents.
80
87
 
88
+ Note:
89
+ This method is tenant-agnostic. Multi-tenancy should be handled
90
+ at the API layer by using separate indices per account.
91
+
81
92
  Args:
82
93
  query: Search query text.
83
94
  index_name: Index to search (uses default if not provided).
84
95
  mode: Search mode (semantic, keyword, hybrid).
85
96
  limit: Maximum results.
86
97
  offset: Result offset for pagination.
87
- account_id: Account ID for multi-tenancy.
88
98
  collection_ids: Filter by collection IDs.
89
99
  source_ids: Filter by source IDs.
90
100
  min_score: Minimum score threshold.
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
105
115
  mode=mode,
106
116
  limit=limit,
107
117
  offset=offset,
108
- account_id=account_id,
109
118
  collection_ids=collection_ids,
110
119
  source_ids=source_ids,
111
120
  min_score=min_score,
@@ -114,17 +123,8 @@ class KnowledgeSearchService:
114
123
  try:
115
124
  result = await self._searcher.search(search_query, index, **options)
116
125
 
117
- # Emit search event
118
- await self._events.emit_async(
119
- EventType.SEARCH_COMPLETED,
120
- {
121
- "query": query,
122
- "mode": mode.value,
123
- "results_count": len(result.items),
124
- "total_hits": result.total_hits,
125
- "duration_ms": result.duration_ms,
126
- },
127
- )
126
+ # TODO: Emit search event when SearchCompletedEvent is defined
127
+ # await self._events.emit_async(SearchCompletedEvent(...))
128
128
 
129
129
  return result
130
130
 
@@ -142,17 +142,19 @@ class KnowledgeSearchService:
142
142
  *,
143
143
  index_name: str | None = None,
144
144
  limit: int = 10,
145
- account_id: str | None = None,
146
145
  collection_ids: list[str] | None = None,
147
146
  **options: Any,
148
147
  ) -> SearchResult:
149
148
  """Execute semantic (vector) search.
150
149
 
150
+ Note:
151
+ This method is tenant-agnostic. Multi-tenancy should be handled
152
+ at the API layer by using separate indices per account.
153
+
151
154
  Args:
152
155
  query: Search query text.
153
156
  index_name: Index to search.
154
157
  limit: Maximum results.
155
- account_id: Account ID for multi-tenancy.
156
158
  collection_ids: Filter by collection IDs.
157
159
  **options: Additional options.
158
160
 
@@ -164,7 +166,6 @@ class KnowledgeSearchService:
164
166
  index_name=index_name,
165
167
  mode=SearchMode.SEMANTIC,
166
168
  limit=limit,
167
- account_id=account_id,
168
169
  collection_ids=collection_ids,
169
170
  **options,
170
171
  )
@@ -175,17 +176,19 @@ class KnowledgeSearchService:
175
176
  *,
176
177
  index_name: str | None = None,
177
178
  limit: int = 10,
178
- account_id: str | None = None,
179
179
  collection_ids: list[str] | None = None,
180
180
  **options: Any,
181
181
  ) -> SearchResult:
182
182
  """Execute keyword (BM25) search.
183
183
 
184
+ Note:
185
+ This method is tenant-agnostic. Multi-tenancy should be handled
186
+ at the API layer by using separate indices per account.
187
+
184
188
  Args:
185
189
  query: Search query text.
186
190
  index_name: Index to search.
187
191
  limit: Maximum results.
188
- account_id: Account ID for multi-tenancy.
189
192
  collection_ids: Filter by collection IDs.
190
193
  **options: Additional options.
191
194
 
@@ -197,7 +200,6 @@ class KnowledgeSearchService:
197
200
  index_name=index_name,
198
201
  mode=SearchMode.KEYWORD,
199
202
  limit=limit,
200
- account_id=account_id,
201
203
  collection_ids=collection_ids,
202
204
  **options,
203
205
  )
@@ -208,7 +210,6 @@ class KnowledgeSearchService:
208
210
  *,
209
211
  index_name: str | None = None,
210
212
  limit: int = 10,
211
- account_id: str | None = None,
212
213
  collection_ids: list[str] | None = None,
213
214
  semantic_weight: float = 0.7,
214
215
  keyword_weight: float = 0.3,
@@ -216,11 +217,14 @@ class KnowledgeSearchService:
216
217
  ) -> SearchResult:
217
218
  """Execute hybrid search (semantic + keyword).
218
219
 
220
+ Note:
221
+ This method is tenant-agnostic. Multi-tenancy should be handled
222
+ at the API layer by using separate indices per account.
223
+
219
224
  Args:
220
225
  query: Search query text.
221
226
  index_name: Index to search.
222
227
  limit: Maximum results.
223
- account_id: Account ID for multi-tenancy.
224
228
  collection_ids: Filter by collection IDs.
225
229
  semantic_weight: Weight for semantic score.
226
230
  keyword_weight: Weight for keyword score.
@@ -234,7 +238,6 @@ class KnowledgeSearchService:
234
238
  index_name=index_name,
235
239
  mode=SearchMode.HYBRID,
236
240
  limit=limit,
237
- account_id=account_id,
238
241
  collection_ids=collection_ids,
239
242
  semantic_weight=semantic_weight,
240
243
  keyword_weight=keyword_weight,
@@ -273,17 +276,19 @@ class KnowledgeSearchService:
273
276
  index_name: str | None = None,
274
277
  mode: SearchMode = SearchMode.HYBRID,
275
278
  limit: int = 10,
276
- account_id: str | None = None,
277
279
  **options: Any,
278
280
  ) -> list[SearchResult]:
279
281
  """Execute multiple searches in parallel.
280
282
 
283
+ Note:
284
+ This method is tenant-agnostic. Multi-tenancy should be handled
285
+ at the API layer by using separate indices per account.
286
+
281
287
  Args:
282
288
  queries: List of query texts.
283
289
  index_name: Index to search.
284
290
  mode: Search mode.
285
291
  limit: Maximum results per query.
286
- account_id: Account ID for multi-tenancy.
287
292
  **options: Additional options.
288
293
 
289
294
  Returns:
@@ -298,7 +303,6 @@ class KnowledgeSearchService:
298
303
  text=query,
299
304
  mode=mode,
300
305
  limit=limit,
301
- account_id=account_id,
302
306
  )
303
307
  for query in queries
304
308
  ]
@@ -319,15 +323,19 @@ class KnowledgeSearchService:
319
323
  async def count(
320
324
  self,
321
325
  index_name: str | None = None,
322
- account_id: str | None = None,
323
326
  collection_id: str | None = None,
327
+ source_id: str | None = None,
324
328
  ) -> int:
325
329
  """Count documents in index.
326
330
 
331
+ Note:
332
+ This method is tenant-agnostic. Multi-tenancy should be handled
333
+ at the API layer by using separate indices per account.
334
+
327
335
  Args:
328
336
  index_name: Index to count.
329
- account_id: Filter by account.
330
337
  collection_id: Filter by collection.
338
+ source_id: Filter by source (for source deletion confirmation).
331
339
 
332
340
  Returns:
333
341
  Document count.
@@ -336,14 +344,59 @@ class KnowledgeSearchService:
336
344
  if not index:
337
345
  raise SearchError(message="No index specified")
338
346
 
339
- # Build count query
347
+ # Build count query with optional filters
340
348
  query = SearchQuery(
341
349
  text="",
342
350
  limit=0,
343
- account_id=account_id,
344
351
  collection_ids=[collection_id] if collection_id else None,
352
+ source_ids=[source_id] if source_id else None,
345
353
  )
346
354
 
347
355
  # Use a simple match_all to get total count
348
356
  result = await self._searcher.search(query, index)
349
357
  return result.total_hits
358
+
359
+ async def get_collections(
360
+ self,
361
+ index_name: str | None = None,
362
+ ) -> list[dict[str, Any]]:
363
+ """Get all collections with document counts.
364
+
365
+ Args:
366
+ index_name: Index to query (uses default if not provided).
367
+
368
+ Returns:
369
+ List of collections with id, name, and document_count.
370
+ """
371
+ index = index_name or self._default_index
372
+ if not index:
373
+ logger.warning("No index specified for get_collections")
374
+ return []
375
+
376
+ try:
377
+ return await self._searcher.get_collections(index)
378
+ except Exception as e:
379
+ logger.error(f"Failed to get collections: {e}")
380
+ return []
381
+
382
+ async def get_stats(
383
+ self,
384
+ index_name: str | None = None,
385
+ ) -> dict[str, Any]:
386
+ """Get index statistics.
387
+
388
+ Args:
389
+ index_name: Index to query (uses default if not provided).
390
+
391
+ Returns:
392
+ Dictionary with document_count, index_name, and other stats.
393
+ """
394
+ index = index_name or self._default_index
395
+ if not index:
396
+ return {"document_count": 0, "index_name": "", "exists": False}
397
+
398
+ try:
399
+ return await self._searcher.get_stats(index)
400
+ except Exception as e:
401
+ logger.error(f"Failed to get stats: {e}")
402
+ return {"document_count": 0, "index_name": index, "error": str(e)}