gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -9,18 +9,27 @@ from gnosisllm_knowledge.backends.opensearch import (
9
9
  OpenSearchKnowledgeSearcher,
10
10
  OpenSearchSetupAdapter,
11
11
  )
12
+ from gnosisllm_knowledge.backends.opensearch.memory import (
13
+ MemoryConfig,
14
+ MemorySetup,
15
+ OpenSearchMemoryClient,
16
+ )
12
17
  from gnosisllm_knowledge.backends.opensearch.queries import QueryBuilder
13
18
 
14
19
  __all__ = [
20
+ "AgenticSearchFallback",
21
+ # OpenSearch Memory
22
+ "MemoryConfig",
23
+ # Memory (for testing)
24
+ "MemoryIndexer",
25
+ "MemorySearcher",
26
+ "MemorySetup",
27
+ "OpenSearchAgenticSearcher",
15
28
  # OpenSearch
16
29
  "OpenSearchConfig",
17
30
  "OpenSearchIndexer",
18
31
  "OpenSearchKnowledgeSearcher",
32
+ "OpenSearchMemoryClient",
19
33
  "OpenSearchSetupAdapter",
20
- "OpenSearchAgenticSearcher",
21
- "AgenticSearchFallback",
22
34
  "QueryBuilder",
23
- # Memory (for testing)
24
- "MemoryIndexer",
25
- "MemorySearcher",
26
35
  ]
@@ -1,4 +1,10 @@
1
- """In-memory document indexer for testing."""
1
+ """In-memory document indexer for testing.
2
+
3
+ Note:
4
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
5
+ isolation (e.g., `knowledge-{account_id}`). The memory indexer does not
6
+ include tenant filtering logic - use separate index names per tenant.
7
+ """
2
8
 
3
9
  from __future__ import annotations
4
10
 
@@ -14,6 +20,9 @@ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
14
20
  class MemoryIndexer:
15
21
  """In-memory document indexer for testing.
16
22
 
23
+ This indexer is tenant-agnostic. Multi-tenancy is achieved through index
24
+ isolation by using tenant-specific index names.
25
+
17
26
  Stores documents in a dictionary for fast testing without
18
27
  requiring an external OpenSearch instance.
19
28
 
@@ -185,6 +194,22 @@ class MemoryIndexer:
185
194
  """
186
195
  return await self.index(document, index_name)
187
196
 
197
+ async def get(
198
+ self,
199
+ doc_id: str,
200
+ index_name: str,
201
+ ) -> dict[str, Any] | None:
202
+ """Get a document by ID (async interface).
203
+
204
+ Args:
205
+ doc_id: Document ID.
206
+ index_name: Index name.
207
+
208
+ Returns:
209
+ Document dictionary or None if not found.
210
+ """
211
+ return self._indices.get(index_name, {}).get(doc_id)
212
+
188
213
  async def delete(
189
214
  self,
190
215
  doc_id: str,
@@ -365,8 +390,8 @@ class MemoryIndexer:
365
390
  "url": document.url,
366
391
  "title": document.title,
367
392
  "source": document.source,
368
- "account_id": document.account_id,
369
393
  "collection_id": document.collection_id,
394
+ "collection_name": document.collection_name,
370
395
  "source_id": document.source_id,
371
396
  "chunk_index": document.chunk_index,
372
397
  "total_chunks": document.total_chunks,
@@ -1,10 +1,16 @@
1
- """In-memory document searcher for testing."""
1
+ """In-memory document searcher for testing.
2
+
3
+ Note: This module is tenant-agnostic. Multi-tenancy should be handled
4
+ at the API layer by using separate indices per account (e.g.,
5
+ gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
6
+ """
2
7
 
3
8
  from __future__ import annotations
4
9
 
5
10
  import math
6
11
  import re
7
12
  import time
13
+ import warnings
8
14
  from typing import Any, Callable
9
15
 
10
16
  from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
@@ -147,7 +153,7 @@ class MemorySearcher:
147
153
 
148
154
  for doc in filtered_docs:
149
155
  content = doc.get("content", "").lower()
150
- title = doc.get("title", "").lower()
156
+ title = (doc.get("title") or "").lower()
151
157
 
152
158
  # Simple TF scoring
153
159
  content_score = sum(
@@ -209,7 +215,7 @@ class MemorySearcher:
209
215
  for doc in filtered_docs:
210
216
  # Keyword score
211
217
  content = doc.get("content", "").lower()
212
- title = doc.get("title", "").lower()
218
+ title = (doc.get("title") or "").lower()
213
219
  keyword_score = sum(content.count(term) for term in query_terms)
214
220
  keyword_score += sum(title.count(term) for term in query_terms) * 2
215
221
 
@@ -348,6 +354,101 @@ class MemorySearcher:
348
354
  results.append(result)
349
355
  return results
350
356
 
357
+ async def list_documents(
358
+ self,
359
+ index_name: str,
360
+ *,
361
+ source_id: str | None = None,
362
+ collection_id: str | None = None,
363
+ limit: int = 50,
364
+ offset: int = 0,
365
+ ) -> dict[str, Any]:
366
+ """List documents with optional filters.
367
+
368
+ Args:
369
+ index_name: Index to query.
370
+ source_id: Optional source ID filter.
371
+ collection_id: Optional collection ID filter.
372
+ limit: Maximum documents to return.
373
+ offset: Number of documents to skip.
374
+
375
+ Returns:
376
+ Dictionary with documents, total, limit, offset.
377
+ """
378
+ documents = self._indexer.get_all(index_name)
379
+
380
+ # Apply filters
381
+ if source_id:
382
+ documents = [d for d in documents if d.get("source_id") == source_id]
383
+ if collection_id:
384
+ documents = [d for d in documents if d.get("collection_id") == collection_id]
385
+
386
+ total = len(documents)
387
+
388
+ # Apply pagination
389
+ paginated = documents[offset : offset + limit]
390
+
391
+ return {
392
+ "documents": paginated,
393
+ "total": total,
394
+ "limit": limit,
395
+ "offset": offset,
396
+ }
397
+
398
+ def count(self, index_name: str) -> int:
399
+ """Count documents in index.
400
+
401
+ Args:
402
+ index_name: Index to count.
403
+
404
+ Returns:
405
+ Document count.
406
+ """
407
+ return self._indexer.count(index_name)
408
+
409
+ async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
410
+ """Get unique collections with document counts.
411
+
412
+ Args:
413
+ index_name: Index to query.
414
+
415
+ Returns:
416
+ List of collections with id, name, and document_count.
417
+ """
418
+ documents = self._indexer.get_all(index_name)
419
+ collections: dict[str, dict[str, Any]] = {}
420
+
421
+ for doc in documents:
422
+ col_id = doc.get("collection_id")
423
+ if not col_id:
424
+ continue
425
+
426
+ if col_id not in collections:
427
+ collections[col_id] = {
428
+ "id": col_id,
429
+ "name": doc.get("collection_name") or col_id,
430
+ "document_count": 0,
431
+ }
432
+ collections[col_id]["document_count"] += 1
433
+
434
+ return list(collections.values())
435
+
436
+ async def get_stats(self, index_name: str) -> dict[str, Any]:
437
+ """Get index statistics.
438
+
439
+ Args:
440
+ index_name: Index to query.
441
+
442
+ Returns:
443
+ Dictionary with document_count and index info.
444
+ """
445
+ count = self._indexer.count(index_name)
446
+ return {
447
+ "document_count": count,
448
+ "index_name": index_name,
449
+ "exists": count > 0 or index_name in self._indexer._indices,
450
+ }
451
+
351
452
  def _apply_filters(
352
453
  self,
353
454
  documents: list[dict[str, Any]],
@@ -355,6 +456,10 @@ class MemorySearcher:
355
456
  ) -> list[dict[str, Any]]:
356
457
  """Apply query filters to documents.
357
458
 
459
+ Note:
460
+ This method is tenant-agnostic. Multi-tenancy should be handled
461
+ at the API layer by using separate indices per account.
462
+
358
463
  Args:
359
464
  documents: Documents to filter.
360
465
  query: Query with filter parameters.
@@ -364,10 +469,6 @@ class MemorySearcher:
364
469
  """
365
470
  filtered = documents
366
471
 
367
- # Account filter
368
- if query.account_id:
369
- filtered = [d for d in filtered if d.get("account_id") == query.account_id]
370
-
371
472
  # Collection filter
372
473
  if query.collection_ids:
373
474
  filtered = [
@@ -378,9 +479,9 @@ class MemorySearcher:
378
479
  if query.source_ids:
379
480
  filtered = [d for d in filtered if d.get("source_id") in query.source_ids]
380
481
 
381
- # Custom filters
382
- if query.filters:
383
- for field, value in query.filters.items():
482
+ # Custom metadata filters
483
+ if query.metadata_filters:
484
+ for field, value in query.metadata_filters.items():
384
485
  if isinstance(value, list):
385
486
  filtered = [d for d in filtered if d.get(field) in value]
386
487
  else: