gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -9,18 +9,27 @@ from gnosisllm_knowledge.backends.opensearch import (
|
|
|
9
9
|
OpenSearchKnowledgeSearcher,
|
|
10
10
|
OpenSearchSetupAdapter,
|
|
11
11
|
)
|
|
12
|
+
from gnosisllm_knowledge.backends.opensearch.memory import (
|
|
13
|
+
MemoryConfig,
|
|
14
|
+
MemorySetup,
|
|
15
|
+
OpenSearchMemoryClient,
|
|
16
|
+
)
|
|
12
17
|
from gnosisllm_knowledge.backends.opensearch.queries import QueryBuilder
|
|
13
18
|
|
|
14
19
|
__all__ = [
|
|
20
|
+
"AgenticSearchFallback",
|
|
21
|
+
# OpenSearch Memory
|
|
22
|
+
"MemoryConfig",
|
|
23
|
+
# Memory (for testing)
|
|
24
|
+
"MemoryIndexer",
|
|
25
|
+
"MemorySearcher",
|
|
26
|
+
"MemorySetup",
|
|
27
|
+
"OpenSearchAgenticSearcher",
|
|
15
28
|
# OpenSearch
|
|
16
29
|
"OpenSearchConfig",
|
|
17
30
|
"OpenSearchIndexer",
|
|
18
31
|
"OpenSearchKnowledgeSearcher",
|
|
32
|
+
"OpenSearchMemoryClient",
|
|
19
33
|
"OpenSearchSetupAdapter",
|
|
20
|
-
"OpenSearchAgenticSearcher",
|
|
21
|
-
"AgenticSearchFallback",
|
|
22
34
|
"QueryBuilder",
|
|
23
|
-
# Memory (for testing)
|
|
24
|
-
"MemoryIndexer",
|
|
25
|
-
"MemorySearcher",
|
|
26
35
|
]
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""In-memory document indexer for testing.
|
|
1
|
+
"""In-memory document indexer for testing.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). The memory indexer does not
|
|
6
|
+
include tenant filtering logic - use separate index names per tenant.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from __future__ import annotations
|
|
4
10
|
|
|
@@ -14,6 +20,9 @@ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
|
|
|
14
20
|
class MemoryIndexer:
|
|
15
21
|
"""In-memory document indexer for testing.
|
|
16
22
|
|
|
23
|
+
This indexer is tenant-agnostic. Multi-tenancy is achieved through index
|
|
24
|
+
isolation by using tenant-specific index names.
|
|
25
|
+
|
|
17
26
|
Stores documents in a dictionary for fast testing without
|
|
18
27
|
requiring an external OpenSearch instance.
|
|
19
28
|
|
|
@@ -185,6 +194,22 @@ class MemoryIndexer:
|
|
|
185
194
|
"""
|
|
186
195
|
return await self.index(document, index_name)
|
|
187
196
|
|
|
197
|
+
async def get(
|
|
198
|
+
self,
|
|
199
|
+
doc_id: str,
|
|
200
|
+
index_name: str,
|
|
201
|
+
) -> dict[str, Any] | None:
|
|
202
|
+
"""Get a document by ID (async interface).
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
doc_id: Document ID.
|
|
206
|
+
index_name: Index name.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Document dictionary or None if not found.
|
|
210
|
+
"""
|
|
211
|
+
return self._indices.get(index_name, {}).get(doc_id)
|
|
212
|
+
|
|
188
213
|
async def delete(
|
|
189
214
|
self,
|
|
190
215
|
doc_id: str,
|
|
@@ -365,8 +390,8 @@ class MemoryIndexer:
|
|
|
365
390
|
"url": document.url,
|
|
366
391
|
"title": document.title,
|
|
367
392
|
"source": document.source,
|
|
368
|
-
"account_id": document.account_id,
|
|
369
393
|
"collection_id": document.collection_id,
|
|
394
|
+
"collection_name": document.collection_name,
|
|
370
395
|
"source_id": document.source_id,
|
|
371
396
|
"chunk_index": document.chunk_index,
|
|
372
397
|
"total_chunks": document.total_chunks,
|
|
@@ -1,10 +1,16 @@
|
|
|
1
|
-
"""In-memory document searcher for testing.
|
|
1
|
+
"""In-memory document searcher for testing.
|
|
2
|
+
|
|
3
|
+
Note: This module is tenant-agnostic. Multi-tenancy should be handled
|
|
4
|
+
at the API layer by using separate indices per account (e.g.,
|
|
5
|
+
gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
|
|
6
|
+
"""
|
|
2
7
|
|
|
3
8
|
from __future__ import annotations
|
|
4
9
|
|
|
5
10
|
import math
|
|
6
11
|
import re
|
|
7
12
|
import time
|
|
13
|
+
import warnings
|
|
8
14
|
from typing import Any, Callable
|
|
9
15
|
|
|
10
16
|
from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
|
|
@@ -147,7 +153,7 @@ class MemorySearcher:
|
|
|
147
153
|
|
|
148
154
|
for doc in filtered_docs:
|
|
149
155
|
content = doc.get("content", "").lower()
|
|
150
|
-
title = doc.get("title"
|
|
156
|
+
title = (doc.get("title") or "").lower()
|
|
151
157
|
|
|
152
158
|
# Simple TF scoring
|
|
153
159
|
content_score = sum(
|
|
@@ -209,7 +215,7 @@ class MemorySearcher:
|
|
|
209
215
|
for doc in filtered_docs:
|
|
210
216
|
# Keyword score
|
|
211
217
|
content = doc.get("content", "").lower()
|
|
212
|
-
title = doc.get("title"
|
|
218
|
+
title = (doc.get("title") or "").lower()
|
|
213
219
|
keyword_score = sum(content.count(term) for term in query_terms)
|
|
214
220
|
keyword_score += sum(title.count(term) for term in query_terms) * 2
|
|
215
221
|
|
|
@@ -348,6 +354,101 @@ class MemorySearcher:
|
|
|
348
354
|
results.append(result)
|
|
349
355
|
return results
|
|
350
356
|
|
|
357
|
+
async def list_documents(
|
|
358
|
+
self,
|
|
359
|
+
index_name: str,
|
|
360
|
+
*,
|
|
361
|
+
source_id: str | None = None,
|
|
362
|
+
collection_id: str | None = None,
|
|
363
|
+
limit: int = 50,
|
|
364
|
+
offset: int = 0,
|
|
365
|
+
) -> dict[str, Any]:
|
|
366
|
+
"""List documents with optional filters.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
index_name: Index to query.
|
|
370
|
+
source_id: Optional source ID filter.
|
|
371
|
+
collection_id: Optional collection ID filter.
|
|
372
|
+
limit: Maximum documents to return.
|
|
373
|
+
offset: Number of documents to skip.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Dictionary with documents, total, limit, offset.
|
|
377
|
+
"""
|
|
378
|
+
documents = self._indexer.get_all(index_name)
|
|
379
|
+
|
|
380
|
+
# Apply filters
|
|
381
|
+
if source_id:
|
|
382
|
+
documents = [d for d in documents if d.get("source_id") == source_id]
|
|
383
|
+
if collection_id:
|
|
384
|
+
documents = [d for d in documents if d.get("collection_id") == collection_id]
|
|
385
|
+
|
|
386
|
+
total = len(documents)
|
|
387
|
+
|
|
388
|
+
# Apply pagination
|
|
389
|
+
paginated = documents[offset : offset + limit]
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
"documents": paginated,
|
|
393
|
+
"total": total,
|
|
394
|
+
"limit": limit,
|
|
395
|
+
"offset": offset,
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
def count(self, index_name: str) -> int:
|
|
399
|
+
"""Count documents in index.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
index_name: Index to count.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Document count.
|
|
406
|
+
"""
|
|
407
|
+
return self._indexer.count(index_name)
|
|
408
|
+
|
|
409
|
+
async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
|
|
410
|
+
"""Get unique collections with document counts.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
index_name: Index to query.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
List of collections with id, name, and document_count.
|
|
417
|
+
"""
|
|
418
|
+
documents = self._indexer.get_all(index_name)
|
|
419
|
+
collections: dict[str, dict[str, Any]] = {}
|
|
420
|
+
|
|
421
|
+
for doc in documents:
|
|
422
|
+
col_id = doc.get("collection_id")
|
|
423
|
+
if not col_id:
|
|
424
|
+
continue
|
|
425
|
+
|
|
426
|
+
if col_id not in collections:
|
|
427
|
+
collections[col_id] = {
|
|
428
|
+
"id": col_id,
|
|
429
|
+
"name": doc.get("collection_name") or col_id,
|
|
430
|
+
"document_count": 0,
|
|
431
|
+
}
|
|
432
|
+
collections[col_id]["document_count"] += 1
|
|
433
|
+
|
|
434
|
+
return list(collections.values())
|
|
435
|
+
|
|
436
|
+
async def get_stats(self, index_name: str) -> dict[str, Any]:
|
|
437
|
+
"""Get index statistics.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
index_name: Index to query.
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Dictionary with document_count and index info.
|
|
444
|
+
"""
|
|
445
|
+
count = self._indexer.count(index_name)
|
|
446
|
+
return {
|
|
447
|
+
"document_count": count,
|
|
448
|
+
"index_name": index_name,
|
|
449
|
+
"exists": count > 0 or index_name in self._indexer._indices,
|
|
450
|
+
}
|
|
451
|
+
|
|
351
452
|
def _apply_filters(
|
|
352
453
|
self,
|
|
353
454
|
documents: list[dict[str, Any]],
|
|
@@ -355,6 +456,10 @@ class MemorySearcher:
|
|
|
355
456
|
) -> list[dict[str, Any]]:
|
|
356
457
|
"""Apply query filters to documents.
|
|
357
458
|
|
|
459
|
+
Note:
|
|
460
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
461
|
+
at the API layer by using separate indices per account.
|
|
462
|
+
|
|
358
463
|
Args:
|
|
359
464
|
documents: Documents to filter.
|
|
360
465
|
query: Query with filter parameters.
|
|
@@ -364,10 +469,6 @@ class MemorySearcher:
|
|
|
364
469
|
"""
|
|
365
470
|
filtered = documents
|
|
366
471
|
|
|
367
|
-
# Account filter
|
|
368
|
-
if query.account_id:
|
|
369
|
-
filtered = [d for d in filtered if d.get("account_id") == query.account_id]
|
|
370
|
-
|
|
371
472
|
# Collection filter
|
|
372
473
|
if query.collection_ids:
|
|
373
474
|
filtered = [
|
|
@@ -378,9 +479,9 @@ class MemorySearcher:
|
|
|
378
479
|
if query.source_ids:
|
|
379
480
|
filtered = [d for d in filtered if d.get("source_id") in query.source_ids]
|
|
380
481
|
|
|
381
|
-
# Custom filters
|
|
382
|
-
if query.
|
|
383
|
-
for field, value in query.
|
|
482
|
+
# Custom metadata filters
|
|
483
|
+
if query.metadata_filters:
|
|
484
|
+
for field, value in query.metadata_filters.items():
|
|
384
485
|
if isinstance(value, list):
|
|
385
486
|
filtered = [d for d in filtered if d.get(field) in value]
|
|
386
487
|
else:
|