gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Search backend implementations."""
|
|
2
|
+
|
|
3
|
+
from gnosisllm_knowledge.backends.memory import MemoryIndexer, MemorySearcher
|
|
4
|
+
from gnosisllm_knowledge.backends.opensearch import (
|
|
5
|
+
AgenticSearchFallback,
|
|
6
|
+
OpenSearchAgenticSearcher,
|
|
7
|
+
OpenSearchConfig,
|
|
8
|
+
OpenSearchIndexer,
|
|
9
|
+
OpenSearchKnowledgeSearcher,
|
|
10
|
+
OpenSearchSetupAdapter,
|
|
11
|
+
)
|
|
12
|
+
from gnosisllm_knowledge.backends.opensearch.queries import QueryBuilder
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
# OpenSearch
|
|
16
|
+
"OpenSearchConfig",
|
|
17
|
+
"OpenSearchIndexer",
|
|
18
|
+
"OpenSearchKnowledgeSearcher",
|
|
19
|
+
"OpenSearchSetupAdapter",
|
|
20
|
+
"OpenSearchAgenticSearcher",
|
|
21
|
+
"AgenticSearchFallback",
|
|
22
|
+
"QueryBuilder",
|
|
23
|
+
# Memory (for testing)
|
|
24
|
+
"MemoryIndexer",
|
|
25
|
+
"MemorySearcher",
|
|
26
|
+
]
|
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"""In-memory document indexer for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import Any, AsyncIterator, Callable, Sequence
|
|
9
|
+
|
|
10
|
+
from gnosisllm_knowledge.core.domain.document import Document
|
|
11
|
+
from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MemoryIndexer:
|
|
15
|
+
"""In-memory document indexer for testing.
|
|
16
|
+
|
|
17
|
+
Stores documents in a dictionary for fast testing without
|
|
18
|
+
requiring an external OpenSearch instance.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
```python
|
|
22
|
+
indexer = MemoryIndexer()
|
|
23
|
+
|
|
24
|
+
# Index documents
|
|
25
|
+
await indexer.index(document, "test-index")
|
|
26
|
+
|
|
27
|
+
# Bulk index
|
|
28
|
+
await indexer.bulk_index(documents, "test-index")
|
|
29
|
+
|
|
30
|
+
# Get stored documents
|
|
31
|
+
docs = indexer.get_all("test-index")
|
|
32
|
+
```
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
embedding_fn: Callable[[str], list[float]] | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Initialize the memory indexer.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
embedding_fn: Optional function to generate embeddings.
|
|
43
|
+
"""
|
|
44
|
+
self._indices: dict[str, dict[str, dict[str, Any]]] = defaultdict(dict)
|
|
45
|
+
self._embedding_fn = embedding_fn
|
|
46
|
+
|
|
47
|
+
async def index(
|
|
48
|
+
self,
|
|
49
|
+
document: Document,
|
|
50
|
+
index_name: str,
|
|
51
|
+
**options: Any,
|
|
52
|
+
) -> IndexResult:
|
|
53
|
+
"""Index a single document.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
document: Document to index.
|
|
57
|
+
index_name: Target index name.
|
|
58
|
+
**options: Additional options.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Index result.
|
|
62
|
+
"""
|
|
63
|
+
doc_body = self._prepare_document(document)
|
|
64
|
+
|
|
65
|
+
if self._embedding_fn and "embedding" not in doc_body:
|
|
66
|
+
doc_body["embedding"] = self._embedding_fn(document.content)
|
|
67
|
+
|
|
68
|
+
self._indices[index_name][document.doc_id] = doc_body
|
|
69
|
+
|
|
70
|
+
return IndexResult(
|
|
71
|
+
success=True,
|
|
72
|
+
document_id=document.doc_id,
|
|
73
|
+
index_name=index_name,
|
|
74
|
+
indexed_count=1,
|
|
75
|
+
failed_count=0,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
async def bulk_index(
|
|
79
|
+
self,
|
|
80
|
+
documents: Sequence[Document],
|
|
81
|
+
index_name: str,
|
|
82
|
+
**options: Any,
|
|
83
|
+
) -> IndexResult:
|
|
84
|
+
"""Index multiple documents.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
documents: Documents to index.
|
|
88
|
+
index_name: Target index name.
|
|
89
|
+
**options: Additional options.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Index result.
|
|
93
|
+
"""
|
|
94
|
+
indexed = 0
|
|
95
|
+
for doc in documents:
|
|
96
|
+
await self.index(doc, index_name)
|
|
97
|
+
indexed += 1
|
|
98
|
+
|
|
99
|
+
return IndexResult(
|
|
100
|
+
success=True,
|
|
101
|
+
index_name=index_name,
|
|
102
|
+
indexed_count=indexed,
|
|
103
|
+
failed_count=0,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
async def bulk_index_streaming(
|
|
107
|
+
self,
|
|
108
|
+
documents: AsyncIterator[Document],
|
|
109
|
+
index_name: str,
|
|
110
|
+
*,
|
|
111
|
+
batch_size: int = 500,
|
|
112
|
+
max_concurrent_batches: int = 3,
|
|
113
|
+
on_batch_complete: Callable[[BatchResult], None] | None = None,
|
|
114
|
+
) -> IndexResult:
|
|
115
|
+
"""Stream-index documents.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
documents: Async iterator of documents.
|
|
119
|
+
index_name: Target index name.
|
|
120
|
+
batch_size: Documents per batch.
|
|
121
|
+
max_concurrent_batches: Ignored for memory backend.
|
|
122
|
+
on_batch_complete: Callback for batch completion.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Index result.
|
|
126
|
+
"""
|
|
127
|
+
indexed = 0
|
|
128
|
+
batch_num = 0
|
|
129
|
+
current_batch: list[Document] = []
|
|
130
|
+
|
|
131
|
+
async for doc in documents:
|
|
132
|
+
current_batch.append(doc)
|
|
133
|
+
|
|
134
|
+
if len(current_batch) >= batch_size:
|
|
135
|
+
batch_num += 1
|
|
136
|
+
await self.bulk_index(current_batch, index_name)
|
|
137
|
+
indexed += len(current_batch)
|
|
138
|
+
|
|
139
|
+
if on_batch_complete:
|
|
140
|
+
on_batch_complete(
|
|
141
|
+
BatchResult(
|
|
142
|
+
total=len(current_batch),
|
|
143
|
+
succeeded=len(current_batch),
|
|
144
|
+
failed=0,
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
current_batch = []
|
|
149
|
+
|
|
150
|
+
# Process remaining
|
|
151
|
+
if current_batch:
|
|
152
|
+
batch_num += 1
|
|
153
|
+
await self.bulk_index(current_batch, index_name)
|
|
154
|
+
indexed += len(current_batch)
|
|
155
|
+
|
|
156
|
+
if on_batch_complete:
|
|
157
|
+
on_batch_complete(
|
|
158
|
+
BatchResult(
|
|
159
|
+
total=len(current_batch),
|
|
160
|
+
succeeded=len(current_batch),
|
|
161
|
+
failed=0,
|
|
162
|
+
)
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return IndexResult(
|
|
166
|
+
success=True,
|
|
167
|
+
index_name=index_name,
|
|
168
|
+
indexed_count=indexed,
|
|
169
|
+
failed_count=0,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
async def upsert(
|
|
173
|
+
self,
|
|
174
|
+
document: Document,
|
|
175
|
+
index_name: str,
|
|
176
|
+
) -> IndexResult:
|
|
177
|
+
"""Upsert a document.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
document: Document to upsert.
|
|
181
|
+
index_name: Target index name.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Index result.
|
|
185
|
+
"""
|
|
186
|
+
return await self.index(document, index_name)
|
|
187
|
+
|
|
188
|
+
async def delete(
|
|
189
|
+
self,
|
|
190
|
+
doc_id: str,
|
|
191
|
+
index_name: str,
|
|
192
|
+
) -> bool:
|
|
193
|
+
"""Delete a document by ID.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
doc_id: Document ID.
|
|
197
|
+
index_name: Index name.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
True if deleted.
|
|
201
|
+
"""
|
|
202
|
+
if index_name in self._indices and doc_id in self._indices[index_name]:
|
|
203
|
+
del self._indices[index_name][doc_id]
|
|
204
|
+
return True
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
async def bulk_delete(
|
|
208
|
+
self,
|
|
209
|
+
doc_ids: Sequence[str],
|
|
210
|
+
index_name: str,
|
|
211
|
+
) -> int:
|
|
212
|
+
"""Delete multiple documents.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
doc_ids: Document IDs.
|
|
216
|
+
index_name: Index name.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Count of deleted documents.
|
|
220
|
+
"""
|
|
221
|
+
deleted = 0
|
|
222
|
+
for doc_id in doc_ids:
|
|
223
|
+
if await self.delete(doc_id, index_name):
|
|
224
|
+
deleted += 1
|
|
225
|
+
return deleted
|
|
226
|
+
|
|
227
|
+
async def delete_by_query(
|
|
228
|
+
self,
|
|
229
|
+
query: dict[str, Any],
|
|
230
|
+
index_name: str,
|
|
231
|
+
) -> int:
|
|
232
|
+
"""Delete documents matching query.
|
|
233
|
+
|
|
234
|
+
Simple implementation that checks filter terms.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
query: Query dictionary with filters.
|
|
238
|
+
index_name: Index name.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Count of deleted documents.
|
|
242
|
+
"""
|
|
243
|
+
if index_name not in self._indices:
|
|
244
|
+
return 0
|
|
245
|
+
|
|
246
|
+
filters = query.get("query", {}).get("bool", {}).get("filter", [])
|
|
247
|
+
to_delete = []
|
|
248
|
+
|
|
249
|
+
for doc_id, doc in self._indices[index_name].items():
|
|
250
|
+
matches = True
|
|
251
|
+
for f in filters:
|
|
252
|
+
if "term" in f:
|
|
253
|
+
for field, value in f["term"].items():
|
|
254
|
+
if doc.get(field) != value:
|
|
255
|
+
matches = False
|
|
256
|
+
break
|
|
257
|
+
if matches:
|
|
258
|
+
to_delete.append(doc_id)
|
|
259
|
+
|
|
260
|
+
for doc_id in to_delete:
|
|
261
|
+
del self._indices[index_name][doc_id]
|
|
262
|
+
|
|
263
|
+
return len(to_delete)
|
|
264
|
+
|
|
265
|
+
async def ensure_index(
|
|
266
|
+
self,
|
|
267
|
+
index_name: str,
|
|
268
|
+
**options: Any,
|
|
269
|
+
) -> bool:
|
|
270
|
+
"""Ensure index exists.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
index_name: Index name.
|
|
274
|
+
**options: Ignored.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
True if created.
|
|
278
|
+
"""
|
|
279
|
+
if index_name not in self._indices:
|
|
280
|
+
self._indices[index_name] = {}
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
async def delete_index(self, index_name: str) -> bool:
|
|
285
|
+
"""Delete an index.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
index_name: Index to delete.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
True if deleted.
|
|
292
|
+
"""
|
|
293
|
+
if index_name in self._indices:
|
|
294
|
+
del self._indices[index_name]
|
|
295
|
+
return True
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
async def refresh_index(self, index_name: str) -> bool:
|
|
299
|
+
"""Refresh index (no-op for memory).
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
index_name: Index name.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Always True.
|
|
306
|
+
"""
|
|
307
|
+
return True
|
|
308
|
+
|
|
309
|
+
def get_all(self, index_name: str) -> list[dict[str, Any]]:
|
|
310
|
+
"""Get all documents in an index.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
index_name: Index name.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of document dictionaries.
|
|
317
|
+
"""
|
|
318
|
+
return list(self._indices.get(index_name, {}).values())
|
|
319
|
+
|
|
320
|
+
def get_by_id(self, doc_id: str, index_name: str) -> dict[str, Any] | None:
|
|
321
|
+
"""Get document by ID.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
doc_id: Document ID.
|
|
325
|
+
index_name: Index name.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Document dictionary or None.
|
|
329
|
+
"""
|
|
330
|
+
return self._indices.get(index_name, {}).get(doc_id)
|
|
331
|
+
|
|
332
|
+
def count(self, index_name: str) -> int:
|
|
333
|
+
"""Count documents in index.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
index_name: Index name.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Document count.
|
|
340
|
+
"""
|
|
341
|
+
return len(self._indices.get(index_name, {}))
|
|
342
|
+
|
|
343
|
+
def clear_all(self) -> None:
|
|
344
|
+
"""Clear all indices."""
|
|
345
|
+
self._indices.clear()
|
|
346
|
+
|
|
347
|
+
def _prepare_document(self, document: Document) -> dict[str, Any]:
|
|
348
|
+
"""Prepare document for storage.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
document: Document to prepare.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Dictionary for storage.
|
|
355
|
+
"""
|
|
356
|
+
content_hash = document.content_hash
|
|
357
|
+
if not content_hash:
|
|
358
|
+
content_hash = hashlib.sha256(document.content.encode()).hexdigest()
|
|
359
|
+
|
|
360
|
+
now = datetime.now(timezone.utc)
|
|
361
|
+
|
|
362
|
+
return {
|
|
363
|
+
"id": document.doc_id,
|
|
364
|
+
"content": document.content,
|
|
365
|
+
"url": document.url,
|
|
366
|
+
"title": document.title,
|
|
367
|
+
"source": document.source,
|
|
368
|
+
"account_id": document.account_id,
|
|
369
|
+
"collection_id": document.collection_id,
|
|
370
|
+
"source_id": document.source_id,
|
|
371
|
+
"chunk_index": document.chunk_index,
|
|
372
|
+
"total_chunks": document.total_chunks,
|
|
373
|
+
"parent_doc_id": document.parent_doc_id,
|
|
374
|
+
"quality_score": document.quality_score,
|
|
375
|
+
"language": document.language,
|
|
376
|
+
"content_hash": content_hash,
|
|
377
|
+
"word_count": document.word_count or len(document.content.split()),
|
|
378
|
+
"status": document.status.value,
|
|
379
|
+
"pii_detected": document.pii_detected,
|
|
380
|
+
"pii_redacted": document.pii_redacted,
|
|
381
|
+
"metadata": document.metadata,
|
|
382
|
+
"created_at": document.created_at.isoformat() if document.created_at else now.isoformat(),
|
|
383
|
+
"indexed_at": now.isoformat(),
|
|
384
|
+
}
|