gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,26 @@
1
+ """Search backend implementations."""
2
+
3
+ from gnosisllm_knowledge.backends.memory import MemoryIndexer, MemorySearcher
4
+ from gnosisllm_knowledge.backends.opensearch import (
5
+ AgenticSearchFallback,
6
+ OpenSearchAgenticSearcher,
7
+ OpenSearchConfig,
8
+ OpenSearchIndexer,
9
+ OpenSearchKnowledgeSearcher,
10
+ OpenSearchSetupAdapter,
11
+ )
12
+ from gnosisllm_knowledge.backends.opensearch.queries import QueryBuilder
13
+
14
+ __all__ = [
15
+ # OpenSearch
16
+ "OpenSearchConfig",
17
+ "OpenSearchIndexer",
18
+ "OpenSearchKnowledgeSearcher",
19
+ "OpenSearchSetupAdapter",
20
+ "OpenSearchAgenticSearcher",
21
+ "AgenticSearchFallback",
22
+ "QueryBuilder",
23
+ # Memory (for testing)
24
+ "MemoryIndexer",
25
+ "MemorySearcher",
26
+ ]
@@ -0,0 +1,9 @@
1
+ """In-memory backend for testing."""
2
+
3
+ from gnosisllm_knowledge.backends.memory.indexer import MemoryIndexer
4
+ from gnosisllm_knowledge.backends.memory.searcher import MemorySearcher
5
+
6
+ __all__ = [
7
+ "MemoryIndexer",
8
+ "MemorySearcher",
9
+ ]
@@ -0,0 +1,384 @@
1
+ """In-memory document indexer for testing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from collections import defaultdict
7
+ from datetime import datetime, timezone
8
+ from typing import Any, AsyncIterator, Callable, Sequence
9
+
10
+ from gnosisllm_knowledge.core.domain.document import Document
11
+ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
12
+
13
+
14
+ class MemoryIndexer:
15
+ """In-memory document indexer for testing.
16
+
17
+ Stores documents in a dictionary for fast testing without
18
+ requiring an external OpenSearch instance.
19
+
20
+ Example:
21
+ ```python
22
+ indexer = MemoryIndexer()
23
+
24
+ # Index documents
25
+ await indexer.index(document, "test-index")
26
+
27
+ # Bulk index
28
+ await indexer.bulk_index(documents, "test-index")
29
+
30
+ # Get stored documents
31
+ docs = indexer.get_all("test-index")
32
+ ```
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ embedding_fn: Callable[[str], list[float]] | None = None,
38
+ ) -> None:
39
+ """Initialize the memory indexer.
40
+
41
+ Args:
42
+ embedding_fn: Optional function to generate embeddings.
43
+ """
44
+ self._indices: dict[str, dict[str, dict[str, Any]]] = defaultdict(dict)
45
+ self._embedding_fn = embedding_fn
46
+
47
+ async def index(
48
+ self,
49
+ document: Document,
50
+ index_name: str,
51
+ **options: Any,
52
+ ) -> IndexResult:
53
+ """Index a single document.
54
+
55
+ Args:
56
+ document: Document to index.
57
+ index_name: Target index name.
58
+ **options: Additional options.
59
+
60
+ Returns:
61
+ Index result.
62
+ """
63
+ doc_body = self._prepare_document(document)
64
+
65
+ if self._embedding_fn and "embedding" not in doc_body:
66
+ doc_body["embedding"] = self._embedding_fn(document.content)
67
+
68
+ self._indices[index_name][document.doc_id] = doc_body
69
+
70
+ return IndexResult(
71
+ success=True,
72
+ document_id=document.doc_id,
73
+ index_name=index_name,
74
+ indexed_count=1,
75
+ failed_count=0,
76
+ )
77
+
78
+ async def bulk_index(
79
+ self,
80
+ documents: Sequence[Document],
81
+ index_name: str,
82
+ **options: Any,
83
+ ) -> IndexResult:
84
+ """Index multiple documents.
85
+
86
+ Args:
87
+ documents: Documents to index.
88
+ index_name: Target index name.
89
+ **options: Additional options.
90
+
91
+ Returns:
92
+ Index result.
93
+ """
94
+ indexed = 0
95
+ for doc in documents:
96
+ await self.index(doc, index_name)
97
+ indexed += 1
98
+
99
+ return IndexResult(
100
+ success=True,
101
+ index_name=index_name,
102
+ indexed_count=indexed,
103
+ failed_count=0,
104
+ )
105
+
106
+ async def bulk_index_streaming(
107
+ self,
108
+ documents: AsyncIterator[Document],
109
+ index_name: str,
110
+ *,
111
+ batch_size: int = 500,
112
+ max_concurrent_batches: int = 3,
113
+ on_batch_complete: Callable[[BatchResult], None] | None = None,
114
+ ) -> IndexResult:
115
+ """Stream-index documents.
116
+
117
+ Args:
118
+ documents: Async iterator of documents.
119
+ index_name: Target index name.
120
+ batch_size: Documents per batch.
121
+ max_concurrent_batches: Ignored for memory backend.
122
+ on_batch_complete: Callback for batch completion.
123
+
124
+ Returns:
125
+ Index result.
126
+ """
127
+ indexed = 0
128
+ batch_num = 0
129
+ current_batch: list[Document] = []
130
+
131
+ async for doc in documents:
132
+ current_batch.append(doc)
133
+
134
+ if len(current_batch) >= batch_size:
135
+ batch_num += 1
136
+ await self.bulk_index(current_batch, index_name)
137
+ indexed += len(current_batch)
138
+
139
+ if on_batch_complete:
140
+ on_batch_complete(
141
+ BatchResult(
142
+ total=len(current_batch),
143
+ succeeded=len(current_batch),
144
+ failed=0,
145
+ )
146
+ )
147
+
148
+ current_batch = []
149
+
150
+ # Process remaining
151
+ if current_batch:
152
+ batch_num += 1
153
+ await self.bulk_index(current_batch, index_name)
154
+ indexed += len(current_batch)
155
+
156
+ if on_batch_complete:
157
+ on_batch_complete(
158
+ BatchResult(
159
+ total=len(current_batch),
160
+ succeeded=len(current_batch),
161
+ failed=0,
162
+ )
163
+ )
164
+
165
+ return IndexResult(
166
+ success=True,
167
+ index_name=index_name,
168
+ indexed_count=indexed,
169
+ failed_count=0,
170
+ )
171
+
172
+ async def upsert(
173
+ self,
174
+ document: Document,
175
+ index_name: str,
176
+ ) -> IndexResult:
177
+ """Upsert a document.
178
+
179
+ Args:
180
+ document: Document to upsert.
181
+ index_name: Target index name.
182
+
183
+ Returns:
184
+ Index result.
185
+ """
186
+ return await self.index(document, index_name)
187
+
188
+ async def delete(
189
+ self,
190
+ doc_id: str,
191
+ index_name: str,
192
+ ) -> bool:
193
+ """Delete a document by ID.
194
+
195
+ Args:
196
+ doc_id: Document ID.
197
+ index_name: Index name.
198
+
199
+ Returns:
200
+ True if deleted.
201
+ """
202
+ if index_name in self._indices and doc_id in self._indices[index_name]:
203
+ del self._indices[index_name][doc_id]
204
+ return True
205
+ return False
206
+
207
+ async def bulk_delete(
208
+ self,
209
+ doc_ids: Sequence[str],
210
+ index_name: str,
211
+ ) -> int:
212
+ """Delete multiple documents.
213
+
214
+ Args:
215
+ doc_ids: Document IDs.
216
+ index_name: Index name.
217
+
218
+ Returns:
219
+ Count of deleted documents.
220
+ """
221
+ deleted = 0
222
+ for doc_id in doc_ids:
223
+ if await self.delete(doc_id, index_name):
224
+ deleted += 1
225
+ return deleted
226
+
227
+ async def delete_by_query(
228
+ self,
229
+ query: dict[str, Any],
230
+ index_name: str,
231
+ ) -> int:
232
+ """Delete documents matching query.
233
+
234
+ Simple implementation that checks filter terms.
235
+
236
+ Args:
237
+ query: Query dictionary with filters.
238
+ index_name: Index name.
239
+
240
+ Returns:
241
+ Count of deleted documents.
242
+ """
243
+ if index_name not in self._indices:
244
+ return 0
245
+
246
+ filters = query.get("query", {}).get("bool", {}).get("filter", [])
247
+ to_delete = []
248
+
249
+ for doc_id, doc in self._indices[index_name].items():
250
+ matches = True
251
+ for f in filters:
252
+ if "term" in f:
253
+ for field, value in f["term"].items():
254
+ if doc.get(field) != value:
255
+ matches = False
256
+ break
257
+ if matches:
258
+ to_delete.append(doc_id)
259
+
260
+ for doc_id in to_delete:
261
+ del self._indices[index_name][doc_id]
262
+
263
+ return len(to_delete)
264
+
265
+ async def ensure_index(
266
+ self,
267
+ index_name: str,
268
+ **options: Any,
269
+ ) -> bool:
270
+ """Ensure index exists.
271
+
272
+ Args:
273
+ index_name: Index name.
274
+ **options: Ignored.
275
+
276
+ Returns:
277
+ True if created.
278
+ """
279
+ if index_name not in self._indices:
280
+ self._indices[index_name] = {}
281
+ return True
282
+ return False
283
+
284
+ async def delete_index(self, index_name: str) -> bool:
285
+ """Delete an index.
286
+
287
+ Args:
288
+ index_name: Index to delete.
289
+
290
+ Returns:
291
+ True if deleted.
292
+ """
293
+ if index_name in self._indices:
294
+ del self._indices[index_name]
295
+ return True
296
+ return False
297
+
298
+ async def refresh_index(self, index_name: str) -> bool:
299
+ """Refresh index (no-op for memory).
300
+
301
+ Args:
302
+ index_name: Index name.
303
+
304
+ Returns:
305
+ Always True.
306
+ """
307
+ return True
308
+
309
+ def get_all(self, index_name: str) -> list[dict[str, Any]]:
310
+ """Get all documents in an index.
311
+
312
+ Args:
313
+ index_name: Index name.
314
+
315
+ Returns:
316
+ List of document dictionaries.
317
+ """
318
+ return list(self._indices.get(index_name, {}).values())
319
+
320
+ def get_by_id(self, doc_id: str, index_name: str) -> dict[str, Any] | None:
321
+ """Get document by ID.
322
+
323
+ Args:
324
+ doc_id: Document ID.
325
+ index_name: Index name.
326
+
327
+ Returns:
328
+ Document dictionary or None.
329
+ """
330
+ return self._indices.get(index_name, {}).get(doc_id)
331
+
332
+ def count(self, index_name: str) -> int:
333
+ """Count documents in index.
334
+
335
+ Args:
336
+ index_name: Index name.
337
+
338
+ Returns:
339
+ Document count.
340
+ """
341
+ return len(self._indices.get(index_name, {}))
342
+
343
+ def clear_all(self) -> None:
344
+ """Clear all indices."""
345
+ self._indices.clear()
346
+
347
+ def _prepare_document(self, document: Document) -> dict[str, Any]:
348
+ """Prepare document for storage.
349
+
350
+ Args:
351
+ document: Document to prepare.
352
+
353
+ Returns:
354
+ Dictionary for storage.
355
+ """
356
+ content_hash = document.content_hash
357
+ if not content_hash:
358
+ content_hash = hashlib.sha256(document.content.encode()).hexdigest()
359
+
360
+ now = datetime.now(timezone.utc)
361
+
362
+ return {
363
+ "id": document.doc_id,
364
+ "content": document.content,
365
+ "url": document.url,
366
+ "title": document.title,
367
+ "source": document.source,
368
+ "account_id": document.account_id,
369
+ "collection_id": document.collection_id,
370
+ "source_id": document.source_id,
371
+ "chunk_index": document.chunk_index,
372
+ "total_chunks": document.total_chunks,
373
+ "parent_doc_id": document.parent_doc_id,
374
+ "quality_score": document.quality_score,
375
+ "language": document.language,
376
+ "content_hash": content_hash,
377
+ "word_count": document.word_count or len(document.content.split()),
378
+ "status": document.status.value,
379
+ "pii_detected": document.pii_detected,
380
+ "pii_redacted": document.pii_redacted,
381
+ "metadata": document.metadata,
382
+ "created_at": document.created_at.isoformat() if document.created_at else now.isoformat(),
383
+ "indexed_at": now.isoformat(),
384
+ }