gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,387 @@
1
+ """Knowledge indexing service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import uuid
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus
10
+ from gnosisllm_knowledge.core.domain.result import IndexResult, LoadResult
11
+ from gnosisllm_knowledge.core.domain.source import SourceConfig
12
+ from gnosisllm_knowledge.core.events.emitter import EventEmitter
13
+ from gnosisllm_knowledge.core.events.types import (
14
+ BatchCompletedEvent,
15
+ BatchStartedEvent,
16
+ DocumentIndexedEvent,
17
+ EventType,
18
+ )
19
+ from gnosisllm_knowledge.core.exceptions import IndexError, LoadError
20
+
21
+ if TYPE_CHECKING:
22
+ from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
23
+ from gnosisllm_knowledge.core.interfaces.indexer import IDocumentIndexer
24
+ from gnosisllm_knowledge.core.interfaces.loader import IContentLoader
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class KnowledgeIndexingService:
30
+ """Service for loading, chunking, and indexing knowledge documents.
31
+
32
+ Orchestrates the document ingestion pipeline from source to index.
33
+
34
+ Example:
35
+ ```python
36
+ service = KnowledgeIndexingService(
37
+ loader=WebsiteLoader(fetcher),
38
+ chunker=SentenceChunker(),
39
+ indexer=OpenSearchIndexer(client, config),
40
+ )
41
+
42
+ # Load and index from a URL
43
+ result = await service.load_and_index(
44
+ source="https://docs.example.com",
45
+ index_name="knowledge",
46
+ collection_id="docs",
47
+ )
48
+ ```
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ loader: IContentLoader,
54
+ chunker: ITextChunker,
55
+ indexer: IDocumentIndexer,
56
+ events: EventEmitter | None = None,
57
+ ) -> None:
58
+ """Initialize the indexing service.
59
+
60
+ Args:
61
+ loader: Content loader for fetching documents.
62
+ chunker: Text chunker for splitting documents.
63
+ indexer: Document indexer for storing documents.
64
+ events: Optional event emitter for progress tracking.
65
+
66
+ Note:
67
+ Embeddings are generated automatically by OpenSearch ingest pipeline.
68
+ No Python-side embedding function is needed.
69
+ """
70
+ self._loader = loader
71
+ self._chunker = chunker
72
+ self._indexer = indexer
73
+ self._events = events or EventEmitter()
74
+
75
+ @property
76
+ def events(self) -> EventEmitter:
77
+ """Get the event emitter."""
78
+ return self._events
79
+
80
+ async def load_and_index(
81
+ self,
82
+ source: str,
83
+ index_name: str,
84
+ *,
85
+ account_id: str | None = None,
86
+ collection_id: str | None = None,
87
+ source_id: str | None = None,
88
+ batch_size: int = 100,
89
+ **options: Any,
90
+ ) -> IndexResult:
91
+ """Load content from source and index it.
92
+
93
+ Args:
94
+ source: Source URL or path.
95
+ index_name: Target index name.
96
+ account_id: Account ID for multi-tenancy.
97
+ collection_id: Collection ID.
98
+ source_id: Source ID (auto-generated if not provided).
99
+ batch_size: Documents per batch.
100
+ **options: Additional loader/indexer options.
101
+
102
+ Returns:
103
+ Index result with counts.
104
+ """
105
+ source_id = source_id or str(uuid.uuid4())
106
+
107
+ # Emit batch started event
108
+ await self._events.emit_async(
109
+ EventType.BATCH_STARTED,
110
+ BatchStartedEvent(
111
+ source=source,
112
+ source_id=source_id,
113
+ ),
114
+ )
115
+
116
+ try:
117
+ # Load documents
118
+ load_result = await self._loader.load(source, **options)
119
+
120
+ if not load_result.success:
121
+ raise LoadError(
122
+ message=f"Failed to load from {source}",
123
+ details={"errors": load_result.errors},
124
+ )
125
+
126
+ # Process and index documents
127
+ total_indexed = 0
128
+ total_failed = 0
129
+ errors: list[str] = []
130
+
131
+ batch: list[Document] = []
132
+
133
+ for doc in load_result.documents:
134
+ # Chunk the document
135
+ chunks = self._chunker.chunk(doc.content)
136
+
137
+ for i, chunk in enumerate(chunks):
138
+ # Create chunk document
139
+ chunk_doc = Document(
140
+ id=f"{doc.id}-chunk-{i}",
141
+ content=chunk.content,
142
+ url=doc.url,
143
+ title=doc.title,
144
+ source=source,
145
+ account_id=account_id,
146
+ collection_id=collection_id,
147
+ source_id=source_id,
148
+ chunk_index=i,
149
+ total_chunks=len(chunks),
150
+ parent_doc_id=doc.id,
151
+ status=DocumentStatus.INDEXED,
152
+ metadata=doc.metadata,
153
+ )
154
+
155
+ batch.append(chunk_doc)
156
+
157
+ # Index batch when full
158
+ if len(batch) >= batch_size:
159
+ result = await self._index_batch(batch, index_name)
160
+ total_indexed += result.documents_indexed
161
+ total_failed += result.documents_failed
162
+ if result.errors:
163
+ errors.extend(result.errors)
164
+ batch = []
165
+
166
+ # Index remaining documents
167
+ if batch:
168
+ result = await self._index_batch(batch, index_name)
169
+ total_indexed += result.documents_indexed
170
+ total_failed += result.documents_failed
171
+ if result.errors:
172
+ errors.extend(result.errors)
173
+
174
+ # Emit batch completed event
175
+ await self._events.emit_async(
176
+ EventType.BATCH_COMPLETED,
177
+ BatchCompletedEvent(
178
+ source=source,
179
+ source_id=source_id,
180
+ documents_indexed=total_indexed,
181
+ documents_failed=total_failed,
182
+ success=total_failed == 0,
183
+ ),
184
+ )
185
+
186
+ return IndexResult(
187
+ success=total_failed == 0,
188
+ documents_indexed=total_indexed,
189
+ documents_failed=total_failed,
190
+ errors=errors if errors else None,
191
+ )
192
+
193
+ except Exception as e:
194
+ logger.error(f"Failed to load and index from {source}: {e}")
195
+ raise IndexError(
196
+ message=f"Failed to index from {source}",
197
+ details={"source": source},
198
+ cause=e,
199
+ ) from e
200
+
201
+ async def index_documents(
202
+ self,
203
+ documents: list[Document],
204
+ index_name: str,
205
+ *,
206
+ chunk: bool = True,
207
+ batch_size: int = 100,
208
+ **options: Any,
209
+ ) -> IndexResult:
210
+ """Index a list of documents.
211
+
212
+ Args:
213
+ documents: Documents to index.
214
+ index_name: Target index name.
215
+ chunk: Whether to chunk documents.
216
+ batch_size: Documents per batch.
217
+ **options: Additional indexer options.
218
+
219
+ Returns:
220
+ Index result.
221
+ """
222
+ total_indexed = 0
223
+ total_failed = 0
224
+ errors: list[str] = []
225
+ batch: list[Document] = []
226
+
227
+ for doc in documents:
228
+ if chunk:
229
+ # Chunk the document
230
+ chunks = self._chunker.chunk(doc.content)
231
+
232
+ for i, chunk_obj in enumerate(chunks):
233
+ chunk_doc = Document(
234
+ id=f"{doc.id}-chunk-{i}",
235
+ content=chunk_obj.content,
236
+ url=doc.url,
237
+ title=doc.title,
238
+ source=doc.source,
239
+ account_id=doc.account_id,
240
+ collection_id=doc.collection_id,
241
+ source_id=doc.source_id,
242
+ chunk_index=i,
243
+ total_chunks=len(chunks),
244
+ parent_doc_id=doc.id,
245
+ status=DocumentStatus.INDEXED,
246
+ metadata=doc.metadata,
247
+ )
248
+ batch.append(chunk_doc)
249
+ else:
250
+ batch.append(doc)
251
+
252
+ # Index batch when full
253
+ if len(batch) >= batch_size:
254
+ result = await self._index_batch(batch, index_name)
255
+ total_indexed += result.documents_indexed
256
+ total_failed += result.documents_failed
257
+ if result.errors:
258
+ errors.extend(result.errors)
259
+ batch = []
260
+
261
+ # Index remaining
262
+ if batch:
263
+ result = await self._index_batch(batch, index_name)
264
+ total_indexed += result.documents_indexed
265
+ total_failed += result.documents_failed
266
+ if result.errors:
267
+ errors.extend(result.errors)
268
+
269
+ return IndexResult(
270
+ success=total_failed == 0,
271
+ documents_indexed=total_indexed,
272
+ documents_failed=total_failed,
273
+ errors=errors if errors else None,
274
+ )
275
+
276
+ async def delete_source(
277
+ self,
278
+ source_id: str,
279
+ index_name: str,
280
+ account_id: str | None = None,
281
+ ) -> int:
282
+ """Delete all documents from a source.
283
+
284
+ Args:
285
+ source_id: Source ID to delete.
286
+ index_name: Index name.
287
+ account_id: Optional account filter.
288
+
289
+ Returns:
290
+ Count of deleted documents.
291
+ """
292
+ from gnosisllm_knowledge.backends.opensearch.queries import (
293
+ build_delete_by_source_query,
294
+ )
295
+
296
+ query = build_delete_by_source_query(source_id, account_id)
297
+ return await self._indexer.delete_by_query(query, index_name)
298
+
299
+ async def delete_collection(
300
+ self,
301
+ collection_id: str,
302
+ index_name: str,
303
+ account_id: str | None = None,
304
+ ) -> int:
305
+ """Delete all documents from a collection.
306
+
307
+ Args:
308
+ collection_id: Collection ID to delete.
309
+ index_name: Index name.
310
+ account_id: Optional account filter.
311
+
312
+ Returns:
313
+ Count of deleted documents.
314
+ """
315
+ from gnosisllm_knowledge.backends.opensearch.queries import (
316
+ build_delete_by_collection_query,
317
+ )
318
+
319
+ query = build_delete_by_collection_query(collection_id, account_id)
320
+ return await self._indexer.delete_by_query(query, index_name)
321
+
322
+ async def reindex_source(
323
+ self,
324
+ source: str,
325
+ source_id: str,
326
+ index_name: str,
327
+ *,
328
+ account_id: str | None = None,
329
+ collection_id: str | None = None,
330
+ **options: Any,
331
+ ) -> IndexResult:
332
+ """Reindex a source by deleting and re-loading.
333
+
334
+ Args:
335
+ source: Source URL or path.
336
+ source_id: Existing source ID.
337
+ index_name: Index name.
338
+ account_id: Account ID.
339
+ collection_id: Collection ID.
340
+ **options: Additional options.
341
+
342
+ Returns:
343
+ Index result.
344
+ """
345
+ # Delete existing documents
346
+ await self.delete_source(source_id, index_name, account_id)
347
+
348
+ # Re-index
349
+ return await self.load_and_index(
350
+ source=source,
351
+ index_name=index_name,
352
+ account_id=account_id,
353
+ collection_id=collection_id,
354
+ source_id=source_id,
355
+ **options,
356
+ )
357
+
358
+ async def _index_batch(
359
+ self,
360
+ documents: list[Document],
361
+ index_name: str,
362
+ ) -> IndexResult:
363
+ """Index a batch of documents.
364
+
365
+ Args:
366
+ documents: Documents to index.
367
+ index_name: Target index.
368
+
369
+ Returns:
370
+ Batch index result.
371
+ """
372
+ result = await self._indexer.bulk_index(documents, index_name)
373
+
374
+ # Emit events for indexed documents
375
+ for doc in documents:
376
+ if result.success:
377
+ await self._events.emit_async(
378
+ EventType.DOCUMENT_INDEXED,
379
+ DocumentIndexedEvent(
380
+ document_id=doc.id,
381
+ index_name=index_name,
382
+ chunk_index=doc.chunk_index,
383
+ total_chunks=doc.total_chunks,
384
+ ),
385
+ )
386
+
387
+ return result