gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""Knowledge indexing service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import uuid
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus
|
|
10
|
+
from gnosisllm_knowledge.core.domain.result import IndexResult, LoadResult
|
|
11
|
+
from gnosisllm_knowledge.core.domain.source import SourceConfig
|
|
12
|
+
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
13
|
+
from gnosisllm_knowledge.core.events.types import (
|
|
14
|
+
BatchCompletedEvent,
|
|
15
|
+
BatchStartedEvent,
|
|
16
|
+
DocumentIndexedEvent,
|
|
17
|
+
EventType,
|
|
18
|
+
)
|
|
19
|
+
from gnosisllm_knowledge.core.exceptions import IndexError, LoadError
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
23
|
+
from gnosisllm_knowledge.core.interfaces.indexer import IDocumentIndexer
|
|
24
|
+
from gnosisllm_knowledge.core.interfaces.loader import IContentLoader
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class KnowledgeIndexingService:
|
|
30
|
+
"""Service for loading, chunking, and indexing knowledge documents.
|
|
31
|
+
|
|
32
|
+
Orchestrates the document ingestion pipeline from source to index.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
```python
|
|
36
|
+
service = KnowledgeIndexingService(
|
|
37
|
+
loader=WebsiteLoader(fetcher),
|
|
38
|
+
chunker=SentenceChunker(),
|
|
39
|
+
indexer=OpenSearchIndexer(client, config),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Load and index from a URL
|
|
43
|
+
result = await service.load_and_index(
|
|
44
|
+
source="https://docs.example.com",
|
|
45
|
+
index_name="knowledge",
|
|
46
|
+
collection_id="docs",
|
|
47
|
+
)
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
loader: IContentLoader,
|
|
54
|
+
chunker: ITextChunker,
|
|
55
|
+
indexer: IDocumentIndexer,
|
|
56
|
+
events: EventEmitter | None = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Initialize the indexing service.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
loader: Content loader for fetching documents.
|
|
62
|
+
chunker: Text chunker for splitting documents.
|
|
63
|
+
indexer: Document indexer for storing documents.
|
|
64
|
+
events: Optional event emitter for progress tracking.
|
|
65
|
+
|
|
66
|
+
Note:
|
|
67
|
+
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
68
|
+
No Python-side embedding function is needed.
|
|
69
|
+
"""
|
|
70
|
+
self._loader = loader
|
|
71
|
+
self._chunker = chunker
|
|
72
|
+
self._indexer = indexer
|
|
73
|
+
self._events = events or EventEmitter()
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def events(self) -> EventEmitter:
|
|
77
|
+
"""Get the event emitter."""
|
|
78
|
+
return self._events
|
|
79
|
+
|
|
80
|
+
async def load_and_index(
|
|
81
|
+
self,
|
|
82
|
+
source: str,
|
|
83
|
+
index_name: str,
|
|
84
|
+
*,
|
|
85
|
+
account_id: str | None = None,
|
|
86
|
+
collection_id: str | None = None,
|
|
87
|
+
source_id: str | None = None,
|
|
88
|
+
batch_size: int = 100,
|
|
89
|
+
**options: Any,
|
|
90
|
+
) -> IndexResult:
|
|
91
|
+
"""Load content from source and index it.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
source: Source URL or path.
|
|
95
|
+
index_name: Target index name.
|
|
96
|
+
account_id: Account ID for multi-tenancy.
|
|
97
|
+
collection_id: Collection ID.
|
|
98
|
+
source_id: Source ID (auto-generated if not provided).
|
|
99
|
+
batch_size: Documents per batch.
|
|
100
|
+
**options: Additional loader/indexer options.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Index result with counts.
|
|
104
|
+
"""
|
|
105
|
+
source_id = source_id or str(uuid.uuid4())
|
|
106
|
+
|
|
107
|
+
# Emit batch started event
|
|
108
|
+
await self._events.emit_async(
|
|
109
|
+
EventType.BATCH_STARTED,
|
|
110
|
+
BatchStartedEvent(
|
|
111
|
+
source=source,
|
|
112
|
+
source_id=source_id,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
# Load documents
|
|
118
|
+
load_result = await self._loader.load(source, **options)
|
|
119
|
+
|
|
120
|
+
if not load_result.success:
|
|
121
|
+
raise LoadError(
|
|
122
|
+
message=f"Failed to load from {source}",
|
|
123
|
+
details={"errors": load_result.errors},
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Process and index documents
|
|
127
|
+
total_indexed = 0
|
|
128
|
+
total_failed = 0
|
|
129
|
+
errors: list[str] = []
|
|
130
|
+
|
|
131
|
+
batch: list[Document] = []
|
|
132
|
+
|
|
133
|
+
for doc in load_result.documents:
|
|
134
|
+
# Chunk the document
|
|
135
|
+
chunks = self._chunker.chunk(doc.content)
|
|
136
|
+
|
|
137
|
+
for i, chunk in enumerate(chunks):
|
|
138
|
+
# Create chunk document
|
|
139
|
+
chunk_doc = Document(
|
|
140
|
+
id=f"{doc.id}-chunk-{i}",
|
|
141
|
+
content=chunk.content,
|
|
142
|
+
url=doc.url,
|
|
143
|
+
title=doc.title,
|
|
144
|
+
source=source,
|
|
145
|
+
account_id=account_id,
|
|
146
|
+
collection_id=collection_id,
|
|
147
|
+
source_id=source_id,
|
|
148
|
+
chunk_index=i,
|
|
149
|
+
total_chunks=len(chunks),
|
|
150
|
+
parent_doc_id=doc.id,
|
|
151
|
+
status=DocumentStatus.INDEXED,
|
|
152
|
+
metadata=doc.metadata,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
batch.append(chunk_doc)
|
|
156
|
+
|
|
157
|
+
# Index batch when full
|
|
158
|
+
if len(batch) >= batch_size:
|
|
159
|
+
result = await self._index_batch(batch, index_name)
|
|
160
|
+
total_indexed += result.documents_indexed
|
|
161
|
+
total_failed += result.documents_failed
|
|
162
|
+
if result.errors:
|
|
163
|
+
errors.extend(result.errors)
|
|
164
|
+
batch = []
|
|
165
|
+
|
|
166
|
+
# Index remaining documents
|
|
167
|
+
if batch:
|
|
168
|
+
result = await self._index_batch(batch, index_name)
|
|
169
|
+
total_indexed += result.documents_indexed
|
|
170
|
+
total_failed += result.documents_failed
|
|
171
|
+
if result.errors:
|
|
172
|
+
errors.extend(result.errors)
|
|
173
|
+
|
|
174
|
+
# Emit batch completed event
|
|
175
|
+
await self._events.emit_async(
|
|
176
|
+
EventType.BATCH_COMPLETED,
|
|
177
|
+
BatchCompletedEvent(
|
|
178
|
+
source=source,
|
|
179
|
+
source_id=source_id,
|
|
180
|
+
documents_indexed=total_indexed,
|
|
181
|
+
documents_failed=total_failed,
|
|
182
|
+
success=total_failed == 0,
|
|
183
|
+
),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return IndexResult(
|
|
187
|
+
success=total_failed == 0,
|
|
188
|
+
documents_indexed=total_indexed,
|
|
189
|
+
documents_failed=total_failed,
|
|
190
|
+
errors=errors if errors else None,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logger.error(f"Failed to load and index from {source}: {e}")
|
|
195
|
+
raise IndexError(
|
|
196
|
+
message=f"Failed to index from {source}",
|
|
197
|
+
details={"source": source},
|
|
198
|
+
cause=e,
|
|
199
|
+
) from e
|
|
200
|
+
|
|
201
|
+
async def index_documents(
|
|
202
|
+
self,
|
|
203
|
+
documents: list[Document],
|
|
204
|
+
index_name: str,
|
|
205
|
+
*,
|
|
206
|
+
chunk: bool = True,
|
|
207
|
+
batch_size: int = 100,
|
|
208
|
+
**options: Any,
|
|
209
|
+
) -> IndexResult:
|
|
210
|
+
"""Index a list of documents.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
documents: Documents to index.
|
|
214
|
+
index_name: Target index name.
|
|
215
|
+
chunk: Whether to chunk documents.
|
|
216
|
+
batch_size: Documents per batch.
|
|
217
|
+
**options: Additional indexer options.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Index result.
|
|
221
|
+
"""
|
|
222
|
+
total_indexed = 0
|
|
223
|
+
total_failed = 0
|
|
224
|
+
errors: list[str] = []
|
|
225
|
+
batch: list[Document] = []
|
|
226
|
+
|
|
227
|
+
for doc in documents:
|
|
228
|
+
if chunk:
|
|
229
|
+
# Chunk the document
|
|
230
|
+
chunks = self._chunker.chunk(doc.content)
|
|
231
|
+
|
|
232
|
+
for i, chunk_obj in enumerate(chunks):
|
|
233
|
+
chunk_doc = Document(
|
|
234
|
+
id=f"{doc.id}-chunk-{i}",
|
|
235
|
+
content=chunk_obj.content,
|
|
236
|
+
url=doc.url,
|
|
237
|
+
title=doc.title,
|
|
238
|
+
source=doc.source,
|
|
239
|
+
account_id=doc.account_id,
|
|
240
|
+
collection_id=doc.collection_id,
|
|
241
|
+
source_id=doc.source_id,
|
|
242
|
+
chunk_index=i,
|
|
243
|
+
total_chunks=len(chunks),
|
|
244
|
+
parent_doc_id=doc.id,
|
|
245
|
+
status=DocumentStatus.INDEXED,
|
|
246
|
+
metadata=doc.metadata,
|
|
247
|
+
)
|
|
248
|
+
batch.append(chunk_doc)
|
|
249
|
+
else:
|
|
250
|
+
batch.append(doc)
|
|
251
|
+
|
|
252
|
+
# Index batch when full
|
|
253
|
+
if len(batch) >= batch_size:
|
|
254
|
+
result = await self._index_batch(batch, index_name)
|
|
255
|
+
total_indexed += result.documents_indexed
|
|
256
|
+
total_failed += result.documents_failed
|
|
257
|
+
if result.errors:
|
|
258
|
+
errors.extend(result.errors)
|
|
259
|
+
batch = []
|
|
260
|
+
|
|
261
|
+
# Index remaining
|
|
262
|
+
if batch:
|
|
263
|
+
result = await self._index_batch(batch, index_name)
|
|
264
|
+
total_indexed += result.documents_indexed
|
|
265
|
+
total_failed += result.documents_failed
|
|
266
|
+
if result.errors:
|
|
267
|
+
errors.extend(result.errors)
|
|
268
|
+
|
|
269
|
+
return IndexResult(
|
|
270
|
+
success=total_failed == 0,
|
|
271
|
+
documents_indexed=total_indexed,
|
|
272
|
+
documents_failed=total_failed,
|
|
273
|
+
errors=errors if errors else None,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
async def delete_source(
|
|
277
|
+
self,
|
|
278
|
+
source_id: str,
|
|
279
|
+
index_name: str,
|
|
280
|
+
account_id: str | None = None,
|
|
281
|
+
) -> int:
|
|
282
|
+
"""Delete all documents from a source.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
source_id: Source ID to delete.
|
|
286
|
+
index_name: Index name.
|
|
287
|
+
account_id: Optional account filter.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
Count of deleted documents.
|
|
291
|
+
"""
|
|
292
|
+
from gnosisllm_knowledge.backends.opensearch.queries import (
|
|
293
|
+
build_delete_by_source_query,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
query = build_delete_by_source_query(source_id, account_id)
|
|
297
|
+
return await self._indexer.delete_by_query(query, index_name)
|
|
298
|
+
|
|
299
|
+
async def delete_collection(
|
|
300
|
+
self,
|
|
301
|
+
collection_id: str,
|
|
302
|
+
index_name: str,
|
|
303
|
+
account_id: str | None = None,
|
|
304
|
+
) -> int:
|
|
305
|
+
"""Delete all documents from a collection.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
collection_id: Collection ID to delete.
|
|
309
|
+
index_name: Index name.
|
|
310
|
+
account_id: Optional account filter.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Count of deleted documents.
|
|
314
|
+
"""
|
|
315
|
+
from gnosisllm_knowledge.backends.opensearch.queries import (
|
|
316
|
+
build_delete_by_collection_query,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
query = build_delete_by_collection_query(collection_id, account_id)
|
|
320
|
+
return await self._indexer.delete_by_query(query, index_name)
|
|
321
|
+
|
|
322
|
+
async def reindex_source(
|
|
323
|
+
self,
|
|
324
|
+
source: str,
|
|
325
|
+
source_id: str,
|
|
326
|
+
index_name: str,
|
|
327
|
+
*,
|
|
328
|
+
account_id: str | None = None,
|
|
329
|
+
collection_id: str | None = None,
|
|
330
|
+
**options: Any,
|
|
331
|
+
) -> IndexResult:
|
|
332
|
+
"""Reindex a source by deleting and re-loading.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
source: Source URL or path.
|
|
336
|
+
source_id: Existing source ID.
|
|
337
|
+
index_name: Index name.
|
|
338
|
+
account_id: Account ID.
|
|
339
|
+
collection_id: Collection ID.
|
|
340
|
+
**options: Additional options.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Index result.
|
|
344
|
+
"""
|
|
345
|
+
# Delete existing documents
|
|
346
|
+
await self.delete_source(source_id, index_name, account_id)
|
|
347
|
+
|
|
348
|
+
# Re-index
|
|
349
|
+
return await self.load_and_index(
|
|
350
|
+
source=source,
|
|
351
|
+
index_name=index_name,
|
|
352
|
+
account_id=account_id,
|
|
353
|
+
collection_id=collection_id,
|
|
354
|
+
source_id=source_id,
|
|
355
|
+
**options,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
async def _index_batch(
|
|
359
|
+
self,
|
|
360
|
+
documents: list[Document],
|
|
361
|
+
index_name: str,
|
|
362
|
+
) -> IndexResult:
|
|
363
|
+
"""Index a batch of documents.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
documents: Documents to index.
|
|
367
|
+
index_name: Target index.
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Batch index result.
|
|
371
|
+
"""
|
|
372
|
+
result = await self._indexer.bulk_index(documents, index_name)
|
|
373
|
+
|
|
374
|
+
# Emit events for indexed documents
|
|
375
|
+
for doc in documents:
|
|
376
|
+
if result.success:
|
|
377
|
+
await self._events.emit_async(
|
|
378
|
+
EventType.DOCUMENT_INDEXED,
|
|
379
|
+
DocumentIndexedEvent(
|
|
380
|
+
document_id=doc.id,
|
|
381
|
+
index_name=index_name,
|
|
382
|
+
chunk_index=doc.chunk_index,
|
|
383
|
+
total_chunks=doc.total_chunks,
|
|
384
|
+
),
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
return result
|