gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""Base loader with common functionality (Template Method Pattern)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from collections.abc import AsyncIterator, Callable
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from gnosisllm_knowledge.core.domain.document import Document
|
|
12
|
+
from gnosisllm_knowledge.core.domain.result import LoadResult, ValidationResult
|
|
13
|
+
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
14
|
+
from gnosisllm_knowledge.core.events.types import DocumentLoadedEvent, EventType
|
|
15
|
+
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
16
|
+
from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
|
|
17
|
+
|
|
18
|
+
# Default concurrency for parallel URL fetching
|
|
19
|
+
DEFAULT_MAX_CONCURRENT = 10
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseLoader(ABC):
|
|
23
|
+
"""Base class for content loaders (Template Method Pattern).
|
|
24
|
+
|
|
25
|
+
This class provides common functionality for loading content from
|
|
26
|
+
various sources. Subclasses implement the `_get_urls` method to
|
|
27
|
+
define how URLs are discovered from the source.
|
|
28
|
+
|
|
29
|
+
Features:
|
|
30
|
+
- Parallel URL fetching with configurable concurrency
|
|
31
|
+
- Streaming support for memory-efficient processing
|
|
32
|
+
- Event emission for progress tracking
|
|
33
|
+
- Automatic chunking of fetched content
|
|
34
|
+
- Configurable options per source type
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
```python
|
|
38
|
+
class MyLoader(BaseLoader):
|
|
39
|
+
@property
|
|
40
|
+
def name(self) -> str:
|
|
41
|
+
return "my-loader"
|
|
42
|
+
|
|
43
|
+
def supports(self, source: str) -> bool:
|
|
44
|
+
return source.startswith("my://")
|
|
45
|
+
|
|
46
|
+
async def _get_urls(self, source: str, **options) -> list[str]:
|
|
47
|
+
# Return list of URLs to fetch
|
|
48
|
+
return [source.replace("my://", "https://")]
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
fetcher: IContentFetcher,
|
|
55
|
+
chunker: ITextChunker,
|
|
56
|
+
config: dict[str, Any] | None = None,
|
|
57
|
+
event_emitter: EventEmitter | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Initialize the loader with dependencies.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
fetcher: Content fetcher for retrieving URL content.
|
|
63
|
+
chunker: Text chunker for splitting content into documents.
|
|
64
|
+
config: Optional configuration dictionary.
|
|
65
|
+
event_emitter: Optional event emitter for progress events.
|
|
66
|
+
"""
|
|
67
|
+
self._fetcher = fetcher
|
|
68
|
+
self._chunker = chunker
|
|
69
|
+
self._config = config or {}
|
|
70
|
+
self._events = event_emitter or EventEmitter()
|
|
71
|
+
self._logger = logging.getLogger(self.__class__.__name__)
|
|
72
|
+
self._max_concurrent = self._config.get("max_concurrent", DEFAULT_MAX_CONCURRENT)
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def name(self) -> str:
|
|
77
|
+
"""Return the loader name for registry identification."""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def supports(self, source: str) -> bool:
|
|
82
|
+
"""Check if this loader supports the given source.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
source: The source URL or path.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
True if this loader can handle the source.
|
|
89
|
+
"""
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
@abstractmethod
|
|
93
|
+
async def _get_urls(self, source: str, **options: Any) -> list[str]:
|
|
94
|
+
"""Get list of URLs to process from the source.
|
|
95
|
+
|
|
96
|
+
This method is implemented by subclasses to define how
|
|
97
|
+
URLs are discovered from the source.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
source: The source URL or path.
|
|
101
|
+
**options: Loader-specific options.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of URLs to fetch and process.
|
|
105
|
+
"""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
async def validate_source(self, source: str) -> ValidationResult:
|
|
109
|
+
"""Validate that the source is accessible and valid.
|
|
110
|
+
|
|
111
|
+
Default implementation checks if fetcher can reach the source.
|
|
112
|
+
Subclasses can override for specific validation logic.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
source: The source URL or path.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
ValidationResult with validation status.
|
|
119
|
+
"""
|
|
120
|
+
try:
|
|
121
|
+
# Check if we can get URLs from source
|
|
122
|
+
urls = await self._get_urls(source)
|
|
123
|
+
if not urls:
|
|
124
|
+
return ValidationResult(
|
|
125
|
+
valid=False,
|
|
126
|
+
message=f"No URLs found in source: {source}",
|
|
127
|
+
)
|
|
128
|
+
return ValidationResult(
|
|
129
|
+
valid=True,
|
|
130
|
+
message=f"Source valid with {len(urls)} URLs",
|
|
131
|
+
metadata={"url_count": len(urls)},
|
|
132
|
+
)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
return ValidationResult(
|
|
135
|
+
valid=False,
|
|
136
|
+
message=str(e),
|
|
137
|
+
errors=[str(e)],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
async def load(self, source: str, **options: Any) -> LoadResult:
|
|
141
|
+
"""Load all documents from source with parallel URL fetching.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
source: The source URL or path.
|
|
145
|
+
**options: Loader-specific options.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
LoadResult with loaded documents and metadata.
|
|
149
|
+
"""
|
|
150
|
+
import time
|
|
151
|
+
|
|
152
|
+
start_time = time.time()
|
|
153
|
+
documents: list[Document] = []
|
|
154
|
+
urls_processed = 0
|
|
155
|
+
urls_failed = 0
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
urls = await self._get_urls(source, **options)
|
|
159
|
+
|
|
160
|
+
if not urls:
|
|
161
|
+
return LoadResult(
|
|
162
|
+
source=source,
|
|
163
|
+
source_type=self.name,
|
|
164
|
+
document_count=0,
|
|
165
|
+
success=True,
|
|
166
|
+
duration_ms=(time.time() - start_time) * 1000,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
self._logger.info(
|
|
170
|
+
f"Loading {len(urls)} URLs with concurrency={self._max_concurrent}"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Use semaphore to limit concurrent fetches
|
|
174
|
+
semaphore = asyncio.Semaphore(self._max_concurrent)
|
|
175
|
+
|
|
176
|
+
async def fetch_with_semaphore(url: str) -> list[Document]:
|
|
177
|
+
async with semaphore:
|
|
178
|
+
return await self._load_url(url, source, **options)
|
|
179
|
+
|
|
180
|
+
# Process all URLs in parallel (limited by semaphore)
|
|
181
|
+
results = await asyncio.gather(
|
|
182
|
+
*[fetch_with_semaphore(url) for url in urls],
|
|
183
|
+
return_exceptions=True,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Flatten results, handling exceptions gracefully
|
|
187
|
+
for i, result in enumerate(results):
|
|
188
|
+
if isinstance(result, Exception):
|
|
189
|
+
self._logger.error(f"Failed to load URL {urls[i]}: {result}")
|
|
190
|
+
urls_failed += 1
|
|
191
|
+
elif isinstance(result, list):
|
|
192
|
+
documents.extend(result)
|
|
193
|
+
urls_processed += 1
|
|
194
|
+
|
|
195
|
+
duration_ms = (time.time() - start_time) * 1000
|
|
196
|
+
|
|
197
|
+
return LoadResult(
|
|
198
|
+
source=source,
|
|
199
|
+
source_type=self.name,
|
|
200
|
+
document_count=len(documents),
|
|
201
|
+
success=True,
|
|
202
|
+
duration_ms=duration_ms,
|
|
203
|
+
urls_processed=urls_processed,
|
|
204
|
+
urls_failed=urls_failed,
|
|
205
|
+
bytes_loaded=sum(len(d.content) for d in documents),
|
|
206
|
+
metadata={"documents": documents},
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
except Exception as e:
|
|
210
|
+
self._logger.error(f"Load failed: {e}")
|
|
211
|
+
return LoadResult(
|
|
212
|
+
source=source,
|
|
213
|
+
source_type=self.name,
|
|
214
|
+
document_count=0,
|
|
215
|
+
success=False,
|
|
216
|
+
error_message=str(e),
|
|
217
|
+
duration_ms=(time.time() - start_time) * 1000,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
async def load_streaming(
|
|
221
|
+
self,
|
|
222
|
+
source: str,
|
|
223
|
+
**options: Any,
|
|
224
|
+
) -> AsyncIterator[Document]:
|
|
225
|
+
"""Stream documents from source for memory-efficient processing.
|
|
226
|
+
|
|
227
|
+
Yields documents as they are loaded, allowing for immediate
|
|
228
|
+
processing without waiting for all documents to load.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
source: The source URL or path.
|
|
232
|
+
**options: Loader-specific options.
|
|
233
|
+
|
|
234
|
+
Yields:
|
|
235
|
+
Document objects as they are loaded.
|
|
236
|
+
"""
|
|
237
|
+
urls = await self._get_urls(source, **options)
|
|
238
|
+
|
|
239
|
+
if not urls:
|
|
240
|
+
return
|
|
241
|
+
|
|
242
|
+
self._logger.info(
|
|
243
|
+
f"Streaming {len(urls)} URLs with concurrency={self._max_concurrent}"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
semaphore = asyncio.Semaphore(self._max_concurrent)
|
|
247
|
+
|
|
248
|
+
async def fetch_with_semaphore(url: str) -> tuple[str, list[Document]]:
|
|
249
|
+
async with semaphore:
|
|
250
|
+
docs = await self._load_url(url, source, **options)
|
|
251
|
+
return (url, docs)
|
|
252
|
+
|
|
253
|
+
# Create tasks for all URLs
|
|
254
|
+
tasks = [asyncio.create_task(fetch_with_semaphore(url)) for url in urls]
|
|
255
|
+
|
|
256
|
+
# Process results as they complete (streaming behavior)
|
|
257
|
+
for coro in asyncio.as_completed(tasks):
|
|
258
|
+
try:
|
|
259
|
+
url, url_docs = await coro
|
|
260
|
+
for doc in url_docs:
|
|
261
|
+
yield doc
|
|
262
|
+
except Exception as e:
|
|
263
|
+
self._logger.error(f"Failed to load URL: {e}")
|
|
264
|
+
|
|
265
|
+
async def load_with_callback(
|
|
266
|
+
self,
|
|
267
|
+
source: str,
|
|
268
|
+
callback: Callable[[list[Document]], Any],
|
|
269
|
+
batch_size: int = 5,
|
|
270
|
+
**options: Any,
|
|
271
|
+
) -> int:
|
|
272
|
+
"""Load documents with a callback for batch processing.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
source: The source URL or path.
|
|
276
|
+
callback: Callback function called with each batch.
|
|
277
|
+
batch_size: Number of documents per batch.
|
|
278
|
+
**options: Loader-specific options.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Total number of documents loaded.
|
|
282
|
+
"""
|
|
283
|
+
urls = await self._get_urls(source, **options)
|
|
284
|
+
|
|
285
|
+
if not urls:
|
|
286
|
+
return 0
|
|
287
|
+
|
|
288
|
+
self._logger.info(
|
|
289
|
+
f"Streaming {len(urls)} URLs with concurrency={self._max_concurrent}"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
total = 0
|
|
293
|
+
batch: list[Document] = []
|
|
294
|
+
semaphore = asyncio.Semaphore(self._max_concurrent)
|
|
295
|
+
|
|
296
|
+
async def fetch_with_semaphore(url: str) -> tuple[str, list[Document]]:
|
|
297
|
+
async with semaphore:
|
|
298
|
+
docs = await self._load_url(url, source, **options)
|
|
299
|
+
return (url, docs)
|
|
300
|
+
|
|
301
|
+
# Create tasks for all URLs
|
|
302
|
+
tasks = [asyncio.create_task(fetch_with_semaphore(url)) for url in urls]
|
|
303
|
+
|
|
304
|
+
# Process results as they complete
|
|
305
|
+
for coro in asyncio.as_completed(tasks):
|
|
306
|
+
try:
|
|
307
|
+
url, url_docs = await coro
|
|
308
|
+
batch.extend(url_docs)
|
|
309
|
+
|
|
310
|
+
# Stream batches as they fill up
|
|
311
|
+
while len(batch) >= batch_size:
|
|
312
|
+
await self._invoke_callback(callback, batch[:batch_size])
|
|
313
|
+
total += batch_size
|
|
314
|
+
batch = batch[batch_size:]
|
|
315
|
+
|
|
316
|
+
except Exception as e:
|
|
317
|
+
self._logger.error(f"Failed to load URL: {e}")
|
|
318
|
+
|
|
319
|
+
# Flush remaining batch
|
|
320
|
+
if batch:
|
|
321
|
+
await self._invoke_callback(callback, batch)
|
|
322
|
+
total += len(batch)
|
|
323
|
+
|
|
324
|
+
return total
|
|
325
|
+
|
|
326
|
+
async def _invoke_callback(
|
|
327
|
+
self, callback: Callable[[list[Document]], Any], documents: list[Document]
|
|
328
|
+
) -> None:
|
|
329
|
+
"""Invoke callback, handling both sync and async callbacks.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
callback: The callback function to invoke.
|
|
333
|
+
documents: List of documents to pass to callback.
|
|
334
|
+
"""
|
|
335
|
+
import inspect
|
|
336
|
+
|
|
337
|
+
result = callback(documents)
|
|
338
|
+
if inspect.iscoroutine(result):
|
|
339
|
+
await result
|
|
340
|
+
|
|
341
|
+
async def _load_url(
|
|
342
|
+
self,
|
|
343
|
+
url: str,
|
|
344
|
+
source: str,
|
|
345
|
+
**options: Any,
|
|
346
|
+
) -> list[Document]:
|
|
347
|
+
"""Load and chunk content from a single URL.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
url: The URL to fetch.
|
|
351
|
+
source: The original source identifier.
|
|
352
|
+
**options: Loader-specific options.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
List of Document objects from the URL.
|
|
356
|
+
"""
|
|
357
|
+
try:
|
|
358
|
+
result = await self._fetcher.fetch(url, **options)
|
|
359
|
+
chunks = self._chunker.chunk(result.content)
|
|
360
|
+
|
|
361
|
+
documents: list[Document] = []
|
|
362
|
+
for chunk in chunks:
|
|
363
|
+
doc = Document(
|
|
364
|
+
content=chunk.content,
|
|
365
|
+
source=source,
|
|
366
|
+
url=url,
|
|
367
|
+
title=result.title,
|
|
368
|
+
chunk_index=chunk.index,
|
|
369
|
+
total_chunks=len(chunks),
|
|
370
|
+
metadata={
|
|
371
|
+
"loader": self.name,
|
|
372
|
+
"fetch_url": url,
|
|
373
|
+
"content_type": result.content_type,
|
|
374
|
+
},
|
|
375
|
+
)
|
|
376
|
+
documents.append(doc)
|
|
377
|
+
|
|
378
|
+
# Emit event
|
|
379
|
+
self._events.emit(
|
|
380
|
+
DocumentLoadedEvent(
|
|
381
|
+
url=url,
|
|
382
|
+
source=source,
|
|
383
|
+
chunks_count=len(chunks),
|
|
384
|
+
content_length=len(result.content),
|
|
385
|
+
)
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return documents
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
self._logger.error(f"Failed to load URL {url}: {e}")
|
|
392
|
+
self._events.emit(
|
|
393
|
+
type(
|
|
394
|
+
"Event",
|
|
395
|
+
(),
|
|
396
|
+
{"event_type": EventType.ERROR, "data": {"url": url, "error": str(e)}},
|
|
397
|
+
)() # type: ignore[arg-type]
|
|
398
|
+
)
|
|
399
|
+
return []
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Loader factory with registry pattern."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
10
|
+
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
11
|
+
from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
|
|
12
|
+
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
13
|
+
from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
|
|
14
|
+
from gnosisllm_knowledge.loaders.website import WebsiteLoader
|
|
15
|
+
|
|
16
|
+
# Type for loader factory functions
|
|
17
|
+
LoaderCreator = Callable[
|
|
18
|
+
[IContentFetcher, ITextChunker, dict[str, Any] | None, EventEmitter | None],
|
|
19
|
+
BaseLoader,
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LoaderFactory:
|
|
24
|
+
"""Factory for creating content loaders (Registry Pattern).
|
|
25
|
+
|
|
26
|
+
The factory maintains a registry of loader types and can create
|
|
27
|
+
the appropriate loader based on source URL or explicit type.
|
|
28
|
+
|
|
29
|
+
Built-in loaders:
|
|
30
|
+
- website: Single URL loading
|
|
31
|
+
- sitemap: Sitemap XML with recursive discovery
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
```python
|
|
35
|
+
factory = LoaderFactory(fetcher, chunker)
|
|
36
|
+
|
|
37
|
+
# Auto-detect source type
|
|
38
|
+
loader = factory.create_for_source("https://example.com/sitemap.xml")
|
|
39
|
+
|
|
40
|
+
# Explicit type
|
|
41
|
+
loader = factory.create("sitemap", config={"max_urls": 500})
|
|
42
|
+
|
|
43
|
+
# Register custom loader
|
|
44
|
+
factory.register("custom", MyCustomLoader)
|
|
45
|
+
```
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
fetcher: IContentFetcher,
|
|
51
|
+
chunker: ITextChunker,
|
|
52
|
+
default_config: dict[str, Any] | None = None,
|
|
53
|
+
event_emitter: EventEmitter | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialize the factory with dependencies.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
fetcher: Content fetcher for all loaders.
|
|
59
|
+
chunker: Text chunker for all loaders.
|
|
60
|
+
default_config: Default configuration for all loaders.
|
|
61
|
+
event_emitter: Event emitter for all loaders.
|
|
62
|
+
"""
|
|
63
|
+
self._fetcher = fetcher
|
|
64
|
+
self._chunker = chunker
|
|
65
|
+
self._default_config = default_config or {}
|
|
66
|
+
self._events = event_emitter or EventEmitter()
|
|
67
|
+
self._logger = logging.getLogger(__name__)
|
|
68
|
+
|
|
69
|
+
# Registry of loader creators
|
|
70
|
+
self._registry: dict[str, LoaderCreator] = {}
|
|
71
|
+
|
|
72
|
+
# Register built-in loaders
|
|
73
|
+
self._register_builtin_loaders()
|
|
74
|
+
|
|
75
|
+
def _register_builtin_loaders(self) -> None:
|
|
76
|
+
"""Register built-in loader types."""
|
|
77
|
+
self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
|
|
78
|
+
self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
|
|
79
|
+
|
|
80
|
+
def register(self, name: str, creator: LoaderCreator) -> None:
|
|
81
|
+
"""Register a loader type.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
name: Loader type name.
|
|
85
|
+
creator: Factory function that creates the loader.
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
```python
|
|
89
|
+
factory.register("custom", lambda f, c, cfg, e: MyLoader(f, c, cfg, e))
|
|
90
|
+
```
|
|
91
|
+
"""
|
|
92
|
+
self._registry[name.lower()] = creator
|
|
93
|
+
self._logger.debug(f"Registered loader type: {name}")
|
|
94
|
+
|
|
95
|
+
def unregister(self, name: str) -> bool:
|
|
96
|
+
"""Unregister a loader type.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
name: Loader type name to remove.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
True if removed, False if not found.
|
|
103
|
+
"""
|
|
104
|
+
name = name.lower()
|
|
105
|
+
if name in self._registry:
|
|
106
|
+
del self._registry[name]
|
|
107
|
+
return True
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def list_types(self) -> list[str]:
|
|
111
|
+
"""List all registered loader types.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
List of loader type names.
|
|
115
|
+
"""
|
|
116
|
+
return list(self._registry.keys())
|
|
117
|
+
|
|
118
|
+
def create(
|
|
119
|
+
self,
|
|
120
|
+
loader_type: str,
|
|
121
|
+
config: dict[str, Any] | None = None,
|
|
122
|
+
) -> BaseLoader:
|
|
123
|
+
"""Create a loader by type name.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
loader_type: Type of loader to create.
|
|
127
|
+
config: Optional configuration (merged with defaults).
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Configured loader instance.
|
|
131
|
+
|
|
132
|
+
Raises:
|
|
133
|
+
ValueError: If loader type is not registered.
|
|
134
|
+
"""
|
|
135
|
+
loader_type = loader_type.lower()
|
|
136
|
+
|
|
137
|
+
if loader_type not in self._registry:
|
|
138
|
+
available = ", ".join(self._registry.keys())
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Unknown loader type: {loader_type}. Available: {available}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Merge default config with provided config
|
|
144
|
+
merged_config = {**self._default_config, **(config or {})}
|
|
145
|
+
|
|
146
|
+
creator = self._registry[loader_type]
|
|
147
|
+
return creator(self._fetcher, self._chunker, merged_config, self._events)
|
|
148
|
+
|
|
149
|
+
def create_for_source(
|
|
150
|
+
self,
|
|
151
|
+
source: str,
|
|
152
|
+
config: dict[str, Any] | None = None,
|
|
153
|
+
) -> BaseLoader:
|
|
154
|
+
"""Create the appropriate loader for a source URL.
|
|
155
|
+
|
|
156
|
+
Auto-detects the loader type based on the source URL.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
source: The source URL or path.
|
|
160
|
+
config: Optional configuration.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Loader that supports the source.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
ValueError: If no loader supports the source.
|
|
167
|
+
"""
|
|
168
|
+
# Check each registered loader
|
|
169
|
+
for loader_type, creator in self._registry.items():
|
|
170
|
+
merged_config = {**self._default_config, **(config or {})}
|
|
171
|
+
loader = creator(self._fetcher, self._chunker, merged_config, self._events)
|
|
172
|
+
if loader.supports(source):
|
|
173
|
+
self._logger.debug(f"Auto-selected loader type '{loader_type}' for {source}")
|
|
174
|
+
return loader
|
|
175
|
+
|
|
176
|
+
raise ValueError(f"No loader supports source: {source}")
|
|
177
|
+
|
|
178
|
+
def detect_type(self, source: str) -> str | None:
|
|
179
|
+
"""Detect the loader type for a source URL.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
source: The source URL or path.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Loader type name or None if not detected.
|
|
186
|
+
"""
|
|
187
|
+
for loader_type, creator in self._registry.items():
|
|
188
|
+
loader = creator(self._fetcher, self._chunker, {}, None)
|
|
189
|
+
if loader.supports(source):
|
|
190
|
+
return loader_type
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
def supports(self, source: str) -> bool:
|
|
194
|
+
"""Check if any loader supports the source.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
source: The source URL or path.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
True if at least one loader supports it.
|
|
201
|
+
"""
|
|
202
|
+
return self.detect_type(source) is not None
|