gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +287 -7
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +378 -12
- gnosisllm_knowledge/cli/commands/agentic.py +11 -0
- gnosisllm_knowledge/cli/commands/memory.py +723 -0
- gnosisllm_knowledge/cli/commands/setup.py +24 -22
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +58 -0
- gnosisllm_knowledge/core/domain/__init__.py +41 -0
- gnosisllm_knowledge/core/domain/document.py +5 -0
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +2 -0
- gnosisllm_knowledge/core/events/types.py +76 -0
- gnosisllm_knowledge/core/exceptions.py +134 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +67 -75
- gnosisllm_knowledge/services/search.py +47 -11
- gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Streaming interfaces for memory-efficient processing.
|
|
2
|
+
|
|
3
|
+
These protocols define contracts for streaming operations that process
|
|
4
|
+
data in bounded batches rather than loading everything into memory.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from gnosisllm_knowledge.core.domain.document import Document
|
|
14
|
+
from gnosisllm_knowledge.core.domain.result import IndexResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IStreamingUrlDiscoverer(Protocol):
|
|
18
|
+
"""Protocol for streaming URL discovery.
|
|
19
|
+
|
|
20
|
+
Implementations yield URLs as they are discovered rather than
|
|
21
|
+
collecting all URLs first. This enables processing to begin
|
|
22
|
+
immediately and keeps memory usage bounded.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
```python
|
|
26
|
+
discoverer: IStreamingUrlDiscoverer = StreamingSitemapDiscoverer()
|
|
27
|
+
|
|
28
|
+
async for url_batch in discoverer.discover_urls_streaming(
|
|
29
|
+
source="https://example.com/sitemap.xml",
|
|
30
|
+
batch_size=50,
|
|
31
|
+
):
|
|
32
|
+
for url in url_batch:
|
|
33
|
+
await process(url)
|
|
34
|
+
```
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
async def discover_urls_streaming(
|
|
38
|
+
self,
|
|
39
|
+
source: str,
|
|
40
|
+
batch_size: int = 100,
|
|
41
|
+
**options: Any,
|
|
42
|
+
) -> AsyncIterator[list[str]]:
|
|
43
|
+
"""Yield batches of discovered URLs.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
source: The sitemap or source URL.
|
|
47
|
+
batch_size: Number of URLs per batch.
|
|
48
|
+
**options: Discoverer-specific options (max_urls, patterns, etc.)
|
|
49
|
+
|
|
50
|
+
Yields:
|
|
51
|
+
Batches of discovered URLs as they're found.
|
|
52
|
+
"""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class IStreamingLoader(Protocol):
|
|
57
|
+
"""Protocol for streaming content loading.
|
|
58
|
+
|
|
59
|
+
Processes URLs in bounded batches with immediate indexing,
|
|
60
|
+
preventing memory accumulation.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
async def load_streaming_with_indexing(
|
|
64
|
+
self,
|
|
65
|
+
source: str,
|
|
66
|
+
index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
|
|
67
|
+
url_batch_size: int = 50,
|
|
68
|
+
doc_batch_size: int = 100,
|
|
69
|
+
**options: Any,
|
|
70
|
+
) -> IndexResult:
|
|
71
|
+
"""Load and index with streaming, calling callback for each batch.
|
|
72
|
+
|
|
73
|
+
This method:
|
|
74
|
+
1. Discovers URLs in batches (not all at once)
|
|
75
|
+
2. Fetches content for each URL batch
|
|
76
|
+
3. Indexes documents immediately after fetching
|
|
77
|
+
4. Moves to next batch only after indexing completes
|
|
78
|
+
|
|
79
|
+
Memory usage is bounded by:
|
|
80
|
+
- url_batch_size * avg_url_length (URL strings)
|
|
81
|
+
- doc_batch_size * avg_doc_size (document content)
|
|
82
|
+
- fetch_concurrency * avg_page_size (in-flight fetches)
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
source: Source URL.
|
|
86
|
+
index_callback: Called with each batch of documents to index.
|
|
87
|
+
url_batch_size: URLs to process per iteration.
|
|
88
|
+
doc_batch_size: Documents per index batch.
|
|
89
|
+
**options: Additional options.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Aggregated index result.
|
|
93
|
+
"""
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class IStreamingPipeline(Protocol):
|
|
98
|
+
"""Protocol for streaming indexing pipelines.
|
|
99
|
+
|
|
100
|
+
Orchestrates the full streaming load -> index pipeline with
|
|
101
|
+
bounded memory guarantees.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
async def execute(
|
|
105
|
+
self,
|
|
106
|
+
source: str,
|
|
107
|
+
index_name: str,
|
|
108
|
+
*,
|
|
109
|
+
account_id: str | None = None,
|
|
110
|
+
collection_id: str | None = None,
|
|
111
|
+
source_id: str | None = None,
|
|
112
|
+
**options: Any,
|
|
113
|
+
) -> IndexResult:
|
|
114
|
+
"""Execute the streaming pipeline.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
source: Sitemap URL.
|
|
118
|
+
index_name: Target OpenSearch index.
|
|
119
|
+
account_id: For multi-tenancy filtering.
|
|
120
|
+
collection_id: Collection within account.
|
|
121
|
+
source_id: Source identifier.
|
|
122
|
+
**options: Additional loader options.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Aggregated index result.
|
|
126
|
+
"""
|
|
127
|
+
...
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Streaming pipeline configuration and utilities.
|
|
2
|
+
|
|
3
|
+
This module provides infrastructure for memory-efficient streaming
|
|
4
|
+
pipelines with bounded queues and backpressure support.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
```python
|
|
8
|
+
from gnosisllm_knowledge.core.streaming import (
|
|
9
|
+
PipelineConfig,
|
|
10
|
+
BoundedQueue,
|
|
11
|
+
BatchCollector,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Configure pipeline for large sitemap processing
|
|
15
|
+
config = PipelineConfig(
|
|
16
|
+
url_batch_size=50,
|
|
17
|
+
fetch_concurrency=10,
|
|
18
|
+
index_batch_size=100,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Use bounded queue for backpressure
|
|
22
|
+
queue: BoundedQueue[str] = BoundedQueue(maxsize=100)
|
|
23
|
+
```
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from gnosisllm_knowledge.core.streaming.pipeline import (
|
|
27
|
+
BatchCollector,
|
|
28
|
+
BoundedQueue,
|
|
29
|
+
PipelineConfig,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"BatchCollector",
|
|
34
|
+
"BoundedQueue",
|
|
35
|
+
"PipelineConfig",
|
|
36
|
+
]
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Bounded streaming pipeline with backpressure support.
|
|
2
|
+
|
|
3
|
+
This module provides infrastructure for memory-efficient streaming pipelines
|
|
4
|
+
with bounded queues that apply backpressure when downstream processing is slow.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from collections.abc import AsyncIterator
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Generic, TypeVar
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class PipelineConfig:
|
|
19
|
+
"""Configuration for streaming pipeline stages.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
url_batch_size: Number of URLs to discover before yielding a batch.
|
|
23
|
+
fetch_concurrency: Maximum parallel URL fetches.
|
|
24
|
+
fetch_queue_size: Maximum URLs waiting to be fetched.
|
|
25
|
+
index_batch_size: Documents per index batch.
|
|
26
|
+
index_queue_size: Maximum docs waiting to be indexed.
|
|
27
|
+
fetch_timeout_seconds: Timeout for each URL fetch.
|
|
28
|
+
index_timeout_seconds: Timeout for each index batch.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# URL discovery
|
|
32
|
+
url_batch_size: int = 100
|
|
33
|
+
|
|
34
|
+
# Content fetching
|
|
35
|
+
fetch_concurrency: int = 10
|
|
36
|
+
fetch_queue_size: int = 50
|
|
37
|
+
|
|
38
|
+
# Indexing
|
|
39
|
+
index_batch_size: int = 100
|
|
40
|
+
index_queue_size: int = 200
|
|
41
|
+
|
|
42
|
+
# Timeouts
|
|
43
|
+
fetch_timeout_seconds: float = 30.0
|
|
44
|
+
index_timeout_seconds: float = 60.0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class BoundedQueue(Generic[T]):
|
|
48
|
+
"""Async queue with bounded size and backpressure.
|
|
49
|
+
|
|
50
|
+
This queue provides backpressure: when full, put() blocks until space
|
|
51
|
+
is available. This prevents memory from growing unboundedly when
|
|
52
|
+
producers are faster than consumers.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
```python
|
|
56
|
+
queue: BoundedQueue[str] = BoundedQueue(maxsize=10)
|
|
57
|
+
|
|
58
|
+
# Producer task
|
|
59
|
+
async def producer():
|
|
60
|
+
for url in urls:
|
|
61
|
+
await queue.put(url) # Blocks if queue is full
|
|
62
|
+
queue.close()
|
|
63
|
+
|
|
64
|
+
# Consumer task
|
|
65
|
+
async def consumer():
|
|
66
|
+
async for item in queue:
|
|
67
|
+
await process(item)
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, maxsize: int = 0) -> None:
|
|
72
|
+
"""Initialize the bounded queue.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
maxsize: Maximum queue size. 0 means unlimited (no backpressure).
|
|
76
|
+
"""
|
|
77
|
+
self._queue: asyncio.Queue[T | None] = asyncio.Queue(maxsize=maxsize)
|
|
78
|
+
self._closed = False
|
|
79
|
+
self._consumer_count = 0
|
|
80
|
+
|
|
81
|
+
async def put(self, item: T) -> None:
|
|
82
|
+
"""Put an item in the queue, blocking if full (backpressure).
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
item: The item to add to the queue.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
RuntimeError: If the queue has been closed.
|
|
89
|
+
"""
|
|
90
|
+
if self._closed:
|
|
91
|
+
raise RuntimeError("Queue is closed")
|
|
92
|
+
await self._queue.put(item)
|
|
93
|
+
|
|
94
|
+
def put_nowait(self, item: T) -> None:
|
|
95
|
+
"""Put an item without waiting (raises if full).
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
item: The item to add to the queue.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
RuntimeError: If the queue has been closed.
|
|
102
|
+
asyncio.QueueFull: If the queue is full.
|
|
103
|
+
"""
|
|
104
|
+
if self._closed:
|
|
105
|
+
raise RuntimeError("Queue is closed")
|
|
106
|
+
self._queue.put_nowait(item)
|
|
107
|
+
|
|
108
|
+
async def get(self) -> T | None:
|
|
109
|
+
"""Get an item from the queue.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The next item, or None if queue is closed and empty.
|
|
113
|
+
"""
|
|
114
|
+
item = await self._queue.get()
|
|
115
|
+
self._queue.task_done()
|
|
116
|
+
return item
|
|
117
|
+
|
|
118
|
+
def close(self) -> None:
|
|
119
|
+
"""Signal that no more items will be added.
|
|
120
|
+
|
|
121
|
+
After closing, consumers will receive None when the queue
|
|
122
|
+
is empty, signaling them to stop.
|
|
123
|
+
"""
|
|
124
|
+
if not self._closed:
|
|
125
|
+
self._closed = True
|
|
126
|
+
# Put sentinel to unblock any waiting consumers
|
|
127
|
+
try:
|
|
128
|
+
self._queue.put_nowait(None)
|
|
129
|
+
except asyncio.QueueFull:
|
|
130
|
+
# Queue is full, consumer will eventually get the items
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def is_closed(self) -> bool:
|
|
135
|
+
"""Check if the queue has been closed."""
|
|
136
|
+
return self._closed
|
|
137
|
+
|
|
138
|
+
def qsize(self) -> int:
|
|
139
|
+
"""Return the current queue size."""
|
|
140
|
+
return self._queue.qsize()
|
|
141
|
+
|
|
142
|
+
def empty(self) -> bool:
|
|
143
|
+
"""Return True if the queue is empty."""
|
|
144
|
+
return self._queue.empty()
|
|
145
|
+
|
|
146
|
+
def full(self) -> bool:
|
|
147
|
+
"""Return True if the queue is full."""
|
|
148
|
+
return self._queue.full()
|
|
149
|
+
|
|
150
|
+
def __aiter__(self) -> AsyncIterator[T]:
|
|
151
|
+
"""Return async iterator for consuming items."""
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
async def __anext__(self) -> T:
|
|
155
|
+
"""Get next item from queue.
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
StopAsyncIteration: When queue is closed and empty.
|
|
159
|
+
"""
|
|
160
|
+
item = await self.get()
|
|
161
|
+
if item is None:
|
|
162
|
+
raise StopAsyncIteration
|
|
163
|
+
return item
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class BatchCollector(Generic[T]):
|
|
167
|
+
"""Collects items into batches of a specified size.
|
|
168
|
+
|
|
169
|
+
Useful for grouping streaming items into batches for
|
|
170
|
+
efficient bulk processing.
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
```python
|
|
174
|
+
collector = BatchCollector[Document](batch_size=100)
|
|
175
|
+
|
|
176
|
+
async for doc in document_stream:
|
|
177
|
+
batch = collector.add(doc)
|
|
178
|
+
if batch:
|
|
179
|
+
await index_batch(batch)
|
|
180
|
+
|
|
181
|
+
# Flush remaining items
|
|
182
|
+
final_batch = collector.flush()
|
|
183
|
+
if final_batch:
|
|
184
|
+
await index_batch(final_batch)
|
|
185
|
+
```
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
def __init__(self, batch_size: int) -> None:
|
|
189
|
+
"""Initialize the batch collector.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
batch_size: Number of items per batch.
|
|
193
|
+
"""
|
|
194
|
+
self._batch_size = batch_size
|
|
195
|
+
self._buffer: list[T] = []
|
|
196
|
+
|
|
197
|
+
def add(self, item: T) -> list[T] | None:
|
|
198
|
+
"""Add an item to the current batch.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
item: The item to add.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
A complete batch if batch_size is reached, otherwise None.
|
|
205
|
+
"""
|
|
206
|
+
self._buffer.append(item)
|
|
207
|
+
if len(self._buffer) >= self._batch_size:
|
|
208
|
+
batch = self._buffer
|
|
209
|
+
self._buffer = []
|
|
210
|
+
return batch
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
def flush(self) -> list[T] | None:
|
|
214
|
+
"""Flush any remaining items as a partial batch.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
The remaining items, or None if empty.
|
|
218
|
+
"""
|
|
219
|
+
if self._buffer:
|
|
220
|
+
batch = self._buffer
|
|
221
|
+
self._buffer = []
|
|
222
|
+
return batch
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
@property
|
|
226
|
+
def pending_count(self) -> int:
|
|
227
|
+
"""Return the number of items waiting in the buffer."""
|
|
228
|
+
return len(self._buffer)
|
|
@@ -161,8 +161,8 @@ class BaseLoader(ABC):
|
|
|
161
161
|
return LoadResult(
|
|
162
162
|
source=source,
|
|
163
163
|
source_type=self.name,
|
|
164
|
-
document_count=0,
|
|
165
164
|
success=True,
|
|
165
|
+
documents=[],
|
|
166
166
|
duration_ms=(time.time() - start_time) * 1000,
|
|
167
167
|
)
|
|
168
168
|
|
|
@@ -197,13 +197,12 @@ class BaseLoader(ABC):
|
|
|
197
197
|
return LoadResult(
|
|
198
198
|
source=source,
|
|
199
199
|
source_type=self.name,
|
|
200
|
-
document_count=len(documents),
|
|
201
200
|
success=True,
|
|
201
|
+
documents=documents,
|
|
202
202
|
duration_ms=duration_ms,
|
|
203
203
|
urls_processed=urls_processed,
|
|
204
204
|
urls_failed=urls_failed,
|
|
205
205
|
bytes_loaded=sum(len(d.content) for d in documents),
|
|
206
|
-
metadata={"documents": documents},
|
|
207
206
|
)
|
|
208
207
|
|
|
209
208
|
except Exception as e:
|
|
@@ -211,8 +210,8 @@ class BaseLoader(ABC):
|
|
|
211
210
|
return LoadResult(
|
|
212
211
|
source=source,
|
|
213
212
|
source_type=self.name,
|
|
214
|
-
document_count=0,
|
|
215
213
|
success=False,
|
|
214
|
+
documents=[],
|
|
216
215
|
error_message=str(e),
|
|
217
216
|
duration_ms=(time.time() - start_time) * 1000,
|
|
218
217
|
)
|
|
@@ -6,13 +6,18 @@ import asyncio
|
|
|
6
6
|
import fnmatch
|
|
7
7
|
import logging
|
|
8
8
|
import re
|
|
9
|
+
from collections.abc import Awaitable, Callable
|
|
9
10
|
from typing import Any
|
|
10
11
|
from xml.etree import ElementTree
|
|
11
12
|
|
|
12
13
|
import httpx
|
|
13
14
|
|
|
14
|
-
from gnosisllm_knowledge.core.
|
|
15
|
+
from gnosisllm_knowledge.core.domain.document import Document
|
|
16
|
+
from gnosisllm_knowledge.core.domain.result import IndexResult
|
|
17
|
+
from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent, UrlBatchProcessedEvent
|
|
18
|
+
from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
|
|
15
19
|
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
20
|
+
from gnosisllm_knowledge.loaders.sitemap_streaming import StreamingSitemapDiscoverer
|
|
16
21
|
|
|
17
22
|
# XML namespace for sitemaps
|
|
18
23
|
SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
@@ -283,3 +288,126 @@ class SitemapLoader(BaseLoader):
|
|
|
283
288
|
except re.error:
|
|
284
289
|
# Invalid regex, try substring match
|
|
285
290
|
return pattern in url
|
|
291
|
+
|
|
292
|
+
async def load_streaming_with_indexing(
|
|
293
|
+
self,
|
|
294
|
+
source: str,
|
|
295
|
+
index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
|
|
296
|
+
url_batch_size: int = 50,
|
|
297
|
+
doc_batch_size: int = 100,
|
|
298
|
+
config: PipelineConfig | None = None,
|
|
299
|
+
**options: Any,
|
|
300
|
+
) -> IndexResult:
|
|
301
|
+
"""Load sitemap with streaming URL discovery and progressive indexing.
|
|
302
|
+
|
|
303
|
+
This method:
|
|
304
|
+
1. Discovers URLs in batches (not all at once)
|
|
305
|
+
2. Fetches content for each URL batch
|
|
306
|
+
3. Indexes documents immediately after fetching
|
|
307
|
+
4. Moves to next batch only after indexing completes
|
|
308
|
+
|
|
309
|
+
Memory usage is bounded by:
|
|
310
|
+
- url_batch_size * avg_url_length (URL strings)
|
|
311
|
+
- doc_batch_size * avg_doc_size (document content)
|
|
312
|
+
- fetch_concurrency * avg_page_size (in-flight fetches)
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
source: Sitemap URL.
|
|
316
|
+
index_callback: Called with each batch of documents to index.
|
|
317
|
+
url_batch_size: URLs to process per iteration.
|
|
318
|
+
doc_batch_size: Documents per index batch.
|
|
319
|
+
config: Pipeline configuration.
|
|
320
|
+
**options: Additional loader options.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Aggregated IndexResult.
|
|
324
|
+
"""
|
|
325
|
+
config = config or PipelineConfig(url_batch_size=url_batch_size)
|
|
326
|
+
discoverer = StreamingSitemapDiscoverer(config=config)
|
|
327
|
+
|
|
328
|
+
total_indexed = 0
|
|
329
|
+
total_failed = 0
|
|
330
|
+
all_errors: list[dict[str, Any]] = []
|
|
331
|
+
batch_index = 0
|
|
332
|
+
total_urls_processed = 0
|
|
333
|
+
|
|
334
|
+
async for url_batch in discoverer.discover_urls_streaming(
|
|
335
|
+
sitemap_url=source,
|
|
336
|
+
batch_size=url_batch_size,
|
|
337
|
+
max_urls=options.get("max_urls", 10000),
|
|
338
|
+
max_depth=options.get("max_depth", 3),
|
|
339
|
+
allowed_patterns=options.get("allowed_patterns", []),
|
|
340
|
+
blocked_patterns=options.get("blocked_patterns", []),
|
|
341
|
+
):
|
|
342
|
+
# Fetch content for this batch of URLs
|
|
343
|
+
documents = await self._fetch_url_batch(url_batch, source, **options)
|
|
344
|
+
total_urls_processed += len(url_batch)
|
|
345
|
+
|
|
346
|
+
# Emit URL batch processed event
|
|
347
|
+
self._events.emit(
|
|
348
|
+
UrlBatchProcessedEvent(
|
|
349
|
+
batch_index=batch_index,
|
|
350
|
+
urls_in_batch=len(url_batch),
|
|
351
|
+
documents_created=len(documents),
|
|
352
|
+
total_urls_processed=total_urls_processed,
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Index in sub-batches
|
|
357
|
+
for i in range(0, len(documents), doc_batch_size):
|
|
358
|
+
doc_batch = documents[i : i + doc_batch_size]
|
|
359
|
+
result = await index_callback(doc_batch)
|
|
360
|
+
total_indexed += result.indexed_count
|
|
361
|
+
total_failed += result.failed_count
|
|
362
|
+
if result.errors:
|
|
363
|
+
all_errors.extend(result.errors)
|
|
364
|
+
|
|
365
|
+
# Memory freed: url_batch and documents go out of scope
|
|
366
|
+
self._sitemap_logger.info(
|
|
367
|
+
f"Processed URL batch {batch_index}: {len(url_batch)} URLs, "
|
|
368
|
+
f"{len(documents)} docs, {total_indexed} total indexed"
|
|
369
|
+
)
|
|
370
|
+
batch_index += 1
|
|
371
|
+
|
|
372
|
+
return IndexResult(
|
|
373
|
+
success=total_failed == 0,
|
|
374
|
+
indexed_count=total_indexed,
|
|
375
|
+
failed_count=total_failed,
|
|
376
|
+
errors=all_errors,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
async def _fetch_url_batch(
|
|
380
|
+
self,
|
|
381
|
+
urls: list[str],
|
|
382
|
+
source: str,
|
|
383
|
+
**options: Any,
|
|
384
|
+
) -> list[Document]:
|
|
385
|
+
"""Fetch and chunk content for a batch of URLs.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
urls: List of URLs to fetch.
|
|
389
|
+
source: Original source identifier.
|
|
390
|
+
**options: Loader options.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
List of Document objects from all URLs.
|
|
394
|
+
"""
|
|
395
|
+
semaphore = asyncio.Semaphore(self._max_concurrent)
|
|
396
|
+
|
|
397
|
+
async def fetch_one(url: str) -> list[Document]:
|
|
398
|
+
async with semaphore:
|
|
399
|
+
return await self._load_url(url, source, **options)
|
|
400
|
+
|
|
401
|
+
results = await asyncio.gather(
|
|
402
|
+
*[fetch_one(url) for url in urls],
|
|
403
|
+
return_exceptions=True,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
documents: list[Document] = []
|
|
407
|
+
for i, result in enumerate(results):
|
|
408
|
+
if isinstance(result, Exception):
|
|
409
|
+
self._sitemap_logger.error(f"Failed to fetch {urls[i]}: {result}")
|
|
410
|
+
elif isinstance(result, list):
|
|
411
|
+
documents.extend(result)
|
|
412
|
+
|
|
413
|
+
return documents
|