gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +287 -7
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
  7. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  8. gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
  9. gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
  10. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  11. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  12. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  16. gnosisllm_knowledge/cli/app.py +378 -12
  17. gnosisllm_knowledge/cli/commands/agentic.py +11 -0
  18. gnosisllm_knowledge/cli/commands/memory.py +723 -0
  19. gnosisllm_knowledge/cli/commands/setup.py +24 -22
  20. gnosisllm_knowledge/cli/display/service.py +43 -0
  21. gnosisllm_knowledge/cli/utils/config.py +58 -0
  22. gnosisllm_knowledge/core/domain/__init__.py +41 -0
  23. gnosisllm_knowledge/core/domain/document.py +5 -0
  24. gnosisllm_knowledge/core/domain/memory.py +440 -0
  25. gnosisllm_knowledge/core/domain/result.py +11 -3
  26. gnosisllm_knowledge/core/domain/search.py +2 -0
  27. gnosisllm_knowledge/core/events/types.py +76 -0
  28. gnosisllm_knowledge/core/exceptions.py +134 -0
  29. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  30. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  31. gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
  32. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  33. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  34. gnosisllm_knowledge/loaders/base.py +3 -4
  35. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  36. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  37. gnosisllm_knowledge/services/indexing.py +67 -75
  38. gnosisllm_knowledge/services/search.py +47 -11
  39. gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
  40. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
  42. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  43. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
  44. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,127 @@
1
+ """Streaming interfaces for memory-efficient processing.
2
+
3
+ These protocols define contracts for streaming operations that process
4
+ data in bounded batches rather than loading everything into memory.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import AsyncIterator, Awaitable, Callable
10
+ from typing import TYPE_CHECKING, Any, Protocol
11
+
12
+ if TYPE_CHECKING:
13
+ from gnosisllm_knowledge.core.domain.document import Document
14
+ from gnosisllm_knowledge.core.domain.result import IndexResult
15
+
16
+
17
+ class IStreamingUrlDiscoverer(Protocol):
18
+ """Protocol for streaming URL discovery.
19
+
20
+ Implementations yield URLs as they are discovered rather than
21
+ collecting all URLs first. This enables processing to begin
22
+ immediately and keeps memory usage bounded.
23
+
24
+ Example:
25
+ ```python
26
+ discoverer: IStreamingUrlDiscoverer = StreamingSitemapDiscoverer()
27
+
28
+ async for url_batch in discoverer.discover_urls_streaming(
29
+ source="https://example.com/sitemap.xml",
30
+ batch_size=50,
31
+ ):
32
+ for url in url_batch:
33
+ await process(url)
34
+ ```
35
+ """
36
+
37
+ async def discover_urls_streaming(
38
+ self,
39
+ source: str,
40
+ batch_size: int = 100,
41
+ **options: Any,
42
+ ) -> AsyncIterator[list[str]]:
43
+ """Yield batches of discovered URLs.
44
+
45
+ Args:
46
+ source: The sitemap or source URL.
47
+ batch_size: Number of URLs per batch.
48
+ **options: Discoverer-specific options (max_urls, patterns, etc.)
49
+
50
+ Yields:
51
+ Batches of discovered URLs as they're found.
52
+ """
53
+ ...
54
+
55
+
56
+ class IStreamingLoader(Protocol):
57
+ """Protocol for streaming content loading.
58
+
59
+ Processes URLs in bounded batches with immediate indexing,
60
+ preventing memory accumulation.
61
+ """
62
+
63
+ async def load_streaming_with_indexing(
64
+ self,
65
+ source: str,
66
+ index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
67
+ url_batch_size: int = 50,
68
+ doc_batch_size: int = 100,
69
+ **options: Any,
70
+ ) -> IndexResult:
71
+ """Load and index with streaming, calling callback for each batch.
72
+
73
+ This method:
74
+ 1. Discovers URLs in batches (not all at once)
75
+ 2. Fetches content for each URL batch
76
+ 3. Indexes documents immediately after fetching
77
+ 4. Moves to next batch only after indexing completes
78
+
79
+ Memory usage is bounded by:
80
+ - url_batch_size * avg_url_length (URL strings)
81
+ - doc_batch_size * avg_doc_size (document content)
82
+ - fetch_concurrency * avg_page_size (in-flight fetches)
83
+
84
+ Args:
85
+ source: Source URL.
86
+ index_callback: Called with each batch of documents to index.
87
+ url_batch_size: URLs to process per iteration.
88
+ doc_batch_size: Documents per index batch.
89
+ **options: Additional options.
90
+
91
+ Returns:
92
+ Aggregated index result.
93
+ """
94
+ ...
95
+
96
+
97
+ class IStreamingPipeline(Protocol):
98
+ """Protocol for streaming indexing pipelines.
99
+
100
+ Orchestrates the full streaming load -> index pipeline with
101
+ bounded memory guarantees.
102
+ """
103
+
104
+ async def execute(
105
+ self,
106
+ source: str,
107
+ index_name: str,
108
+ *,
109
+ account_id: str | None = None,
110
+ collection_id: str | None = None,
111
+ source_id: str | None = None,
112
+ **options: Any,
113
+ ) -> IndexResult:
114
+ """Execute the streaming pipeline.
115
+
116
+ Args:
117
+ source: Sitemap URL.
118
+ index_name: Target OpenSearch index.
119
+ account_id: For multi-tenancy filtering.
120
+ collection_id: Collection within account.
121
+ source_id: Source identifier.
122
+ **options: Additional loader options.
123
+
124
+ Returns:
125
+ Aggregated index result.
126
+ """
127
+ ...
@@ -0,0 +1,36 @@
1
+ """Streaming pipeline configuration and utilities.
2
+
3
+ This module provides infrastructure for memory-efficient streaming
4
+ pipelines with bounded queues and backpressure support.
5
+
6
+ Example:
7
+ ```python
8
+ from gnosisllm_knowledge.core.streaming import (
9
+ PipelineConfig,
10
+ BoundedQueue,
11
+ BatchCollector,
12
+ )
13
+
14
+ # Configure pipeline for large sitemap processing
15
+ config = PipelineConfig(
16
+ url_batch_size=50,
17
+ fetch_concurrency=10,
18
+ index_batch_size=100,
19
+ )
20
+
21
+ # Use bounded queue for backpressure
22
+ queue: BoundedQueue[str] = BoundedQueue(maxsize=100)
23
+ ```
24
+ """
25
+
26
+ from gnosisllm_knowledge.core.streaming.pipeline import (
27
+ BatchCollector,
28
+ BoundedQueue,
29
+ PipelineConfig,
30
+ )
31
+
32
+ __all__ = [
33
+ "BatchCollector",
34
+ "BoundedQueue",
35
+ "PipelineConfig",
36
+ ]
@@ -0,0 +1,228 @@
1
+ """Bounded streaming pipeline with backpressure support.
2
+
3
+ This module provides infrastructure for memory-efficient streaming pipelines
4
+ with bounded queues that apply backpressure when downstream processing is slow.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ from collections.abc import AsyncIterator
11
+ from dataclasses import dataclass
12
+ from typing import Generic, TypeVar
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ @dataclass
18
+ class PipelineConfig:
19
+ """Configuration for streaming pipeline stages.
20
+
21
+ Attributes:
22
+ url_batch_size: Number of URLs to discover before yielding a batch.
23
+ fetch_concurrency: Maximum parallel URL fetches.
24
+ fetch_queue_size: Maximum URLs waiting to be fetched.
25
+ index_batch_size: Documents per index batch.
26
+ index_queue_size: Maximum docs waiting to be indexed.
27
+ fetch_timeout_seconds: Timeout for each URL fetch.
28
+ index_timeout_seconds: Timeout for each index batch.
29
+ """
30
+
31
+ # URL discovery
32
+ url_batch_size: int = 100
33
+
34
+ # Content fetching
35
+ fetch_concurrency: int = 10
36
+ fetch_queue_size: int = 50
37
+
38
+ # Indexing
39
+ index_batch_size: int = 100
40
+ index_queue_size: int = 200
41
+
42
+ # Timeouts
43
+ fetch_timeout_seconds: float = 30.0
44
+ index_timeout_seconds: float = 60.0
45
+
46
+
47
+ class BoundedQueue(Generic[T]):
48
+ """Async queue with bounded size and backpressure.
49
+
50
+ This queue provides backpressure: when full, put() blocks until space
51
+ is available. This prevents memory from growing unboundedly when
52
+ producers are faster than consumers.
53
+
54
+ Example:
55
+ ```python
56
+ queue: BoundedQueue[str] = BoundedQueue(maxsize=10)
57
+
58
+ # Producer task
59
+ async def producer():
60
+ for url in urls:
61
+ await queue.put(url) # Blocks if queue is full
62
+ queue.close()
63
+
64
+ # Consumer task
65
+ async def consumer():
66
+ async for item in queue:
67
+ await process(item)
68
+ ```
69
+ """
70
+
71
+ def __init__(self, maxsize: int = 0) -> None:
72
+ """Initialize the bounded queue.
73
+
74
+ Args:
75
+ maxsize: Maximum queue size. 0 means unlimited (no backpressure).
76
+ """
77
+ self._queue: asyncio.Queue[T | None] = asyncio.Queue(maxsize=maxsize)
78
+ self._closed = False
79
+ self._consumer_count = 0
80
+
81
+ async def put(self, item: T) -> None:
82
+ """Put an item in the queue, blocking if full (backpressure).
83
+
84
+ Args:
85
+ item: The item to add to the queue.
86
+
87
+ Raises:
88
+ RuntimeError: If the queue has been closed.
89
+ """
90
+ if self._closed:
91
+ raise RuntimeError("Queue is closed")
92
+ await self._queue.put(item)
93
+
94
+ def put_nowait(self, item: T) -> None:
95
+ """Put an item without waiting (raises if full).
96
+
97
+ Args:
98
+ item: The item to add to the queue.
99
+
100
+ Raises:
101
+ RuntimeError: If the queue has been closed.
102
+ asyncio.QueueFull: If the queue is full.
103
+ """
104
+ if self._closed:
105
+ raise RuntimeError("Queue is closed")
106
+ self._queue.put_nowait(item)
107
+
108
+ async def get(self) -> T | None:
109
+ """Get an item from the queue.
110
+
111
+ Returns:
112
+ The next item, or None if queue is closed and empty.
113
+ """
114
+ item = await self._queue.get()
115
+ self._queue.task_done()
116
+ return item
117
+
118
+ def close(self) -> None:
119
+ """Signal that no more items will be added.
120
+
121
+ After closing, consumers will receive None when the queue
122
+ is empty, signaling them to stop.
123
+ """
124
+ if not self._closed:
125
+ self._closed = True
126
+ # Put sentinel to unblock any waiting consumers
127
+ try:
128
+ self._queue.put_nowait(None)
129
+ except asyncio.QueueFull:
130
+ # Queue is full, consumer will eventually get the items
131
+ pass
132
+
133
+ @property
134
+ def is_closed(self) -> bool:
135
+ """Check if the queue has been closed."""
136
+ return self._closed
137
+
138
+ def qsize(self) -> int:
139
+ """Return the current queue size."""
140
+ return self._queue.qsize()
141
+
142
+ def empty(self) -> bool:
143
+ """Return True if the queue is empty."""
144
+ return self._queue.empty()
145
+
146
+ def full(self) -> bool:
147
+ """Return True if the queue is full."""
148
+ return self._queue.full()
149
+
150
+ def __aiter__(self) -> AsyncIterator[T]:
151
+ """Return async iterator for consuming items."""
152
+ return self
153
+
154
+ async def __anext__(self) -> T:
155
+ """Get next item from queue.
156
+
157
+ Raises:
158
+ StopAsyncIteration: When queue is closed and empty.
159
+ """
160
+ item = await self.get()
161
+ if item is None:
162
+ raise StopAsyncIteration
163
+ return item
164
+
165
+
166
+ class BatchCollector(Generic[T]):
167
+ """Collects items into batches of a specified size.
168
+
169
+ Useful for grouping streaming items into batches for
170
+ efficient bulk processing.
171
+
172
+ Example:
173
+ ```python
174
+ collector = BatchCollector[Document](batch_size=100)
175
+
176
+ async for doc in document_stream:
177
+ batch = collector.add(doc)
178
+ if batch:
179
+ await index_batch(batch)
180
+
181
+ # Flush remaining items
182
+ final_batch = collector.flush()
183
+ if final_batch:
184
+ await index_batch(final_batch)
185
+ ```
186
+ """
187
+
188
+ def __init__(self, batch_size: int) -> None:
189
+ """Initialize the batch collector.
190
+
191
+ Args:
192
+ batch_size: Number of items per batch.
193
+ """
194
+ self._batch_size = batch_size
195
+ self._buffer: list[T] = []
196
+
197
+ def add(self, item: T) -> list[T] | None:
198
+ """Add an item to the current batch.
199
+
200
+ Args:
201
+ item: The item to add.
202
+
203
+ Returns:
204
+ A complete batch if batch_size is reached, otherwise None.
205
+ """
206
+ self._buffer.append(item)
207
+ if len(self._buffer) >= self._batch_size:
208
+ batch = self._buffer
209
+ self._buffer = []
210
+ return batch
211
+ return None
212
+
213
+ def flush(self) -> list[T] | None:
214
+ """Flush any remaining items as a partial batch.
215
+
216
+ Returns:
217
+ The remaining items, or None if empty.
218
+ """
219
+ if self._buffer:
220
+ batch = self._buffer
221
+ self._buffer = []
222
+ return batch
223
+ return None
224
+
225
+ @property
226
+ def pending_count(self) -> int:
227
+ """Return the number of items waiting in the buffer."""
228
+ return len(self._buffer)
@@ -161,8 +161,8 @@ class BaseLoader(ABC):
161
161
  return LoadResult(
162
162
  source=source,
163
163
  source_type=self.name,
164
- document_count=0,
165
164
  success=True,
165
+ documents=[],
166
166
  duration_ms=(time.time() - start_time) * 1000,
167
167
  )
168
168
 
@@ -197,13 +197,12 @@ class BaseLoader(ABC):
197
197
  return LoadResult(
198
198
  source=source,
199
199
  source_type=self.name,
200
- document_count=len(documents),
201
200
  success=True,
201
+ documents=documents,
202
202
  duration_ms=duration_ms,
203
203
  urls_processed=urls_processed,
204
204
  urls_failed=urls_failed,
205
205
  bytes_loaded=sum(len(d.content) for d in documents),
206
- metadata={"documents": documents},
207
206
  )
208
207
 
209
208
  except Exception as e:
@@ -211,8 +210,8 @@ class BaseLoader(ABC):
211
210
  return LoadResult(
212
211
  source=source,
213
212
  source_type=self.name,
214
- document_count=0,
215
213
  success=False,
214
+ documents=[],
216
215
  error_message=str(e),
217
216
  duration_ms=(time.time() - start_time) * 1000,
218
217
  )
@@ -6,13 +6,18 @@ import asyncio
6
6
  import fnmatch
7
7
  import logging
8
8
  import re
9
+ from collections.abc import Awaitable, Callable
9
10
  from typing import Any
10
11
  from xml.etree import ElementTree
11
12
 
12
13
  import httpx
13
14
 
14
- from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent
15
+ from gnosisllm_knowledge.core.domain.document import Document
16
+ from gnosisllm_knowledge.core.domain.result import IndexResult
17
+ from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent, UrlBatchProcessedEvent
18
+ from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
15
19
  from gnosisllm_knowledge.loaders.base import BaseLoader
20
+ from gnosisllm_knowledge.loaders.sitemap_streaming import StreamingSitemapDiscoverer
16
21
 
17
22
  # XML namespace for sitemaps
18
23
  SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
@@ -283,3 +288,126 @@ class SitemapLoader(BaseLoader):
283
288
  except re.error:
284
289
  # Invalid regex, try substring match
285
290
  return pattern in url
291
+
292
+ async def load_streaming_with_indexing(
293
+ self,
294
+ source: str,
295
+ index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
296
+ url_batch_size: int = 50,
297
+ doc_batch_size: int = 100,
298
+ config: PipelineConfig | None = None,
299
+ **options: Any,
300
+ ) -> IndexResult:
301
+ """Load sitemap with streaming URL discovery and progressive indexing.
302
+
303
+ This method:
304
+ 1. Discovers URLs in batches (not all at once)
305
+ 2. Fetches content for each URL batch
306
+ 3. Indexes documents immediately after fetching
307
+ 4. Moves to next batch only after indexing completes
308
+
309
+ Memory usage is bounded by:
310
+ - url_batch_size * avg_url_length (URL strings)
311
+ - doc_batch_size * avg_doc_size (document content)
312
+ - fetch_concurrency * avg_page_size (in-flight fetches)
313
+
314
+ Args:
315
+ source: Sitemap URL.
316
+ index_callback: Called with each batch of documents to index.
317
+ url_batch_size: URLs to process per iteration.
318
+ doc_batch_size: Documents per index batch.
319
+ config: Pipeline configuration.
320
+ **options: Additional loader options.
321
+
322
+ Returns:
323
+ Aggregated IndexResult.
324
+ """
325
+ config = config or PipelineConfig(url_batch_size=url_batch_size)
326
+ discoverer = StreamingSitemapDiscoverer(config=config)
327
+
328
+ total_indexed = 0
329
+ total_failed = 0
330
+ all_errors: list[dict[str, Any]] = []
331
+ batch_index = 0
332
+ total_urls_processed = 0
333
+
334
+ async for url_batch in discoverer.discover_urls_streaming(
335
+ sitemap_url=source,
336
+ batch_size=url_batch_size,
337
+ max_urls=options.get("max_urls", 10000),
338
+ max_depth=options.get("max_depth", 3),
339
+ allowed_patterns=options.get("allowed_patterns", []),
340
+ blocked_patterns=options.get("blocked_patterns", []),
341
+ ):
342
+ # Fetch content for this batch of URLs
343
+ documents = await self._fetch_url_batch(url_batch, source, **options)
344
+ total_urls_processed += len(url_batch)
345
+
346
+ # Emit URL batch processed event
347
+ self._events.emit(
348
+ UrlBatchProcessedEvent(
349
+ batch_index=batch_index,
350
+ urls_in_batch=len(url_batch),
351
+ documents_created=len(documents),
352
+ total_urls_processed=total_urls_processed,
353
+ )
354
+ )
355
+
356
+ # Index in sub-batches
357
+ for i in range(0, len(documents), doc_batch_size):
358
+ doc_batch = documents[i : i + doc_batch_size]
359
+ result = await index_callback(doc_batch)
360
+ total_indexed += result.indexed_count
361
+ total_failed += result.failed_count
362
+ if result.errors:
363
+ all_errors.extend(result.errors)
364
+
365
+ # Memory freed: url_batch and documents go out of scope
366
+ self._sitemap_logger.info(
367
+ f"Processed URL batch {batch_index}: {len(url_batch)} URLs, "
368
+ f"{len(documents)} docs, {total_indexed} total indexed"
369
+ )
370
+ batch_index += 1
371
+
372
+ return IndexResult(
373
+ success=total_failed == 0,
374
+ indexed_count=total_indexed,
375
+ failed_count=total_failed,
376
+ errors=all_errors,
377
+ )
378
+
379
+ async def _fetch_url_batch(
380
+ self,
381
+ urls: list[str],
382
+ source: str,
383
+ **options: Any,
384
+ ) -> list[Document]:
385
+ """Fetch and chunk content for a batch of URLs.
386
+
387
+ Args:
388
+ urls: List of URLs to fetch.
389
+ source: Original source identifier.
390
+ **options: Loader options.
391
+
392
+ Returns:
393
+ List of Document objects from all URLs.
394
+ """
395
+ semaphore = asyncio.Semaphore(self._max_concurrent)
396
+
397
+ async def fetch_one(url: str) -> list[Document]:
398
+ async with semaphore:
399
+ return await self._load_url(url, source, **options)
400
+
401
+ results = await asyncio.gather(
402
+ *[fetch_one(url) for url in urls],
403
+ return_exceptions=True,
404
+ )
405
+
406
+ documents: list[Document] = []
407
+ for i, result in enumerate(results):
408
+ if isinstance(result, Exception):
409
+ self._sitemap_logger.error(f"Failed to fetch {urls[i]}: {result}")
410
+ elif isinstance(result, list):
411
+ documents.extend(result)
412
+
413
+ return documents