gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,11 @@ from typing import Any
9
9
  from gnosisllm_knowledge.core.events.emitter import EventEmitter
10
10
  from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
11
11
  from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
12
+ from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
13
+ from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
14
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
12
15
  from gnosisllm_knowledge.loaders.base import BaseLoader
16
+ from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
13
17
  from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
14
18
  from gnosisllm_knowledge.loaders.website import WebsiteLoader
15
19
 
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
20
24
  ]
21
25
 
22
26
 
27
+ def _create_discovery_loader(
28
+ fetcher: IContentFetcher,
29
+ chunker: ITextChunker,
30
+ config: dict[str, Any] | None,
31
+ event_emitter: EventEmitter | None,
32
+ ) -> DiscoveryLoader:
33
+ """Factory function for creating DiscoveryLoader instances.
34
+
35
+ Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
36
+ is a NeoreaderContentFetcher, reuses its config to ensure consistency.
37
+ Otherwise, creates config from environment variables.
38
+
39
+ Args:
40
+ fetcher: Content fetcher for retrieving URL content.
41
+ chunker: Text chunker for splitting content.
42
+ config: Optional configuration dictionary.
43
+ event_emitter: Optional event emitter for progress events.
44
+
45
+ Returns:
46
+ Configured DiscoveryLoader instance.
47
+ """
48
+ # Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
49
+ if isinstance(fetcher, NeoreaderContentFetcher):
50
+ neoreader_config = fetcher.config
51
+ else:
52
+ neoreader_config = NeoreaderConfig.from_env()
53
+
54
+ discovery_client = NeoreaderDiscoveryClient(neoreader_config)
55
+ return DiscoveryLoader(
56
+ fetcher=fetcher,
57
+ chunker=chunker,
58
+ discovery_client=discovery_client,
59
+ config=config,
60
+ event_emitter=event_emitter,
61
+ )
62
+
63
+
23
64
  class LoaderFactory:
24
65
  """Factory for creating content loaders (Registry Pattern).
25
66
 
@@ -29,6 +70,7 @@ class LoaderFactory:
29
70
  Built-in loaders:
30
71
  - website: Single URL loading
31
72
  - sitemap: Sitemap XML with recursive discovery
73
+ - discovery: Website crawling via Neo Reader Discovery API
32
74
 
33
75
  Example:
34
76
  ```python
@@ -40,6 +82,9 @@ class LoaderFactory:
40
82
  # Explicit type
41
83
  loader = factory.create("sitemap", config={"max_urls": 500})
42
84
 
85
+ # Discovery loader for full website crawling
86
+ loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
87
+
43
88
  # Register custom loader
44
89
  factory.register("custom", MyCustomLoader)
45
90
  ```
@@ -76,6 +121,7 @@ class LoaderFactory:
76
121
  """Register built-in loader types."""
77
122
  self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
78
123
  self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
124
+ self.register("discovery", _create_discovery_loader)
79
125
 
80
126
  def register(self, name: str, creator: LoaderCreator) -> None:
81
127
  """Register a loader type.
@@ -6,13 +6,18 @@ import asyncio
6
6
  import fnmatch
7
7
  import logging
8
8
  import re
9
+ from collections.abc import Awaitable, Callable
9
10
  from typing import Any
10
11
  from xml.etree import ElementTree
11
12
 
12
13
  import httpx
13
14
 
14
- from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent
15
+ from gnosisllm_knowledge.core.domain.document import Document
16
+ from gnosisllm_knowledge.core.domain.result import IndexResult
17
+ from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent, UrlBatchProcessedEvent
18
+ from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
15
19
  from gnosisllm_knowledge.loaders.base import BaseLoader
20
+ from gnosisllm_knowledge.loaders.sitemap_streaming import StreamingSitemapDiscoverer
16
21
 
17
22
  # XML namespace for sitemaps
18
23
  SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
@@ -283,3 +288,126 @@ class SitemapLoader(BaseLoader):
283
288
  except re.error:
284
289
  # Invalid regex, try substring match
285
290
  return pattern in url
291
+
292
+ async def load_streaming_with_indexing(
293
+ self,
294
+ source: str,
295
+ index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
296
+ url_batch_size: int = 50,
297
+ doc_batch_size: int = 100,
298
+ config: PipelineConfig | None = None,
299
+ **options: Any,
300
+ ) -> IndexResult:
301
+ """Load sitemap with streaming URL discovery and progressive indexing.
302
+
303
+ This method:
304
+ 1. Discovers URLs in batches (not all at once)
305
+ 2. Fetches content for each URL batch
306
+ 3. Indexes documents immediately after fetching
307
+ 4. Moves to next batch only after indexing completes
308
+
309
+ Memory usage is bounded by:
310
+ - url_batch_size * avg_url_length (URL strings)
311
+ - doc_batch_size * avg_doc_size (document content)
312
+ - fetch_concurrency * avg_page_size (in-flight fetches)
313
+
314
+ Args:
315
+ source: Sitemap URL.
316
+ index_callback: Called with each batch of documents to index.
317
+ url_batch_size: URLs to process per iteration.
318
+ doc_batch_size: Documents per index batch.
319
+ config: Pipeline configuration.
320
+ **options: Additional loader options.
321
+
322
+ Returns:
323
+ Aggregated IndexResult.
324
+ """
325
+ config = config or PipelineConfig(url_batch_size=url_batch_size)
326
+ discoverer = StreamingSitemapDiscoverer(config=config)
327
+
328
+ total_indexed = 0
329
+ total_failed = 0
330
+ all_errors: list[dict[str, Any]] = []
331
+ batch_index = 0
332
+ total_urls_processed = 0
333
+
334
+ async for url_batch in discoverer.discover_urls_streaming(
335
+ sitemap_url=source,
336
+ batch_size=url_batch_size,
337
+ max_urls=options.get("max_urls", 10000),
338
+ max_depth=options.get("max_depth", 3),
339
+ allowed_patterns=options.get("allowed_patterns", []),
340
+ blocked_patterns=options.get("blocked_patterns", []),
341
+ ):
342
+ # Fetch content for this batch of URLs
343
+ documents = await self._fetch_url_batch(url_batch, source, **options)
344
+ total_urls_processed += len(url_batch)
345
+
346
+ # Emit URL batch processed event
347
+ self._events.emit(
348
+ UrlBatchProcessedEvent(
349
+ batch_index=batch_index,
350
+ urls_in_batch=len(url_batch),
351
+ documents_created=len(documents),
352
+ total_urls_processed=total_urls_processed,
353
+ )
354
+ )
355
+
356
+ # Index in sub-batches
357
+ for i in range(0, len(documents), doc_batch_size):
358
+ doc_batch = documents[i : i + doc_batch_size]
359
+ result = await index_callback(doc_batch)
360
+ total_indexed += result.indexed_count
361
+ total_failed += result.failed_count
362
+ if result.errors:
363
+ all_errors.extend(result.errors)
364
+
365
+ # Memory freed: url_batch and documents go out of scope
366
+ self._sitemap_logger.info(
367
+ f"Processed URL batch {batch_index}: {len(url_batch)} URLs, "
368
+ f"{len(documents)} docs, {total_indexed} total indexed"
369
+ )
370
+ batch_index += 1
371
+
372
+ return IndexResult(
373
+ success=total_failed == 0,
374
+ indexed_count=total_indexed,
375
+ failed_count=total_failed,
376
+ errors=all_errors,
377
+ )
378
+
379
+ async def _fetch_url_batch(
380
+ self,
381
+ urls: list[str],
382
+ source: str,
383
+ **options: Any,
384
+ ) -> list[Document]:
385
+ """Fetch and chunk content for a batch of URLs.
386
+
387
+ Args:
388
+ urls: List of URLs to fetch.
389
+ source: Original source identifier.
390
+ **options: Loader options.
391
+
392
+ Returns:
393
+ List of Document objects from all URLs.
394
+ """
395
+ semaphore = asyncio.Semaphore(self._max_concurrent)
396
+
397
+ async def fetch_one(url: str) -> list[Document]:
398
+ async with semaphore:
399
+ return await self._load_url(url, source, **options)
400
+
401
+ results = await asyncio.gather(
402
+ *[fetch_one(url) for url in urls],
403
+ return_exceptions=True,
404
+ )
405
+
406
+ documents: list[Document] = []
407
+ for i, result in enumerate(results):
408
+ if isinstance(result, Exception):
409
+ self._sitemap_logger.error(f"Failed to fetch {urls[i]}: {result}")
410
+ elif isinstance(result, list):
411
+ documents.extend(result)
412
+
413
+ return documents
@@ -0,0 +1,258 @@
1
+ """Streaming sitemap discovery with bounded memory.
2
+
3
+ This module provides streaming URL discovery from sitemaps, yielding
4
+ batches of URLs as they're discovered rather than collecting all URLs
5
+ first. This enables immediate processing and keeps memory bounded.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import fnmatch
12
+ import logging
13
+ import re
14
+ from collections.abc import AsyncIterator
15
+ from typing import Any
16
+ from xml.etree import ElementTree
17
+
18
+ import httpx
19
+
20
+ from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
21
+
22
+ # XML namespace for sitemaps
23
+ SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
24
+
25
+
26
+ class StreamingSitemapDiscoverer:
27
+ """Discovers sitemap URLs in a streaming fashion.
28
+
29
+ Instead of collecting all URLs before processing, this yields
30
+ batches of URLs as they're discovered, enabling immediate processing.
31
+
32
+ Key differences from SitemapLoader._get_urls():
33
+ - Yields batches instead of returning complete list
34
+ - Uses bounded queue for backpressure
35
+ - Memory usage is O(batch_size) not O(total_urls)
36
+
37
+ Example:
38
+ ```python
39
+ discoverer = StreamingSitemapDiscoverer()
40
+
41
+ async for url_batch in discoverer.discover_urls_streaming(
42
+ sitemap_url="https://example.com/sitemap.xml",
43
+ batch_size=50,
44
+ max_urls=1000,
45
+ ):
46
+ # Process batch immediately
47
+ for url in url_batch:
48
+ await fetch_and_process(url)
49
+ ```
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ config: PipelineConfig | None = None,
55
+ ) -> None:
56
+ """Initialize the streaming sitemap discoverer.
57
+
58
+ Args:
59
+ config: Pipeline configuration with batch sizes and concurrency.
60
+ """
61
+ self._config = config or PipelineConfig()
62
+ self._logger = logging.getLogger(__name__)
63
+
64
+ async def discover_urls_streaming(
65
+ self,
66
+ sitemap_url: str,
67
+ batch_size: int | None = None,
68
+ max_urls: int = 10000,
69
+ max_depth: int = 3,
70
+ allowed_patterns: list[str] | None = None,
71
+ blocked_patterns: list[str] | None = None,
72
+ **options: Any,
73
+ ) -> AsyncIterator[list[str]]:
74
+ """Yield batches of URLs as they're discovered.
75
+
76
+ Args:
77
+ sitemap_url: Root sitemap URL.
78
+ batch_size: URLs per batch (default from config).
79
+ max_urls: Maximum total URLs to discover.
80
+ max_depth: Maximum sitemap recursion depth.
81
+ allowed_patterns: URL patterns to include.
82
+ blocked_patterns: URL patterns to exclude.
83
+ **options: Additional options (unused, for compatibility).
84
+
85
+ Yields:
86
+ Lists of discovered URLs, batch_size at a time.
87
+ """
88
+ batch_size = batch_size or self._config.url_batch_size
89
+ allowed_patterns = allowed_patterns or []
90
+ blocked_patterns = blocked_patterns or []
91
+
92
+ # Use bounded queue for discovered URLs
93
+ # Queue size is 2x batch_size to allow producer to stay ahead
94
+ url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
95
+
96
+ # Tracking state
97
+ discovered_count = 0
98
+ seen_urls: set[str] = set()
99
+
100
+ async def discover_recursive(url: str, depth: int) -> None:
101
+ """Recursively discover URLs, pushing to queue."""
102
+ nonlocal discovered_count
103
+
104
+ if depth > max_depth or discovered_count >= max_urls:
105
+ return
106
+
107
+ content = await self._fetch_sitemap(url)
108
+ if not content:
109
+ return
110
+
111
+ try:
112
+ root = ElementTree.fromstring(content)
113
+ except ElementTree.ParseError as e:
114
+ self._logger.error(f"Failed to parse sitemap {url}: {e}")
115
+ return
116
+
117
+ # Check for sitemap index
118
+ sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
119
+ if sitemap_refs:
120
+ self._logger.info(
121
+ f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
122
+ )
123
+ # Process nested sitemaps (limited parallelism to avoid overwhelming)
124
+ tasks = []
125
+ for ref in sitemap_refs[:10]: # Limit parallel sitemap fetches
126
+ if ref.text and discovered_count < max_urls:
127
+ tasks.append(discover_recursive(ref.text.strip(), depth + 1))
128
+ await asyncio.gather(*tasks, return_exceptions=True)
129
+ return
130
+
131
+ # Process URL entries
132
+ url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
133
+ for url_elem in url_elements:
134
+ if url_elem.text and discovered_count < max_urls:
135
+ page_url = url_elem.text.strip()
136
+
137
+ if page_url in seen_urls:
138
+ continue
139
+
140
+ if not self._should_include_url(
141
+ page_url, allowed_patterns, blocked_patterns
142
+ ):
143
+ continue
144
+
145
+ seen_urls.add(page_url)
146
+ await url_queue.put(page_url) # Backpressure if queue full
147
+ discovered_count += 1
148
+
149
+ if discovered_count % 100 == 0:
150
+ self._logger.debug(f"Discovered {discovered_count} URLs so far")
151
+
152
+ async def discover_and_close() -> None:
153
+ """Run discovery and close queue when done."""
154
+ try:
155
+ await discover_recursive(sitemap_url, depth=0)
156
+ except Exception as e:
157
+ self._logger.error(f"Discovery error: {e}")
158
+ finally:
159
+ url_queue.close()
160
+
161
+ # Start discovery in background task
162
+ discovery_task = asyncio.create_task(discover_and_close())
163
+
164
+ # Yield batches from queue
165
+ batch: list[str] = []
166
+ try:
167
+ async for url in url_queue:
168
+ batch.append(url)
169
+ if len(batch) >= batch_size:
170
+ yield batch
171
+ batch = []
172
+
173
+ # Yield remaining URLs
174
+ if batch:
175
+ yield batch
176
+
177
+ finally:
178
+ # Ensure discovery task is complete or cancelled
179
+ if not discovery_task.done():
180
+ discovery_task.cancel()
181
+ try:
182
+ await discovery_task
183
+ except asyncio.CancelledError:
184
+ pass
185
+
186
+ async def _fetch_sitemap(self, url: str) -> str | None:
187
+ """Fetch sitemap XML content.
188
+
189
+ Args:
190
+ url: The sitemap URL to fetch.
191
+
192
+ Returns:
193
+ Sitemap XML content or None if fetch failed.
194
+ """
195
+ try:
196
+ async with httpx.AsyncClient(
197
+ timeout=self._config.fetch_timeout_seconds
198
+ ) as client:
199
+ response = await client.get(
200
+ url,
201
+ headers={"Accept": "application/xml, text/xml, */*"},
202
+ follow_redirects=True,
203
+ )
204
+ response.raise_for_status()
205
+ return response.text
206
+ except Exception as e:
207
+ self._logger.error(f"Failed to fetch sitemap {url}: {e}")
208
+ return None
209
+
210
+ def _should_include_url(
211
+ self,
212
+ url: str,
213
+ allowed_patterns: list[str],
214
+ blocked_patterns: list[str],
215
+ ) -> bool:
216
+ """Check if a URL should be included based on patterns.
217
+
218
+ Args:
219
+ url: The URL to check.
220
+ allowed_patterns: Patterns that must match (if any).
221
+ blocked_patterns: Patterns that must not match.
222
+
223
+ Returns:
224
+ True if URL should be included.
225
+ """
226
+ # Check blocked patterns first
227
+ for pattern in blocked_patterns:
228
+ if self._matches_pattern(url, pattern):
229
+ return False
230
+
231
+ # If allowed patterns specified, at least one must match
232
+ if allowed_patterns:
233
+ return any(self._matches_pattern(url, p) for p in allowed_patterns)
234
+
235
+ return True
236
+
237
+ def _matches_pattern(self, url: str, pattern: str) -> bool:
238
+ """Check if URL matches a pattern.
239
+
240
+ Supports both glob patterns (with *) and regex patterns.
241
+
242
+ Args:
243
+ url: The URL to check.
244
+ pattern: The pattern to match against.
245
+
246
+ Returns:
247
+ True if URL matches the pattern.
248
+ """
249
+ # Try fnmatch for glob patterns
250
+ if "*" in pattern or "?" in pattern:
251
+ return fnmatch.fnmatch(url, pattern)
252
+
253
+ # Try regex
254
+ try:
255
+ return bool(re.search(pattern, url))
256
+ except re.error:
257
+ # Invalid regex, try substring match
258
+ return pattern in url