gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,228 @@
1
+ """Bounded streaming pipeline with backpressure support.
2
+
3
+ This module provides infrastructure for memory-efficient streaming pipelines
4
+ with bounded queues that apply backpressure when downstream processing is slow.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ from collections.abc import AsyncIterator
11
+ from dataclasses import dataclass
12
+ from typing import Generic, TypeVar
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ @dataclass
18
+ class PipelineConfig:
19
+ """Configuration for streaming pipeline stages.
20
+
21
+ Attributes:
22
+ url_batch_size: Number of URLs to discover before yielding a batch.
23
+ fetch_concurrency: Maximum parallel URL fetches.
24
+ fetch_queue_size: Maximum URLs waiting to be fetched.
25
+ index_batch_size: Documents per index batch.
26
+ index_queue_size: Maximum docs waiting to be indexed.
27
+ fetch_timeout_seconds: Timeout for each URL fetch.
28
+ index_timeout_seconds: Timeout for each index batch.
29
+ """
30
+
31
+ # URL discovery
32
+ url_batch_size: int = 100
33
+
34
+ # Content fetching
35
+ fetch_concurrency: int = 10
36
+ fetch_queue_size: int = 50
37
+
38
+ # Indexing
39
+ index_batch_size: int = 100
40
+ index_queue_size: int = 200
41
+
42
+ # Timeouts
43
+ fetch_timeout_seconds: float = 30.0
44
+ index_timeout_seconds: float = 60.0
45
+
46
+
47
+ class BoundedQueue(Generic[T]):
48
+ """Async queue with bounded size and backpressure.
49
+
50
+ This queue provides backpressure: when full, put() blocks until space
51
+ is available. This prevents memory from growing unboundedly when
52
+ producers are faster than consumers.
53
+
54
+ Example:
55
+ ```python
56
+ queue: BoundedQueue[str] = BoundedQueue(maxsize=10)
57
+
58
+ # Producer task
59
+ async def producer():
60
+ for url in urls:
61
+ await queue.put(url) # Blocks if queue is full
62
+ queue.close()
63
+
64
+ # Consumer task
65
+ async def consumer():
66
+ async for item in queue:
67
+ await process(item)
68
+ ```
69
+ """
70
+
71
+ def __init__(self, maxsize: int = 0) -> None:
72
+ """Initialize the bounded queue.
73
+
74
+ Args:
75
+ maxsize: Maximum queue size. 0 means unlimited (no backpressure).
76
+ """
77
+ self._queue: asyncio.Queue[T | None] = asyncio.Queue(maxsize=maxsize)
78
+ self._closed = False
79
+ self._consumer_count = 0
80
+
81
+ async def put(self, item: T) -> None:
82
+ """Put an item in the queue, blocking if full (backpressure).
83
+
84
+ Args:
85
+ item: The item to add to the queue.
86
+
87
+ Raises:
88
+ RuntimeError: If the queue has been closed.
89
+ """
90
+ if self._closed:
91
+ raise RuntimeError("Queue is closed")
92
+ await self._queue.put(item)
93
+
94
+ def put_nowait(self, item: T) -> None:
95
+ """Put an item without waiting (raises if full).
96
+
97
+ Args:
98
+ item: The item to add to the queue.
99
+
100
+ Raises:
101
+ RuntimeError: If the queue has been closed.
102
+ asyncio.QueueFull: If the queue is full.
103
+ """
104
+ if self._closed:
105
+ raise RuntimeError("Queue is closed")
106
+ self._queue.put_nowait(item)
107
+
108
+ async def get(self) -> T | None:
109
+ """Get an item from the queue.
110
+
111
+ Returns:
112
+ The next item, or None if queue is closed and empty.
113
+ """
114
+ item = await self._queue.get()
115
+ self._queue.task_done()
116
+ return item
117
+
118
+ def close(self) -> None:
119
+ """Signal that no more items will be added.
120
+
121
+ After closing, consumers will receive None when the queue
122
+ is empty, signaling them to stop.
123
+ """
124
+ if not self._closed:
125
+ self._closed = True
126
+ # Put sentinel to unblock any waiting consumers
127
+ try:
128
+ self._queue.put_nowait(None)
129
+ except asyncio.QueueFull:
130
+ # Queue is full, consumer will eventually get the items
131
+ pass
132
+
133
+ @property
134
+ def is_closed(self) -> bool:
135
+ """Check if the queue has been closed."""
136
+ return self._closed
137
+
138
+ def qsize(self) -> int:
139
+ """Return the current queue size."""
140
+ return self._queue.qsize()
141
+
142
+ def empty(self) -> bool:
143
+ """Return True if the queue is empty."""
144
+ return self._queue.empty()
145
+
146
+ def full(self) -> bool:
147
+ """Return True if the queue is full."""
148
+ return self._queue.full()
149
+
150
+ def __aiter__(self) -> AsyncIterator[T]:
151
+ """Return async iterator for consuming items."""
152
+ return self
153
+
154
+ async def __anext__(self) -> T:
155
+ """Get next item from queue.
156
+
157
+ Raises:
158
+ StopAsyncIteration: When queue is closed and empty.
159
+ """
160
+ item = await self.get()
161
+ if item is None:
162
+ raise StopAsyncIteration
163
+ return item
164
+
165
+
166
+ class BatchCollector(Generic[T]):
167
+ """Collects items into batches of a specified size.
168
+
169
+ Useful for grouping streaming items into batches for
170
+ efficient bulk processing.
171
+
172
+ Example:
173
+ ```python
174
+ collector = BatchCollector[Document](batch_size=100)
175
+
176
+ async for doc in document_stream:
177
+ batch = collector.add(doc)
178
+ if batch:
179
+ await index_batch(batch)
180
+
181
+ # Flush remaining items
182
+ final_batch = collector.flush()
183
+ if final_batch:
184
+ await index_batch(final_batch)
185
+ ```
186
+ """
187
+
188
+ def __init__(self, batch_size: int) -> None:
189
+ """Initialize the batch collector.
190
+
191
+ Args:
192
+ batch_size: Number of items per batch.
193
+ """
194
+ self._batch_size = batch_size
195
+ self._buffer: list[T] = []
196
+
197
+ def add(self, item: T) -> list[T] | None:
198
+ """Add an item to the current batch.
199
+
200
+ Args:
201
+ item: The item to add.
202
+
203
+ Returns:
204
+ A complete batch if batch_size is reached, otherwise None.
205
+ """
206
+ self._buffer.append(item)
207
+ if len(self._buffer) >= self._batch_size:
208
+ batch = self._buffer
209
+ self._buffer = []
210
+ return batch
211
+ return None
212
+
213
+ def flush(self) -> list[T] | None:
214
+ """Flush any remaining items as a partial batch.
215
+
216
+ Returns:
217
+ The remaining items, or None if empty.
218
+ """
219
+ if self._buffer:
220
+ batch = self._buffer
221
+ self._buffer = []
222
+ return batch
223
+ return None
224
+
225
+ @property
226
+ def pending_count(self) -> int:
227
+ """Return the number of items waiting in the buffer."""
228
+ return len(self._buffer)
@@ -1,12 +1,20 @@
1
1
  """Content fetchers for retrieving content from URLs."""
2
2
 
3
+ from gnosisllm_knowledge.core.exceptions import (
4
+ DiscoveryJobFailedError,
5
+ DiscoveryTimeoutError,
6
+ )
3
7
  from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
4
8
  from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
5
9
  from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
10
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
6
11
 
7
12
  __all__ = [
8
13
  "HTTPContentFetcher",
9
14
  "NeoreaderContentFetcher",
15
+ "NeoreaderDiscoveryClient",
10
16
  "FetcherConfig",
11
17
  "NeoreaderConfig",
18
+ "DiscoveryTimeoutError",
19
+ "DiscoveryJobFailedError",
12
20
  ]
@@ -40,6 +40,11 @@ class NeoreaderConfig:
40
40
  remove_selector: CSS selector for elements to remove.
41
41
  with_images: Whether to include image references.
42
42
  with_links: Whether to include link references.
43
+ discovery_enabled: Whether discovery loader is enabled.
44
+ discovery_poll_interval: Interval between status polls in seconds.
45
+ discovery_timeout: Maximum time to wait for discovery completion in seconds.
46
+ discovery_max_depth: Default maximum crawl depth for discovery.
47
+ discovery_max_pages: Default maximum pages to discover.
43
48
  """
44
49
 
45
50
  host: str = "http://localhost:3000"
@@ -50,6 +55,13 @@ class NeoreaderConfig:
50
55
  with_images: bool = False
51
56
  with_links: bool = True
52
57
 
58
+ # Discovery settings
59
+ discovery_enabled: bool = True
60
+ discovery_poll_interval: float = 2.0
61
+ discovery_timeout: float = 600.0
62
+ discovery_max_depth: int = 3
63
+ discovery_max_pages: int = 100
64
+
53
65
  @classmethod
54
66
  def from_env(cls) -> NeoreaderConfig:
55
67
  """Create configuration from environment variables.
@@ -62,6 +74,11 @@ class NeoreaderConfig:
62
74
  - NEOREADER_REMOVE_SELECTOR: CSS selector for removal
63
75
  - NEOREADER_WITH_IMAGES: Include images (true/false)
64
76
  - NEOREADER_WITH_LINKS: Include links (true/false)
77
+ - NEOREADER_DISCOVERY_ENABLED: Enable discovery loader (true/false)
78
+ - NEOREADER_DISCOVERY_POLL_INTERVAL: Discovery poll interval in seconds
79
+ - NEOREADER_DISCOVERY_TIMEOUT: Discovery timeout in seconds
80
+ - NEOREADER_DISCOVERY_MAX_DEPTH: Default max crawl depth
81
+ - NEOREADER_DISCOVERY_MAX_PAGES: Default max pages to discover
65
82
 
66
83
  Returns:
67
84
  NeoreaderConfig populated from environment.
@@ -74,4 +91,14 @@ class NeoreaderConfig:
74
91
  remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
75
92
  with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
76
93
  with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
94
+ discovery_enabled=os.getenv("NEOREADER_DISCOVERY_ENABLED", "true").lower()
95
+ == "true",
96
+ discovery_poll_interval=float(
97
+ os.getenv("NEOREADER_DISCOVERY_POLL_INTERVAL", "2.0")
98
+ ),
99
+ discovery_timeout=float(
100
+ os.getenv("NEOREADER_DISCOVERY_TIMEOUT", "600.0")
101
+ ),
102
+ discovery_max_depth=int(os.getenv("NEOREADER_DISCOVERY_MAX_DEPTH", "3")),
103
+ discovery_max_pages=int(os.getenv("NEOREADER_DISCOVERY_MAX_PAGES", "100")),
77
104
  )
@@ -43,6 +43,15 @@ class NeoreaderContentFetcher:
43
43
  self._config = config or NeoreaderConfig.from_env()
44
44
  self._logger = logging.getLogger(__name__)
45
45
 
46
+ @property
47
+ def config(self) -> NeoreaderConfig:
48
+ """Expose configuration for reuse by discovery client.
49
+
50
+ Returns:
51
+ The Neo Reader configuration used by this fetcher.
52
+ """
53
+ return self._config
54
+
46
55
  async def fetch(self, url: str, **options: Any) -> FetchResult:
47
56
  """Fetch content from a URL using Neoreader.
48
57
 
@@ -181,7 +190,7 @@ class NeoreaderContentFetcher:
181
190
  def _extract_title(self, content: str) -> str | None:
182
191
  """Extract title from markdown content.
183
192
 
184
- Looks for the first H1 heading in the markdown.
193
+ Looks for the first H1 heading in various formats.
185
194
 
186
195
  Args:
187
196
  content: Markdown content.
@@ -189,14 +198,33 @@ class NeoreaderContentFetcher:
189
198
  Returns:
190
199
  Title string or None.
191
200
  """
192
- # Look for first H1 heading
193
201
  lines = content.split("\n")
202
+
203
+ # Look for ATX-style H1 heading (# Title)
194
204
  for line in lines:
195
205
  line = line.strip()
196
206
  if line.startswith("# "):
197
207
  return line[2:].strip()
198
208
 
199
- # Try regex for H1
209
+ # Look for "Title: ..." prefix format (common in Neoreader output)
210
+ for line in lines:
211
+ line = line.strip()
212
+ if line.startswith("Title:"):
213
+ title = line[6:].strip()
214
+ # Stop at "URL" or "Source" if present on same line
215
+ for stop in [" URL", " Source"]:
216
+ if stop in title:
217
+ title = title[:title.index(stop)]
218
+ return title.strip() if title else None
219
+
220
+ # Look for Setext-style H1 (Title followed by === line)
221
+ for i, line in enumerate(lines[:-1]):
222
+ line = line.strip()
223
+ next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
224
+ if line and next_line and all(c == "=" for c in next_line) and len(next_line) >= 3:
225
+ return line
226
+
227
+ # Try regex for ATX H1
200
228
  match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
201
229
  if match:
202
230
  return match.group(1).strip()