gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Bounded streaming pipeline with backpressure support.
|
|
2
|
+
|
|
3
|
+
This module provides infrastructure for memory-efficient streaming pipelines
|
|
4
|
+
with bounded queues that apply backpressure when downstream processing is slow.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from collections.abc import AsyncIterator
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Generic, TypeVar
|
|
13
|
+
|
|
14
|
+
T = TypeVar("T")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class PipelineConfig:
|
|
19
|
+
"""Configuration for streaming pipeline stages.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
url_batch_size: Number of URLs to discover before yielding a batch.
|
|
23
|
+
fetch_concurrency: Maximum parallel URL fetches.
|
|
24
|
+
fetch_queue_size: Maximum URLs waiting to be fetched.
|
|
25
|
+
index_batch_size: Documents per index batch.
|
|
26
|
+
index_queue_size: Maximum docs waiting to be indexed.
|
|
27
|
+
fetch_timeout_seconds: Timeout for each URL fetch.
|
|
28
|
+
index_timeout_seconds: Timeout for each index batch.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# URL discovery
|
|
32
|
+
url_batch_size: int = 100
|
|
33
|
+
|
|
34
|
+
# Content fetching
|
|
35
|
+
fetch_concurrency: int = 10
|
|
36
|
+
fetch_queue_size: int = 50
|
|
37
|
+
|
|
38
|
+
# Indexing
|
|
39
|
+
index_batch_size: int = 100
|
|
40
|
+
index_queue_size: int = 200
|
|
41
|
+
|
|
42
|
+
# Timeouts
|
|
43
|
+
fetch_timeout_seconds: float = 30.0
|
|
44
|
+
index_timeout_seconds: float = 60.0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class BoundedQueue(Generic[T]):
|
|
48
|
+
"""Async queue with bounded size and backpressure.
|
|
49
|
+
|
|
50
|
+
This queue provides backpressure: when full, put() blocks until space
|
|
51
|
+
is available. This prevents memory from growing unboundedly when
|
|
52
|
+
producers are faster than consumers.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
```python
|
|
56
|
+
queue: BoundedQueue[str] = BoundedQueue(maxsize=10)
|
|
57
|
+
|
|
58
|
+
# Producer task
|
|
59
|
+
async def producer():
|
|
60
|
+
for url in urls:
|
|
61
|
+
await queue.put(url) # Blocks if queue is full
|
|
62
|
+
queue.close()
|
|
63
|
+
|
|
64
|
+
# Consumer task
|
|
65
|
+
async def consumer():
|
|
66
|
+
async for item in queue:
|
|
67
|
+
await process(item)
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, maxsize: int = 0) -> None:
|
|
72
|
+
"""Initialize the bounded queue.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
maxsize: Maximum queue size. 0 means unlimited (no backpressure).
|
|
76
|
+
"""
|
|
77
|
+
self._queue: asyncio.Queue[T | None] = asyncio.Queue(maxsize=maxsize)
|
|
78
|
+
self._closed = False
|
|
79
|
+
self._consumer_count = 0
|
|
80
|
+
|
|
81
|
+
async def put(self, item: T) -> None:
|
|
82
|
+
"""Put an item in the queue, blocking if full (backpressure).
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
item: The item to add to the queue.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
RuntimeError: If the queue has been closed.
|
|
89
|
+
"""
|
|
90
|
+
if self._closed:
|
|
91
|
+
raise RuntimeError("Queue is closed")
|
|
92
|
+
await self._queue.put(item)
|
|
93
|
+
|
|
94
|
+
def put_nowait(self, item: T) -> None:
|
|
95
|
+
"""Put an item without waiting (raises if full).
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
item: The item to add to the queue.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
RuntimeError: If the queue has been closed.
|
|
102
|
+
asyncio.QueueFull: If the queue is full.
|
|
103
|
+
"""
|
|
104
|
+
if self._closed:
|
|
105
|
+
raise RuntimeError("Queue is closed")
|
|
106
|
+
self._queue.put_nowait(item)
|
|
107
|
+
|
|
108
|
+
async def get(self) -> T | None:
|
|
109
|
+
"""Get an item from the queue.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The next item, or None if queue is closed and empty.
|
|
113
|
+
"""
|
|
114
|
+
item = await self._queue.get()
|
|
115
|
+
self._queue.task_done()
|
|
116
|
+
return item
|
|
117
|
+
|
|
118
|
+
def close(self) -> None:
|
|
119
|
+
"""Signal that no more items will be added.
|
|
120
|
+
|
|
121
|
+
After closing, consumers will receive None when the queue
|
|
122
|
+
is empty, signaling them to stop.
|
|
123
|
+
"""
|
|
124
|
+
if not self._closed:
|
|
125
|
+
self._closed = True
|
|
126
|
+
# Put sentinel to unblock any waiting consumers
|
|
127
|
+
try:
|
|
128
|
+
self._queue.put_nowait(None)
|
|
129
|
+
except asyncio.QueueFull:
|
|
130
|
+
# Queue is full, consumer will eventually get the items
|
|
131
|
+
pass
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def is_closed(self) -> bool:
|
|
135
|
+
"""Check if the queue has been closed."""
|
|
136
|
+
return self._closed
|
|
137
|
+
|
|
138
|
+
def qsize(self) -> int:
|
|
139
|
+
"""Return the current queue size."""
|
|
140
|
+
return self._queue.qsize()
|
|
141
|
+
|
|
142
|
+
def empty(self) -> bool:
|
|
143
|
+
"""Return True if the queue is empty."""
|
|
144
|
+
return self._queue.empty()
|
|
145
|
+
|
|
146
|
+
def full(self) -> bool:
|
|
147
|
+
"""Return True if the queue is full."""
|
|
148
|
+
return self._queue.full()
|
|
149
|
+
|
|
150
|
+
def __aiter__(self) -> AsyncIterator[T]:
|
|
151
|
+
"""Return async iterator for consuming items."""
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
async def __anext__(self) -> T:
|
|
155
|
+
"""Get next item from queue.
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
StopAsyncIteration: When queue is closed and empty.
|
|
159
|
+
"""
|
|
160
|
+
item = await self.get()
|
|
161
|
+
if item is None:
|
|
162
|
+
raise StopAsyncIteration
|
|
163
|
+
return item
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class BatchCollector(Generic[T]):
|
|
167
|
+
"""Collects items into batches of a specified size.
|
|
168
|
+
|
|
169
|
+
Useful for grouping streaming items into batches for
|
|
170
|
+
efficient bulk processing.
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
```python
|
|
174
|
+
collector = BatchCollector[Document](batch_size=100)
|
|
175
|
+
|
|
176
|
+
async for doc in document_stream:
|
|
177
|
+
batch = collector.add(doc)
|
|
178
|
+
if batch:
|
|
179
|
+
await index_batch(batch)
|
|
180
|
+
|
|
181
|
+
# Flush remaining items
|
|
182
|
+
final_batch = collector.flush()
|
|
183
|
+
if final_batch:
|
|
184
|
+
await index_batch(final_batch)
|
|
185
|
+
```
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
def __init__(self, batch_size: int) -> None:
|
|
189
|
+
"""Initialize the batch collector.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
batch_size: Number of items per batch.
|
|
193
|
+
"""
|
|
194
|
+
self._batch_size = batch_size
|
|
195
|
+
self._buffer: list[T] = []
|
|
196
|
+
|
|
197
|
+
def add(self, item: T) -> list[T] | None:
|
|
198
|
+
"""Add an item to the current batch.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
item: The item to add.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
A complete batch if batch_size is reached, otherwise None.
|
|
205
|
+
"""
|
|
206
|
+
self._buffer.append(item)
|
|
207
|
+
if len(self._buffer) >= self._batch_size:
|
|
208
|
+
batch = self._buffer
|
|
209
|
+
self._buffer = []
|
|
210
|
+
return batch
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
def flush(self) -> list[T] | None:
|
|
214
|
+
"""Flush any remaining items as a partial batch.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
The remaining items, or None if empty.
|
|
218
|
+
"""
|
|
219
|
+
if self._buffer:
|
|
220
|
+
batch = self._buffer
|
|
221
|
+
self._buffer = []
|
|
222
|
+
return batch
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
@property
|
|
226
|
+
def pending_count(self) -> int:
|
|
227
|
+
"""Return the number of items waiting in the buffer."""
|
|
228
|
+
return len(self._buffer)
|
|
@@ -1,12 +1,20 @@
|
|
|
1
1
|
"""Content fetchers for retrieving content from URLs."""
|
|
2
2
|
|
|
3
|
+
from gnosisllm_knowledge.core.exceptions import (
|
|
4
|
+
DiscoveryJobFailedError,
|
|
5
|
+
DiscoveryTimeoutError,
|
|
6
|
+
)
|
|
3
7
|
from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
|
|
4
8
|
from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
|
|
5
9
|
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
10
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
6
11
|
|
|
7
12
|
__all__ = [
|
|
8
13
|
"HTTPContentFetcher",
|
|
9
14
|
"NeoreaderContentFetcher",
|
|
15
|
+
"NeoreaderDiscoveryClient",
|
|
10
16
|
"FetcherConfig",
|
|
11
17
|
"NeoreaderConfig",
|
|
18
|
+
"DiscoveryTimeoutError",
|
|
19
|
+
"DiscoveryJobFailedError",
|
|
12
20
|
]
|
|
@@ -40,6 +40,11 @@ class NeoreaderConfig:
|
|
|
40
40
|
remove_selector: CSS selector for elements to remove.
|
|
41
41
|
with_images: Whether to include image references.
|
|
42
42
|
with_links: Whether to include link references.
|
|
43
|
+
discovery_enabled: Whether discovery loader is enabled.
|
|
44
|
+
discovery_poll_interval: Interval between status polls in seconds.
|
|
45
|
+
discovery_timeout: Maximum time to wait for discovery completion in seconds.
|
|
46
|
+
discovery_max_depth: Default maximum crawl depth for discovery.
|
|
47
|
+
discovery_max_pages: Default maximum pages to discover.
|
|
43
48
|
"""
|
|
44
49
|
|
|
45
50
|
host: str = "http://localhost:3000"
|
|
@@ -50,6 +55,13 @@ class NeoreaderConfig:
|
|
|
50
55
|
with_images: bool = False
|
|
51
56
|
with_links: bool = True
|
|
52
57
|
|
|
58
|
+
# Discovery settings
|
|
59
|
+
discovery_enabled: bool = True
|
|
60
|
+
discovery_poll_interval: float = 2.0
|
|
61
|
+
discovery_timeout: float = 600.0
|
|
62
|
+
discovery_max_depth: int = 3
|
|
63
|
+
discovery_max_pages: int = 100
|
|
64
|
+
|
|
53
65
|
@classmethod
|
|
54
66
|
def from_env(cls) -> NeoreaderConfig:
|
|
55
67
|
"""Create configuration from environment variables.
|
|
@@ -62,6 +74,11 @@ class NeoreaderConfig:
|
|
|
62
74
|
- NEOREADER_REMOVE_SELECTOR: CSS selector for removal
|
|
63
75
|
- NEOREADER_WITH_IMAGES: Include images (true/false)
|
|
64
76
|
- NEOREADER_WITH_LINKS: Include links (true/false)
|
|
77
|
+
- NEOREADER_DISCOVERY_ENABLED: Enable discovery loader (true/false)
|
|
78
|
+
- NEOREADER_DISCOVERY_POLL_INTERVAL: Discovery poll interval in seconds
|
|
79
|
+
- NEOREADER_DISCOVERY_TIMEOUT: Discovery timeout in seconds
|
|
80
|
+
- NEOREADER_DISCOVERY_MAX_DEPTH: Default max crawl depth
|
|
81
|
+
- NEOREADER_DISCOVERY_MAX_PAGES: Default max pages to discover
|
|
65
82
|
|
|
66
83
|
Returns:
|
|
67
84
|
NeoreaderConfig populated from environment.
|
|
@@ -74,4 +91,14 @@ class NeoreaderConfig:
|
|
|
74
91
|
remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
|
|
75
92
|
with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
|
|
76
93
|
with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
|
|
94
|
+
discovery_enabled=os.getenv("NEOREADER_DISCOVERY_ENABLED", "true").lower()
|
|
95
|
+
== "true",
|
|
96
|
+
discovery_poll_interval=float(
|
|
97
|
+
os.getenv("NEOREADER_DISCOVERY_POLL_INTERVAL", "2.0")
|
|
98
|
+
),
|
|
99
|
+
discovery_timeout=float(
|
|
100
|
+
os.getenv("NEOREADER_DISCOVERY_TIMEOUT", "600.0")
|
|
101
|
+
),
|
|
102
|
+
discovery_max_depth=int(os.getenv("NEOREADER_DISCOVERY_MAX_DEPTH", "3")),
|
|
103
|
+
discovery_max_pages=int(os.getenv("NEOREADER_DISCOVERY_MAX_PAGES", "100")),
|
|
77
104
|
)
|
|
@@ -43,6 +43,15 @@ class NeoreaderContentFetcher:
|
|
|
43
43
|
self._config = config or NeoreaderConfig.from_env()
|
|
44
44
|
self._logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
|
+
@property
|
|
47
|
+
def config(self) -> NeoreaderConfig:
|
|
48
|
+
"""Expose configuration for reuse by discovery client.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The Neo Reader configuration used by this fetcher.
|
|
52
|
+
"""
|
|
53
|
+
return self._config
|
|
54
|
+
|
|
46
55
|
async def fetch(self, url: str, **options: Any) -> FetchResult:
|
|
47
56
|
"""Fetch content from a URL using Neoreader.
|
|
48
57
|
|
|
@@ -181,7 +190,7 @@ class NeoreaderContentFetcher:
|
|
|
181
190
|
def _extract_title(self, content: str) -> str | None:
|
|
182
191
|
"""Extract title from markdown content.
|
|
183
192
|
|
|
184
|
-
Looks for the first H1 heading in
|
|
193
|
+
Looks for the first H1 heading in various formats.
|
|
185
194
|
|
|
186
195
|
Args:
|
|
187
196
|
content: Markdown content.
|
|
@@ -189,14 +198,33 @@ class NeoreaderContentFetcher:
|
|
|
189
198
|
Returns:
|
|
190
199
|
Title string or None.
|
|
191
200
|
"""
|
|
192
|
-
# Look for first H1 heading
|
|
193
201
|
lines = content.split("\n")
|
|
202
|
+
|
|
203
|
+
# Look for ATX-style H1 heading (# Title)
|
|
194
204
|
for line in lines:
|
|
195
205
|
line = line.strip()
|
|
196
206
|
if line.startswith("# "):
|
|
197
207
|
return line[2:].strip()
|
|
198
208
|
|
|
199
|
-
#
|
|
209
|
+
# Look for "Title: ..." prefix format (common in Neoreader output)
|
|
210
|
+
for line in lines:
|
|
211
|
+
line = line.strip()
|
|
212
|
+
if line.startswith("Title:"):
|
|
213
|
+
title = line[6:].strip()
|
|
214
|
+
# Stop at "URL" or "Source" if present on same line
|
|
215
|
+
for stop in [" URL", " Source"]:
|
|
216
|
+
if stop in title:
|
|
217
|
+
title = title[:title.index(stop)]
|
|
218
|
+
return title.strip() if title else None
|
|
219
|
+
|
|
220
|
+
# Look for Setext-style H1 (Title followed by === line)
|
|
221
|
+
for i, line in enumerate(lines[:-1]):
|
|
222
|
+
line = line.strip()
|
|
223
|
+
next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
|
|
224
|
+
if line and next_line and all(c == "=" for c in next_line) and len(next_line) >= 3:
|
|
225
|
+
return line
|
|
226
|
+
|
|
227
|
+
# Try regex for ATX H1
|
|
200
228
|
match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
|
201
229
|
if match:
|
|
202
230
|
return match.group(1).strip()
|