gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,4 +1,11 @@
|
|
|
1
|
-
"""Agentic searcher protocol - Interface for AI-powered search operations.
|
|
1
|
+
"""Agentic searcher protocol - Interface for AI-powered search operations.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). Agentic searcher implementations
|
|
6
|
+
should not include tenant filtering logic - callers should use tenant-specific
|
|
7
|
+
indices.
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
10
|
from __future__ import annotations
|
|
4
11
|
|
|
@@ -15,6 +22,9 @@ if TYPE_CHECKING:
|
|
|
15
22
|
class IAgenticSearcher(Protocol):
|
|
16
23
|
"""Protocol for agentic search operations using AI agents.
|
|
17
24
|
|
|
25
|
+
This protocol is tenant-agnostic. Multi-tenancy is achieved through index
|
|
26
|
+
isolation by using tenant-specific index names.
|
|
27
|
+
|
|
18
28
|
Agentic searchers are responsible for:
|
|
19
29
|
- Understanding natural language queries
|
|
20
30
|
- Automatically constructing optimal search strategies
|
|
@@ -107,13 +117,11 @@ class IAgenticSearcher(Protocol):
|
|
|
107
117
|
|
|
108
118
|
async def list_conversations(
|
|
109
119
|
self,
|
|
110
|
-
account_id: str | None = None,
|
|
111
120
|
limit: int = 100,
|
|
112
121
|
) -> list[dict[str, Any]]:
|
|
113
122
|
"""List active conversations.
|
|
114
123
|
|
|
115
124
|
Args:
|
|
116
|
-
account_id: Filter by account (multi-tenant).
|
|
117
125
|
limit: Maximum number of conversations.
|
|
118
126
|
|
|
119
127
|
Returns:
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""Document indexer protocol - Interface Segregation Principle.
|
|
1
|
+
"""Document indexer protocol - Interface Segregation Principle.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). Indexer implementations should
|
|
6
|
+
not include tenant filtering logic - callers should use tenant-specific indices.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from __future__ import annotations
|
|
4
10
|
|
|
@@ -14,6 +20,9 @@ if TYPE_CHECKING:
|
|
|
14
20
|
class IDocumentIndexer(Protocol):
|
|
15
21
|
"""Protocol for indexing documents into a search backend.
|
|
16
22
|
|
|
23
|
+
This protocol is tenant-agnostic. Multi-tenancy is achieved through index
|
|
24
|
+
isolation by using tenant-specific index names.
|
|
25
|
+
|
|
17
26
|
Document indexers are responsible for:
|
|
18
27
|
- Generating embeddings for documents
|
|
19
28
|
- Storing documents in the search backend
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""Knowledge searcher protocol - Interface Segregation Principle.
|
|
1
|
+
"""Knowledge searcher protocol - Interface Segregation Principle.
|
|
2
|
+
|
|
3
|
+
Note:
|
|
4
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
5
|
+
isolation (e.g., `knowledge-{account_id}`). Searcher implementations should
|
|
6
|
+
not include tenant filtering logic - callers should use tenant-specific indices.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from __future__ import annotations
|
|
4
10
|
|
|
@@ -12,6 +18,9 @@ if TYPE_CHECKING:
|
|
|
12
18
|
class IKnowledgeSearcher(Protocol):
|
|
13
19
|
"""Protocol for searching documents in a search backend.
|
|
14
20
|
|
|
21
|
+
This protocol is tenant-agnostic. Multi-tenancy is achieved through index
|
|
22
|
+
isolation by using tenant-specific index names.
|
|
23
|
+
|
|
15
24
|
Knowledge searchers are responsible for:
|
|
16
25
|
- Executing different search modes (semantic, keyword, hybrid)
|
|
17
26
|
- Generating embeddings for queries
|
|
@@ -176,3 +185,23 @@ class IKnowledgeSearcher(Protocol):
|
|
|
176
185
|
List of SearchResults in same order as queries.
|
|
177
186
|
"""
|
|
178
187
|
...
|
|
188
|
+
|
|
189
|
+
async def count(
|
|
190
|
+
self,
|
|
191
|
+
index_name: str,
|
|
192
|
+
collection_id: str | None = None,
|
|
193
|
+
source_id: str | None = None,
|
|
194
|
+
) -> int:
|
|
195
|
+
"""Count documents in index with optional filters.
|
|
196
|
+
|
|
197
|
+
Uses native count API instead of search for efficiency.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
index_name: Target index name.
|
|
201
|
+
collection_id: Filter by collection.
|
|
202
|
+
source_id: Filter by source.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Document count.
|
|
206
|
+
"""
|
|
207
|
+
...
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
These protocols define contracts for streaming operations that process
|
|
4
4
|
data in bounded batches rather than loading everything into memory.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
8
|
+
isolation (e.g., `knowledge-{account_id}`). Streaming implementations should
|
|
9
|
+
not include tenant filtering logic - callers should use tenant-specific indices.
|
|
5
10
|
"""
|
|
6
11
|
|
|
7
12
|
from __future__ import annotations
|
|
@@ -97,6 +102,9 @@ class IStreamingLoader(Protocol):
|
|
|
97
102
|
class IStreamingPipeline(Protocol):
|
|
98
103
|
"""Protocol for streaming indexing pipelines.
|
|
99
104
|
|
|
105
|
+
This protocol is tenant-agnostic. Multi-tenancy is achieved through index
|
|
106
|
+
isolation by using tenant-specific index names.
|
|
107
|
+
|
|
100
108
|
Orchestrates the full streaming load -> index pipeline with
|
|
101
109
|
bounded memory guarantees.
|
|
102
110
|
"""
|
|
@@ -106,7 +114,6 @@ class IStreamingPipeline(Protocol):
|
|
|
106
114
|
source: str,
|
|
107
115
|
index_name: str,
|
|
108
116
|
*,
|
|
109
|
-
account_id: str | None = None,
|
|
110
117
|
collection_id: str | None = None,
|
|
111
118
|
source_id: str | None = None,
|
|
112
119
|
**options: Any,
|
|
@@ -115,9 +122,8 @@ class IStreamingPipeline(Protocol):
|
|
|
115
122
|
|
|
116
123
|
Args:
|
|
117
124
|
source: Sitemap URL.
|
|
118
|
-
index_name: Target OpenSearch index.
|
|
119
|
-
|
|
120
|
-
collection_id: Collection within account.
|
|
125
|
+
index_name: Target OpenSearch index (use tenant-specific name).
|
|
126
|
+
collection_id: Collection within the index.
|
|
121
127
|
source_id: Source identifier.
|
|
122
128
|
**options: Additional loader options.
|
|
123
129
|
|
|
@@ -1,12 +1,20 @@
|
|
|
1
1
|
"""Content fetchers for retrieving content from URLs."""
|
|
2
2
|
|
|
3
|
+
from gnosisllm_knowledge.core.exceptions import (
|
|
4
|
+
DiscoveryJobFailedError,
|
|
5
|
+
DiscoveryTimeoutError,
|
|
6
|
+
)
|
|
3
7
|
from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
|
|
4
8
|
from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
|
|
5
9
|
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
10
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
6
11
|
|
|
7
12
|
__all__ = [
|
|
8
13
|
"HTTPContentFetcher",
|
|
9
14
|
"NeoreaderContentFetcher",
|
|
15
|
+
"NeoreaderDiscoveryClient",
|
|
10
16
|
"FetcherConfig",
|
|
11
17
|
"NeoreaderConfig",
|
|
18
|
+
"DiscoveryTimeoutError",
|
|
19
|
+
"DiscoveryJobFailedError",
|
|
12
20
|
]
|
|
@@ -40,6 +40,11 @@ class NeoreaderConfig:
|
|
|
40
40
|
remove_selector: CSS selector for elements to remove.
|
|
41
41
|
with_images: Whether to include image references.
|
|
42
42
|
with_links: Whether to include link references.
|
|
43
|
+
discovery_enabled: Whether discovery loader is enabled.
|
|
44
|
+
discovery_poll_interval: Interval between status polls in seconds.
|
|
45
|
+
discovery_timeout: Maximum time to wait for discovery completion in seconds.
|
|
46
|
+
discovery_max_depth: Default maximum crawl depth for discovery.
|
|
47
|
+
discovery_max_pages: Default maximum pages to discover.
|
|
43
48
|
"""
|
|
44
49
|
|
|
45
50
|
host: str = "http://localhost:3000"
|
|
@@ -50,6 +55,13 @@ class NeoreaderConfig:
|
|
|
50
55
|
with_images: bool = False
|
|
51
56
|
with_links: bool = True
|
|
52
57
|
|
|
58
|
+
# Discovery settings
|
|
59
|
+
discovery_enabled: bool = True
|
|
60
|
+
discovery_poll_interval: float = 2.0
|
|
61
|
+
discovery_timeout: float = 600.0
|
|
62
|
+
discovery_max_depth: int = 3
|
|
63
|
+
discovery_max_pages: int = 100
|
|
64
|
+
|
|
53
65
|
@classmethod
|
|
54
66
|
def from_env(cls) -> NeoreaderConfig:
|
|
55
67
|
"""Create configuration from environment variables.
|
|
@@ -62,6 +74,11 @@ class NeoreaderConfig:
|
|
|
62
74
|
- NEOREADER_REMOVE_SELECTOR: CSS selector for removal
|
|
63
75
|
- NEOREADER_WITH_IMAGES: Include images (true/false)
|
|
64
76
|
- NEOREADER_WITH_LINKS: Include links (true/false)
|
|
77
|
+
- NEOREADER_DISCOVERY_ENABLED: Enable discovery loader (true/false)
|
|
78
|
+
- NEOREADER_DISCOVERY_POLL_INTERVAL: Discovery poll interval in seconds
|
|
79
|
+
- NEOREADER_DISCOVERY_TIMEOUT: Discovery timeout in seconds
|
|
80
|
+
- NEOREADER_DISCOVERY_MAX_DEPTH: Default max crawl depth
|
|
81
|
+
- NEOREADER_DISCOVERY_MAX_PAGES: Default max pages to discover
|
|
65
82
|
|
|
66
83
|
Returns:
|
|
67
84
|
NeoreaderConfig populated from environment.
|
|
@@ -74,4 +91,14 @@ class NeoreaderConfig:
|
|
|
74
91
|
remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
|
|
75
92
|
with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
|
|
76
93
|
with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
|
|
94
|
+
discovery_enabled=os.getenv("NEOREADER_DISCOVERY_ENABLED", "true").lower()
|
|
95
|
+
== "true",
|
|
96
|
+
discovery_poll_interval=float(
|
|
97
|
+
os.getenv("NEOREADER_DISCOVERY_POLL_INTERVAL", "2.0")
|
|
98
|
+
),
|
|
99
|
+
discovery_timeout=float(
|
|
100
|
+
os.getenv("NEOREADER_DISCOVERY_TIMEOUT", "600.0")
|
|
101
|
+
),
|
|
102
|
+
discovery_max_depth=int(os.getenv("NEOREADER_DISCOVERY_MAX_DEPTH", "3")),
|
|
103
|
+
discovery_max_pages=int(os.getenv("NEOREADER_DISCOVERY_MAX_PAGES", "100")),
|
|
77
104
|
)
|
|
@@ -43,6 +43,15 @@ class NeoreaderContentFetcher:
|
|
|
43
43
|
self._config = config or NeoreaderConfig.from_env()
|
|
44
44
|
self._logger = logging.getLogger(__name__)
|
|
45
45
|
|
|
46
|
+
@property
|
|
47
|
+
def config(self) -> NeoreaderConfig:
|
|
48
|
+
"""Expose configuration for reuse by discovery client.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The Neo Reader configuration used by this fetcher.
|
|
52
|
+
"""
|
|
53
|
+
return self._config
|
|
54
|
+
|
|
46
55
|
async def fetch(self, url: str, **options: Any) -> FetchResult:
|
|
47
56
|
"""Fetch content from a URL using Neoreader.
|
|
48
57
|
|
|
@@ -181,7 +190,7 @@ class NeoreaderContentFetcher:
|
|
|
181
190
|
def _extract_title(self, content: str) -> str | None:
|
|
182
191
|
"""Extract title from markdown content.
|
|
183
192
|
|
|
184
|
-
Looks for the first H1 heading in
|
|
193
|
+
Looks for the first H1 heading in various formats.
|
|
185
194
|
|
|
186
195
|
Args:
|
|
187
196
|
content: Markdown content.
|
|
@@ -189,14 +198,33 @@ class NeoreaderContentFetcher:
|
|
|
189
198
|
Returns:
|
|
190
199
|
Title string or None.
|
|
191
200
|
"""
|
|
192
|
-
# Look for first H1 heading
|
|
193
201
|
lines = content.split("\n")
|
|
202
|
+
|
|
203
|
+
# Look for ATX-style H1 heading (# Title)
|
|
194
204
|
for line in lines:
|
|
195
205
|
line = line.strip()
|
|
196
206
|
if line.startswith("# "):
|
|
197
207
|
return line[2:].strip()
|
|
198
208
|
|
|
199
|
-
#
|
|
209
|
+
# Look for "Title: ..." prefix format (common in Neoreader output)
|
|
210
|
+
for line in lines:
|
|
211
|
+
line = line.strip()
|
|
212
|
+
if line.startswith("Title:"):
|
|
213
|
+
title = line[6:].strip()
|
|
214
|
+
# Stop at "URL" or "Source" if present on same line
|
|
215
|
+
for stop in [" URL", " Source"]:
|
|
216
|
+
if stop in title:
|
|
217
|
+
title = title[:title.index(stop)]
|
|
218
|
+
return title.strip() if title else None
|
|
219
|
+
|
|
220
|
+
# Look for Setext-style H1 (Title followed by === line)
|
|
221
|
+
for i, line in enumerate(lines[:-1]):
|
|
222
|
+
line = line.strip()
|
|
223
|
+
next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
|
|
224
|
+
if line and next_line and all(c == "=" for c in next_line) and len(next_line) >= 3:
|
|
225
|
+
return line
|
|
226
|
+
|
|
227
|
+
# Try regex for ATX H1
|
|
200
228
|
match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
|
201
229
|
if match:
|
|
202
230
|
return match.group(1).strip()
|