gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""Sitemap loader with recursive discovery and URL filtering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import fnmatch
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any
|
|
10
|
+
from xml.etree import ElementTree
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
|
|
14
|
+
from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent
|
|
15
|
+
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
16
|
+
|
|
17
|
+
# XML namespace for sitemaps
|
|
18
|
+
SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
19
|
+
|
|
20
|
+
# Default limits
|
|
21
|
+
DEFAULT_MAX_URLS = 1000
|
|
22
|
+
DEFAULT_MAX_DEPTH = 3
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SitemapLoader(BaseLoader):
|
|
26
|
+
"""Loader for sitemap XML files with recursive discovery.
|
|
27
|
+
|
|
28
|
+
Features:
|
|
29
|
+
- Recursive sitemap discovery (sitemap index files)
|
|
30
|
+
- URL filtering with allow/block patterns
|
|
31
|
+
- Configurable max URLs and depth limits
|
|
32
|
+
- Parallel processing of nested sitemaps
|
|
33
|
+
- Deduplication of discovered URLs
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
```python
|
|
37
|
+
loader = SitemapLoader(
|
|
38
|
+
fetcher, chunker,
|
|
39
|
+
config={
|
|
40
|
+
"max_urls": 500,
|
|
41
|
+
"max_depth": 2,
|
|
42
|
+
"allowed_patterns": ["*/docs/*", "*/blog/*"],
|
|
43
|
+
"blocked_patterns": ["*/admin/*"],
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
result = await loader.load("https://example.com/sitemap.xml")
|
|
47
|
+
```
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
51
|
+
"""Initialize the sitemap loader."""
|
|
52
|
+
super().__init__(*args, **kwargs)
|
|
53
|
+
self._sitemap_logger = logging.getLogger(f"{__name__}.SitemapLoader")
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def name(self) -> str:
|
|
57
|
+
"""Return the loader name."""
|
|
58
|
+
return "sitemap"
|
|
59
|
+
|
|
60
|
+
def supports(self, source: str) -> bool:
|
|
61
|
+
"""Check if this loader supports the given source.
|
|
62
|
+
|
|
63
|
+
Supports URLs that look like sitemaps (contain 'sitemap' or end with .xml).
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
source: The source URL.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
True if this looks like a sitemap URL.
|
|
70
|
+
"""
|
|
71
|
+
source_lower = source.lower()
|
|
72
|
+
is_http = source_lower.startswith(("http://", "https://"))
|
|
73
|
+
is_sitemap = "sitemap" in source_lower or source_lower.endswith(".xml")
|
|
74
|
+
return is_http and is_sitemap
|
|
75
|
+
|
|
76
|
+
async def _get_urls(self, source: str, **options: Any) -> list[str]:
|
|
77
|
+
"""Get list of URLs to process from the sitemap.
|
|
78
|
+
|
|
79
|
+
Recursively discovers URLs from sitemap and sitemap index files.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
source: The sitemap URL.
|
|
83
|
+
**options: Loader-specific options:
|
|
84
|
+
- max_urls: Maximum URLs to return
|
|
85
|
+
- max_depth: Maximum recursion depth
|
|
86
|
+
- allowed_patterns: URL patterns to include
|
|
87
|
+
- blocked_patterns: URL patterns to exclude
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of discovered and filtered URLs.
|
|
91
|
+
"""
|
|
92
|
+
max_urls = options.get("max_urls", self._config.get("max_urls", DEFAULT_MAX_URLS))
|
|
93
|
+
max_depth = options.get("max_depth", self._config.get("max_depth", DEFAULT_MAX_DEPTH))
|
|
94
|
+
allowed_patterns = options.get(
|
|
95
|
+
"allowed_patterns", self._config.get("allowed_patterns", [])
|
|
96
|
+
)
|
|
97
|
+
blocked_patterns = options.get(
|
|
98
|
+
"blocked_patterns", self._config.get("blocked_patterns", [])
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
discovered_urls: set[str] = set()
|
|
102
|
+
|
|
103
|
+
await self._discover_urls(
|
|
104
|
+
sitemap_url=source,
|
|
105
|
+
depth=0,
|
|
106
|
+
max_depth=max_depth,
|
|
107
|
+
max_urls=max_urls,
|
|
108
|
+
discovered=discovered_urls,
|
|
109
|
+
allowed_patterns=allowed_patterns,
|
|
110
|
+
blocked_patterns=blocked_patterns,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Convert to list and limit
|
|
114
|
+
urls = list(discovered_urls)[:max_urls]
|
|
115
|
+
self._sitemap_logger.info(f"Discovered {len(urls)} URLs from sitemap")
|
|
116
|
+
|
|
117
|
+
return urls
|
|
118
|
+
|
|
119
|
+
async def _discover_urls(
|
|
120
|
+
self,
|
|
121
|
+
sitemap_url: str,
|
|
122
|
+
depth: int,
|
|
123
|
+
max_depth: int,
|
|
124
|
+
max_urls: int,
|
|
125
|
+
discovered: set[str],
|
|
126
|
+
allowed_patterns: list[str],
|
|
127
|
+
blocked_patterns: list[str],
|
|
128
|
+
) -> None:
|
|
129
|
+
"""Recursively discover URLs from a sitemap.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
sitemap_url: The sitemap URL to process.
|
|
133
|
+
depth: Current recursion depth.
|
|
134
|
+
max_depth: Maximum recursion depth.
|
|
135
|
+
max_urls: Maximum URLs to discover.
|
|
136
|
+
discovered: Set of already discovered URLs (modified in place).
|
|
137
|
+
allowed_patterns: URL patterns to include.
|
|
138
|
+
blocked_patterns: URL patterns to exclude.
|
|
139
|
+
"""
|
|
140
|
+
if depth > max_depth:
|
|
141
|
+
self._sitemap_logger.debug(f"Max depth {max_depth} reached, skipping {sitemap_url}")
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
if len(discovered) >= max_urls:
|
|
145
|
+
self._sitemap_logger.debug(f"Max URLs {max_urls} reached")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
content = await self._fetch_sitemap(sitemap_url)
|
|
150
|
+
if not content:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
root = ElementTree.fromstring(content)
|
|
154
|
+
|
|
155
|
+
# Check if this is a sitemap index
|
|
156
|
+
sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
|
|
157
|
+
if sitemap_refs:
|
|
158
|
+
# This is a sitemap index - recursively process each sitemap
|
|
159
|
+
self._sitemap_logger.info(
|
|
160
|
+
f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Process nested sitemaps in parallel
|
|
164
|
+
tasks = []
|
|
165
|
+
for sitemap_ref in sitemap_refs:
|
|
166
|
+
if sitemap_ref.text and len(discovered) < max_urls:
|
|
167
|
+
tasks.append(
|
|
168
|
+
self._discover_urls(
|
|
169
|
+
sitemap_url=sitemap_ref.text.strip(),
|
|
170
|
+
depth=depth + 1,
|
|
171
|
+
max_depth=max_depth,
|
|
172
|
+
max_urls=max_urls,
|
|
173
|
+
discovered=discovered,
|
|
174
|
+
allowed_patterns=allowed_patterns,
|
|
175
|
+
blocked_patterns=blocked_patterns,
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if tasks:
|
|
180
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
181
|
+
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
# Process regular sitemap URLs
|
|
185
|
+
url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
|
|
186
|
+
urls_added = 0
|
|
187
|
+
|
|
188
|
+
for url_elem in url_elements:
|
|
189
|
+
if url_elem.text and len(discovered) < max_urls:
|
|
190
|
+
url = url_elem.text.strip()
|
|
191
|
+
|
|
192
|
+
# Apply filters
|
|
193
|
+
if self._should_include_url(url, allowed_patterns, blocked_patterns):
|
|
194
|
+
if url not in discovered:
|
|
195
|
+
discovered.add(url)
|
|
196
|
+
urls_added += 1
|
|
197
|
+
|
|
198
|
+
# Emit discovery event
|
|
199
|
+
self._events.emit(
|
|
200
|
+
SitemapDiscoveryEvent(
|
|
201
|
+
sitemap_url=sitemap_url,
|
|
202
|
+
urls_discovered=urls_added,
|
|
203
|
+
depth=depth,
|
|
204
|
+
total_urls=len(discovered),
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
self._sitemap_logger.debug(
|
|
209
|
+
f"Discovered {urls_added} URLs from {sitemap_url} at depth {depth}"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
except Exception as e:
|
|
213
|
+
self._sitemap_logger.error(f"Failed to process sitemap {sitemap_url}: {e}")
|
|
214
|
+
|
|
215
|
+
async def _fetch_sitemap(self, url: str) -> str | None:
|
|
216
|
+
"""Fetch sitemap XML content.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
url: The sitemap URL to fetch.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Sitemap XML content or None if fetch failed.
|
|
223
|
+
"""
|
|
224
|
+
try:
|
|
225
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
226
|
+
response = await client.get(
|
|
227
|
+
url,
|
|
228
|
+
headers={"Accept": "application/xml, text/xml, */*"},
|
|
229
|
+
follow_redirects=True,
|
|
230
|
+
)
|
|
231
|
+
response.raise_for_status()
|
|
232
|
+
return response.text
|
|
233
|
+
except Exception as e:
|
|
234
|
+
self._sitemap_logger.error(f"Failed to fetch sitemap {url}: {e}")
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
def _should_include_url(
|
|
238
|
+
self,
|
|
239
|
+
url: str,
|
|
240
|
+
allowed_patterns: list[str],
|
|
241
|
+
blocked_patterns: list[str],
|
|
242
|
+
) -> bool:
|
|
243
|
+
"""Check if a URL should be included based on patterns.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
url: The URL to check.
|
|
247
|
+
allowed_patterns: Patterns that must match (if any).
|
|
248
|
+
blocked_patterns: Patterns that must not match.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if URL should be included.
|
|
252
|
+
"""
|
|
253
|
+
# Check blocked patterns first
|
|
254
|
+
for pattern in blocked_patterns:
|
|
255
|
+
if self._matches_pattern(url, pattern):
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
# If allowed patterns specified, at least one must match
|
|
259
|
+
if allowed_patterns:
|
|
260
|
+
return any(self._matches_pattern(url, p) for p in allowed_patterns)
|
|
261
|
+
|
|
262
|
+
return True
|
|
263
|
+
|
|
264
|
+
def _matches_pattern(self, url: str, pattern: str) -> bool:
|
|
265
|
+
"""Check if URL matches a pattern.
|
|
266
|
+
|
|
267
|
+
Supports both glob patterns (with *) and regex patterns.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
url: The URL to check.
|
|
271
|
+
pattern: The pattern to match against.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
True if URL matches the pattern.
|
|
275
|
+
"""
|
|
276
|
+
# Try fnmatch for glob patterns
|
|
277
|
+
if "*" in pattern or "?" in pattern:
|
|
278
|
+
return fnmatch.fnmatch(url, pattern)
|
|
279
|
+
|
|
280
|
+
# Try regex
|
|
281
|
+
try:
|
|
282
|
+
return bool(re.search(pattern, url))
|
|
283
|
+
except re.error:
|
|
284
|
+
# Invalid regex, try substring match
|
|
285
|
+
return pattern in url
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Website loader for single URL loading."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WebsiteLoader(BaseLoader):
|
|
11
|
+
"""Loader for single website URLs.
|
|
12
|
+
|
|
13
|
+
This is the simplest loader that handles loading content from
|
|
14
|
+
a single URL. For loading multiple URLs from a sitemap, use
|
|
15
|
+
SitemapLoader instead.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
```python
|
|
19
|
+
loader = WebsiteLoader(fetcher, chunker)
|
|
20
|
+
result = await loader.load("https://example.com/page")
|
|
21
|
+
```
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def name(self) -> str:
|
|
26
|
+
"""Return the loader name."""
|
|
27
|
+
return "website"
|
|
28
|
+
|
|
29
|
+
def supports(self, source: str) -> bool:
|
|
30
|
+
"""Check if this loader supports the given source.
|
|
31
|
+
|
|
32
|
+
Supports HTTP and HTTPS URLs that don't look like sitemaps.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
source: The source URL.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
True if this is a regular website URL.
|
|
39
|
+
"""
|
|
40
|
+
source_lower = source.lower()
|
|
41
|
+
is_http = source_lower.startswith(("http://", "https://"))
|
|
42
|
+
is_sitemap = "sitemap" in source_lower or source_lower.endswith(".xml")
|
|
43
|
+
return is_http and not is_sitemap
|
|
44
|
+
|
|
45
|
+
async def _get_urls(self, source: str, **options: Any) -> list[str]:
|
|
46
|
+
"""Get list of URLs to process.
|
|
47
|
+
|
|
48
|
+
For website loader, this simply returns the source URL.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
source: The source URL.
|
|
52
|
+
**options: Loader-specific options (ignored).
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List containing just the source URL.
|
|
56
|
+
"""
|
|
57
|
+
return [source]
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Service layer for knowledge orchestration."""
|
|
2
|
+
|
|
3
|
+
from gnosisllm_knowledge.services.indexing import KnowledgeIndexingService
|
|
4
|
+
from gnosisllm_knowledge.services.search import KnowledgeSearchService
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"KnowledgeIndexingService",
|
|
8
|
+
"KnowledgeSearchService",
|
|
9
|
+
]
|