gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,285 @@
1
+ """Sitemap loader with recursive discovery and URL filtering."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import fnmatch
7
+ import logging
8
+ import re
9
+ from typing import Any
10
+ from xml.etree import ElementTree
11
+
12
+ import httpx
13
+
14
+ from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent
15
+ from gnosisllm_knowledge.loaders.base import BaseLoader
16
+
17
+ # XML namespace for sitemaps
18
+ SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
19
+
20
+ # Default limits
21
+ DEFAULT_MAX_URLS = 1000
22
+ DEFAULT_MAX_DEPTH = 3
23
+
24
+
25
+ class SitemapLoader(BaseLoader):
26
+ """Loader for sitemap XML files with recursive discovery.
27
+
28
+ Features:
29
+ - Recursive sitemap discovery (sitemap index files)
30
+ - URL filtering with allow/block patterns
31
+ - Configurable max URLs and depth limits
32
+ - Parallel processing of nested sitemaps
33
+ - Deduplication of discovered URLs
34
+
35
+ Example:
36
+ ```python
37
+ loader = SitemapLoader(
38
+ fetcher, chunker,
39
+ config={
40
+ "max_urls": 500,
41
+ "max_depth": 2,
42
+ "allowed_patterns": ["*/docs/*", "*/blog/*"],
43
+ "blocked_patterns": ["*/admin/*"],
44
+ }
45
+ )
46
+ result = await loader.load("https://example.com/sitemap.xml")
47
+ ```
48
+ """
49
+
50
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
51
+ """Initialize the sitemap loader."""
52
+ super().__init__(*args, **kwargs)
53
+ self._sitemap_logger = logging.getLogger(f"{__name__}.SitemapLoader")
54
+
55
+ @property
56
+ def name(self) -> str:
57
+ """Return the loader name."""
58
+ return "sitemap"
59
+
60
+ def supports(self, source: str) -> bool:
61
+ """Check if this loader supports the given source.
62
+
63
+ Supports URLs that look like sitemaps (contain 'sitemap' or end with .xml).
64
+
65
+ Args:
66
+ source: The source URL.
67
+
68
+ Returns:
69
+ True if this looks like a sitemap URL.
70
+ """
71
+ source_lower = source.lower()
72
+ is_http = source_lower.startswith(("http://", "https://"))
73
+ is_sitemap = "sitemap" in source_lower or source_lower.endswith(".xml")
74
+ return is_http and is_sitemap
75
+
76
+ async def _get_urls(self, source: str, **options: Any) -> list[str]:
77
+ """Get list of URLs to process from the sitemap.
78
+
79
+ Recursively discovers URLs from sitemap and sitemap index files.
80
+
81
+ Args:
82
+ source: The sitemap URL.
83
+ **options: Loader-specific options:
84
+ - max_urls: Maximum URLs to return
85
+ - max_depth: Maximum recursion depth
86
+ - allowed_patterns: URL patterns to include
87
+ - blocked_patterns: URL patterns to exclude
88
+
89
+ Returns:
90
+ List of discovered and filtered URLs.
91
+ """
92
+ max_urls = options.get("max_urls", self._config.get("max_urls", DEFAULT_MAX_URLS))
93
+ max_depth = options.get("max_depth", self._config.get("max_depth", DEFAULT_MAX_DEPTH))
94
+ allowed_patterns = options.get(
95
+ "allowed_patterns", self._config.get("allowed_patterns", [])
96
+ )
97
+ blocked_patterns = options.get(
98
+ "blocked_patterns", self._config.get("blocked_patterns", [])
99
+ )
100
+
101
+ discovered_urls: set[str] = set()
102
+
103
+ await self._discover_urls(
104
+ sitemap_url=source,
105
+ depth=0,
106
+ max_depth=max_depth,
107
+ max_urls=max_urls,
108
+ discovered=discovered_urls,
109
+ allowed_patterns=allowed_patterns,
110
+ blocked_patterns=blocked_patterns,
111
+ )
112
+
113
+ # Convert to list and limit
114
+ urls = list(discovered_urls)[:max_urls]
115
+ self._sitemap_logger.info(f"Discovered {len(urls)} URLs from sitemap")
116
+
117
+ return urls
118
+
119
+ async def _discover_urls(
120
+ self,
121
+ sitemap_url: str,
122
+ depth: int,
123
+ max_depth: int,
124
+ max_urls: int,
125
+ discovered: set[str],
126
+ allowed_patterns: list[str],
127
+ blocked_patterns: list[str],
128
+ ) -> None:
129
+ """Recursively discover URLs from a sitemap.
130
+
131
+ Args:
132
+ sitemap_url: The sitemap URL to process.
133
+ depth: Current recursion depth.
134
+ max_depth: Maximum recursion depth.
135
+ max_urls: Maximum URLs to discover.
136
+ discovered: Set of already discovered URLs (modified in place).
137
+ allowed_patterns: URL patterns to include.
138
+ blocked_patterns: URL patterns to exclude.
139
+ """
140
+ if depth > max_depth:
141
+ self._sitemap_logger.debug(f"Max depth {max_depth} reached, skipping {sitemap_url}")
142
+ return
143
+
144
+ if len(discovered) >= max_urls:
145
+ self._sitemap_logger.debug(f"Max URLs {max_urls} reached")
146
+ return
147
+
148
+ try:
149
+ content = await self._fetch_sitemap(sitemap_url)
150
+ if not content:
151
+ return
152
+
153
+ root = ElementTree.fromstring(content)
154
+
155
+ # Check if this is a sitemap index
156
+ sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
157
+ if sitemap_refs:
158
+ # This is a sitemap index - recursively process each sitemap
159
+ self._sitemap_logger.info(
160
+ f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
161
+ )
162
+
163
+ # Process nested sitemaps in parallel
164
+ tasks = []
165
+ for sitemap_ref in sitemap_refs:
166
+ if sitemap_ref.text and len(discovered) < max_urls:
167
+ tasks.append(
168
+ self._discover_urls(
169
+ sitemap_url=sitemap_ref.text.strip(),
170
+ depth=depth + 1,
171
+ max_depth=max_depth,
172
+ max_urls=max_urls,
173
+ discovered=discovered,
174
+ allowed_patterns=allowed_patterns,
175
+ blocked_patterns=blocked_patterns,
176
+ )
177
+ )
178
+
179
+ if tasks:
180
+ await asyncio.gather(*tasks, return_exceptions=True)
181
+
182
+ return
183
+
184
+ # Process regular sitemap URLs
185
+ url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
186
+ urls_added = 0
187
+
188
+ for url_elem in url_elements:
189
+ if url_elem.text and len(discovered) < max_urls:
190
+ url = url_elem.text.strip()
191
+
192
+ # Apply filters
193
+ if self._should_include_url(url, allowed_patterns, blocked_patterns):
194
+ if url not in discovered:
195
+ discovered.add(url)
196
+ urls_added += 1
197
+
198
+ # Emit discovery event
199
+ self._events.emit(
200
+ SitemapDiscoveryEvent(
201
+ sitemap_url=sitemap_url,
202
+ urls_discovered=urls_added,
203
+ depth=depth,
204
+ total_urls=len(discovered),
205
+ )
206
+ )
207
+
208
+ self._sitemap_logger.debug(
209
+ f"Discovered {urls_added} URLs from {sitemap_url} at depth {depth}"
210
+ )
211
+
212
+ except Exception as e:
213
+ self._sitemap_logger.error(f"Failed to process sitemap {sitemap_url}: {e}")
214
+
215
+ async def _fetch_sitemap(self, url: str) -> str | None:
216
+ """Fetch sitemap XML content.
217
+
218
+ Args:
219
+ url: The sitemap URL to fetch.
220
+
221
+ Returns:
222
+ Sitemap XML content or None if fetch failed.
223
+ """
224
+ try:
225
+ async with httpx.AsyncClient(timeout=30.0) as client:
226
+ response = await client.get(
227
+ url,
228
+ headers={"Accept": "application/xml, text/xml, */*"},
229
+ follow_redirects=True,
230
+ )
231
+ response.raise_for_status()
232
+ return response.text
233
+ except Exception as e:
234
+ self._sitemap_logger.error(f"Failed to fetch sitemap {url}: {e}")
235
+ return None
236
+
237
+ def _should_include_url(
238
+ self,
239
+ url: str,
240
+ allowed_patterns: list[str],
241
+ blocked_patterns: list[str],
242
+ ) -> bool:
243
+ """Check if a URL should be included based on patterns.
244
+
245
+ Args:
246
+ url: The URL to check.
247
+ allowed_patterns: Patterns that must match (if any).
248
+ blocked_patterns: Patterns that must not match.
249
+
250
+ Returns:
251
+ True if URL should be included.
252
+ """
253
+ # Check blocked patterns first
254
+ for pattern in blocked_patterns:
255
+ if self._matches_pattern(url, pattern):
256
+ return False
257
+
258
+ # If allowed patterns specified, at least one must match
259
+ if allowed_patterns:
260
+ return any(self._matches_pattern(url, p) for p in allowed_patterns)
261
+
262
+ return True
263
+
264
+ def _matches_pattern(self, url: str, pattern: str) -> bool:
265
+ """Check if URL matches a pattern.
266
+
267
+ Supports both glob patterns (with *) and regex patterns.
268
+
269
+ Args:
270
+ url: The URL to check.
271
+ pattern: The pattern to match against.
272
+
273
+ Returns:
274
+ True if URL matches the pattern.
275
+ """
276
+ # Try fnmatch for glob patterns
277
+ if "*" in pattern or "?" in pattern:
278
+ return fnmatch.fnmatch(url, pattern)
279
+
280
+ # Try regex
281
+ try:
282
+ return bool(re.search(pattern, url))
283
+ except re.error:
284
+ # Invalid regex, try substring match
285
+ return pattern in url
@@ -0,0 +1,57 @@
1
+ """Website loader for single URL loading."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from gnosisllm_knowledge.loaders.base import BaseLoader
8
+
9
+
10
+ class WebsiteLoader(BaseLoader):
11
+ """Loader for single website URLs.
12
+
13
+ This is the simplest loader that handles loading content from
14
+ a single URL. For loading multiple URLs from a sitemap, use
15
+ SitemapLoader instead.
16
+
17
+ Example:
18
+ ```python
19
+ loader = WebsiteLoader(fetcher, chunker)
20
+ result = await loader.load("https://example.com/page")
21
+ ```
22
+ """
23
+
24
+ @property
25
+ def name(self) -> str:
26
+ """Return the loader name."""
27
+ return "website"
28
+
29
+ def supports(self, source: str) -> bool:
30
+ """Check if this loader supports the given source.
31
+
32
+ Supports HTTP and HTTPS URLs that don't look like sitemaps.
33
+
34
+ Args:
35
+ source: The source URL.
36
+
37
+ Returns:
38
+ True if this is a regular website URL.
39
+ """
40
+ source_lower = source.lower()
41
+ is_http = source_lower.startswith(("http://", "https://"))
42
+ is_sitemap = "sitemap" in source_lower or source_lower.endswith(".xml")
43
+ return is_http and not is_sitemap
44
+
45
+ async def _get_urls(self, source: str, **options: Any) -> list[str]:
46
+ """Get list of URLs to process.
47
+
48
+ For website loader, this simply returns the source URL.
49
+
50
+ Args:
51
+ source: The source URL.
52
+ **options: Loader-specific options (ignored).
53
+
54
+ Returns:
55
+ List containing just the source URL.
56
+ """
57
+ return [source]
File without changes
@@ -0,0 +1,9 @@
1
+ """Service layer for knowledge orchestration."""
2
+
3
+ from gnosisllm_knowledge.services.indexing import KnowledgeIndexingService
4
+ from gnosisllm_knowledge.services.search import KnowledgeSearchService
5
+
6
+ __all__ = [
7
+ "KnowledgeIndexingService",
8
+ "KnowledgeSearchService",
9
+ ]