gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,164 @@
1
+ """Setup adapter protocol - Interface for backend setup operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from enum import Enum
8
+ from typing import Any, Protocol, runtime_checkable
9
+
10
+
11
+ class HealthStatus(Enum):
12
+ """Health status enumeration."""
13
+
14
+ HEALTHY = "healthy"
15
+ DEGRADED = "degraded"
16
+ UNHEALTHY = "unhealthy"
17
+ UNKNOWN = "unknown"
18
+
19
+
20
+ @dataclass
21
+ class SetupResult:
22
+ """Result of a setup operation.
23
+
24
+ Attributes:
25
+ success: Whether setup completed successfully.
26
+ steps_completed: List of completed step descriptions.
27
+ errors: List of errors encountered.
28
+ warnings: List of warnings.
29
+ duration_ms: Duration in milliseconds.
30
+ data: Additional result data (e.g., model_id).
31
+ """
32
+
33
+ success: bool
34
+ steps_completed: list[str] | None = None
35
+ errors: list[str] | None = None
36
+ warnings: list[str] | None = None
37
+ duration_ms: float = 0.0
38
+ data: dict[str, Any] | None = None
39
+
40
+
41
+ @dataclass
42
+ class HealthReport:
43
+ """Comprehensive health report.
44
+
45
+ Attributes:
46
+ healthy: Overall health status.
47
+ status: HealthStatus enum value.
48
+ components: Component health details.
49
+ checked_at: When the check was performed.
50
+ """
51
+
52
+ healthy: bool
53
+ status: HealthStatus = HealthStatus.UNKNOWN
54
+ components: dict[str, Any] = field(default_factory=dict)
55
+ checked_at: datetime = field(default_factory=datetime.utcnow)
56
+
57
+
58
+ @dataclass
59
+ class DiagnosticReport:
60
+ """Diagnostic report with recommendations.
61
+
62
+ Attributes:
63
+ health: Health report.
64
+ issues: List of issues found.
65
+ warnings: List of warnings.
66
+ recommendations: List of recommendations.
67
+ cluster_info: Cluster-specific information.
68
+ """
69
+
70
+ health: HealthReport
71
+ issues: list[str] = field(default_factory=list)
72
+ warnings: list[str] = field(default_factory=list)
73
+ recommendations: list[str] = field(default_factory=list)
74
+ cluster_info: dict[str, Any] = field(default_factory=dict)
75
+
76
+
77
+ @runtime_checkable
78
+ class ISetupAdapter(Protocol):
79
+ """Protocol for backend setup operations.
80
+
81
+ Setup adapters are responsible for:
82
+ - Setting up the search backend (indices, pipelines, etc.)
83
+ - Health checking the backend
84
+ - Running diagnostics and providing recommendations
85
+ - Cleaning up resources
86
+
87
+ Implementations should handle all backend-specific setup
88
+ requirements transparently.
89
+ """
90
+
91
+ @property
92
+ def name(self) -> str:
93
+ """Human-readable backend name.
94
+
95
+ Returns:
96
+ Backend name (e.g., "OpenSearch", "Elasticsearch").
97
+ """
98
+ ...
99
+
100
+ async def health_check(self) -> bool:
101
+ """Quick health check.
102
+
103
+ Returns:
104
+ True if backend is healthy and responding.
105
+ """
106
+ ...
107
+
108
+ async def deep_health_check(self) -> HealthReport:
109
+ """Comprehensive health check with component status.
110
+
111
+ Checks all components (cluster, indices, pipelines, etc.)
112
+ and returns detailed health information.
113
+
114
+ Returns:
115
+ HealthReport with component-level health status.
116
+ """
117
+ ...
118
+
119
+ async def setup(self, **options: Any) -> SetupResult:
120
+ """Run complete setup.
121
+
122
+ Creates indices, pipelines, templates, and any other
123
+ required backend resources.
124
+
125
+ Args:
126
+ **options: Setup options like:
127
+ - force: Recreate existing resources
128
+ - skip_pipelines: Skip pipeline creation
129
+ - index_prefix: Custom index prefix
130
+
131
+ Returns:
132
+ SetupResult with completion status.
133
+ """
134
+ ...
135
+
136
+ async def cleanup(self) -> SetupResult:
137
+ """Clean up all resources.
138
+
139
+ Removes all resources created by setup. Use with caution
140
+ as this will delete all data.
141
+
142
+ Returns:
143
+ SetupResult with cleanup status.
144
+ """
145
+ ...
146
+
147
+ async def diagnose(self) -> DiagnosticReport:
148
+ """Run diagnostics and return recommendations.
149
+
150
+ Analyzes the backend configuration and state,
151
+ identifies issues, and provides recommendations.
152
+
153
+ Returns:
154
+ DiagnosticReport with issues and recommendations.
155
+ """
156
+ ...
157
+
158
+ def get_setup_steps(self) -> list[tuple[str, str]]:
159
+ """Get list of setup steps.
160
+
161
+ Returns:
162
+ List of (step_name, step_description) tuples.
163
+ """
164
+ ...
@@ -0,0 +1,12 @@
1
+ """Content fetchers for retrieving content from URLs."""
2
+
3
+ from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
4
+ from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
5
+ from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
6
+
7
+ __all__ = [
8
+ "HTTPContentFetcher",
9
+ "NeoreaderContentFetcher",
10
+ "FetcherConfig",
11
+ "NeoreaderConfig",
12
+ ]
@@ -0,0 +1,77 @@
1
+ """Fetcher configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass
10
+ class FetcherConfig:
11
+ """Base configuration for content fetchers.
12
+
13
+ Attributes:
14
+ timeout: Request timeout in seconds.
15
+ user_agent: User-Agent header value.
16
+ headers: Additional HTTP headers.
17
+ max_retries: Maximum retry attempts.
18
+ retry_delay: Delay between retries in seconds.
19
+ """
20
+
21
+ timeout: float = 30.0
22
+ user_agent: str = "gnosisllm-knowledge/0.1.0"
23
+ headers: dict[str, str] = field(default_factory=dict)
24
+ max_retries: int = 3
25
+ retry_delay: float = 1.0
26
+
27
+
28
+ @dataclass
29
+ class NeoreaderConfig:
30
+ """Configuration for Neoreader content fetcher.
31
+
32
+ Neoreader is a service that converts web pages to clean markdown,
33
+ making content extraction easier for RAG systems.
34
+
35
+ Attributes:
36
+ host: Neoreader API host URL.
37
+ api_key: API key for authentication.
38
+ timeout: Request timeout in seconds.
39
+ target_selector: CSS selector for main content extraction.
40
+ remove_selector: CSS selector for elements to remove.
41
+ with_images: Whether to include image references.
42
+ with_links: Whether to include link references.
43
+ """
44
+
45
+ host: str = "http://localhost:3000"
46
+ api_key: str | None = None
47
+ timeout: float = 30.0
48
+ target_selector: str | None = None
49
+ remove_selector: str | None = None
50
+ with_images: bool = False
51
+ with_links: bool = True
52
+
53
+ @classmethod
54
+ def from_env(cls) -> NeoreaderConfig:
55
+ """Create configuration from environment variables.
56
+
57
+ Environment variables:
58
+ - NEOREADER_HOST: API host URL
59
+ - NEOREADER_API_KEY: API key
60
+ - NEOREADER_TIMEOUT: Request timeout
61
+ - NEOREADER_TARGET_SELECTOR: CSS selector for content
62
+ - NEOREADER_REMOVE_SELECTOR: CSS selector for removal
63
+ - NEOREADER_WITH_IMAGES: Include images (true/false)
64
+ - NEOREADER_WITH_LINKS: Include links (true/false)
65
+
66
+ Returns:
67
+ NeoreaderConfig populated from environment.
68
+ """
69
+ return cls(
70
+ host=os.getenv("NEOREADER_HOST", "http://localhost:3000"),
71
+ api_key=os.getenv("NEOREADER_API_KEY"),
72
+ timeout=float(os.getenv("NEOREADER_TIMEOUT", "30")),
73
+ target_selector=os.getenv("NEOREADER_TARGET_SELECTOR"),
74
+ remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
75
+ with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
76
+ with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
77
+ )
@@ -0,0 +1,167 @@
1
+ """Generic HTTP content fetcher."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import re
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from gnosisllm_knowledge.core.exceptions import FetchError, TimeoutError
13
+ from gnosisllm_knowledge.core.interfaces.fetcher import FetchResult
14
+ from gnosisllm_knowledge.fetchers.config import FetcherConfig
15
+
16
+
17
+ class HTTPContentFetcher:
18
+ """Generic HTTP content fetcher.
19
+
20
+ Fetches raw content from URLs using HTTP requests. For better
21
+ content extraction (converting HTML to markdown), use
22
+ NeoreaderContentFetcher instead.
23
+
24
+ Example:
25
+ ```python
26
+ fetcher = HTTPContentFetcher()
27
+ result = await fetcher.fetch("https://example.com/page")
28
+ print(result.content)
29
+ ```
30
+ """
31
+
32
+ def __init__(self, config: FetcherConfig | None = None) -> None:
33
+ """Initialize the fetcher.
34
+
35
+ Args:
36
+ config: Optional fetcher configuration.
37
+ """
38
+ self._config = config or FetcherConfig()
39
+ self._logger = logging.getLogger(__name__)
40
+
41
+ async def fetch(self, url: str, **options: Any) -> FetchResult:
42
+ """Fetch content from a URL.
43
+
44
+ Args:
45
+ url: The URL to fetch.
46
+ **options: Additional options:
47
+ - timeout: Override default timeout
48
+ - headers: Additional headers
49
+
50
+ Returns:
51
+ FetchResult with content and metadata.
52
+
53
+ Raises:
54
+ FetchError: If the fetch fails.
55
+ TimeoutError: If the request times out.
56
+ """
57
+ timeout = options.get("timeout", self._config.timeout)
58
+ extra_headers = options.get("headers", {})
59
+
60
+ headers = {
61
+ "User-Agent": self._config.user_agent,
62
+ **self._config.headers,
63
+ **extra_headers,
64
+ }
65
+
66
+ try:
67
+ async with httpx.AsyncClient(
68
+ timeout=timeout,
69
+ follow_redirects=True,
70
+ ) as client:
71
+ response = await client.get(url, headers=headers)
72
+ response.raise_for_status()
73
+
74
+ content = response.text
75
+ content_type = response.headers.get("content-type", "text/html")
76
+ title = self._extract_title(content, content_type)
77
+
78
+ return FetchResult(
79
+ content=content,
80
+ status_code=response.status_code,
81
+ content_type=content_type,
82
+ url=str(response.url), # Final URL after redirects
83
+ title=title,
84
+ encoding=response.encoding,
85
+ headers=dict(response.headers),
86
+ )
87
+
88
+ except httpx.TimeoutException as e:
89
+ raise TimeoutError(
90
+ f"Request timed out after {timeout}s",
91
+ timeout=timeout,
92
+ operation="fetch",
93
+ cause=e,
94
+ ) from e
95
+ except httpx.HTTPStatusError as e:
96
+ raise FetchError(
97
+ f"HTTP {e.response.status_code}",
98
+ source=url,
99
+ status_code=e.response.status_code,
100
+ cause=e,
101
+ ) from e
102
+ except Exception as e:
103
+ raise FetchError(str(e), source=url, cause=e) from e
104
+
105
+ async def health_check(self) -> bool:
106
+ """Check if HTTP requests can be made.
107
+
108
+ Returns:
109
+ True (HTTP fetcher is always "healthy").
110
+ """
111
+ return True
112
+
113
+ async def fetch_batch(
114
+ self,
115
+ urls: list[str],
116
+ max_concurrent: int = 10,
117
+ **options: Any,
118
+ ) -> list[FetchResult | Exception]:
119
+ """Fetch multiple URLs concurrently.
120
+
121
+ Args:
122
+ urls: List of URLs to fetch.
123
+ max_concurrent: Maximum concurrent requests.
124
+ **options: Options passed to each fetch call.
125
+
126
+ Returns:
127
+ List of FetchResult objects or Exception for failed fetches.
128
+ """
129
+ semaphore = asyncio.Semaphore(max_concurrent)
130
+
131
+ async def fetch_with_limit(url: str) -> FetchResult | Exception:
132
+ async with semaphore:
133
+ try:
134
+ return await self.fetch(url, **options)
135
+ except Exception as e:
136
+ return e
137
+
138
+ results = await asyncio.gather(
139
+ *[fetch_with_limit(url) for url in urls],
140
+ )
141
+
142
+ return list(results)
143
+
144
+ def _extract_title(self, content: str, content_type: str) -> str | None:
145
+ """Extract title from content.
146
+
147
+ Args:
148
+ content: The fetched content.
149
+ content_type: Content MIME type.
150
+
151
+ Returns:
152
+ Extracted title or None.
153
+ """
154
+ if "html" not in content_type.lower():
155
+ return None
156
+
157
+ # Try to extract from <title> tag
158
+ title_match = re.search(r"<title[^>]*>([^<]+)</title>", content, re.IGNORECASE)
159
+ if title_match:
160
+ return title_match.group(1).strip()
161
+
162
+ # Try to extract from <h1> tag
163
+ h1_match = re.search(r"<h1[^>]*>([^<]+)</h1>", content, re.IGNORECASE)
164
+ if h1_match:
165
+ return h1_match.group(1).strip()
166
+
167
+ return None
@@ -0,0 +1,204 @@
1
+ """Neoreader content fetcher for clean markdown extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import re
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from gnosisllm_knowledge.core.exceptions import (
13
+ ConnectionError,
14
+ FetchError,
15
+ TimeoutError,
16
+ )
17
+ from gnosisllm_knowledge.core.interfaces.fetcher import FetchResult
18
+ from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
19
+
20
+
21
+ class NeoreaderContentFetcher:
22
+ """Content fetcher using Neoreader for clean markdown extraction.
23
+
24
+ Neoreader converts web pages to clean markdown, removing navigation,
25
+ ads, and other noise. This produces much better content for RAG
26
+ systems compared to raw HTML.
27
+
28
+ Example:
29
+ ```python
30
+ config = NeoreaderConfig.from_env()
31
+ fetcher = NeoreaderContentFetcher(config)
32
+ result = await fetcher.fetch("https://example.com/page")
33
+ print(result.content) # Clean markdown
34
+ ```
35
+ """
36
+
37
+ def __init__(self, config: NeoreaderConfig | None = None) -> None:
38
+ """Initialize the fetcher.
39
+
40
+ Args:
41
+ config: Neoreader configuration. Uses environment variables if not provided.
42
+ """
43
+ self._config = config or NeoreaderConfig.from_env()
44
+ self._logger = logging.getLogger(__name__)
45
+
46
+ async def fetch(self, url: str, **options: Any) -> FetchResult:
47
+ """Fetch content from a URL using Neoreader.
48
+
49
+ Args:
50
+ url: The URL to fetch.
51
+ **options: Additional options:
52
+ - target_selector: CSS selector for content
53
+ - remove_selector: CSS selector for removal
54
+ - timeout: Override default timeout
55
+
56
+ Returns:
57
+ FetchResult with markdown content.
58
+
59
+ Raises:
60
+ FetchError: If the fetch fails.
61
+ TimeoutError: If the request times out.
62
+ ConnectionError: If Neoreader is not available.
63
+ """
64
+ timeout = options.get("timeout", self._config.timeout)
65
+ target_selector = options.get("target_selector", self._config.target_selector)
66
+ remove_selector = options.get("remove_selector", self._config.remove_selector)
67
+
68
+ headers = {
69
+ "Accept": "text/markdown",
70
+ "X-Respond-With": "markdown",
71
+ }
72
+
73
+ if self._config.api_key:
74
+ headers["Authorization"] = f"Bearer {self._config.api_key}"
75
+
76
+ if target_selector:
77
+ headers["X-Target-Selector"] = target_selector
78
+ if remove_selector:
79
+ headers["X-Remove-Selector"] = remove_selector
80
+ if timeout:
81
+ headers["X-Timeout"] = str(int(timeout * 1000))
82
+
83
+ try:
84
+ async with httpx.AsyncClient(
85
+ timeout=timeout,
86
+ follow_redirects=True,
87
+ ) as client:
88
+ response = await client.get(
89
+ f"{self._config.host}/{url}",
90
+ headers=headers,
91
+ )
92
+ response.raise_for_status()
93
+
94
+ content = response.text
95
+ title = self._extract_title(content)
96
+
97
+ return FetchResult(
98
+ content=content,
99
+ status_code=response.status_code,
100
+ content_type="text/markdown",
101
+ url=url,
102
+ title=title,
103
+ encoding="utf-8",
104
+ headers=dict(response.headers),
105
+ )
106
+
107
+ except httpx.TimeoutException as e:
108
+ raise TimeoutError(
109
+ f"Request timed out after {timeout}s",
110
+ timeout=timeout,
111
+ operation="fetch",
112
+ cause=e,
113
+ ) from e
114
+ except httpx.ConnectError as e:
115
+ raise ConnectionError(
116
+ f"Cannot connect to Neoreader at {self._config.host}",
117
+ host=self._config.host,
118
+ cause=e,
119
+ ) from e
120
+ except httpx.HTTPStatusError as e:
121
+ raise FetchError(
122
+ f"HTTP {e.response.status_code}",
123
+ source=url,
124
+ status_code=e.response.status_code,
125
+ cause=e,
126
+ ) from e
127
+ except Exception as e:
128
+ raise FetchError(str(e), source=url, cause=e) from e
129
+
130
+ async def health_check(self) -> bool:
131
+ """Check if Neoreader service is available.
132
+
133
+ Returns:
134
+ True if Neoreader is responding, False otherwise.
135
+ """
136
+ try:
137
+ async with httpx.AsyncClient(timeout=5.0) as client:
138
+ # Try to reach the Neoreader health endpoint or root
139
+ response = await client.get(f"{self._config.host}/health")
140
+ return response.status_code < 500
141
+ except Exception:
142
+ try:
143
+ # Fallback to root endpoint
144
+ async with httpx.AsyncClient(timeout=5.0) as client:
145
+ response = await client.get(self._config.host)
146
+ return response.status_code < 500
147
+ except Exception:
148
+ return False
149
+
150
+ async def fetch_batch(
151
+ self,
152
+ urls: list[str],
153
+ max_concurrent: int = 10,
154
+ **options: Any,
155
+ ) -> list[FetchResult | Exception]:
156
+ """Fetch multiple URLs concurrently.
157
+
158
+ Args:
159
+ urls: List of URLs to fetch.
160
+ max_concurrent: Maximum concurrent requests.
161
+ **options: Options passed to each fetch call.
162
+
163
+ Returns:
164
+ List of FetchResult objects or Exception for failed fetches.
165
+ """
166
+ semaphore = asyncio.Semaphore(max_concurrent)
167
+
168
+ async def fetch_with_limit(url: str) -> FetchResult | Exception:
169
+ async with semaphore:
170
+ try:
171
+ return await self.fetch(url, **options)
172
+ except Exception as e:
173
+ return e
174
+
175
+ results = await asyncio.gather(
176
+ *[fetch_with_limit(url) for url in urls],
177
+ )
178
+
179
+ return list(results)
180
+
181
+ def _extract_title(self, content: str) -> str | None:
182
+ """Extract title from markdown content.
183
+
184
+ Looks for the first H1 heading in the markdown.
185
+
186
+ Args:
187
+ content: Markdown content.
188
+
189
+ Returns:
190
+ Title string or None.
191
+ """
192
+ # Look for first H1 heading
193
+ lines = content.split("\n")
194
+ for line in lines:
195
+ line = line.strip()
196
+ if line.startswith("# "):
197
+ return line[2:].strip()
198
+
199
+ # Try regex for H1
200
+ match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
201
+ if match:
202
+ return match.group(1).strip()
203
+
204
+ return None
@@ -0,0 +1,13 @@
1
+ """Content loaders for various source types."""
2
+
3
+ from gnosisllm_knowledge.loaders.base import BaseLoader
4
+ from gnosisllm_knowledge.loaders.factory import LoaderFactory
5
+ from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
6
+ from gnosisllm_knowledge.loaders.website import WebsiteLoader
7
+
8
+ __all__ = [
9
+ "BaseLoader",
10
+ "LoaderFactory",
11
+ "WebsiteLoader",
12
+ "SitemapLoader",
13
+ ]