gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Setup adapter protocol - Interface for backend setup operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Protocol, runtime_checkable
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HealthStatus(Enum):
|
|
12
|
+
"""Health status enumeration."""
|
|
13
|
+
|
|
14
|
+
HEALTHY = "healthy"
|
|
15
|
+
DEGRADED = "degraded"
|
|
16
|
+
UNHEALTHY = "unhealthy"
|
|
17
|
+
UNKNOWN = "unknown"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class SetupResult:
|
|
22
|
+
"""Result of a setup operation.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
success: Whether setup completed successfully.
|
|
26
|
+
steps_completed: List of completed step descriptions.
|
|
27
|
+
errors: List of errors encountered.
|
|
28
|
+
warnings: List of warnings.
|
|
29
|
+
duration_ms: Duration in milliseconds.
|
|
30
|
+
data: Additional result data (e.g., model_id).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
success: bool
|
|
34
|
+
steps_completed: list[str] | None = None
|
|
35
|
+
errors: list[str] | None = None
|
|
36
|
+
warnings: list[str] | None = None
|
|
37
|
+
duration_ms: float = 0.0
|
|
38
|
+
data: dict[str, Any] | None = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class HealthReport:
|
|
43
|
+
"""Comprehensive health report.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
healthy: Overall health status.
|
|
47
|
+
status: HealthStatus enum value.
|
|
48
|
+
components: Component health details.
|
|
49
|
+
checked_at: When the check was performed.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
healthy: bool
|
|
53
|
+
status: HealthStatus = HealthStatus.UNKNOWN
|
|
54
|
+
components: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
checked_at: datetime = field(default_factory=datetime.utcnow)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class DiagnosticReport:
|
|
60
|
+
"""Diagnostic report with recommendations.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
health: Health report.
|
|
64
|
+
issues: List of issues found.
|
|
65
|
+
warnings: List of warnings.
|
|
66
|
+
recommendations: List of recommendations.
|
|
67
|
+
cluster_info: Cluster-specific information.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
health: HealthReport
|
|
71
|
+
issues: list[str] = field(default_factory=list)
|
|
72
|
+
warnings: list[str] = field(default_factory=list)
|
|
73
|
+
recommendations: list[str] = field(default_factory=list)
|
|
74
|
+
cluster_info: dict[str, Any] = field(default_factory=dict)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@runtime_checkable
|
|
78
|
+
class ISetupAdapter(Protocol):
|
|
79
|
+
"""Protocol for backend setup operations.
|
|
80
|
+
|
|
81
|
+
Setup adapters are responsible for:
|
|
82
|
+
- Setting up the search backend (indices, pipelines, etc.)
|
|
83
|
+
- Health checking the backend
|
|
84
|
+
- Running diagnostics and providing recommendations
|
|
85
|
+
- Cleaning up resources
|
|
86
|
+
|
|
87
|
+
Implementations should handle all backend-specific setup
|
|
88
|
+
requirements transparently.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def name(self) -> str:
|
|
93
|
+
"""Human-readable backend name.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Backend name (e.g., "OpenSearch", "Elasticsearch").
|
|
97
|
+
"""
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
async def health_check(self) -> bool:
|
|
101
|
+
"""Quick health check.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
True if backend is healthy and responding.
|
|
105
|
+
"""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
async def deep_health_check(self) -> HealthReport:
|
|
109
|
+
"""Comprehensive health check with component status.
|
|
110
|
+
|
|
111
|
+
Checks all components (cluster, indices, pipelines, etc.)
|
|
112
|
+
and returns detailed health information.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
HealthReport with component-level health status.
|
|
116
|
+
"""
|
|
117
|
+
...
|
|
118
|
+
|
|
119
|
+
async def setup(self, **options: Any) -> SetupResult:
|
|
120
|
+
"""Run complete setup.
|
|
121
|
+
|
|
122
|
+
Creates indices, pipelines, templates, and any other
|
|
123
|
+
required backend resources.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
**options: Setup options like:
|
|
127
|
+
- force: Recreate existing resources
|
|
128
|
+
- skip_pipelines: Skip pipeline creation
|
|
129
|
+
- index_prefix: Custom index prefix
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
SetupResult with completion status.
|
|
133
|
+
"""
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
async def cleanup(self) -> SetupResult:
|
|
137
|
+
"""Clean up all resources.
|
|
138
|
+
|
|
139
|
+
Removes all resources created by setup. Use with caution
|
|
140
|
+
as this will delete all data.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
SetupResult with cleanup status.
|
|
144
|
+
"""
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
async def diagnose(self) -> DiagnosticReport:
|
|
148
|
+
"""Run diagnostics and return recommendations.
|
|
149
|
+
|
|
150
|
+
Analyzes the backend configuration and state,
|
|
151
|
+
identifies issues, and provides recommendations.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
DiagnosticReport with issues and recommendations.
|
|
155
|
+
"""
|
|
156
|
+
...
|
|
157
|
+
|
|
158
|
+
def get_setup_steps(self) -> list[tuple[str, str]]:
|
|
159
|
+
"""Get list of setup steps.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List of (step_name, step_description) tuples.
|
|
163
|
+
"""
|
|
164
|
+
...
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Content fetchers for retrieving content from URLs."""
|
|
2
|
+
|
|
3
|
+
from gnosisllm_knowledge.fetchers.config import FetcherConfig, NeoreaderConfig
|
|
4
|
+
from gnosisllm_knowledge.fetchers.http import HTTPContentFetcher
|
|
5
|
+
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"HTTPContentFetcher",
|
|
9
|
+
"NeoreaderContentFetcher",
|
|
10
|
+
"FetcherConfig",
|
|
11
|
+
"NeoreaderConfig",
|
|
12
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Fetcher configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class FetcherConfig:
|
|
11
|
+
"""Base configuration for content fetchers.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
timeout: Request timeout in seconds.
|
|
15
|
+
user_agent: User-Agent header value.
|
|
16
|
+
headers: Additional HTTP headers.
|
|
17
|
+
max_retries: Maximum retry attempts.
|
|
18
|
+
retry_delay: Delay between retries in seconds.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
timeout: float = 30.0
|
|
22
|
+
user_agent: str = "gnosisllm-knowledge/0.1.0"
|
|
23
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
24
|
+
max_retries: int = 3
|
|
25
|
+
retry_delay: float = 1.0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class NeoreaderConfig:
|
|
30
|
+
"""Configuration for Neoreader content fetcher.
|
|
31
|
+
|
|
32
|
+
Neoreader is a service that converts web pages to clean markdown,
|
|
33
|
+
making content extraction easier for RAG systems.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
host: Neoreader API host URL.
|
|
37
|
+
api_key: API key for authentication.
|
|
38
|
+
timeout: Request timeout in seconds.
|
|
39
|
+
target_selector: CSS selector for main content extraction.
|
|
40
|
+
remove_selector: CSS selector for elements to remove.
|
|
41
|
+
with_images: Whether to include image references.
|
|
42
|
+
with_links: Whether to include link references.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
host: str = "http://localhost:3000"
|
|
46
|
+
api_key: str | None = None
|
|
47
|
+
timeout: float = 30.0
|
|
48
|
+
target_selector: str | None = None
|
|
49
|
+
remove_selector: str | None = None
|
|
50
|
+
with_images: bool = False
|
|
51
|
+
with_links: bool = True
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_env(cls) -> NeoreaderConfig:
|
|
55
|
+
"""Create configuration from environment variables.
|
|
56
|
+
|
|
57
|
+
Environment variables:
|
|
58
|
+
- NEOREADER_HOST: API host URL
|
|
59
|
+
- NEOREADER_API_KEY: API key
|
|
60
|
+
- NEOREADER_TIMEOUT: Request timeout
|
|
61
|
+
- NEOREADER_TARGET_SELECTOR: CSS selector for content
|
|
62
|
+
- NEOREADER_REMOVE_SELECTOR: CSS selector for removal
|
|
63
|
+
- NEOREADER_WITH_IMAGES: Include images (true/false)
|
|
64
|
+
- NEOREADER_WITH_LINKS: Include links (true/false)
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
NeoreaderConfig populated from environment.
|
|
68
|
+
"""
|
|
69
|
+
return cls(
|
|
70
|
+
host=os.getenv("NEOREADER_HOST", "http://localhost:3000"),
|
|
71
|
+
api_key=os.getenv("NEOREADER_API_KEY"),
|
|
72
|
+
timeout=float(os.getenv("NEOREADER_TIMEOUT", "30")),
|
|
73
|
+
target_selector=os.getenv("NEOREADER_TARGET_SELECTOR"),
|
|
74
|
+
remove_selector=os.getenv("NEOREADER_REMOVE_SELECTOR"),
|
|
75
|
+
with_images=os.getenv("NEOREADER_WITH_IMAGES", "").lower() == "true",
|
|
76
|
+
with_links=os.getenv("NEOREADER_WITH_LINKS", "true").lower() == "true",
|
|
77
|
+
)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Generic HTTP content fetcher."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from gnosisllm_knowledge.core.exceptions import FetchError, TimeoutError
|
|
13
|
+
from gnosisllm_knowledge.core.interfaces.fetcher import FetchResult
|
|
14
|
+
from gnosisllm_knowledge.fetchers.config import FetcherConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HTTPContentFetcher:
|
|
18
|
+
"""Generic HTTP content fetcher.
|
|
19
|
+
|
|
20
|
+
Fetches raw content from URLs using HTTP requests. For better
|
|
21
|
+
content extraction (converting HTML to markdown), use
|
|
22
|
+
NeoreaderContentFetcher instead.
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
```python
|
|
26
|
+
fetcher = HTTPContentFetcher()
|
|
27
|
+
result = await fetcher.fetch("https://example.com/page")
|
|
28
|
+
print(result.content)
|
|
29
|
+
```
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, config: FetcherConfig | None = None) -> None:
|
|
33
|
+
"""Initialize the fetcher.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
config: Optional fetcher configuration.
|
|
37
|
+
"""
|
|
38
|
+
self._config = config or FetcherConfig()
|
|
39
|
+
self._logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
async def fetch(self, url: str, **options: Any) -> FetchResult:
|
|
42
|
+
"""Fetch content from a URL.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
url: The URL to fetch.
|
|
46
|
+
**options: Additional options:
|
|
47
|
+
- timeout: Override default timeout
|
|
48
|
+
- headers: Additional headers
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
FetchResult with content and metadata.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
FetchError: If the fetch fails.
|
|
55
|
+
TimeoutError: If the request times out.
|
|
56
|
+
"""
|
|
57
|
+
timeout = options.get("timeout", self._config.timeout)
|
|
58
|
+
extra_headers = options.get("headers", {})
|
|
59
|
+
|
|
60
|
+
headers = {
|
|
61
|
+
"User-Agent": self._config.user_agent,
|
|
62
|
+
**self._config.headers,
|
|
63
|
+
**extra_headers,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
async with httpx.AsyncClient(
|
|
68
|
+
timeout=timeout,
|
|
69
|
+
follow_redirects=True,
|
|
70
|
+
) as client:
|
|
71
|
+
response = await client.get(url, headers=headers)
|
|
72
|
+
response.raise_for_status()
|
|
73
|
+
|
|
74
|
+
content = response.text
|
|
75
|
+
content_type = response.headers.get("content-type", "text/html")
|
|
76
|
+
title = self._extract_title(content, content_type)
|
|
77
|
+
|
|
78
|
+
return FetchResult(
|
|
79
|
+
content=content,
|
|
80
|
+
status_code=response.status_code,
|
|
81
|
+
content_type=content_type,
|
|
82
|
+
url=str(response.url), # Final URL after redirects
|
|
83
|
+
title=title,
|
|
84
|
+
encoding=response.encoding,
|
|
85
|
+
headers=dict(response.headers),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
except httpx.TimeoutException as e:
|
|
89
|
+
raise TimeoutError(
|
|
90
|
+
f"Request timed out after {timeout}s",
|
|
91
|
+
timeout=timeout,
|
|
92
|
+
operation="fetch",
|
|
93
|
+
cause=e,
|
|
94
|
+
) from e
|
|
95
|
+
except httpx.HTTPStatusError as e:
|
|
96
|
+
raise FetchError(
|
|
97
|
+
f"HTTP {e.response.status_code}",
|
|
98
|
+
source=url,
|
|
99
|
+
status_code=e.response.status_code,
|
|
100
|
+
cause=e,
|
|
101
|
+
) from e
|
|
102
|
+
except Exception as e:
|
|
103
|
+
raise FetchError(str(e), source=url, cause=e) from e
|
|
104
|
+
|
|
105
|
+
async def health_check(self) -> bool:
|
|
106
|
+
"""Check if HTTP requests can be made.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
True (HTTP fetcher is always "healthy").
|
|
110
|
+
"""
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
async def fetch_batch(
|
|
114
|
+
self,
|
|
115
|
+
urls: list[str],
|
|
116
|
+
max_concurrent: int = 10,
|
|
117
|
+
**options: Any,
|
|
118
|
+
) -> list[FetchResult | Exception]:
|
|
119
|
+
"""Fetch multiple URLs concurrently.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
urls: List of URLs to fetch.
|
|
123
|
+
max_concurrent: Maximum concurrent requests.
|
|
124
|
+
**options: Options passed to each fetch call.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of FetchResult objects or Exception for failed fetches.
|
|
128
|
+
"""
|
|
129
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
130
|
+
|
|
131
|
+
async def fetch_with_limit(url: str) -> FetchResult | Exception:
|
|
132
|
+
async with semaphore:
|
|
133
|
+
try:
|
|
134
|
+
return await self.fetch(url, **options)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
return e
|
|
137
|
+
|
|
138
|
+
results = await asyncio.gather(
|
|
139
|
+
*[fetch_with_limit(url) for url in urls],
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return list(results)
|
|
143
|
+
|
|
144
|
+
def _extract_title(self, content: str, content_type: str) -> str | None:
|
|
145
|
+
"""Extract title from content.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
content: The fetched content.
|
|
149
|
+
content_type: Content MIME type.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Extracted title or None.
|
|
153
|
+
"""
|
|
154
|
+
if "html" not in content_type.lower():
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
# Try to extract from <title> tag
|
|
158
|
+
title_match = re.search(r"<title[^>]*>([^<]+)</title>", content, re.IGNORECASE)
|
|
159
|
+
if title_match:
|
|
160
|
+
return title_match.group(1).strip()
|
|
161
|
+
|
|
162
|
+
# Try to extract from <h1> tag
|
|
163
|
+
h1_match = re.search(r"<h1[^>]*>([^<]+)</h1>", content, re.IGNORECASE)
|
|
164
|
+
if h1_match:
|
|
165
|
+
return h1_match.group(1).strip()
|
|
166
|
+
|
|
167
|
+
return None
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Neoreader content fetcher for clean markdown extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from gnosisllm_knowledge.core.exceptions import (
|
|
13
|
+
ConnectionError,
|
|
14
|
+
FetchError,
|
|
15
|
+
TimeoutError,
|
|
16
|
+
)
|
|
17
|
+
from gnosisllm_knowledge.core.interfaces.fetcher import FetchResult
|
|
18
|
+
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NeoreaderContentFetcher:
|
|
22
|
+
"""Content fetcher using Neoreader for clean markdown extraction.
|
|
23
|
+
|
|
24
|
+
Neoreader converts web pages to clean markdown, removing navigation,
|
|
25
|
+
ads, and other noise. This produces much better content for RAG
|
|
26
|
+
systems compared to raw HTML.
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
```python
|
|
30
|
+
config = NeoreaderConfig.from_env()
|
|
31
|
+
fetcher = NeoreaderContentFetcher(config)
|
|
32
|
+
result = await fetcher.fetch("https://example.com/page")
|
|
33
|
+
print(result.content) # Clean markdown
|
|
34
|
+
```
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, config: NeoreaderConfig | None = None) -> None:
|
|
38
|
+
"""Initialize the fetcher.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
config: Neoreader configuration. Uses environment variables if not provided.
|
|
42
|
+
"""
|
|
43
|
+
self._config = config or NeoreaderConfig.from_env()
|
|
44
|
+
self._logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
async def fetch(self, url: str, **options: Any) -> FetchResult:
|
|
47
|
+
"""Fetch content from a URL using Neoreader.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
url: The URL to fetch.
|
|
51
|
+
**options: Additional options:
|
|
52
|
+
- target_selector: CSS selector for content
|
|
53
|
+
- remove_selector: CSS selector for removal
|
|
54
|
+
- timeout: Override default timeout
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
FetchResult with markdown content.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
FetchError: If the fetch fails.
|
|
61
|
+
TimeoutError: If the request times out.
|
|
62
|
+
ConnectionError: If Neoreader is not available.
|
|
63
|
+
"""
|
|
64
|
+
timeout = options.get("timeout", self._config.timeout)
|
|
65
|
+
target_selector = options.get("target_selector", self._config.target_selector)
|
|
66
|
+
remove_selector = options.get("remove_selector", self._config.remove_selector)
|
|
67
|
+
|
|
68
|
+
headers = {
|
|
69
|
+
"Accept": "text/markdown",
|
|
70
|
+
"X-Respond-With": "markdown",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if self._config.api_key:
|
|
74
|
+
headers["Authorization"] = f"Bearer {self._config.api_key}"
|
|
75
|
+
|
|
76
|
+
if target_selector:
|
|
77
|
+
headers["X-Target-Selector"] = target_selector
|
|
78
|
+
if remove_selector:
|
|
79
|
+
headers["X-Remove-Selector"] = remove_selector
|
|
80
|
+
if timeout:
|
|
81
|
+
headers["X-Timeout"] = str(int(timeout * 1000))
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
async with httpx.AsyncClient(
|
|
85
|
+
timeout=timeout,
|
|
86
|
+
follow_redirects=True,
|
|
87
|
+
) as client:
|
|
88
|
+
response = await client.get(
|
|
89
|
+
f"{self._config.host}/{url}",
|
|
90
|
+
headers=headers,
|
|
91
|
+
)
|
|
92
|
+
response.raise_for_status()
|
|
93
|
+
|
|
94
|
+
content = response.text
|
|
95
|
+
title = self._extract_title(content)
|
|
96
|
+
|
|
97
|
+
return FetchResult(
|
|
98
|
+
content=content,
|
|
99
|
+
status_code=response.status_code,
|
|
100
|
+
content_type="text/markdown",
|
|
101
|
+
url=url,
|
|
102
|
+
title=title,
|
|
103
|
+
encoding="utf-8",
|
|
104
|
+
headers=dict(response.headers),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
except httpx.TimeoutException as e:
|
|
108
|
+
raise TimeoutError(
|
|
109
|
+
f"Request timed out after {timeout}s",
|
|
110
|
+
timeout=timeout,
|
|
111
|
+
operation="fetch",
|
|
112
|
+
cause=e,
|
|
113
|
+
) from e
|
|
114
|
+
except httpx.ConnectError as e:
|
|
115
|
+
raise ConnectionError(
|
|
116
|
+
f"Cannot connect to Neoreader at {self._config.host}",
|
|
117
|
+
host=self._config.host,
|
|
118
|
+
cause=e,
|
|
119
|
+
) from e
|
|
120
|
+
except httpx.HTTPStatusError as e:
|
|
121
|
+
raise FetchError(
|
|
122
|
+
f"HTTP {e.response.status_code}",
|
|
123
|
+
source=url,
|
|
124
|
+
status_code=e.response.status_code,
|
|
125
|
+
cause=e,
|
|
126
|
+
) from e
|
|
127
|
+
except Exception as e:
|
|
128
|
+
raise FetchError(str(e), source=url, cause=e) from e
|
|
129
|
+
|
|
130
|
+
async def health_check(self) -> bool:
|
|
131
|
+
"""Check if Neoreader service is available.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
True if Neoreader is responding, False otherwise.
|
|
135
|
+
"""
|
|
136
|
+
try:
|
|
137
|
+
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
138
|
+
# Try to reach the Neoreader health endpoint or root
|
|
139
|
+
response = await client.get(f"{self._config.host}/health")
|
|
140
|
+
return response.status_code < 500
|
|
141
|
+
except Exception:
|
|
142
|
+
try:
|
|
143
|
+
# Fallback to root endpoint
|
|
144
|
+
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
145
|
+
response = await client.get(self._config.host)
|
|
146
|
+
return response.status_code < 500
|
|
147
|
+
except Exception:
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
async def fetch_batch(
|
|
151
|
+
self,
|
|
152
|
+
urls: list[str],
|
|
153
|
+
max_concurrent: int = 10,
|
|
154
|
+
**options: Any,
|
|
155
|
+
) -> list[FetchResult | Exception]:
|
|
156
|
+
"""Fetch multiple URLs concurrently.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
urls: List of URLs to fetch.
|
|
160
|
+
max_concurrent: Maximum concurrent requests.
|
|
161
|
+
**options: Options passed to each fetch call.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of FetchResult objects or Exception for failed fetches.
|
|
165
|
+
"""
|
|
166
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
167
|
+
|
|
168
|
+
async def fetch_with_limit(url: str) -> FetchResult | Exception:
|
|
169
|
+
async with semaphore:
|
|
170
|
+
try:
|
|
171
|
+
return await self.fetch(url, **options)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
return e
|
|
174
|
+
|
|
175
|
+
results = await asyncio.gather(
|
|
176
|
+
*[fetch_with_limit(url) for url in urls],
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return list(results)
|
|
180
|
+
|
|
181
|
+
def _extract_title(self, content: str) -> str | None:
|
|
182
|
+
"""Extract title from markdown content.
|
|
183
|
+
|
|
184
|
+
Looks for the first H1 heading in the markdown.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
content: Markdown content.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Title string or None.
|
|
191
|
+
"""
|
|
192
|
+
# Look for first H1 heading
|
|
193
|
+
lines = content.split("\n")
|
|
194
|
+
for line in lines:
|
|
195
|
+
line = line.strip()
|
|
196
|
+
if line.startswith("# "):
|
|
197
|
+
return line[2:].strip()
|
|
198
|
+
|
|
199
|
+
# Try regex for H1
|
|
200
|
+
match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
|
201
|
+
if match:
|
|
202
|
+
return match.group(1).strip()
|
|
203
|
+
|
|
204
|
+
return None
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Content loaders for various source types."""
|
|
2
|
+
|
|
3
|
+
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
4
|
+
from gnosisllm_knowledge.loaders.factory import LoaderFactory
|
|
5
|
+
from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
|
|
6
|
+
from gnosisllm_knowledge.loaders.website import WebsiteLoader
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseLoader",
|
|
10
|
+
"LoaderFactory",
|
|
11
|
+
"WebsiteLoader",
|
|
12
|
+
"SitemapLoader",
|
|
13
|
+
]
|