keenable-haystack 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keenable_haystack-0.1.0/.gitignore +10 -0
- keenable_haystack-0.1.0/LICENSE +21 -0
- keenable_haystack-0.1.0/PKG-INFO +94 -0
- keenable_haystack-0.1.0/README.md +71 -0
- keenable_haystack-0.1.0/haystack_integrations/components/fetchers/keenable/__init__.py +3 -0
- keenable_haystack-0.1.0/haystack_integrations/components/fetchers/keenable/fetcher.py +111 -0
- keenable_haystack-0.1.0/haystack_integrations/components/websearch/keenable/__init__.py +3 -0
- keenable_haystack-0.1.0/haystack_integrations/components/websearch/keenable/_client.py +232 -0
- keenable_haystack-0.1.0/haystack_integrations/components/websearch/keenable/web_search.py +156 -0
- keenable_haystack-0.1.0/pyproject.toml +62 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Keenable
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: keenable-haystack
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Keenable web-search and page-fetch components for Haystack. Keyless by default.
|
|
5
|
+
Project-URL: Homepage, https://keenable.ai
|
|
6
|
+
Project-URL: Documentation, https://docs.keenable.ai
|
|
7
|
+
Project-URL: Repository, https://github.com/keenableai/keenable-haystack
|
|
8
|
+
Author-email: Keenable <hello@keenable.ai>
|
|
9
|
+
Maintainer: keenableai
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: agents,fetch,haystack,haystack-ai,keenable,rag,web-search
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Requires-Dist: haystack-ai>=2.0.0
|
|
21
|
+
Requires-Dist: requests>=2.31
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# keenable-haystack
|
|
25
|
+
|
|
26
|
+
[Keenable](https://keenable.ai) web search + page fetch for
|
|
27
|
+
[Haystack](https://haystack.deepset.ai) 2.x, as two components:
|
|
28
|
+
|
|
29
|
+
- **`KeenableWebSearch`** — searches the web and returns `documents` +
|
|
30
|
+
`links`, the same output shape as Haystack's built-in `SerperDevWebSearch` /
|
|
31
|
+
`SearchApiWebSearch`, so it is drop-in for pipelines wired to those.
|
|
32
|
+
- **`KeenableFetcher`** — fetches a list of URLs and returns `documents` whose
|
|
33
|
+
content is the page's main text as markdown (Keenable extracts it server-side,
|
|
34
|
+
so you don't need a separate `LinkContentFetcher` + `HTMLToDocument` step).
|
|
35
|
+
|
|
36
|
+
**Keyless by default**: with no API key the keyless public endpoints are used.
|
|
37
|
+
Provide a key to use the authenticated endpoints (required for `mode="realtime"`
|
|
38
|
+
and for higher rate limits).
|
|
39
|
+
|
|
40
|
+
## Install
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install keenable-haystack
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from haystack_integrations.components.websearch.keenable import KeenableWebSearch
|
|
50
|
+
from haystack_integrations.components.fetchers.keenable import KeenableFetcher
|
|
51
|
+
|
|
52
|
+
# No key -> keyless public endpoints. Set KEENABLE_API_KEY to lift limits.
|
|
53
|
+
websearch = KeenableWebSearch(top_k=5)
|
|
54
|
+
hits = websearch.run(query="latest developments in AI agents")
|
|
55
|
+
print(hits["links"])
|
|
56
|
+
|
|
57
|
+
fetcher = KeenableFetcher()
|
|
58
|
+
pages = fetcher.run(urls=hits["links"][:2])
|
|
59
|
+
print(pages["documents"][0].content)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
In a pipeline (drop-in for any web-search component):
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from haystack import Pipeline
|
|
66
|
+
from haystack.components.builders import PromptBuilder
|
|
67
|
+
|
|
68
|
+
pipe = Pipeline()
|
|
69
|
+
pipe.add_component("search", KeenableWebSearch(top_k=5))
|
|
70
|
+
pipe.add_component("prompt", PromptBuilder(template="Answer using:\n{{ documents }}"))
|
|
71
|
+
pipe.connect("search.documents", "prompt.documents")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
`KeenableWebSearch.run` accepts optional per-query filters (`site`,
|
|
75
|
+
`published_after/before`, `acquired_after/before`, `mode`). There is no
|
|
76
|
+
`max_results`: the API returns a fixed-size result set; `top_k` (constructor)
|
|
77
|
+
trims it client-side.
|
|
78
|
+
|
|
79
|
+
## Configuration
|
|
80
|
+
|
|
81
|
+
- **API key (optional).** `api_key=Secret.from_token(...)` / the default
|
|
82
|
+
`Secret.from_env_var("KEENABLE_API_KEY", strict=False)`. Blank/unset → keyless
|
|
83
|
+
public endpoints. Serializes by env-var name, never the key value.
|
|
84
|
+
- **Endpoint (optional).** `KEENABLE_API_URL` overrides the base URL (HTTPS
|
|
85
|
+
enforced; plain `http` only for loopback). The endpoint is never a component
|
|
86
|
+
argument the model can set, so it cannot be used to redirect requests.
|
|
87
|
+
|
|
88
|
+
`KeenableFetcher` rejects non-`http(s)` schemes and private/internal hosts
|
|
89
|
+
client-side before sending, and (like `LinkContentFetcher`) skips failed URLs by
|
|
90
|
+
default — set `raise_on_failure=True` to surface errors instead.
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT © Keenable
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# keenable-haystack
|
|
2
|
+
|
|
3
|
+
[Keenable](https://keenable.ai) web search + page fetch for
|
|
4
|
+
[Haystack](https://haystack.deepset.ai) 2.x, as two components:
|
|
5
|
+
|
|
6
|
+
- **`KeenableWebSearch`** — searches the web and returns `documents` +
|
|
7
|
+
`links`, the same output shape as Haystack's built-in `SerperDevWebSearch` /
|
|
8
|
+
`SearchApiWebSearch`, so it is drop-in for pipelines wired to those.
|
|
9
|
+
- **`KeenableFetcher`** — fetches a list of URLs and returns `documents` whose
|
|
10
|
+
content is the page's main text as markdown (Keenable extracts it server-side,
|
|
11
|
+
so you don't need a separate `LinkContentFetcher` + `HTMLToDocument` step).
|
|
12
|
+
|
|
13
|
+
**Keyless by default**: with no API key the keyless public endpoints are used.
|
|
14
|
+
Provide a key to use the authenticated endpoints (required for `mode="realtime"`
|
|
15
|
+
and for higher rate limits).
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install keenable-haystack
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from haystack_integrations.components.websearch.keenable import KeenableWebSearch
|
|
27
|
+
from haystack_integrations.components.fetchers.keenable import KeenableFetcher
|
|
28
|
+
|
|
29
|
+
# No key -> keyless public endpoints. Set KEENABLE_API_KEY to lift limits.
|
|
30
|
+
websearch = KeenableWebSearch(top_k=5)
|
|
31
|
+
hits = websearch.run(query="latest developments in AI agents")
|
|
32
|
+
print(hits["links"])
|
|
33
|
+
|
|
34
|
+
fetcher = KeenableFetcher()
|
|
35
|
+
pages = fetcher.run(urls=hits["links"][:2])
|
|
36
|
+
print(pages["documents"][0].content)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
In a pipeline (drop-in for any web-search component):
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from haystack import Pipeline
|
|
43
|
+
from haystack.components.builders import PromptBuilder
|
|
44
|
+
|
|
45
|
+
pipe = Pipeline()
|
|
46
|
+
pipe.add_component("search", KeenableWebSearch(top_k=5))
|
|
47
|
+
pipe.add_component("prompt", PromptBuilder(template="Answer using:\n{{ documents }}"))
|
|
48
|
+
pipe.connect("search.documents", "prompt.documents")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
`KeenableWebSearch.run` accepts optional per-query filters (`site`,
|
|
52
|
+
`published_after/before`, `acquired_after/before`, `mode`). There is no
|
|
53
|
+
`max_results`: the API returns a fixed-size result set; `top_k` (constructor)
|
|
54
|
+
trims it client-side.
|
|
55
|
+
|
|
56
|
+
## Configuration
|
|
57
|
+
|
|
58
|
+
- **API key (optional).** `api_key=Secret.from_token(...)` / the default
|
|
59
|
+
`Secret.from_env_var("KEENABLE_API_KEY", strict=False)`. Blank/unset → keyless
|
|
60
|
+
public endpoints. Serializes by env-var name, never the key value.
|
|
61
|
+
- **Endpoint (optional).** `KEENABLE_API_URL` overrides the base URL (HTTPS
|
|
62
|
+
enforced; plain `http` only for loopback). The endpoint is never a component
|
|
63
|
+
argument the model can set, so it cannot be used to redirect requests.
|
|
64
|
+
|
|
65
|
+
`KeenableFetcher` rejects non-`http(s)` schemes and private/internal hosts
|
|
66
|
+
client-side before sending, and (like `LinkContentFetcher`) skips failed URLs by
|
|
67
|
+
default — set `raise_on_failure=True` to surface errors instead.
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
MIT © Keenable
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Keenable page-fetch component for Haystack."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
9
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
10
|
+
|
|
11
|
+
# The transport lives in the websearch leaf package so it is defined once; the
|
|
12
|
+
# fetcher reuses it (keyed/keyless selection, attribution headers, SSRF guard).
|
|
13
|
+
from haystack_integrations.components.websearch.keenable._client import (
|
|
14
|
+
KeenableError,
|
|
15
|
+
keenable_get,
|
|
16
|
+
normalize_key,
|
|
17
|
+
reject_private_fetch_target,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@component
|
|
24
|
+
class KeenableFetcher:
|
|
25
|
+
"""Fetches web pages via Keenable and returns their content as Documents.
|
|
26
|
+
|
|
27
|
+
Given a list of URLs, returns one ``Document`` per successfully fetched page
|
|
28
|
+
(``content`` is the page's main content as markdown; ``meta`` carries
|
|
29
|
+
``url``, ``title`` and any other fields the page exposes). Pairs with
|
|
30
|
+
:class:`KeenableWebSearch` — discover URLs with search, then read full pages
|
|
31
|
+
here. Unlike Haystack's ``LinkContentFetcher`` + ``HTMLToDocument`` two-step,
|
|
32
|
+
Keenable returns clean extracted markdown, so this is a single component that
|
|
33
|
+
returns Documents directly.
|
|
34
|
+
|
|
35
|
+
Keyless by default (``/v1/fetch/public``); an ``api_key`` (or
|
|
36
|
+
``KEENABLE_API_KEY``) switches to ``/v1/fetch`` and lifts limits. Non-http(s)
|
|
37
|
+
and private/internal URLs are rejected client-side before sending.
|
|
38
|
+
|
|
39
|
+
### Usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from haystack_integrations.components.fetchers.keenable import KeenableFetcher
|
|
43
|
+
|
|
44
|
+
fetcher = KeenableFetcher()
|
|
45
|
+
result = fetcher.run(urls=["https://example.com/article"])
|
|
46
|
+
print(result["documents"][0].content)
|
|
47
|
+
```
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
api_key: Secret = Secret.from_env_var("KEENABLE_API_KEY", strict=False), # noqa: B008
|
|
54
|
+
raise_on_failure: bool = False,
|
|
55
|
+
timeout: float = 30.0,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""
|
|
58
|
+
:param api_key: Keenable API key. Falls back to ``KEENABLE_API_KEY``; when
|
|
59
|
+
absent (or blank) the keyless public endpoint is used.
|
|
60
|
+
:param raise_on_failure: If ``True``, a failed fetch raises; if ``False``
|
|
61
|
+
(default, matching ``LinkContentFetcher``), the URL is logged and
|
|
62
|
+
skipped so one bad URL does not fail the whole batch.
|
|
63
|
+
:param timeout: Per-request timeout in seconds.
|
|
64
|
+
"""
|
|
65
|
+
if timeout <= 0:
|
|
66
|
+
msg = f"timeout must be a positive number of seconds, got {timeout!r}"
|
|
67
|
+
raise ValueError(msg)
|
|
68
|
+
self.api_key = api_key
|
|
69
|
+
self.raise_on_failure = raise_on_failure
|
|
70
|
+
self.timeout = timeout
|
|
71
|
+
|
|
72
|
+
def to_dict(self) -> dict[str, Any]:
|
|
73
|
+
return default_to_dict(
|
|
74
|
+
self,
|
|
75
|
+
api_key=self.api_key.to_dict(),
|
|
76
|
+
raise_on_failure=self.raise_on_failure,
|
|
77
|
+
timeout=self.timeout,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_dict(cls, data: dict[str, Any]) -> "KeenableFetcher":
|
|
82
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
83
|
+
return default_from_dict(cls, data)
|
|
84
|
+
|
|
85
|
+
def _fetch_one(self, url: str, api_key: Optional[str]) -> Document:
|
|
86
|
+
if not url.lower().startswith(("http://", "https://")):
|
|
87
|
+
msg = f"Refusing to fetch a non-http(s) URL: {url!r}"
|
|
88
|
+
raise KeenableError(msg)
|
|
89
|
+
reject_private_fetch_target(url)
|
|
90
|
+
data = keenable_get("/v1/fetch/public", "/v1/fetch", {"url": url}, api_key, self.timeout)
|
|
91
|
+
content = data.get("content") or ""
|
|
92
|
+
return Document(content=content, meta=dict(data))
|
|
93
|
+
|
|
94
|
+
@component.output_types(documents=list[Document])
|
|
95
|
+
def run(self, urls: list[str]) -> dict[str, Any]:
|
|
96
|
+
"""Fetch each URL and return the extracted pages as Documents.
|
|
97
|
+
|
|
98
|
+
:param urls: The URLs to fetch.
|
|
99
|
+
:returns: A dict with ``documents`` (``list[Document]``), one per page
|
|
100
|
+
fetched successfully.
|
|
101
|
+
"""
|
|
102
|
+
api_key = normalize_key(self.api_key.resolve_value())
|
|
103
|
+
documents: list[Document] = []
|
|
104
|
+
for url in urls:
|
|
105
|
+
try:
|
|
106
|
+
documents.append(self._fetch_one(url, api_key))
|
|
107
|
+
except Exception as e: # noqa: BLE001 - contract: one bad URL must not fail the batch
|
|
108
|
+
if self.raise_on_failure:
|
|
109
|
+
raise
|
|
110
|
+
logger.warning("Keenable could not fetch %r; skipping (%s).", url, e)
|
|
111
|
+
return {"documents": documents}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Shared transport for the Keenable Haystack components.
|
|
2
|
+
|
|
3
|
+
One place for the parts of the Keenable contract both components need: keyed vs.
|
|
4
|
+
keyless endpoint selection, the attribution headers, HTTPS-only base-URL
|
|
5
|
+
resolution, the client-side SSRF guard, and turning a non-2xx response into a
|
|
6
|
+
readable error. The endpoint is read from the environment and is never a
|
|
7
|
+
component argument the model/pipeline can set (an arbitrary base URL is an SSRF
|
|
8
|
+
foothold).
|
|
9
|
+
|
|
10
|
+
The fetcher component imports from this module too, so the transport lives in
|
|
11
|
+
exactly one place across both `haystack_integrations.components.websearch.keenable`
|
|
12
|
+
and `...fetchers.keenable`.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import ipaddress
|
|
18
|
+
import os
|
|
19
|
+
import socket
|
|
20
|
+
from importlib import metadata
|
|
21
|
+
from typing import Any
|
|
22
|
+
from urllib.parse import urlsplit
|
|
23
|
+
|
|
24
|
+
import requests
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
_VERSION = metadata.version("keenable-haystack")
|
|
28
|
+
except metadata.PackageNotFoundError: # pragma: no cover - editable/source checkout
|
|
29
|
+
_VERSION = "unknown"
|
|
30
|
+
|
|
31
|
+
# Tagged User-Agent so Keenable can attribute traffic from this integration.
|
|
32
|
+
_USER_AGENT = f"keenable-haystack/{_VERSION}"
|
|
33
|
+
|
|
34
|
+
# The load-bearing attribution signal: the Keenable backend segments traffic by
|
|
35
|
+
# this header (adoption dashboards). The User-Agent above is a secondary tag.
|
|
36
|
+
_ATTRIBUTION_TITLE = "Haystack"
|
|
37
|
+
|
|
38
|
+
_DEFAULT_BASE_URL = "https://api.keenable.ai"
|
|
39
|
+
_BASE_URL_ENV = "KEENABLE_API_URL"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class KeenableError(RuntimeError):
|
|
43
|
+
"""A Keenable transport/API error carrying a message safe to show a user."""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def normalize_key(raw: str | None) -> str | None:
|
|
47
|
+
"""Return the non-blank key, else ``None`` to use the keyless free tier.
|
|
48
|
+
|
|
49
|
+
Haystack's :class:`~haystack.utils.Secret` already resolves the
|
|
50
|
+
``KEENABLE_API_KEY`` env var (with ``strict=False`` it yields ``None`` when
|
|
51
|
+
unset); this just collapses a blank/whitespace value to ``None`` so an empty
|
|
52
|
+
string never selects the authenticated endpoint.
|
|
53
|
+
"""
|
|
54
|
+
key = raw.strip() if isinstance(raw, str) else ""
|
|
55
|
+
return key or None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def resolve_base_url() -> str:
|
|
59
|
+
"""Resolve the API base URL from ``KEENABLE_API_URL`` and enforce HTTPS."""
|
|
60
|
+
base = (os.environ.get(_BASE_URL_ENV) or _DEFAULT_BASE_URL).rstrip("/")
|
|
61
|
+
parsed = urlsplit(base)
|
|
62
|
+
host = (parsed.hostname or "").rstrip(".")
|
|
63
|
+
if not host:
|
|
64
|
+
msg = f"{_BASE_URL_ENV} must be an https:// URL with a host, got {base!r}"
|
|
65
|
+
raise KeenableError(msg)
|
|
66
|
+
# Local-dev escape hatch: plain http only to an explicit loopback host.
|
|
67
|
+
if parsed.scheme == "http" and host in {"localhost", "127.0.0.1", "::1"}:
|
|
68
|
+
return base
|
|
69
|
+
if parsed.scheme != "https":
|
|
70
|
+
msg = f"{_BASE_URL_ENV} must be an https:// URL with a host, got {base!r}"
|
|
71
|
+
raise KeenableError(msg)
|
|
72
|
+
# Over https, refuse a base URL pointing at a private/internal destination —
|
|
73
|
+
# a misconfigured KEENABLE_API_URL must never ship API keys to an internal
|
|
74
|
+
# host (the same SSRF set as reject_private_fetch_target).
|
|
75
|
+
if host == "metadata.google.internal" or any(
|
|
76
|
+
ip.is_loopback
|
|
77
|
+
or ip.is_private
|
|
78
|
+
or ip.is_link_local
|
|
79
|
+
or ip.is_multicast
|
|
80
|
+
or ip.is_unspecified
|
|
81
|
+
for ip in _candidate_ips(host)
|
|
82
|
+
):
|
|
83
|
+
msg = f"{_BASE_URL_ENV} must not point at a private/internal address, got {base!r}"
|
|
84
|
+
raise KeenableError(msg)
|
|
85
|
+
return base
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _candidate_ips(host: str) -> list[ipaddress.IPv4Address | ipaddress.IPv6Address]:
|
|
89
|
+
"""Every IP address ``host`` could denote, without doing DNS.
|
|
90
|
+
|
|
91
|
+
Covers dotted/colon literals *and* the numeric IPv4 encodings that resolvers
|
|
92
|
+
accept but :func:`ipaddress.ip_address` rejects as strings — decimal
|
|
93
|
+
(``2130706433``), hex (``0x7f000001``), octal (``0177.0.0.1``) and short
|
|
94
|
+
``a.b``/``a.b.c`` forms — all of which ``socket.inet_aton`` canonicalizes to a
|
|
95
|
+
real IPv4 so the private-range check below sees the true address.
|
|
96
|
+
"""
|
|
97
|
+
candidates: list[ipaddress.IPv4Address | ipaddress.IPv6Address] = []
|
|
98
|
+
try:
|
|
99
|
+
candidates.append(ipaddress.ip_address(host))
|
|
100
|
+
except ValueError:
|
|
101
|
+
pass
|
|
102
|
+
try:
|
|
103
|
+
packed = socket.inet_aton(host)
|
|
104
|
+
except OSError:
|
|
105
|
+
pass
|
|
106
|
+
else:
|
|
107
|
+
candidates.append(ipaddress.ip_address(socket.inet_ntoa(packed)))
|
|
108
|
+
return candidates
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def reject_private_fetch_target(url: str) -> None:
|
|
112
|
+
"""Refuse obviously private/internal fetch targets before sending (SSRF).
|
|
113
|
+
|
|
114
|
+
The backend enforces this server-side too, but a client-side guard avoids
|
|
115
|
+
leaking an internal hostname in a request and is required by our integration
|
|
116
|
+
contract. Hostnames that are not IP literals (and not a numeric IPv4 form)
|
|
117
|
+
pass through; the backend's SSRF guard is the backstop for those.
|
|
118
|
+
"""
|
|
119
|
+
host = (urlsplit(url).hostname or "").strip().lower()
|
|
120
|
+
# A trailing dot is the FQDN form of the same name (``localhost.`` ==
|
|
121
|
+
# ``localhost``); strip it so it can't slip past the checks below.
|
|
122
|
+
host = host.rstrip(".")
|
|
123
|
+
if not host:
|
|
124
|
+
msg = f"Refusing to fetch a URL with no host: {url!r}"
|
|
125
|
+
raise KeenableError(msg)
|
|
126
|
+
if host in {"localhost", "metadata.google.internal"}:
|
|
127
|
+
msg = f"Refusing to fetch a private/internal host: {host!r}"
|
|
128
|
+
raise KeenableError(msg)
|
|
129
|
+
for ip in _candidate_ips(host):
|
|
130
|
+
# ``is_reserved`` is intentionally omitted: it flags non-routable but
|
|
131
|
+
# harmless ranges (e.g. the 2001:db8::/32 documentation prefix). The
|
|
132
|
+
# checks below are the ones that matter for SSRF.
|
|
133
|
+
if (
|
|
134
|
+
ip.is_loopback
|
|
135
|
+
or ip.is_private
|
|
136
|
+
or ip.is_link_local
|
|
137
|
+
or ip.is_multicast
|
|
138
|
+
or ip.is_unspecified
|
|
139
|
+
):
|
|
140
|
+
msg = f"Refusing to fetch a private/internal address: {host!r}"
|
|
141
|
+
raise KeenableError(msg)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _headers(api_key: str | None) -> dict[str, str]:
|
|
145
|
+
headers = {"User-Agent": _USER_AGENT, "X-Keenable-Title": _ATTRIBUTION_TITLE}
|
|
146
|
+
if api_key:
|
|
147
|
+
headers["X-API-Key"] = api_key
|
|
148
|
+
return headers
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _redact(text: str, api_key: str | None) -> str:
|
|
152
|
+
"""Strip the API key from any text bound for an exception message or log.
|
|
153
|
+
|
|
154
|
+
Server error bodies and transport-exception strings are attacker- or
|
|
155
|
+
misconfiguration-influenced; a server that echoed the ``X-API-Key`` request
|
|
156
|
+
header back in its response would otherwise leak the key into our
|
|
157
|
+
``KeenableError`` text, logs, and Haystack pipeline traces.
|
|
158
|
+
"""
|
|
159
|
+
return text.replace(api_key, "***") if api_key else text
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _raise_for_status(response: requests.Response, api_key: str | None) -> None:
|
|
163
|
+
"""Map a non-2xx Keenable response to a readable :class:`KeenableError`."""
|
|
164
|
+
if response.ok:
|
|
165
|
+
return
|
|
166
|
+
detail = ""
|
|
167
|
+
try:
|
|
168
|
+
body = response.json()
|
|
169
|
+
if isinstance(body, dict):
|
|
170
|
+
detail = str(body.get("message") or body.get("error") or body.get("detail") or "")
|
|
171
|
+
except ValueError:
|
|
172
|
+
detail = (response.text or "").strip()
|
|
173
|
+
detail = _redact(detail[:200], api_key)
|
|
174
|
+
label = {
|
|
175
|
+
401: "Keenable authentication failed (401)",
|
|
176
|
+
402: "Keenable: insufficient credits (402)",
|
|
177
|
+
429: "Keenable rate limit exceeded (429)",
|
|
178
|
+
}.get(response.status_code, f"Keenable API error ({response.status_code})")
|
|
179
|
+
raise KeenableError(f"{label}: {detail}" if detail else label)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _decode(response: requests.Response, api_key: str | None) -> dict[str, Any]:
|
|
183
|
+
_raise_for_status(response, api_key)
|
|
184
|
+
try:
|
|
185
|
+
data = response.json()
|
|
186
|
+
except ValueError as e:
|
|
187
|
+
snippet = _redact((response.text or "")[:200], api_key)
|
|
188
|
+
msg = f"Keenable API returned a non-JSON response: {snippet!r}"
|
|
189
|
+
raise KeenableError(msg) from e
|
|
190
|
+
if not isinstance(data, dict):
|
|
191
|
+
msg = f"Unexpected response from the Keenable API: {_redact(repr(data)[:200], api_key)}"
|
|
192
|
+
raise KeenableError(msg)
|
|
193
|
+
return data
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _transport_error(e: Exception, api_key: str | None) -> KeenableError:
|
|
197
|
+
"""Wrap a transport exception, redacting the key from its message text.
|
|
198
|
+
|
|
199
|
+
Standard ``requests`` exceptions never carry headers, but a custom adapter /
|
|
200
|
+
proxy middleware could put one in the exception string; redact defensively so
|
|
201
|
+
the key can't reach an exception message, logs, or pipeline tracing.
|
|
202
|
+
"""
|
|
203
|
+
return KeenableError(
|
|
204
|
+
f"Could not reach the Keenable API: {type(e).__name__}: {_redact(str(e), api_key)}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def keenable_post(
|
|
209
|
+
public_path: str, keyed_path: str, payload: dict[str, Any], api_key: str | None, timeout: float
|
|
210
|
+
) -> dict[str, Any]:
|
|
211
|
+
"""POST ``payload`` to the keyed or keyless endpoint and return the body."""
|
|
212
|
+
path = keyed_path if api_key else public_path
|
|
213
|
+
url = f"{resolve_base_url()}{path}"
|
|
214
|
+
headers = {**_headers(api_key), "Content-Type": "application/json"}
|
|
215
|
+
try:
|
|
216
|
+
response = requests.post(url, json=payload, headers=headers, timeout=timeout)
|
|
217
|
+
except requests.RequestException as e:
|
|
218
|
+
raise _transport_error(e, api_key) from e
|
|
219
|
+
return _decode(response, api_key)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def keenable_get(
|
|
223
|
+
public_path: str, keyed_path: str, params: dict[str, Any], api_key: str | None, timeout: float
|
|
224
|
+
) -> dict[str, Any]:
|
|
225
|
+
"""GET the keyed or keyless endpoint with query ``params``; return the body."""
|
|
226
|
+
path = keyed_path if api_key else public_path
|
|
227
|
+
url = f"{resolve_base_url()}{path}"
|
|
228
|
+
try:
|
|
229
|
+
response = requests.get(url, params=params, headers=_headers(api_key), timeout=timeout)
|
|
230
|
+
except requests.RequestException as e:
|
|
231
|
+
raise _transport_error(e, api_key) from e
|
|
232
|
+
return _decode(response, api_key)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Keenable web-search component for Haystack."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
8
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
9
|
+
|
|
10
|
+
from haystack_integrations.components.websearch.keenable._client import (
|
|
11
|
+
KeenableError,
|
|
12
|
+
keenable_post,
|
|
13
|
+
normalize_key,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@component
|
|
18
|
+
class KeenableWebSearch:
|
|
19
|
+
"""Searches the web with Keenable, a search engine built for AI agents.
|
|
20
|
+
|
|
21
|
+
Mirrors the output shape of Haystack's built-in web-search components
|
|
22
|
+
(``SerperDevWebSearch`` / ``SearchApiWebSearch``): ``run()`` returns
|
|
23
|
+
``documents`` (one ``Document`` per result, snippet as content, result
|
|
24
|
+
fields as ``meta``) and ``links`` (the result URLs), so it is drop-in for
|
|
25
|
+
pipelines wired to those.
|
|
26
|
+
|
|
27
|
+
Keyless by default: with no API key the keyless public endpoint
|
|
28
|
+
(``/v1/search/public``) is used. Provide an API key (the ``api_key`` argument
|
|
29
|
+
or the ``KEENABLE_API_KEY`` environment variable) to use the authenticated
|
|
30
|
+
endpoint (``/v1/search``), required for ``mode="realtime"`` and for higher
|
|
31
|
+
rate limits.
|
|
32
|
+
|
|
33
|
+
The API endpoint is read from ``KEENABLE_API_URL`` (HTTPS enforced), never a
|
|
34
|
+
``run`` argument, so the search cannot be redirected to an arbitrary host.
|
|
35
|
+
|
|
36
|
+
### Usage
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from haystack_integrations.components.websearch.keenable import KeenableWebSearch
|
|
40
|
+
|
|
41
|
+
# No key -> keyless public endpoint. Set KEENABLE_API_KEY to lift limits.
|
|
42
|
+
websearch = KeenableWebSearch(top_k=5)
|
|
43
|
+
result = websearch.run(query="latest developments in AI agents")
|
|
44
|
+
print(result["documents"])
|
|
45
|
+
print(result["links"])
|
|
46
|
+
```
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
*,
|
|
52
|
+
api_key: Secret = Secret.from_env_var("KEENABLE_API_KEY", strict=False), # noqa: B008
|
|
53
|
+
top_k: Optional[int] = None,
|
|
54
|
+
mode: str = "pro",
|
|
55
|
+
site: Optional[str] = None,
|
|
56
|
+
timeout: float = 30.0,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""
|
|
59
|
+
:param api_key: Keenable API key. Falls back to ``KEENABLE_API_KEY``; when
|
|
60
|
+
absent (or blank) the keyless public endpoint is used.
|
|
61
|
+
:param top_k: Keep at most this many results (applied client-side; the API
|
|
62
|
+
returns a fixed-size set with no count parameter). ``None`` keeps all.
|
|
63
|
+
:param mode: Default search mode, ``"pro"`` (deeper) or ``"realtime"``
|
|
64
|
+
(low latency). ``"realtime"`` requires an API key. Overridable per run.
|
|
65
|
+
:param site: Default single-domain restriction, e.g. ``"github.com"``.
|
|
66
|
+
Overridable per run.
|
|
67
|
+
:param timeout: Per-request timeout in seconds.
|
|
68
|
+
"""
|
|
69
|
+
if timeout <= 0:
|
|
70
|
+
msg = f"timeout must be a positive number of seconds, got {timeout!r}"
|
|
71
|
+
raise ValueError(msg)
|
|
72
|
+
if top_k is not None and top_k < 1:
|
|
73
|
+
msg = f"top_k must be None or a positive integer, got {top_k!r}"
|
|
74
|
+
raise ValueError(msg)
|
|
75
|
+
self.api_key = api_key
|
|
76
|
+
self.top_k = top_k
|
|
77
|
+
self.mode = mode
|
|
78
|
+
self.site = site
|
|
79
|
+
self.timeout = timeout
|
|
80
|
+
|
|
81
|
+
def to_dict(self) -> dict[str, Any]:
|
|
82
|
+
return default_to_dict(
|
|
83
|
+
self,
|
|
84
|
+
api_key=self.api_key.to_dict(),
|
|
85
|
+
top_k=self.top_k,
|
|
86
|
+
mode=self.mode,
|
|
87
|
+
site=self.site,
|
|
88
|
+
timeout=self.timeout,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def from_dict(cls, data: dict[str, Any]) -> "KeenableWebSearch":
|
|
93
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
94
|
+
return default_from_dict(cls, data)
|
|
95
|
+
|
|
96
|
+
@component.output_types(documents=list[Document], links=list[str])
|
|
97
|
+
def run(
|
|
98
|
+
self,
|
|
99
|
+
query: str,
|
|
100
|
+
site: Optional[str] = None,
|
|
101
|
+
published_after: Optional[str] = None,
|
|
102
|
+
published_before: Optional[str] = None,
|
|
103
|
+
acquired_after: Optional[str] = None,
|
|
104
|
+
acquired_before: Optional[str] = None,
|
|
105
|
+
mode: Optional[str] = None,
|
|
106
|
+
) -> dict[str, Any]:
|
|
107
|
+
"""Run a Keenable web search.
|
|
108
|
+
|
|
109
|
+
:param query: The search query.
|
|
110
|
+
:param site: Restrict results to a single domain (overrides the default).
|
|
111
|
+
:param published_after: Only pages published on/after this date (YYYY-MM-DD).
|
|
112
|
+
:param published_before: Only pages published on/before this date (YYYY-MM-DD).
|
|
113
|
+
:param acquired_after: Only pages indexed on/after this date (YYYY-MM-DD).
|
|
114
|
+
:param acquired_before: Only pages indexed on/before this date (YYYY-MM-DD).
|
|
115
|
+
:param mode: Override the default search mode for this query.
|
|
116
|
+
:returns: A dict with ``documents`` (``list[Document]``) and ``links``
|
|
117
|
+
(``list[str]``).
|
|
118
|
+
"""
|
|
119
|
+
effective_mode = mode or self.mode
|
|
120
|
+
api_key = normalize_key(self.api_key.resolve_value())
|
|
121
|
+
if effective_mode == "realtime" and api_key is None:
|
|
122
|
+
msg = "mode='realtime' requires an API key; it is not available on the keyless endpoint."
|
|
123
|
+
raise KeenableError(msg)
|
|
124
|
+
|
|
125
|
+
payload: dict[str, Any] = {"query": query, "mode": effective_mode}
|
|
126
|
+
for field, value in (
|
|
127
|
+
("site", site or self.site),
|
|
128
|
+
("published_after", published_after),
|
|
129
|
+
("published_before", published_before),
|
|
130
|
+
("acquired_after", acquired_after),
|
|
131
|
+
("acquired_before", acquired_before),
|
|
132
|
+
):
|
|
133
|
+
if value:
|
|
134
|
+
payload[field] = value
|
|
135
|
+
|
|
136
|
+
data = keenable_post("/v1/search/public", "/v1/search", payload, api_key, self.timeout)
|
|
137
|
+
results = data.get("results")
|
|
138
|
+
if not isinstance(results, list):
|
|
139
|
+
msg = f"Unexpected response from the Keenable search API: {data!r}"
|
|
140
|
+
raise KeenableError(msg)
|
|
141
|
+
|
|
142
|
+
if self.top_k is not None:
|
|
143
|
+
results = results[: self.top_k]
|
|
144
|
+
|
|
145
|
+
documents: list[Document] = []
|
|
146
|
+
links: list[str] = []
|
|
147
|
+
for result in results:
|
|
148
|
+
if not isinstance(result, dict):
|
|
149
|
+
continue
|
|
150
|
+
content = result.get("description") or result.get("title") or ""
|
|
151
|
+
documents.append(Document(content=content, meta=dict(result)))
|
|
152
|
+
link = result.get("url")
|
|
153
|
+
if link:
|
|
154
|
+
links.append(link)
|
|
155
|
+
|
|
156
|
+
return {"documents": documents, "links": links}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "keenable-haystack"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Keenable web-search and page-fetch components for Haystack. Keyless by default."
|
|
9
|
+
authors = [{ name = "Keenable", email = "hello@keenable.ai" }]
|
|
10
|
+
maintainers = [{ name = "keenableai" }]
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = "MIT"
|
|
14
|
+
keywords = ["haystack", "haystack-ai", "keenable", "web-search", "fetch", "agents", "rag"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3.9",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"haystack-ai>=2.0.0",
|
|
25
|
+
"requests>=2.31",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://keenable.ai"
|
|
30
|
+
Documentation = "https://docs.keenable.ai"
|
|
31
|
+
Repository = "https://github.com/keenableai/keenable-haystack"
|
|
32
|
+
|
|
33
|
+
[dependency-groups]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=8.0",
|
|
36
|
+
"ruff>=0.11",
|
|
37
|
+
"mypy>=1.10",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
# PEP 420 namespace package: ship the whole `haystack_integrations` tree (no
|
|
41
|
+
# __init__.py at the namespace levels) so Keenable coexists with other
|
|
42
|
+
# haystack_integrations.* packages.
|
|
43
|
+
[tool.hatch.build.targets.sdist]
|
|
44
|
+
include = ["haystack_integrations/", "LICENSE", "README.md"]
|
|
45
|
+
|
|
46
|
+
[tool.hatch.build.targets.wheel]
|
|
47
|
+
packages = ["haystack_integrations"]
|
|
48
|
+
|
|
49
|
+
[tool.ruff]
|
|
50
|
+
line-length = 100
|
|
51
|
+
target-version = "py39"
|
|
52
|
+
|
|
53
|
+
# Two namespace-package leaves share the basename `keenable`, so mypy needs
|
|
54
|
+
# explicit package bases + namespace mode to map files to modules. python_version
|
|
55
|
+
# is the analysis target (mypy dropped 3.9 as a target); the package floor stays
|
|
56
|
+
# 3.9 via requires-python.
|
|
57
|
+
[tool.mypy]
|
|
58
|
+
ignore_missing_imports = true
|
|
59
|
+
python_version = "3.10"
|
|
60
|
+
namespace_packages = true
|
|
61
|
+
explicit_package_bases = true
|
|
62
|
+
mypy_path = "."
|