keenable-haystack 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ dist/
2
+ .venv/
3
+ __pycache__/
4
+ *.egg-info/
5
+ .pytest_cache/
6
+ .mypy_cache/
7
+ .ruff_cache/
8
+ *.pyc
9
+ tests/.red_team_scratch/
10
+ red_team_report.*
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Keenable
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: keenable-haystack
3
+ Version: 0.1.0
4
+ Summary: Keenable web-search and page-fetch components for Haystack. Keyless by default.
5
+ Project-URL: Homepage, https://keenable.ai
6
+ Project-URL: Documentation, https://docs.keenable.ai
7
+ Project-URL: Repository, https://github.com/keenableai/keenable-haystack
8
+ Author-email: Keenable <hello@keenable.ai>
9
+ Maintainer: keenableai
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: agents,fetch,haystack,haystack-ai,keenable,rag,web-search
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.9
20
+ Requires-Dist: haystack-ai>=2.0.0
21
+ Requires-Dist: requests>=2.31
22
+ Description-Content-Type: text/markdown
23
+
24
+ # keenable-haystack
25
+
26
+ [Keenable](https://keenable.ai) web search + page fetch for
27
+ [Haystack](https://haystack.deepset.ai) 2.x, as two components:
28
+
29
+ - **`KeenableWebSearch`** — searches the web and returns `documents` +
30
+ `links`, the same output shape as Haystack's built-in `SerperDevWebSearch` /
31
+ `SearchApiWebSearch`, so it is drop-in for pipelines wired to those.
32
+ - **`KeenableFetcher`** — fetches a list of URLs and returns `documents` whose
33
+ content is the page's main text as markdown (Keenable extracts it server-side,
34
+ so you don't need a separate `LinkContentFetcher` + `HTMLToDocument` step).
35
+
36
+ **Keyless by default**: with no API key the keyless public endpoints are used.
37
+ Provide a key to use the authenticated endpoints (required for `mode="realtime"`
38
+ and for higher rate limits).
39
+
40
+ ## Install
41
+
42
+ ```bash
43
+ pip install keenable-haystack
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```python
49
+ from haystack_integrations.components.websearch.keenable import KeenableWebSearch
50
+ from haystack_integrations.components.fetchers.keenable import KeenableFetcher
51
+
52
+ # No key -> keyless public endpoints. Set KEENABLE_API_KEY to lift limits.
53
+ websearch = KeenableWebSearch(top_k=5)
54
+ hits = websearch.run(query="latest developments in AI agents")
55
+ print(hits["links"])
56
+
57
+ fetcher = KeenableFetcher()
58
+ pages = fetcher.run(urls=hits["links"][:2])
59
+ print(pages["documents"][0].content)
60
+ ```
61
+
62
+ In a pipeline (drop-in for any web-search component):
63
+
64
+ ```python
65
+ from haystack import Pipeline
66
+ from haystack.components.builders import PromptBuilder
67
+
68
+ pipe = Pipeline()
69
+ pipe.add_component("search", KeenableWebSearch(top_k=5))
70
+ pipe.add_component("prompt", PromptBuilder(template="Answer using:\n{{ documents }}"))
71
+ pipe.connect("search.documents", "prompt.documents")
72
+ ```
73
+
74
+ `KeenableWebSearch.run` accepts optional per-query filters (`site`,
75
+ `published_after/before`, `acquired_after/before`, `mode`). There is no
76
+ `max_results`: the API returns a fixed-size result set; `top_k` (constructor)
77
+ trims it client-side.
78
+
79
+ ## Configuration
80
+
81
+ - **API key (optional).** `api_key=Secret.from_token(...)` / the default
82
+ `Secret.from_env_var("KEENABLE_API_KEY", strict=False)`. Blank/unset → keyless
83
+ public endpoints. Serializes by env-var name, never the key value.
84
+ - **Endpoint (optional).** `KEENABLE_API_URL` overrides the base URL (HTTPS
85
+ enforced; plain `http` only for loopback). The endpoint is never a component
86
+ argument the model can set, so it cannot be used to redirect requests.
87
+
88
+ `KeenableFetcher` rejects non-`http(s)` schemes and private/internal hosts
89
+ client-side before sending, and (like `LinkContentFetcher`) skips failed URLs by
90
+ default — set `raise_on_failure=True` to surface errors instead.
91
+
92
+ ## License
93
+
94
+ MIT © Keenable
@@ -0,0 +1,71 @@
1
+ # keenable-haystack
2
+
3
+ [Keenable](https://keenable.ai) web search + page fetch for
4
+ [Haystack](https://haystack.deepset.ai) 2.x, as two components:
5
+
6
+ - **`KeenableWebSearch`** — searches the web and returns `documents` +
7
+ `links`, the same output shape as Haystack's built-in `SerperDevWebSearch` /
8
+ `SearchApiWebSearch`, so it is drop-in for pipelines wired to those.
9
+ - **`KeenableFetcher`** — fetches a list of URLs and returns `documents` whose
10
+ content is the page's main text as markdown (Keenable extracts it server-side,
11
+ so you don't need a separate `LinkContentFetcher` + `HTMLToDocument` step).
12
+
13
+ **Keyless by default**: with no API key the keyless public endpoints are used.
14
+ Provide a key to use the authenticated endpoints (required for `mode="realtime"`
15
+ and for higher rate limits).
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install keenable-haystack
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```python
26
+ from haystack_integrations.components.websearch.keenable import KeenableWebSearch
27
+ from haystack_integrations.components.fetchers.keenable import KeenableFetcher
28
+
29
+ # No key -> keyless public endpoints. Set KEENABLE_API_KEY to lift limits.
30
+ websearch = KeenableWebSearch(top_k=5)
31
+ hits = websearch.run(query="latest developments in AI agents")
32
+ print(hits["links"])
33
+
34
+ fetcher = KeenableFetcher()
35
+ pages = fetcher.run(urls=hits["links"][:2])
36
+ print(pages["documents"][0].content)
37
+ ```
38
+
39
+ In a pipeline (drop-in for any web-search component):
40
+
41
+ ```python
42
+ from haystack import Pipeline
43
+ from haystack.components.builders import PromptBuilder
44
+
45
+ pipe = Pipeline()
46
+ pipe.add_component("search", KeenableWebSearch(top_k=5))
47
+ pipe.add_component("prompt", PromptBuilder(template="Answer using:\n{{ documents }}"))
48
+ pipe.connect("search.documents", "prompt.documents")
49
+ ```
50
+
51
+ `KeenableWebSearch.run` accepts optional per-query filters (`site`,
52
+ `published_after/before`, `acquired_after/before`, `mode`). There is no
53
+ `max_results`: the API returns a fixed-size result set; `top_k` (constructor)
54
+ trims it client-side.
55
+
56
+ ## Configuration
57
+
58
+ - **API key (optional).** `api_key=Secret.from_token(...)` / the default
59
+ `Secret.from_env_var("KEENABLE_API_KEY", strict=False)`. Blank/unset → keyless
60
+ public endpoints. Serializes by env-var name, never the key value.
61
+ - **Endpoint (optional).** `KEENABLE_API_URL` overrides the base URL (HTTPS
62
+ enforced; plain `http` only for loopback). The endpoint is never a component
63
+ argument the model can set, so it cannot be used to redirect requests.
64
+
65
+ `KeenableFetcher` rejects non-`http(s)` schemes and private/internal hosts
66
+ client-side before sending, and (like `LinkContentFetcher`) skips failed URLs by
67
+ default — set `raise_on_failure=True` to surface errors instead.
68
+
69
+ ## License
70
+
71
+ MIT © Keenable
@@ -0,0 +1,3 @@
1
+ from haystack_integrations.components.fetchers.keenable.fetcher import KeenableFetcher
2
+
3
+ __all__ = ["KeenableFetcher"]
@@ -0,0 +1,111 @@
1
+ """Keenable page-fetch component for Haystack."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, Optional
7
+
8
+ from haystack import Document, component, default_from_dict, default_to_dict
9
+ from haystack.utils import Secret, deserialize_secrets_inplace
10
+
11
+ # The transport lives in the websearch leaf package so it is defined once; the
12
+ # fetcher reuses it (keyed/keyless selection, attribution headers, SSRF guard).
13
+ from haystack_integrations.components.websearch.keenable._client import (
14
+ KeenableError,
15
+ keenable_get,
16
+ normalize_key,
17
+ reject_private_fetch_target,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @component
24
+ class KeenableFetcher:
25
+ """Fetches web pages via Keenable and returns their content as Documents.
26
+
27
+ Given a list of URLs, returns one ``Document`` per successfully fetched page
28
+ (``content`` is the page's main content as markdown; ``meta`` carries
29
+ ``url``, ``title`` and any other fields the page exposes). Pairs with
30
+ :class:`KeenableWebSearch` — discover URLs with search, then read full pages
31
+ here. Unlike Haystack's ``LinkContentFetcher`` + ``HTMLToDocument`` two-step,
32
+ Keenable returns clean extracted markdown, so this is a single component that
33
+ returns Documents directly.
34
+
35
+ Keyless by default (``/v1/fetch/public``); an ``api_key`` (or
36
+ ``KEENABLE_API_KEY``) switches to ``/v1/fetch`` and lifts limits. Non-http(s)
37
+ and private/internal URLs are rejected client-side before sending.
38
+
39
+ ### Usage
40
+
41
+ ```python
42
+ from haystack_integrations.components.fetchers.keenable import KeenableFetcher
43
+
44
+ fetcher = KeenableFetcher()
45
+ result = fetcher.run(urls=["https://example.com/article"])
46
+ print(result["documents"][0].content)
47
+ ```
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ *,
53
+ api_key: Secret = Secret.from_env_var("KEENABLE_API_KEY", strict=False), # noqa: B008
54
+ raise_on_failure: bool = False,
55
+ timeout: float = 30.0,
56
+ ) -> None:
57
+ """
58
+ :param api_key: Keenable API key. Falls back to ``KEENABLE_API_KEY``; when
59
+ absent (or blank) the keyless public endpoint is used.
60
+ :param raise_on_failure: If ``True``, a failed fetch raises; if ``False``
61
+ (default, matching ``LinkContentFetcher``), the URL is logged and
62
+ skipped so one bad URL does not fail the whole batch.
63
+ :param timeout: Per-request timeout in seconds.
64
+ """
65
+ if timeout <= 0:
66
+ msg = f"timeout must be a positive number of seconds, got {timeout!r}"
67
+ raise ValueError(msg)
68
+ self.api_key = api_key
69
+ self.raise_on_failure = raise_on_failure
70
+ self.timeout = timeout
71
+
72
+ def to_dict(self) -> dict[str, Any]:
73
+ return default_to_dict(
74
+ self,
75
+ api_key=self.api_key.to_dict(),
76
+ raise_on_failure=self.raise_on_failure,
77
+ timeout=self.timeout,
78
+ )
79
+
80
+ @classmethod
81
+ def from_dict(cls, data: dict[str, Any]) -> "KeenableFetcher":
82
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
83
+ return default_from_dict(cls, data)
84
+
85
+ def _fetch_one(self, url: str, api_key: Optional[str]) -> Document:
86
+ if not url.lower().startswith(("http://", "https://")):
87
+ msg = f"Refusing to fetch a non-http(s) URL: {url!r}"
88
+ raise KeenableError(msg)
89
+ reject_private_fetch_target(url)
90
+ data = keenable_get("/v1/fetch/public", "/v1/fetch", {"url": url}, api_key, self.timeout)
91
+ content = data.get("content") or ""
92
+ return Document(content=content, meta=dict(data))
93
+
94
+ @component.output_types(documents=list[Document])
95
+ def run(self, urls: list[str]) -> dict[str, Any]:
96
+ """Fetch each URL and return the extracted pages as Documents.
97
+
98
+ :param urls: The URLs to fetch.
99
+ :returns: A dict with ``documents`` (``list[Document]``), one per page
100
+ fetched successfully.
101
+ """
102
+ api_key = normalize_key(self.api_key.resolve_value())
103
+ documents: list[Document] = []
104
+ for url in urls:
105
+ try:
106
+ documents.append(self._fetch_one(url, api_key))
107
+ except Exception as e: # noqa: BLE001 - contract: one bad URL must not fail the batch
108
+ if self.raise_on_failure:
109
+ raise
110
+ logger.warning("Keenable could not fetch %r; skipping (%s).", url, e)
111
+ return {"documents": documents}
@@ -0,0 +1,3 @@
1
+ from haystack_integrations.components.websearch.keenable.web_search import KeenableWebSearch
2
+
3
+ __all__ = ["KeenableWebSearch"]
@@ -0,0 +1,232 @@
1
+ """Shared transport for the Keenable Haystack components.
2
+
3
+ One place for the parts of the Keenable contract both components need: keyed vs.
4
+ keyless endpoint selection, the attribution headers, HTTPS-only base-URL
5
+ resolution, the client-side SSRF guard, and turning a non-2xx response into a
6
+ readable error. The endpoint is read from the environment and is never a
7
+ component argument the model/pipeline can set (an arbitrary base URL is an SSRF
8
+ foothold).
9
+
10
+ The fetcher component imports from this module too, so the transport lives in
11
+ exactly one place across both `haystack_integrations.components.websearch.keenable`
12
+ and `...fetchers.keenable`.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import ipaddress
18
+ import os
19
+ import socket
20
+ from importlib import metadata
21
+ from typing import Any
22
+ from urllib.parse import urlsplit
23
+
24
+ import requests
25
+
26
+ try:
27
+ _VERSION = metadata.version("keenable-haystack")
28
+ except metadata.PackageNotFoundError: # pragma: no cover - editable/source checkout
29
+ _VERSION = "unknown"
30
+
31
+ # Tagged User-Agent so Keenable can attribute traffic from this integration.
32
+ _USER_AGENT = f"keenable-haystack/{_VERSION}"
33
+
34
+ # The load-bearing attribution signal: the Keenable backend segments traffic by
35
+ # this header (adoption dashboards). The User-Agent above is a secondary tag.
36
+ _ATTRIBUTION_TITLE = "Haystack"
37
+
38
+ _DEFAULT_BASE_URL = "https://api.keenable.ai"
39
+ _BASE_URL_ENV = "KEENABLE_API_URL"
40
+
41
+
42
+ class KeenableError(RuntimeError):
43
+ """A Keenable transport/API error carrying a message safe to show a user."""
44
+
45
+
46
+ def normalize_key(raw: str | None) -> str | None:
47
+ """Return the non-blank key, else ``None`` to use the keyless free tier.
48
+
49
+ Haystack's :class:`~haystack.utils.Secret` already resolves the
50
+ ``KEENABLE_API_KEY`` env var (with ``strict=False`` it yields ``None`` when
51
+ unset); this just collapses a blank/whitespace value to ``None`` so an empty
52
+ string never selects the authenticated endpoint.
53
+ """
54
+ key = raw.strip() if isinstance(raw, str) else ""
55
+ return key or None
56
+
57
+
58
+ def resolve_base_url() -> str:
59
+ """Resolve the API base URL from ``KEENABLE_API_URL`` and enforce HTTPS."""
60
+ base = (os.environ.get(_BASE_URL_ENV) or _DEFAULT_BASE_URL).rstrip("/")
61
+ parsed = urlsplit(base)
62
+ host = (parsed.hostname or "").rstrip(".")
63
+ if not host:
64
+ msg = f"{_BASE_URL_ENV} must be an https:// URL with a host, got {base!r}"
65
+ raise KeenableError(msg)
66
+ # Local-dev escape hatch: plain http only to an explicit loopback host.
67
+ if parsed.scheme == "http" and host in {"localhost", "127.0.0.1", "::1"}:
68
+ return base
69
+ if parsed.scheme != "https":
70
+ msg = f"{_BASE_URL_ENV} must be an https:// URL with a host, got {base!r}"
71
+ raise KeenableError(msg)
72
+ # Over https, refuse a base URL pointing at a private/internal destination —
73
+ # a misconfigured KEENABLE_API_URL must never ship API keys to an internal
74
+ # host (the same SSRF set as reject_private_fetch_target).
75
+ if host == "metadata.google.internal" or any(
76
+ ip.is_loopback
77
+ or ip.is_private
78
+ or ip.is_link_local
79
+ or ip.is_multicast
80
+ or ip.is_unspecified
81
+ for ip in _candidate_ips(host)
82
+ ):
83
+ msg = f"{_BASE_URL_ENV} must not point at a private/internal address, got {base!r}"
84
+ raise KeenableError(msg)
85
+ return base
86
+
87
+
88
+ def _candidate_ips(host: str) -> list[ipaddress.IPv4Address | ipaddress.IPv6Address]:
89
+ """Every IP address ``host`` could denote, without doing DNS.
90
+
91
+ Covers dotted/colon literals *and* the numeric IPv4 encodings that resolvers
92
+ accept but :func:`ipaddress.ip_address` rejects as strings — decimal
93
+ (``2130706433``), hex (``0x7f000001``), octal (``0177.0.0.1``) and short
94
+ ``a.b``/``a.b.c`` forms — all of which ``socket.inet_aton`` canonicalizes to a
95
+ real IPv4 so the private-range check below sees the true address.
96
+ """
97
+ candidates: list[ipaddress.IPv4Address | ipaddress.IPv6Address] = []
98
+ try:
99
+ candidates.append(ipaddress.ip_address(host))
100
+ except ValueError:
101
+ pass
102
+ try:
103
+ packed = socket.inet_aton(host)
104
+ except OSError:
105
+ pass
106
+ else:
107
+ candidates.append(ipaddress.ip_address(socket.inet_ntoa(packed)))
108
+ return candidates
109
+
110
+
111
+ def reject_private_fetch_target(url: str) -> None:
112
+ """Refuse obviously private/internal fetch targets before sending (SSRF).
113
+
114
+ The backend enforces this server-side too, but a client-side guard avoids
115
+ leaking an internal hostname in a request and is required by our integration
116
+ contract. Hostnames that are not IP literals (and not a numeric IPv4 form)
117
+ pass through; the backend's SSRF guard is the backstop for those.
118
+ """
119
+ host = (urlsplit(url).hostname or "").strip().lower()
120
+ # A trailing dot is the FQDN form of the same name (``localhost.`` ==
121
+ # ``localhost``); strip it so it can't slip past the checks below.
122
+ host = host.rstrip(".")
123
+ if not host:
124
+ msg = f"Refusing to fetch a URL with no host: {url!r}"
125
+ raise KeenableError(msg)
126
+ if host in {"localhost", "metadata.google.internal"}:
127
+ msg = f"Refusing to fetch a private/internal host: {host!r}"
128
+ raise KeenableError(msg)
129
+ for ip in _candidate_ips(host):
130
+ # ``is_reserved`` is intentionally omitted: it flags non-routable but
131
+ # harmless ranges (e.g. the 2001:db8::/32 documentation prefix). The
132
+ # checks below are the ones that matter for SSRF.
133
+ if (
134
+ ip.is_loopback
135
+ or ip.is_private
136
+ or ip.is_link_local
137
+ or ip.is_multicast
138
+ or ip.is_unspecified
139
+ ):
140
+ msg = f"Refusing to fetch a private/internal address: {host!r}"
141
+ raise KeenableError(msg)
142
+
143
+
144
+ def _headers(api_key: str | None) -> dict[str, str]:
145
+ headers = {"User-Agent": _USER_AGENT, "X-Keenable-Title": _ATTRIBUTION_TITLE}
146
+ if api_key:
147
+ headers["X-API-Key"] = api_key
148
+ return headers
149
+
150
+
151
+ def _redact(text: str, api_key: str | None) -> str:
152
+ """Strip the API key from any text bound for an exception message or log.
153
+
154
+ Server error bodies and transport-exception strings are attacker- or
155
+ misconfiguration-influenced; a server that echoed the ``X-API-Key`` request
156
+ header back in its response would otherwise leak the key into our
157
+ ``KeenableError`` text, logs, and Haystack pipeline traces.
158
+ """
159
+ return text.replace(api_key, "***") if api_key else text
160
+
161
+
162
+ def _raise_for_status(response: requests.Response, api_key: str | None) -> None:
163
+ """Map a non-2xx Keenable response to a readable :class:`KeenableError`."""
164
+ if response.ok:
165
+ return
166
+ detail = ""
167
+ try:
168
+ body = response.json()
169
+ if isinstance(body, dict):
170
+ detail = str(body.get("message") or body.get("error") or body.get("detail") or "")
171
+ except ValueError:
172
+ detail = (response.text or "").strip()
173
+ detail = _redact(detail[:200], api_key)
174
+ label = {
175
+ 401: "Keenable authentication failed (401)",
176
+ 402: "Keenable: insufficient credits (402)",
177
+ 429: "Keenable rate limit exceeded (429)",
178
+ }.get(response.status_code, f"Keenable API error ({response.status_code})")
179
+ raise KeenableError(f"{label}: {detail}" if detail else label)
180
+
181
+
182
+ def _decode(response: requests.Response, api_key: str | None) -> dict[str, Any]:
183
+ _raise_for_status(response, api_key)
184
+ try:
185
+ data = response.json()
186
+ except ValueError as e:
187
+ snippet = _redact((response.text or "")[:200], api_key)
188
+ msg = f"Keenable API returned a non-JSON response: {snippet!r}"
189
+ raise KeenableError(msg) from e
190
+ if not isinstance(data, dict):
191
+ msg = f"Unexpected response from the Keenable API: {_redact(repr(data)[:200], api_key)}"
192
+ raise KeenableError(msg)
193
+ return data
194
+
195
+
196
+ def _transport_error(e: Exception, api_key: str | None) -> KeenableError:
197
+ """Wrap a transport exception, redacting the key from its message text.
198
+
199
+ Standard ``requests`` exceptions never carry headers, but a custom adapter /
200
+ proxy middleware could put one in the exception string; redact defensively so
201
+ the key can't reach an exception message, logs, or pipeline tracing.
202
+ """
203
+ return KeenableError(
204
+ f"Could not reach the Keenable API: {type(e).__name__}: {_redact(str(e), api_key)}"
205
+ )
206
+
207
+
208
+ def keenable_post(
209
+ public_path: str, keyed_path: str, payload: dict[str, Any], api_key: str | None, timeout: float
210
+ ) -> dict[str, Any]:
211
+ """POST ``payload`` to the keyed or keyless endpoint and return the body."""
212
+ path = keyed_path if api_key else public_path
213
+ url = f"{resolve_base_url()}{path}"
214
+ headers = {**_headers(api_key), "Content-Type": "application/json"}
215
+ try:
216
+ response = requests.post(url, json=payload, headers=headers, timeout=timeout)
217
+ except requests.RequestException as e:
218
+ raise _transport_error(e, api_key) from e
219
+ return _decode(response, api_key)
220
+
221
+
222
+ def keenable_get(
223
+ public_path: str, keyed_path: str, params: dict[str, Any], api_key: str | None, timeout: float
224
+ ) -> dict[str, Any]:
225
+ """GET the keyed or keyless endpoint with query ``params``; return the body."""
226
+ path = keyed_path if api_key else public_path
227
+ url = f"{resolve_base_url()}{path}"
228
+ try:
229
+ response = requests.get(url, params=params, headers=_headers(api_key), timeout=timeout)
230
+ except requests.RequestException as e:
231
+ raise _transport_error(e, api_key) from e
232
+ return _decode(response, api_key)
@@ -0,0 +1,156 @@
1
+ """Keenable web-search component for Haystack."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+
7
+ from haystack import Document, component, default_from_dict, default_to_dict
8
+ from haystack.utils import Secret, deserialize_secrets_inplace
9
+
10
+ from haystack_integrations.components.websearch.keenable._client import (
11
+ KeenableError,
12
+ keenable_post,
13
+ normalize_key,
14
+ )
15
+
16
+
17
+ @component
18
+ class KeenableWebSearch:
19
+ """Searches the web with Keenable, a search engine built for AI agents.
20
+
21
+ Mirrors the output shape of Haystack's built-in web-search components
22
+ (``SerperDevWebSearch`` / ``SearchApiWebSearch``): ``run()`` returns
23
+ ``documents`` (one ``Document`` per result, snippet as content, result
24
+ fields as ``meta``) and ``links`` (the result URLs), so it is drop-in for
25
+ pipelines wired to those.
26
+
27
+ Keyless by default: with no API key the keyless public endpoint
28
+ (``/v1/search/public``) is used. Provide an API key (the ``api_key`` argument
29
+ or the ``KEENABLE_API_KEY`` environment variable) to use the authenticated
30
+ endpoint (``/v1/search``), required for ``mode="realtime"`` and for higher
31
+ rate limits.
32
+
33
+ The API endpoint is read from ``KEENABLE_API_URL`` (HTTPS enforced), never a
34
+ ``run`` argument, so the search cannot be redirected to an arbitrary host.
35
+
36
+ ### Usage
37
+
38
+ ```python
39
+ from haystack_integrations.components.websearch.keenable import KeenableWebSearch
40
+
41
+ # No key -> keyless public endpoint. Set KEENABLE_API_KEY to lift limits.
42
+ websearch = KeenableWebSearch(top_k=5)
43
+ result = websearch.run(query="latest developments in AI agents")
44
+ print(result["documents"])
45
+ print(result["links"])
46
+ ```
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ *,
52
+ api_key: Secret = Secret.from_env_var("KEENABLE_API_KEY", strict=False), # noqa: B008
53
+ top_k: Optional[int] = None,
54
+ mode: str = "pro",
55
+ site: Optional[str] = None,
56
+ timeout: float = 30.0,
57
+ ) -> None:
58
+ """
59
+ :param api_key: Keenable API key. Falls back to ``KEENABLE_API_KEY``; when
60
+ absent (or blank) the keyless public endpoint is used.
61
+ :param top_k: Keep at most this many results (applied client-side; the API
62
+ returns a fixed-size set with no count parameter). ``None`` keeps all.
63
+ :param mode: Default search mode, ``"pro"`` (deeper) or ``"realtime"``
64
+ (low latency). ``"realtime"`` requires an API key. Overridable per run.
65
+ :param site: Default single-domain restriction, e.g. ``"github.com"``.
66
+ Overridable per run.
67
+ :param timeout: Per-request timeout in seconds.
68
+ """
69
+ if timeout <= 0:
70
+ msg = f"timeout must be a positive number of seconds, got {timeout!r}"
71
+ raise ValueError(msg)
72
+ if top_k is not None and top_k < 1:
73
+ msg = f"top_k must be None or a positive integer, got {top_k!r}"
74
+ raise ValueError(msg)
75
+ self.api_key = api_key
76
+ self.top_k = top_k
77
+ self.mode = mode
78
+ self.site = site
79
+ self.timeout = timeout
80
+
81
+ def to_dict(self) -> dict[str, Any]:
82
+ return default_to_dict(
83
+ self,
84
+ api_key=self.api_key.to_dict(),
85
+ top_k=self.top_k,
86
+ mode=self.mode,
87
+ site=self.site,
88
+ timeout=self.timeout,
89
+ )
90
+
91
+ @classmethod
92
+ def from_dict(cls, data: dict[str, Any]) -> "KeenableWebSearch":
93
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
94
+ return default_from_dict(cls, data)
95
+
96
+ @component.output_types(documents=list[Document], links=list[str])
97
+ def run(
98
+ self,
99
+ query: str,
100
+ site: Optional[str] = None,
101
+ published_after: Optional[str] = None,
102
+ published_before: Optional[str] = None,
103
+ acquired_after: Optional[str] = None,
104
+ acquired_before: Optional[str] = None,
105
+ mode: Optional[str] = None,
106
+ ) -> dict[str, Any]:
107
+ """Run a Keenable web search.
108
+
109
+ :param query: The search query.
110
+ :param site: Restrict results to a single domain (overrides the default).
111
+ :param published_after: Only pages published on/after this date (YYYY-MM-DD).
112
+ :param published_before: Only pages published on/before this date (YYYY-MM-DD).
113
+ :param acquired_after: Only pages indexed on/after this date (YYYY-MM-DD).
114
+ :param acquired_before: Only pages indexed on/before this date (YYYY-MM-DD).
115
+ :param mode: Override the default search mode for this query.
116
+ :returns: A dict with ``documents`` (``list[Document]``) and ``links``
117
+ (``list[str]``).
118
+ """
119
+ effective_mode = mode or self.mode
120
+ api_key = normalize_key(self.api_key.resolve_value())
121
+ if effective_mode == "realtime" and api_key is None:
122
+ msg = "mode='realtime' requires an API key; it is not available on the keyless endpoint."
123
+ raise KeenableError(msg)
124
+
125
+ payload: dict[str, Any] = {"query": query, "mode": effective_mode}
126
+ for field, value in (
127
+ ("site", site or self.site),
128
+ ("published_after", published_after),
129
+ ("published_before", published_before),
130
+ ("acquired_after", acquired_after),
131
+ ("acquired_before", acquired_before),
132
+ ):
133
+ if value:
134
+ payload[field] = value
135
+
136
+ data = keenable_post("/v1/search/public", "/v1/search", payload, api_key, self.timeout)
137
+ results = data.get("results")
138
+ if not isinstance(results, list):
139
+ msg = f"Unexpected response from the Keenable search API: {data!r}"
140
+ raise KeenableError(msg)
141
+
142
+ if self.top_k is not None:
143
+ results = results[: self.top_k]
144
+
145
+ documents: list[Document] = []
146
+ links: list[str] = []
147
+ for result in results:
148
+ if not isinstance(result, dict):
149
+ continue
150
+ content = result.get("description") or result.get("title") or ""
151
+ documents.append(Document(content=content, meta=dict(result)))
152
+ link = result.get("url")
153
+ if link:
154
+ links.append(link)
155
+
156
+ return {"documents": documents, "links": links}
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "keenable-haystack"
7
+ version = "0.1.0"
8
+ description = "Keenable web-search and page-fetch components for Haystack. Keyless by default."
9
+ authors = [{ name = "Keenable", email = "hello@keenable.ai" }]
10
+ maintainers = [{ name = "keenableai" }]
11
+ requires-python = ">=3.9"
12
+ readme = "README.md"
13
+ license = "MIT"
14
+ keywords = ["haystack", "haystack-ai", "keenable", "web-search", "fetch", "agents", "rag"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3.9",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ ]
23
+ dependencies = [
24
+ "haystack-ai>=2.0.0",
25
+ "requests>=2.31",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://keenable.ai"
30
+ Documentation = "https://docs.keenable.ai"
31
+ Repository = "https://github.com/keenableai/keenable-haystack"
32
+
33
+ [dependency-groups]
34
+ dev = [
35
+ "pytest>=8.0",
36
+ "ruff>=0.11",
37
+ "mypy>=1.10",
38
+ ]
39
+
40
+ # PEP 420 namespace package: ship the whole `haystack_integrations` tree (no
41
+ # __init__.py at the namespace levels) so Keenable coexists with other
42
+ # haystack_integrations.* packages.
43
+ [tool.hatch.build.targets.sdist]
44
+ include = ["haystack_integrations/", "LICENSE", "README.md"]
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["haystack_integrations"]
48
+
49
+ [tool.ruff]
50
+ line-length = 100
51
+ target-version = "py39"
52
+
53
+ # Two namespace-package leaves share the basename `keenable`, so mypy needs
54
+ # explicit package bases + namespace mode to map files to modules. python_version
55
+ # is the analysis target (mypy dropped 3.9 as a target); the package floor stays
56
+ # 3.9 via requires-python.
57
+ [tool.mypy]
58
+ ignore_missing_imports = true
59
+ python_version = "3.10"
60
+ namespace_packages = true
61
+ explicit_package_bases = true
62
+ mypy_path = "."