websearch-kit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- websearch_kit/__init__.py +100 -0
- websearch_kit/_version.py +3 -0
- websearch_kit/assembly/__init__.py +33 -0
- websearch_kit/assembly/citations.py +70 -0
- websearch_kit/assembly/context_builder.py +90 -0
- websearch_kit/caching/__init__.py +89 -0
- websearch_kit/caching/keys.py +93 -0
- websearch_kit/caching/memory.py +121 -0
- websearch_kit/caching/sqlite_cache.py +244 -0
- websearch_kit/config.py +220 -0
- websearch_kit/errors.py +250 -0
- websearch_kit/expansion/__init__.py +104 -0
- websearch_kit/expansion/callback.py +89 -0
- websearch_kit/expansion/llm.py +153 -0
- websearch_kit/expansion/noop.py +45 -0
- websearch_kit/expansion/parsing.py +120 -0
- websearch_kit/extraction/__init__.py +24 -0
- websearch_kit/extraction/chain.py +179 -0
- websearch_kit/extraction/quality.py +184 -0
- websearch_kit/extraction/readability_extractor.py +66 -0
- websearch_kit/extraction/sanitize_text.py +146 -0
- websearch_kit/extraction/trafilatura_extractor.py +181 -0
- websearch_kit/extraction/types.py +58 -0
- websearch_kit/fetching/__init__.py +17 -0
- websearch_kit/fetching/fetcher.py +595 -0
- websearch_kit/fetching/policy.py +122 -0
- websearch_kit/fetching/robots.py +165 -0
- websearch_kit/fetching/user_agents.py +78 -0
- websearch_kit/grammar.py +178 -0
- websearch_kit/kit.py +487 -0
- websearch_kit/mcp/__init__.py +39 -0
- websearch_kit/mcp/__main__.py +61 -0
- websearch_kit/mcp/config_cli.py +150 -0
- websearch_kit/mcp/progress.py +123 -0
- websearch_kit/mcp/server.py +264 -0
- websearch_kit/mcp/tools.py +376 -0
- websearch_kit/models.py +291 -0
- websearch_kit/observability/__init__.py +21 -0
- websearch_kit/observability/events.py +113 -0
- websearch_kit/observability/logging.py +97 -0
- websearch_kit/owui/__init__.py +21 -0
- websearch_kit/owui/_compat.py +275 -0
- websearch_kit/owui/filter_adapter.py +604 -0
- websearch_kit/pipeline.py +985 -0
- websearch_kit/prompts.py +158 -0
- websearch_kit/protocols.py +116 -0
- websearch_kit/providers/__init__.py +149 -0
- websearch_kit/providers/base.py +252 -0
- websearch_kit/providers/brave.py +171 -0
- websearch_kit/providers/ddgs.py +153 -0
- websearch_kit/providers/exa.py +156 -0
- websearch_kit/providers/owui.py +183 -0
- websearch_kit/providers/searxng.py +167 -0
- websearch_kit/providers/serper.py +140 -0
- websearch_kit/providers/tavily.py +141 -0
- websearch_kit/py.typed +0 -0
- websearch_kit/ranking/__init__.py +28 -0
- websearch_kit/ranking/bm25.py +109 -0
- websearch_kit/ranking/budget.py +140 -0
- websearch_kit/resilience/__init__.py +24 -0
- websearch_kit/resilience/circuit.py +166 -0
- websearch_kit/resilience/deadline.py +88 -0
- websearch_kit/resilience/fallback.py +265 -0
- websearch_kit/resilience/health.py +141 -0
- websearch_kit/resilience/retry.py +108 -0
- websearch_kit/run.py +236 -0
- websearch_kit/security/__init__.py +15 -0
- websearch_kit/security/ranges.py +129 -0
- websearch_kit/security/sanitize.py +97 -0
- websearch_kit/security/url_guard.py +296 -0
- websearch_kit-0.1.0.dist-info/METADATA +190 -0
- websearch_kit-0.1.0.dist-info/RECORD +75 -0
- websearch_kit-0.1.0.dist-info/WHEEL +4 -0
- websearch_kit-0.1.0.dist-info/entry_points.txt +2 -0
- websearch_kit-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""websearch-kit — web search, fetch, and research pipeline for LLMs.
|
|
2
|
+
|
|
3
|
+
Public API surface (everything re-exported here is SemVer-protected; see
|
|
4
|
+
VERSIONING.md). Usable three ways:
|
|
5
|
+
|
|
6
|
+
* SDK: ``from websearch_kit import SearchKit``
|
|
7
|
+
* MCP server: ``websearch-kit-mcp`` (requires the ``[mcp]`` extra)
|
|
8
|
+
* Open WebUI: the single-file filter under ``adapters/owui/``
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
from ._version import __version__
|
|
14
|
+
from .config import WebSearchConfig
|
|
15
|
+
from .errors import (
|
|
16
|
+
CacheError,
|
|
17
|
+
ConfigError,
|
|
18
|
+
DeadlineExceededError,
|
|
19
|
+
ExpansionError,
|
|
20
|
+
ExtractionError,
|
|
21
|
+
FetchError,
|
|
22
|
+
GuardError,
|
|
23
|
+
MissingDependencyError,
|
|
24
|
+
ProviderError,
|
|
25
|
+
RobotsBlockedError,
|
|
26
|
+
SSRFBlockedError,
|
|
27
|
+
WebSearchKitError,
|
|
28
|
+
)
|
|
29
|
+
from .models import (
|
|
30
|
+
Degradation,
|
|
31
|
+
FetchOutcome,
|
|
32
|
+
PageContent,
|
|
33
|
+
ProgressEvent,
|
|
34
|
+
ResearchReport,
|
|
35
|
+
RunStats,
|
|
36
|
+
SearchResult,
|
|
37
|
+
Source,
|
|
38
|
+
Stage,
|
|
39
|
+
SystemHealth,
|
|
40
|
+
)
|
|
41
|
+
from .protocols import (
|
|
42
|
+
Cache,
|
|
43
|
+
CallbackSink,
|
|
44
|
+
ProgressSink,
|
|
45
|
+
ProviderCapabilities,
|
|
46
|
+
QueryExpander,
|
|
47
|
+
SearchProvider,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if TYPE_CHECKING: # pragma: no cover - import surface for type checkers only
|
|
51
|
+
from .kit import SearchKit, SyncSearchKit
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
"Cache",
|
|
55
|
+
"CacheError",
|
|
56
|
+
"CallbackSink",
|
|
57
|
+
"ConfigError",
|
|
58
|
+
"DeadlineExceededError",
|
|
59
|
+
"Degradation",
|
|
60
|
+
"ExpansionError",
|
|
61
|
+
"ExtractionError",
|
|
62
|
+
"FetchError",
|
|
63
|
+
"FetchOutcome",
|
|
64
|
+
"GuardError",
|
|
65
|
+
"MissingDependencyError",
|
|
66
|
+
"PageContent",
|
|
67
|
+
"ProgressEvent",
|
|
68
|
+
"ProgressSink",
|
|
69
|
+
"ProviderCapabilities",
|
|
70
|
+
"ProviderError",
|
|
71
|
+
"QueryExpander",
|
|
72
|
+
"ResearchReport",
|
|
73
|
+
"RobotsBlockedError",
|
|
74
|
+
"RunStats",
|
|
75
|
+
"SSRFBlockedError",
|
|
76
|
+
"SearchKit",
|
|
77
|
+
"SearchProvider",
|
|
78
|
+
"SearchResult",
|
|
79
|
+
"Source",
|
|
80
|
+
"Stage",
|
|
81
|
+
"SyncSearchKit",
|
|
82
|
+
"SystemHealth",
|
|
83
|
+
"WebSearchConfig",
|
|
84
|
+
"WebSearchKitError",
|
|
85
|
+
"__version__",
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
#: Engine names resolved lazily: importing ``kit`` pulls the extraction chain,
|
|
89
|
+
#: whose package import loads trafilatura (heavyweight). Deferring keeps
|
|
90
|
+
#: ``import websearch_kit`` cheap for dep-light consumers (models, config,
|
|
91
|
+
#: sanitize) while ``from websearch_kit import SearchKit`` still just works.
|
|
92
|
+
_LAZY_EXPORTS = frozenset({"SearchKit", "SyncSearchKit"})
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def __getattr__(name: str) -> Any:
|
|
96
|
+
if name in _LAZY_EXPORTS:
|
|
97
|
+
from . import kit
|
|
98
|
+
|
|
99
|
+
return getattr(kit, name)
|
|
100
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Context assembly: numbered source blocks and their 1:1 citation list.
|
|
2
|
+
|
|
3
|
+
WHY a package: the pipeline's final products — the ``[N]``-numbered context
|
|
4
|
+
string handed to an LLM and the ``Source`` citations a UI maps those markers
|
|
5
|
+
back to — must be generated from the same records in the same order or the
|
|
6
|
+
model cites things the UI cannot resolve. ``citations`` owns the numbering,
|
|
7
|
+
``context_builder`` owns the (reference-verbatim) block formats, and both are
|
|
8
|
+
pure functions over :class:`SourceDraft` so the lockstep invariant is golden-
|
|
9
|
+
testable with no pipeline machinery.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from .citations import SourceDraft, number_sources
|
|
15
|
+
from .context_builder import (
|
|
16
|
+
SNIPPET_POOL_HEADER,
|
|
17
|
+
TRUNCATION_SUFFIX,
|
|
18
|
+
build_context,
|
|
19
|
+
render_pool_block,
|
|
20
|
+
render_source_block,
|
|
21
|
+
truncate_to,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"SNIPPET_POOL_HEADER",
|
|
26
|
+
"TRUNCATION_SUFFIX",
|
|
27
|
+
"SourceDraft",
|
|
28
|
+
"build_context",
|
|
29
|
+
"number_sources",
|
|
30
|
+
"render_pool_block",
|
|
31
|
+
"render_source_block",
|
|
32
|
+
"truncate_to",
|
|
33
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Citation numbering: assembled source records -> public ``Source`` models.
|
|
2
|
+
|
|
3
|
+
WHY a dedicated module: the context block and the citation list MUST stay in
|
|
4
|
+
lockstep — every ``[N]`` marker the LLM can emit needs exactly one
|
|
5
|
+
:class:`~websearch_kit.models.Source` with the same ``n``, and the numbering
|
|
6
|
+
must be contiguous across the two segments (fetched sources first, then the
|
|
7
|
+
snippet pool). Splitting "what goes in" (the pipeline's ranked
|
|
8
|
+
:class:`SourceDraft` records) from "how it is numbered" (here) and "how it is
|
|
9
|
+
rendered" (``context_builder``) keeps that 1:1 invariant testable as a pure
|
|
10
|
+
function: same drafts in, same numbering out, no pipeline state involved.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Sequence
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Literal
|
|
18
|
+
|
|
19
|
+
from ..models import Source
|
|
20
|
+
|
|
21
|
+
__all__ = ["SourceDraft", "number_sources"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(slots=True)
|
|
25
|
+
class SourceDraft:
|
|
26
|
+
"""One assembled source before numbering: the pipeline's working record.
|
|
27
|
+
|
|
28
|
+
Mutable on purpose — the ranking/budget stage truncates ``content`` and
|
|
29
|
+
assigns ``score`` in place. ``kind`` records *what the content is*
|
|
30
|
+
(``snippet_only`` when the search snippet substituted for a failed or
|
|
31
|
+
low-quality fetch), independent of *where* the draft renders (primary
|
|
32
|
+
block vs snippet pool) — that placement is positional, decided by which
|
|
33
|
+
sequence the draft is passed in.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
title: str
|
|
37
|
+
url: str
|
|
38
|
+
snippet: str
|
|
39
|
+
content: str
|
|
40
|
+
kind: Literal["fetched", "snippet_only"]
|
|
41
|
+
score: float | None = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def number_sources(
|
|
45
|
+
primary: Sequence[SourceDraft],
|
|
46
|
+
pool: Sequence[SourceDraft],
|
|
47
|
+
) -> list[Source]:
|
|
48
|
+
"""Assign contiguous 1-based ``[N]`` numbers across both segments.
|
|
49
|
+
|
|
50
|
+
``primary`` (fetched/snippet-fallback sources, already in ranked order)
|
|
51
|
+
numbers first; ``pool`` (relevance-filtered snippet-only extras) continues
|
|
52
|
+
the sequence — exactly the reference's ``source_id`` counter that ran
|
|
53
|
+
uninterrupted from the fetched blocks into the additional-sources section.
|
|
54
|
+
The returned list is 1:1 with the ``[N]`` markers ``build_context`` renders
|
|
55
|
+
for the same two sequences.
|
|
56
|
+
"""
|
|
57
|
+
sources: list[Source] = []
|
|
58
|
+
for n, draft in enumerate((*primary, *pool), start=1):
|
|
59
|
+
sources.append(
|
|
60
|
+
Source(
|
|
61
|
+
n=n,
|
|
62
|
+
title=draft.title,
|
|
63
|
+
url=draft.url,
|
|
64
|
+
snippet=draft.snippet,
|
|
65
|
+
kind=draft.kind,
|
|
66
|
+
score=draft.score,
|
|
67
|
+
content_chars=len(draft.content),
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return sources
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Context-block rendering: numbered ``--- [N] Title ---`` segments for the LLM.
|
|
2
|
+
|
|
3
|
+
WHY the formats are frozen: the answer prompt (``prompts.build_answer_prompt``)
|
|
4
|
+
instructs the model to cite with inline ``[N]`` markers and to fall back to the
|
|
5
|
+
``Summary (Snippet)`` line when ``Full Content`` is poor — so the literal field
|
|
6
|
+
labels in these blocks are part of the prompt contract, not cosmetics. All
|
|
7
|
+
three templates (fetched block, snippet-pool block, pool header) are ported
|
|
8
|
+
verbatim from the reference ``_process_results`` Phase C so the golden tests
|
|
9
|
+
pin byte-for-byte parity. Everything here is a pure string function; numbering
|
|
10
|
+
stays in lockstep with ``citations.number_sources`` because both iterate the
|
|
11
|
+
same ``(primary, pool)`` sequences in the same order.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Sequence
|
|
17
|
+
|
|
18
|
+
from .citations import SourceDraft
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"SNIPPET_POOL_HEADER",
|
|
22
|
+
"TRUNCATION_SUFFIX",
|
|
23
|
+
"build_context",
|
|
24
|
+
"render_pool_block",
|
|
25
|
+
"render_source_block",
|
|
26
|
+
"truncate_to",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
#: Appended when a source's content is cut to its budget allocation (verbatim
|
|
30
|
+
#: reference suffix — the visible, in-context signal that text was dropped).
|
|
31
|
+
TRUNCATION_SUFFIX = "... [TRUNCATED]"
|
|
32
|
+
|
|
33
|
+
#: Separator announcing the snippet-only pool segment (verbatim reference).
|
|
34
|
+
SNIPPET_POOL_HEADER = "\n--- ADDITIONAL SOURCES (snippet only, same [N] citation format) ---"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def render_source_block(n: int, draft: SourceDraft) -> str:
|
|
38
|
+
"""One primary source block (fetched or snippet-fallback content)."""
|
|
39
|
+
return (
|
|
40
|
+
f"--- [{n}] {draft.title} ---\n"
|
|
41
|
+
f"URL: {draft.url}\n"
|
|
42
|
+
f"Summary (Snippet): {draft.snippet}\n"
|
|
43
|
+
f"Full Content:\n{draft.content}\n"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def render_pool_block(n: int, draft: SourceDraft) -> str:
|
|
48
|
+
"""One snippet-pool block (never fetched; the snippet *is* the content)."""
|
|
49
|
+
return (
|
|
50
|
+
f"--- [{n}] {draft.title} (snippet only) ---\nURL: {draft.url}\nContent: {draft.snippet}\n"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def truncate_to(content: str, alloc: int) -> tuple[str, bool]:
|
|
55
|
+
"""Cut ``content`` to ``alloc`` chars, marking the cut with the suffix.
|
|
56
|
+
|
|
57
|
+
Returns ``(text, truncated)``. The suffix is appended *after* the cut
|
|
58
|
+
(reference behavior: ``content[:alloc] + "... [TRUNCATED]"``), so the
|
|
59
|
+
rendered block may slightly exceed ``alloc`` — the budget governs content,
|
|
60
|
+
the marker is overhead the model needs to see.
|
|
61
|
+
"""
|
|
62
|
+
if len(content) <= alloc:
|
|
63
|
+
return content, False
|
|
64
|
+
return content[:alloc] + TRUNCATION_SUFFIX, True
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_context(
|
|
68
|
+
primary: Sequence[SourceDraft],
|
|
69
|
+
pool: Sequence[SourceDraft],
|
|
70
|
+
) -> str:
|
|
71
|
+
"""Assemble the full ``<search_results>`` payload from both segments.
|
|
72
|
+
|
|
73
|
+
Primary blocks render first (ranked order preserved), then — only when the
|
|
74
|
+
pool is non-empty — the pool header and the snippet-only blocks, with
|
|
75
|
+
``[N]`` numbering running contiguously across the boundary. Blocks are
|
|
76
|
+
joined with ``"\\n"`` (each block already ends in a newline, yielding the
|
|
77
|
+
reference's blank-line separation). Empty input renders an empty string —
|
|
78
|
+
the caller decides what a sourceless run means.
|
|
79
|
+
"""
|
|
80
|
+
parts: list[str] = []
|
|
81
|
+
n = 1
|
|
82
|
+
for draft in primary:
|
|
83
|
+
parts.append(render_source_block(n, draft))
|
|
84
|
+
n += 1
|
|
85
|
+
if pool:
|
|
86
|
+
parts.append(SNIPPET_POOL_HEADER)
|
|
87
|
+
for draft in pool:
|
|
88
|
+
parts.append(render_pool_block(n, draft))
|
|
89
|
+
n += 1
|
|
90
|
+
return "\n".join(parts)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Caching subsystem: backends, key builders, and the :func:`make_cache` factory.
|
|
2
|
+
|
|
3
|
+
WHY this package: the pipeline caches three things — provider searches, fetched
|
|
4
|
+
pages, and query expansions — each with its own TTL (see ``WebSearchConfig``).
|
|
5
|
+
All three flow through the single :class:`~websearch_kit.protocols.Cache`
|
|
6
|
+
protocol, so the choice of backend is a configuration detail resolved once, here,
|
|
7
|
+
by :func:`make_cache`. Centralizing construction keeps backend wiring (the sqlite
|
|
8
|
+
directory resolution, the fail-soft :class:`CacheGuard` wrapping) out of the
|
|
9
|
+
engine.
|
|
10
|
+
|
|
11
|
+
The factory always wraps a real backend in :class:`CacheGuard` so a backend fault
|
|
12
|
+
degrades to a cache miss instead of failing the run (no-fail-silent: the guard
|
|
13
|
+
logs and counts every swallowed error). ``cache_backend="none"`` returns ``None``
|
|
14
|
+
— the engine treats a ``None`` cache as "caching disabled" and skips lookups
|
|
15
|
+
entirely, which is cheaper than routing every call through a no-op object.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from ..config import WebSearchConfig
|
|
26
|
+
from ..protocols import Cache
|
|
27
|
+
from .keys import content_key, expansion_key, search_key
|
|
28
|
+
from .memory import MemoryTTLCache
|
|
29
|
+
from .sqlite_cache import CacheGuard, SqliteCache
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"CacheGuard",
|
|
33
|
+
"MemoryTTLCache",
|
|
34
|
+
"SqliteCache",
|
|
35
|
+
"content_key",
|
|
36
|
+
"default_cache_dir",
|
|
37
|
+
"expansion_key",
|
|
38
|
+
"make_cache",
|
|
39
|
+
"search_key",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def default_cache_dir() -> Path:
|
|
44
|
+
"""Resolve the default sqlite-cache directory using the XDG base-dir spec.
|
|
45
|
+
|
|
46
|
+
Honors ``$XDG_CACHE_HOME`` when set to an absolute path (per the spec, a
|
|
47
|
+
relative or empty value is ignored), otherwise falls back to
|
|
48
|
+
``~/.cache``; the websearch-kit cache lives in a ``websearch-kit``
|
|
49
|
+
subdirectory of that base. Implemented with the standard library only
|
|
50
|
+
(``os``/``pathlib``) so caching never drags in ``platformdirs``.
|
|
51
|
+
"""
|
|
52
|
+
xdg_cache = os.environ.get("XDG_CACHE_HOME")
|
|
53
|
+
# The XDG spec mandates absolute paths; ignore a relative/empty value.
|
|
54
|
+
if xdg_cache and os.path.isabs(xdg_cache):
|
|
55
|
+
base = Path(xdg_cache)
|
|
56
|
+
else:
|
|
57
|
+
base = Path.home() / ".cache"
|
|
58
|
+
return base / "websearch-kit"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def make_cache(
|
|
62
|
+
config: WebSearchConfig,
|
|
63
|
+
clock: Callable[[], float] = time.monotonic,
|
|
64
|
+
) -> Cache | None:
|
|
65
|
+
"""Build the configured cache backend, or ``None`` when caching is disabled.
|
|
66
|
+
|
|
67
|
+
* ``"memory"`` -> :class:`MemoryTTLCache` (process-local, default).
|
|
68
|
+
* ``"sqlite"`` -> :class:`SqliteCache` rooted at ``config.cache_dir`` (or
|
|
69
|
+
:func:`default_cache_dir` when unset).
|
|
70
|
+
* ``"none"`` -> ``None`` (the engine skips caching entirely).
|
|
71
|
+
|
|
72
|
+
Any real backend is wrapped in :class:`CacheGuard` so backend faults become
|
|
73
|
+
logged, counted cache misses rather than run failures.
|
|
74
|
+
|
|
75
|
+
The ``clock`` is threaded into the memory backend (whose TTL uses a monotonic
|
|
76
|
+
source) so tests can drive expiry deterministically. The sqlite backend keeps
|
|
77
|
+
its own wall-clock default because its TTLs must survive process restarts.
|
|
78
|
+
"""
|
|
79
|
+
backend = config.cache_backend
|
|
80
|
+
if backend == "none":
|
|
81
|
+
return None
|
|
82
|
+
if backend == "memory":
|
|
83
|
+
return CacheGuard(MemoryTTLCache(max_entries=512, clock=clock))
|
|
84
|
+
if backend == "sqlite":
|
|
85
|
+
directory = Path(config.cache_dir) if config.cache_dir else default_cache_dir()
|
|
86
|
+
return CacheGuard(SqliteCache(directory=directory))
|
|
87
|
+
# CacheBackend is a closed Literal; an unhandled member is a programming
|
|
88
|
+
# error, not a runtime input — surface it loudly rather than fail silent.
|
|
89
|
+
raise AssertionError(f"unhandled cache backend: {backend!r}") # pragma: no cover
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Deterministic cache-key builders for the three cacheable pipeline stages.
|
|
2
|
+
|
|
3
|
+
WHY a dedicated module: cache correctness hinges on *stable, collision-resistant*
|
|
4
|
+
keys derived from exactly the inputs that change a result — and nothing else. A
|
|
5
|
+
key built ad-hoc at each call site drifts (parameter order, optional-None
|
|
6
|
+
handling, forgetting to normalize a URL) and silently poisons the cache with
|
|
7
|
+
stale or cross-contaminated entries. Centralizing the builders makes the keyspace
|
|
8
|
+
auditable and the canonicalization rules a single source of truth.
|
|
9
|
+
|
|
10
|
+
Design choices:
|
|
11
|
+
|
|
12
|
+
* **sha256 hex of a canonical string.** Keys are fixed-length, opaque, and safe
|
|
13
|
+
to use as a sqlite ``TEXT PRIMARY KEY`` regardless of how long/odd the inputs
|
|
14
|
+
are. sha256 is not used here for any security property — only for a uniform,
|
|
15
|
+
low-collision digest — so the lint that flags weak hashes does not apply.
|
|
16
|
+
* **Explicit stage prefixes** (``search:``/``content:``/``expand:``) keep the
|
|
17
|
+
three keyspaces disjoint inside a shared backend, so a content URL can never
|
|
18
|
+
alias a search query that happens to hash the same canonical bytes.
|
|
19
|
+
* **``content_key`` applies :func:`sanitize_url` itself.** Stripping tracking
|
|
20
|
+
parameters (utm_*, gclid, ...) before hashing is what makes two links to the
|
|
21
|
+
same page share a cache entry. Doing it *inside* the key builder means a caller
|
|
22
|
+
physically cannot forget to normalize first — the dedup win is structural, not
|
|
23
|
+
a convention someone must remember.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import hashlib
|
|
29
|
+
|
|
30
|
+
from ..security.sanitize import sanitize_url
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"content_key",
|
|
34
|
+
"expansion_key",
|
|
35
|
+
"search_key",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
#: Field separator for the canonical string. A control character (unit
|
|
39
|
+
#: separator, 0x1f) cannot appear in a normal query/URL/prompt, so it cannot be
|
|
40
|
+
#: forged by input to merge two distinct tuples into one canonical string.
|
|
41
|
+
_SEP = "\x1f"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _digest(prefix: str, *parts: str) -> str:
|
|
45
|
+
"""Return ``"{prefix}{sha256-hex}"`` over the unit-separator-joined parts."""
|
|
46
|
+
canonical = _SEP.join(parts)
|
|
47
|
+
hexdigest = hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
48
|
+
return f"{prefix}{hexdigest}"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def search_key(
|
|
52
|
+
provider: str,
|
|
53
|
+
query: str,
|
|
54
|
+
count: int,
|
|
55
|
+
lang: str | None,
|
|
56
|
+
time_range: str | None,
|
|
57
|
+
) -> str:
|
|
58
|
+
"""Key for one provider search call.
|
|
59
|
+
|
|
60
|
+
Every argument that changes the returned hits participates: the provider
|
|
61
|
+
(different engines, different results), the query, the requested ``count``,
|
|
62
|
+
and the optional language / time-range filters. ``None`` is canonicalized to
|
|
63
|
+
the empty string so an unset filter is stable and distinct from any real
|
|
64
|
+
value a provider would accept.
|
|
65
|
+
"""
|
|
66
|
+
return _digest(
|
|
67
|
+
"search:",
|
|
68
|
+
provider,
|
|
69
|
+
query,
|
|
70
|
+
str(count),
|
|
71
|
+
lang or "",
|
|
72
|
+
time_range or "",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def content_key(sanitized_url: str) -> str:
|
|
77
|
+
"""Key for one fetched+extracted page, keyed on the *normalized* URL.
|
|
78
|
+
|
|
79
|
+
The argument name documents intent, but this builder does not trust the
|
|
80
|
+
caller to have normalized: it runs :func:`sanitize_url` here so two URLs that
|
|
81
|
+
differ only by tracking parameters (``?utm_source=...``) collapse to the same
|
|
82
|
+
cache entry. This is deliberate — the strip-before-hash cannot be skipped.
|
|
83
|
+
"""
|
|
84
|
+
return _digest("content:", sanitize_url(sanitized_url))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def expansion_key(prompt_text: str) -> str:
|
|
88
|
+
"""Key for a query-expansion result, keyed on the full prompt text.
|
|
89
|
+
|
|
90
|
+
The expansion output is a pure function of the prompt handed to the LLM, so
|
|
91
|
+
the entire prompt string is the cache identity.
|
|
92
|
+
"""
|
|
93
|
+
return _digest("expand:", prompt_text)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""In-process TTL cache with LRU eviction — the zero-config default backend.
|
|
2
|
+
|
|
3
|
+
WHY in-memory by default: the common deployment is a single long-lived process
|
|
4
|
+
(an OWUI worker, an MCP server) where a process-local cache eliminates duplicate
|
|
5
|
+
provider calls and page fetches within a session at zero operational cost — no
|
|
6
|
+
file, no extra dependency. It is intentionally *not* shared across processes;
|
|
7
|
+
callers needing cross-process persistence opt into :class:`SqliteCache`.
|
|
8
|
+
|
|
9
|
+
Concurrency model: every mutation (and the lazy-expiry bookkeeping on ``get``)
|
|
10
|
+
runs under a single :class:`asyncio.Lock`. The cache is only safe within one
|
|
11
|
+
event loop — which is the contract, since the pipeline is async and single-loop.
|
|
12
|
+
The lock makes the read-modify-write on the backing :class:`OrderedDict` atomic
|
|
13
|
+
with respect to other awaiting tasks, so concurrent ``get``/``set`` from
|
|
14
|
+
``asyncio.gather`` can never observe or produce a half-updated structure.
|
|
15
|
+
|
|
16
|
+
Eviction policy:
|
|
17
|
+
|
|
18
|
+
* **TTL** — each entry carries an absolute monotonic expiry timestamp. Expired
|
|
19
|
+
entries are dropped *lazily* on ``get`` (the cheap, always-correct path) and
|
|
20
|
+
*opportunistically* swept on ``set`` only when the cache is over capacity
|
|
21
|
+
(amortizing the sweep cost instead of scanning on every write).
|
|
22
|
+
* **LRU** — recency is tracked by position in the ``OrderedDict``: a hit or a
|
|
23
|
+
write moves the key to the most-recently-used end. When the map exceeds
|
|
24
|
+
``max_entries`` after a write, the least-recently-used keys are popped from the
|
|
25
|
+
front until back within bounds.
|
|
26
|
+
|
|
27
|
+
The clock is injectable (default :func:`time.monotonic`) so tests drive TTL
|
|
28
|
+
expiry deterministically without sleeping. Monotonic time is used because TTL is
|
|
29
|
+
a duration; it must be immune to wall-clock jumps (NTP steps, DST).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import asyncio
|
|
35
|
+
import time
|
|
36
|
+
from collections import OrderedDict
|
|
37
|
+
from collections.abc import Callable
|
|
38
|
+
from dataclasses import dataclass
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
__all__ = ["MemoryTTLCache"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(slots=True)
|
|
45
|
+
class _Entry:
|
|
46
|
+
"""A cached value plus its absolute monotonic expiry timestamp."""
|
|
47
|
+
|
|
48
|
+
value: Any
|
|
49
|
+
expires_at: float
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MemoryTTLCache:
|
|
53
|
+
"""Async, LRU-bounded, per-entry-TTL cache implementing :class:`protocols.Cache`.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
max_entries: Hard cap on retained entries; LRU keys are evicted past it.
|
|
57
|
+
clock: Monotonic time source returning seconds; injected for tests.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
max_entries: int = 512,
|
|
63
|
+
clock: Callable[[], float] = time.monotonic,
|
|
64
|
+
) -> None:
|
|
65
|
+
self._max_entries = max_entries
|
|
66
|
+
self._clock = clock
|
|
67
|
+
# Insertion/access order IS the LRU order: front = least recently used.
|
|
68
|
+
self._store: OrderedDict[str, _Entry] = OrderedDict()
|
|
69
|
+
self._lock = asyncio.Lock()
|
|
70
|
+
|
|
71
|
+
async def get(self, key: str) -> Any | None:
|
|
72
|
+
"""Return the live value for ``key``, or ``None`` if absent or expired.
|
|
73
|
+
|
|
74
|
+
A live hit is promoted to most-recently-used. An expired entry is dropped
|
|
75
|
+
lazily here (and reported as a miss) so stale data never escapes, even if
|
|
76
|
+
no write has triggered an opportunistic sweep.
|
|
77
|
+
"""
|
|
78
|
+
async with self._lock:
|
|
79
|
+
entry = self._store.get(key)
|
|
80
|
+
if entry is None:
|
|
81
|
+
return None
|
|
82
|
+
if self._is_expired(entry):
|
|
83
|
+
# Lazy expiry: drop the stale row and report a miss. Not
|
|
84
|
+
# fail-silent — an expired entry is definitionally not present.
|
|
85
|
+
del self._store[key]
|
|
86
|
+
return None
|
|
87
|
+
self._store.move_to_end(key)
|
|
88
|
+
return entry.value
|
|
89
|
+
|
|
90
|
+
async def set(self, key: str, value: Any, ttl: float) -> None:
|
|
91
|
+
"""Insert/replace ``key`` with ``value``, expiring ``ttl`` seconds hence.
|
|
92
|
+
|
|
93
|
+
The key becomes most-recently-used. When the store is over capacity after
|
|
94
|
+
the write, expired entries are swept first (reclaiming space without
|
|
95
|
+
discarding live data), then the least-recently-used keys are evicted until
|
|
96
|
+
back within ``max_entries``.
|
|
97
|
+
"""
|
|
98
|
+
async with self._lock:
|
|
99
|
+
expires_at = self._clock() + ttl
|
|
100
|
+
if key in self._store:
|
|
101
|
+
self._store.move_to_end(key)
|
|
102
|
+
self._store[key] = _Entry(value=value, expires_at=expires_at)
|
|
103
|
+
if len(self._store) > self._max_entries:
|
|
104
|
+
self._sweep_expired()
|
|
105
|
+
self._evict_lru()
|
|
106
|
+
|
|
107
|
+
def _is_expired(self, entry: _Entry) -> bool:
|
|
108
|
+
return entry.expires_at <= self._clock()
|
|
109
|
+
|
|
110
|
+
def _sweep_expired(self) -> None:
|
|
111
|
+
"""Drop every currently-expired entry. Caller must hold the lock."""
|
|
112
|
+
now = self._clock()
|
|
113
|
+
stale = [key for key, entry in self._store.items() if entry.expires_at <= now]
|
|
114
|
+
for key in stale:
|
|
115
|
+
del self._store[key]
|
|
116
|
+
|
|
117
|
+
def _evict_lru(self) -> None:
|
|
118
|
+
"""Pop least-recently-used keys until within capacity. Caller holds lock."""
|
|
119
|
+
while len(self._store) > self._max_entries:
|
|
120
|
+
# popitem(last=False) removes the front == least-recently-used entry.
|
|
121
|
+
self._store.popitem(last=False)
|