leancontext 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leancontext/__init__.py +104 -0
- leancontext/cli.py +36 -0
- leancontext/core.py +214 -0
- leancontext/cost.py +104 -0
- leancontext/fidelity.py +108 -0
- leancontext/integrations/__init__.py +8 -0
- leancontext/integrations/_common.py +62 -0
- leancontext/integrations/anthropic_native.py +83 -0
- leancontext/integrations/clients.py +58 -0
- leancontext/integrations/decorator.py +103 -0
- leancontext/integrations/frameworks.py +58 -0
- leancontext/integrations/litellm.py +80 -0
- leancontext/integrations/mcp_server.py +64 -0
- leancontext/integrations/otel.py +78 -0
- leancontext/integrations/proxy.py +90 -0
- leancontext/messages.py +152 -0
- leancontext/paging.py +104 -0
- leancontext/py.typed +0 -0
- leancontext/reducers/__init__.py +36 -0
- leancontext/reducers/base.py +19 -0
- leancontext/reducers/diff.py +54 -0
- leancontext/reducers/html.py +64 -0
- leancontext/reducers/json_data.py +61 -0
- leancontext/reducers/logs.py +91 -0
- leancontext/reducers/stacktrace.py +59 -0
- leancontext/reducers/table.py +32 -0
- leancontext/tokens.py +79 -0
- leancontext-2.0.0.dist-info/METADATA +224 -0
- leancontext-2.0.0.dist-info/RECORD +32 -0
- leancontext-2.0.0.dist-info/WHEEL +4 -0
- leancontext-2.0.0.dist-info/entry_points.txt +2 -0
- leancontext-2.0.0.dist-info/licenses/LICENSE +190 -0
leancontext/messages.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Protocol-aware message reduction — the gateway/wire surface.
|
|
2
|
+
|
|
3
|
+
This is how LeanContext plugs into gateways (LiteLLM), SDK client wrappers, and proxies
|
|
4
|
+
*without* the structure-blindness that hurts wire-level compressors: the chat
|
|
5
|
+
protocols already tag tool outputs (OpenAI ``role="tool"``; Anthropic
|
|
6
|
+
``tool_result`` blocks), so we can find and reduce exactly those — and nothing
|
|
7
|
+
else. We never touch system/user/assistant instruction text. Fail-open throughout.
|
|
8
|
+
|
|
9
|
+
Cache-safety: reductions are deterministic and content-addressed, so the same tool
|
|
10
|
+
output always serialises to the same bytes → the provider prompt-cache keeps hitting.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from .core import reduce_text
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def detect_format(messages: list) -> str:
|
|
21
|
+
"""Best-effort detection of the message protocol."""
|
|
22
|
+
for m in messages:
|
|
23
|
+
if not isinstance(m, dict):
|
|
24
|
+
continue
|
|
25
|
+
if isinstance(m.get("parts"), list):
|
|
26
|
+
return "gemini"
|
|
27
|
+
if m.get("role") in ("tool", "function"):
|
|
28
|
+
return "openai"
|
|
29
|
+
content = m.get("content")
|
|
30
|
+
if isinstance(content, list):
|
|
31
|
+
for block in content:
|
|
32
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
33
|
+
return "anthropic"
|
|
34
|
+
return "openai"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _reduce_str(text: Any, opts: dict) -> Any:
|
|
38
|
+
if not isinstance(text, str):
|
|
39
|
+
return text
|
|
40
|
+
return reduce_text(text, **opts).text
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# --- OpenAI / chat-completions format ----------------------------------------
|
|
44
|
+
|
|
45
|
+
def _reduce_openai_message(m: Any, opts: dict) -> Any:
|
|
46
|
+
if not isinstance(m, dict) or m.get("role") not in ("tool", "function"):
|
|
47
|
+
return m
|
|
48
|
+
content = m.get("content")
|
|
49
|
+
if isinstance(content, str):
|
|
50
|
+
nm = dict(m)
|
|
51
|
+
nm["content"] = _reduce_str(content, opts)
|
|
52
|
+
return nm
|
|
53
|
+
if isinstance(content, list):
|
|
54
|
+
nm = dict(m)
|
|
55
|
+
nm["content"] = [_reduce_openai_part(p, opts) for p in content]
|
|
56
|
+
return nm
|
|
57
|
+
return m
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _reduce_openai_part(part: Any, opts: dict) -> Any:
|
|
61
|
+
if (
|
|
62
|
+
isinstance(part, dict)
|
|
63
|
+
and part.get("type") in ("text", "output_text")
|
|
64
|
+
and isinstance(part.get("text"), str)
|
|
65
|
+
):
|
|
66
|
+
np = dict(part)
|
|
67
|
+
np["text"] = _reduce_str(part["text"], opts)
|
|
68
|
+
return np
|
|
69
|
+
return part
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# --- Anthropic messages format -----------------------------------------------
|
|
73
|
+
|
|
74
|
+
def _reduce_anthropic_message(m: Any, opts: dict) -> Any:
|
|
75
|
+
if not isinstance(m, dict):
|
|
76
|
+
return m
|
|
77
|
+
content = m.get("content")
|
|
78
|
+
if not isinstance(content, list):
|
|
79
|
+
return m
|
|
80
|
+
new_blocks, changed = [], False
|
|
81
|
+
for block in content:
|
|
82
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
83
|
+
bc = block.get("content")
|
|
84
|
+
if isinstance(bc, str):
|
|
85
|
+
nb = dict(block)
|
|
86
|
+
nb["content"] = _reduce_str(bc, opts)
|
|
87
|
+
new_blocks.append(nb)
|
|
88
|
+
changed = True
|
|
89
|
+
continue
|
|
90
|
+
if isinstance(bc, list):
|
|
91
|
+
nb = dict(block)
|
|
92
|
+
nb["content"] = [_reduce_anthropic_textblock(x, opts) for x in bc]
|
|
93
|
+
new_blocks.append(nb)
|
|
94
|
+
changed = True
|
|
95
|
+
continue
|
|
96
|
+
new_blocks.append(block)
|
|
97
|
+
if not changed:
|
|
98
|
+
return m
|
|
99
|
+
nm = dict(m)
|
|
100
|
+
nm["content"] = new_blocks
|
|
101
|
+
return nm
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _reduce_anthropic_textblock(x: Any, opts: dict) -> Any:
|
|
105
|
+
if isinstance(x, dict) and x.get("type") == "text" and isinstance(x.get("text"), str):
|
|
106
|
+
nx = dict(x)
|
|
107
|
+
nx["text"] = _reduce_str(x["text"], opts)
|
|
108
|
+
return nx
|
|
109
|
+
return x
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# --- Gemini format -----------------------------------------------------------
|
|
113
|
+
# Gemini uses `contents` -> `parts`, where a tool result is a `functionResponse`
|
|
114
|
+
# part whose `response` is a dict. We reduce the large string values inside that
|
|
115
|
+
# dict, keeping the dict shape Gemini requires. Typed SDK objects (non-dict)
|
|
116
|
+
# pass through untouched.
|
|
117
|
+
|
|
118
|
+
def _reduce_gemini_message(content: Any, opts: dict) -> Any:
|
|
119
|
+
if not isinstance(content, dict) or not isinstance(content.get("parts"), list):
|
|
120
|
+
return content
|
|
121
|
+
new_parts, changed = [], False
|
|
122
|
+
for part in content["parts"]:
|
|
123
|
+
fr = part.get("functionResponse") if isinstance(part, dict) else None
|
|
124
|
+
resp = fr.get("response") if isinstance(fr, dict) else None
|
|
125
|
+
if isinstance(fr, dict) and isinstance(resp, dict):
|
|
126
|
+
reduced = {k: (_reduce_str(v, opts) if isinstance(v, str) else v) for k, v in resp.items()}
|
|
127
|
+
new_parts.append({**part, "functionResponse": {**fr, "response": reduced}})
|
|
128
|
+
changed = True
|
|
129
|
+
else:
|
|
130
|
+
new_parts.append(part)
|
|
131
|
+
if not changed:
|
|
132
|
+
return content
|
|
133
|
+
return {**content, "parts": new_parts}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# --- public ------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
def reduce_messages(messages: Any, *, fmt: str = "auto", **opts) -> Any:
|
|
139
|
+
"""Return a new message list with tool outputs reduced. Input is not mutated.
|
|
140
|
+
|
|
141
|
+
Handles OpenAI (`role:"tool"`), Anthropic (`tool_result` blocks), and Gemini
|
|
142
|
+
(`functionResponse` parts). Only tool-result content is touched; instructions
|
|
143
|
+
are never altered. Anything unrecognised passes through unchanged (fail open).
|
|
144
|
+
"""
|
|
145
|
+
if not isinstance(messages, list):
|
|
146
|
+
return messages
|
|
147
|
+
resolved = detect_format(messages) if fmt == "auto" else fmt
|
|
148
|
+
if resolved == "anthropic":
|
|
149
|
+
return [_reduce_anthropic_message(m, opts) for m in messages]
|
|
150
|
+
if resolved == "gemini":
|
|
151
|
+
return [_reduce_gemini_message(m, opts) for m in messages]
|
|
152
|
+
return [_reduce_openai_message(m, opts) for m in messages]
|
leancontext/paging.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Paging: drop aged tool outputs from the wire, keep them retrievable.
|
|
2
|
+
|
|
3
|
+
Reducing shrinks each payload; paging goes further by removing old payloads from
|
|
4
|
+
context once the agent has moved on. The output is replaced with a small reference
|
|
5
|
+
(a few tens of tokens) and the original is stored, so the agent can fetch it back
|
|
6
|
+
with the expand tool when it needs the detail again.
|
|
7
|
+
|
|
8
|
+
Refs are content hashes, so they're deterministic. The store is in-memory by
|
|
9
|
+
default, or disk-backed (set ``root``) for retrieval across processes.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
from .tokens import content_ref, count_tokens
|
|
18
|
+
|
|
19
|
+
REF_SCHEME = "lc"
|
|
20
|
+
_REF_RE = re.compile(r"lc://([0-9a-f]{6,40})")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ContentStore:
|
|
24
|
+
"""Maps a content hash → original content. In-memory, or disk-backed if ``root`` set."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, root: str | None = None):
|
|
27
|
+
self.root = root
|
|
28
|
+
self._mem: dict[str, str] = {}
|
|
29
|
+
if self.root:
|
|
30
|
+
os.makedirs(self.root, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
def _path(self, ref: str) -> str:
|
|
33
|
+
return os.path.join(self.root, f"{ref}.txt") # type: ignore[arg-type]
|
|
34
|
+
|
|
35
|
+
def put(self, content: str) -> str:
|
|
36
|
+
ref = content_ref(content)
|
|
37
|
+
if self.root:
|
|
38
|
+
with open(self._path(ref), "w", encoding="utf-8") as fh:
|
|
39
|
+
fh.write(content)
|
|
40
|
+
else:
|
|
41
|
+
self._mem[ref] = content
|
|
42
|
+
return ref
|
|
43
|
+
|
|
44
|
+
def get(self, ref: str) -> str | None:
|
|
45
|
+
if self.root:
|
|
46
|
+
try:
|
|
47
|
+
with open(self._path(ref), encoding="utf-8") as fh:
|
|
48
|
+
return fh.read()
|
|
49
|
+
except OSError:
|
|
50
|
+
return None
|
|
51
|
+
return self._mem.get(ref)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_DEFAULT_STORE = ContentStore()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _normalize(ref: str) -> str:
|
|
58
|
+
m = _REF_RE.search(ref)
|
|
59
|
+
return m.group(1) if m else ref.strip()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def store(content: str, using: ContentStore | None = None) -> str:
|
|
63
|
+
"""Stash content and return its ref id."""
|
|
64
|
+
return (using or _DEFAULT_STORE).put(content)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def expand(ref: str, using: ContentStore | None = None) -> str | None:
|
|
68
|
+
"""Return the original content for a ref (accepts 'lc://<id>' or a bare id)."""
|
|
69
|
+
return (using or _DEFAULT_STORE).get(_normalize(ref))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def reference_line(content: str, summary: str | None = None,
|
|
73
|
+
using: ContentStore | None = None) -> str:
|
|
74
|
+
"""Stash content and return a compact, expandable reference line."""
|
|
75
|
+
ref = store(content, using=using)
|
|
76
|
+
tokens = count_tokens(content)
|
|
77
|
+
tail = f" — {summary}" if summary else ""
|
|
78
|
+
return f"[{REF_SCHEME}://{ref} · {tokens} tokens · call leancontext_expand to view{tail}]"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def page(content: str, *, summary: str | None = None,
|
|
82
|
+
using: ContentStore | None = None) -> str:
|
|
83
|
+
"""Collapse aged content to an expandable reference (O(1) on the wire)."""
|
|
84
|
+
return reference_line(content, summary=summary, using=using)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
#: Tool spec to expose ``expand`` to an agent (OpenAI/Anthropic/MCP-compatible shape).
|
|
88
|
+
EXPAND_TOOL_SPEC = {
|
|
89
|
+
"name": "leancontext_expand",
|
|
90
|
+
"description": (
|
|
91
|
+
"Retrieve the full original content for a LeanContext reference id "
|
|
92
|
+
"(format: lc://<id>) that was collapsed to save tokens."
|
|
93
|
+
),
|
|
94
|
+
"input_schema": {
|
|
95
|
+
"type": "object",
|
|
96
|
+
"properties": {
|
|
97
|
+
"ref": {
|
|
98
|
+
"type": "string",
|
|
99
|
+
"description": "The reference id, e.g. 'lc://a1b2c3d4' or the bare id.",
|
|
100
|
+
},
|
|
101
|
+
},
|
|
102
|
+
"required": ["ref"],
|
|
103
|
+
},
|
|
104
|
+
}
|
leancontext/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Typed reducers.
|
|
2
|
+
|
|
3
|
+
Each reducer module exposes a ``REDUCER`` (kind, detector, reduce function,
|
|
4
|
+
priority). ``REGISTRY`` is the ordered list the core iterates for detection and
|
|
5
|
+
dispatch, so adding a reducer means adding one module and listing it here.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .base import Reducer
|
|
9
|
+
from .diff import REDUCER as _diff
|
|
10
|
+
from .diff import reduce_diff
|
|
11
|
+
from .html import REDUCER as _html
|
|
12
|
+
from .html import reduce_html
|
|
13
|
+
from .json_data import REDUCER as _json
|
|
14
|
+
from .json_data import reduce_json
|
|
15
|
+
from .logs import REDUCER as _logs
|
|
16
|
+
from .logs import reduce_logs
|
|
17
|
+
from .stacktrace import REDUCER as _stacktrace
|
|
18
|
+
from .stacktrace import reduce_stacktrace
|
|
19
|
+
from .table import REDUCER as _table
|
|
20
|
+
from .table import reduce_table
|
|
21
|
+
|
|
22
|
+
# Detection runs in priority order (lowest first): json, stacktrace, diff, html, log.
|
|
23
|
+
REGISTRY: list[Reducer] = sorted(
|
|
24
|
+
[_json, _stacktrace, _diff, _html, _logs, _table], key=lambda r: r.priority
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"Reducer",
|
|
29
|
+
"REGISTRY",
|
|
30
|
+
"reduce_logs",
|
|
31
|
+
"reduce_json",
|
|
32
|
+
"reduce_diff",
|
|
33
|
+
"reduce_stacktrace",
|
|
34
|
+
"reduce_html",
|
|
35
|
+
"reduce_table",
|
|
36
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""The shape every reducer registers with.
|
|
2
|
+
|
|
3
|
+
A reducer bundles three things: the kind name, a detector that says whether a
|
|
4
|
+
payload is this kind, and the reduce function. Detection priority is explicit
|
|
5
|
+
(lower runs first), so the order is clear and stable.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Reducer:
|
|
16
|
+
kind: str
|
|
17
|
+
detect: Callable[[str], bool]
|
|
18
|
+
reduce: Callable[[str], tuple[str, list[str]]]
|
|
19
|
+
priority: int
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Diff reducer.
|
|
2
|
+
|
|
3
|
+
Keeps every change line (``+``/``-``), hunk header (``@@``) and file header verbatim
|
|
4
|
+
— those are the signal. Collapses long runs of unchanged context lines to the first
|
|
5
|
+
and last line plus a count. Deterministic; value-preserving for all changes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from .base import Reducer
|
|
13
|
+
|
|
14
|
+
_DIFF_HUNK = re.compile(r"(?m)^@@ -\d+(?:,\d+)? \+\d+(?:,\d+)? @@")
|
|
15
|
+
|
|
16
|
+
_KEEP_PREFIXES = (
|
|
17
|
+
"+", "-", "@@", "diff ", "index ", "new file", "deleted file",
|
|
18
|
+
"rename ", "similarity ", "copy ", "old mode", "new mode",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def reduce_diff(text: str) -> tuple[str, list[str]]:
|
|
23
|
+
lines = text.splitlines()
|
|
24
|
+
out: list[str] = []
|
|
25
|
+
ctx: list[str] = []
|
|
26
|
+
|
|
27
|
+
def flush() -> None:
|
|
28
|
+
if not ctx:
|
|
29
|
+
return
|
|
30
|
+
if len(ctx) <= 3:
|
|
31
|
+
out.extend(ctx)
|
|
32
|
+
else:
|
|
33
|
+
out.append(ctx[0])
|
|
34
|
+
out.append(f" ⟪… {len(ctx) - 2} unchanged lines⟫")
|
|
35
|
+
out.append(ctx[-1])
|
|
36
|
+
ctx.clear()
|
|
37
|
+
|
|
38
|
+
for line in lines:
|
|
39
|
+
if line.startswith(_KEEP_PREFIXES):
|
|
40
|
+
flush()
|
|
41
|
+
out.append(line)
|
|
42
|
+
else:
|
|
43
|
+
ctx.append(line)
|
|
44
|
+
flush()
|
|
45
|
+
|
|
46
|
+
notes = [f"kept all change/header lines; collapsed unchanged context ({len(lines)}→{len(out)} lines)"]
|
|
47
|
+
return "\n".join(out), notes
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _detect(text: str) -> bool:
|
|
51
|
+
return text.lstrip().startswith("diff --git") or bool(_DIFF_HUNK.search(text))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
REDUCER = Reducer("diff", _detect, reduce_diff, priority=30)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""HTML reducer — for web-fetch / scraped tool outputs.
|
|
2
|
+
|
|
3
|
+
Strips tags, scripts, styles and collapses whitespace, keeping the visible text and
|
|
4
|
+
the links (URLs are signal, so they're preserved). Stdlib only, deterministic.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from html.parser import HTMLParser
|
|
11
|
+
|
|
12
|
+
from .base import Reducer
|
|
13
|
+
|
|
14
|
+
_SKIP = {"script", "style", "noscript", "svg", "head", "template"}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _Extract(HTMLParser):
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
super().__init__(convert_charrefs=True)
|
|
20
|
+
self.parts: list[str] = []
|
|
21
|
+
self.links: list[str] = []
|
|
22
|
+
self._skip = 0
|
|
23
|
+
|
|
24
|
+
def handle_starttag(self, tag, attrs):
|
|
25
|
+
if tag in _SKIP:
|
|
26
|
+
self._skip += 1
|
|
27
|
+
if tag == "a":
|
|
28
|
+
for key, val in attrs:
|
|
29
|
+
if key == "href" and val:
|
|
30
|
+
self.links.append(val)
|
|
31
|
+
|
|
32
|
+
def handle_endtag(self, tag):
|
|
33
|
+
if tag in _SKIP and self._skip > 0:
|
|
34
|
+
self._skip -= 1
|
|
35
|
+
|
|
36
|
+
def handle_data(self, data):
|
|
37
|
+
if self._skip == 0:
|
|
38
|
+
text = data.strip()
|
|
39
|
+
if text:
|
|
40
|
+
self.parts.append(text)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def reduce_html(text: str) -> tuple[str, list[str]]:
|
|
44
|
+
parser = _Extract()
|
|
45
|
+
parser.feed(text)
|
|
46
|
+
body = re.sub(r"[ \t]+", " ", "\n".join(parser.parts)).strip()
|
|
47
|
+
links = list(dict.fromkeys(parser.links))
|
|
48
|
+
|
|
49
|
+
out = body
|
|
50
|
+
if links:
|
|
51
|
+
out += "\n\nLinks: " + " ".join(links)
|
|
52
|
+
notes = [f"stripped HTML tags/scripts/styles; kept visible text + {len(links)} links"]
|
|
53
|
+
return out, notes
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _detect(text: str) -> bool:
|
|
57
|
+
stripped = text.lstrip()
|
|
58
|
+
head = stripped[:512].lower()
|
|
59
|
+
if "<!doctype html" in head or "<html" in head:
|
|
60
|
+
return True
|
|
61
|
+
return stripped.startswith("<") and text.lower().count("</") >= 5
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
REDUCER = Reducer("html", _detect, reduce_html, priority=40)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""JSON / RAG reducer.
|
|
2
|
+
|
|
3
|
+
The dominant waste in JSON tool output is *repeated keys*: a list of 200 records
|
|
4
|
+
re-states every field name 200 times. We factor the schema out once and emit the
|
|
5
|
+
values columnar. All values are preserved, so this is near-lossless.
|
|
6
|
+
|
|
7
|
+
Falls back to whitespace-stripped (minified) JSON when the payload isn't a record
|
|
8
|
+
list — still a real saving on pretty-printed output, with zero information loss.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .base import Reducer
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _find_records(data: Any) -> list[dict] | None:
|
|
20
|
+
"""Locate a homogeneous-ish list of dicts at the top level or one level down."""
|
|
21
|
+
if isinstance(data, list) and data and all(isinstance(x, dict) for x in data):
|
|
22
|
+
return data
|
|
23
|
+
if isinstance(data, dict):
|
|
24
|
+
for value in data.values():
|
|
25
|
+
if isinstance(value, list) and value and all(isinstance(x, dict) for x in value):
|
|
26
|
+
return value
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _fmt(value: Any) -> str:
|
|
31
|
+
if isinstance(value, str):
|
|
32
|
+
return value
|
|
33
|
+
return json.dumps(value, separators=(",", ":"), ensure_ascii=False)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def reduce_json(text: str) -> tuple[str, list[str]]:
|
|
37
|
+
data = json.loads(text)
|
|
38
|
+
records = _find_records(data)
|
|
39
|
+
|
|
40
|
+
if records is not None and len(records) >= 3:
|
|
41
|
+
keys = list(dict.fromkeys(k for row in records for k in row.keys()))
|
|
42
|
+
header = "fields: " + " | ".join(keys)
|
|
43
|
+
rows = [" | ".join(_fmt(row.get(k, "")) for k in keys) for row in records]
|
|
44
|
+
notes = [f"columnar: {len(records)} records × {len(keys)} fields, keys factored out once"]
|
|
45
|
+
return header + "\n" + "\n".join(rows), notes
|
|
46
|
+
|
|
47
|
+
compact = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
|
48
|
+
return compact, ["minified json (indentation/whitespace removed, lossless)"]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _detect(text: str) -> bool:
|
|
52
|
+
if text.lstrip()[:1] not in "[{":
|
|
53
|
+
return False
|
|
54
|
+
try:
|
|
55
|
+
json.loads(text)
|
|
56
|
+
return True
|
|
57
|
+
except Exception:
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
REDUCER = Reducer("json", _detect, reduce_json, priority=10)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Collapse repetitive log lines.
|
|
2
|
+
|
|
3
|
+
Near-identical lines collapse to one representative plus a count, while every
|
|
4
|
+
error/anomaly line and every one-off pattern is kept as-is.
|
|
5
|
+
|
|
6
|
+
To decide "near-identical", we mask the volatile parts of a line (timestamps, ips,
|
|
7
|
+
uuids, hex, numbers, quoted strings) into a template. Lines that share a template
|
|
8
|
+
are the same event with different values, so we keep one and count the rest.
|
|
9
|
+
Templates seen only once, or carrying a severity keyword, are kept verbatim, since
|
|
10
|
+
the rare line is usually the one that matters.
|
|
11
|
+
|
|
12
|
+
Deterministic: first-seen order is preserved, so the same input gives the same output.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
|
|
19
|
+
from .base import Reducer
|
|
20
|
+
|
|
21
|
+
_LOG_HINT = re.compile(
|
|
22
|
+
r"(?im)^\s*(?:\d{4}-\d{2}-\d{2}[T ]|\[?(?:INFO|DEBUG|WARN|WARNING|ERROR|FATAL|TRACE|CRITICAL)\b)"
|
|
23
|
+
)
|
|
24
|
+
_SEVERITY = re.compile(r"(?i)\b(ERROR|FATAL|CRITICAL|EXCEPTION|PANIC|TRACEBACK|WARN|WARNING)\b")
|
|
25
|
+
|
|
26
|
+
# Order matters: more specific patterns first so they win before the generic
|
|
27
|
+
# number mask consumes their digits.
|
|
28
|
+
_MASKS = (
|
|
29
|
+
(re.compile(r"\d{4}-\d{2}-\d{2}[T ][\d:.,]+(?:Z|[+-]\d{2}:?\d{2})?"), "§ts"),
|
|
30
|
+
(re.compile(r"\b\d{1,3}(?:\.\d{1,3}){3}\b"), "§ip"),
|
|
31
|
+
(re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"), "§uuid"),
|
|
32
|
+
(re.compile(r"0x[0-9a-fA-F]+"), "§hex"),
|
|
33
|
+
(re.compile(r'"[^"]*"'), "§s"),
|
|
34
|
+
(re.compile(r"\b\d+(?:\.\d+)?\b"), "§n"),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _template(line: str) -> str:
|
|
39
|
+
t = line
|
|
40
|
+
for rx, repl in _MASKS:
|
|
41
|
+
t = rx.sub(repl, t)
|
|
42
|
+
return t.strip()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def reduce_logs(text: str) -> tuple[str, list[str]]:
|
|
46
|
+
lines = text.splitlines()
|
|
47
|
+
groups: dict[str, list] = {} # template -> [representative, count, is_severity]
|
|
48
|
+
order: list[str] = []
|
|
49
|
+
|
|
50
|
+
for line in lines:
|
|
51
|
+
if not line.strip():
|
|
52
|
+
continue
|
|
53
|
+
key = _template(line)
|
|
54
|
+
sev = bool(_SEVERITY.search(line))
|
|
55
|
+
g = groups.get(key)
|
|
56
|
+
if g is None:
|
|
57
|
+
groups[key] = [line, 1, sev]
|
|
58
|
+
order.append(key)
|
|
59
|
+
else:
|
|
60
|
+
g[1] += 1
|
|
61
|
+
g[2] = g[2] or sev
|
|
62
|
+
|
|
63
|
+
out: list[str] = []
|
|
64
|
+
kept_verbatim = 0
|
|
65
|
+
for key in order:
|
|
66
|
+
line, count, is_sev = groups[key]
|
|
67
|
+
if is_sev:
|
|
68
|
+
kept_verbatim += 1
|
|
69
|
+
out.append(line if count == 1 else f"{line} ⟪×{count}⟫")
|
|
70
|
+
elif count == 1:
|
|
71
|
+
kept_verbatim += 1
|
|
72
|
+
out.append(line)
|
|
73
|
+
else:
|
|
74
|
+
out.append(f"{line} ⟪×{count} similar⟫")
|
|
75
|
+
|
|
76
|
+
notes = [
|
|
77
|
+
f"{len(order)} unique patterns from {len(lines)} lines; "
|
|
78
|
+
f"{kept_verbatim} anomaly/unique lines kept verbatim"
|
|
79
|
+
]
|
|
80
|
+
return "\n".join(out), notes
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _detect(text: str) -> bool:
|
|
84
|
+
lines = text.splitlines()
|
|
85
|
+
if len(lines) < 5:
|
|
86
|
+
return False
|
|
87
|
+
hits = sum(1 for ln in lines if _LOG_HINT.match(ln))
|
|
88
|
+
return hits >= max(3, len(lines) * 0.3)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
REDUCER = Reducer("log", _detect, reduce_logs, priority=50)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Stack-trace reducer.
|
|
2
|
+
|
|
3
|
+
The exception (last line) and the boundary frames are the signal; the deep middle
|
|
4
|
+
of the call stack is usually noise. Keeps the header, the first frame, the last two
|
|
5
|
+
frames, and the full exception/tail verbatim; collapses the middle with a count.
|
|
6
|
+
Raises on non-tracebacks → core falls back to passthrough (fail open).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .base import Reducer
|
|
12
|
+
|
|
13
|
+
_KEEP_HEAD = 1
|
|
14
|
+
_KEEP_TAIL = 2
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def reduce_stacktrace(text: str) -> tuple[str, list[str]]:
|
|
18
|
+
lines = text.splitlines()
|
|
19
|
+
file_idx = [i for i, ln in enumerate(lines) if ln.lstrip().startswith('File "')]
|
|
20
|
+
if not file_idx:
|
|
21
|
+
raise ValueError("not a python traceback")
|
|
22
|
+
|
|
23
|
+
header = lines[: file_idx[0]]
|
|
24
|
+
frames = [
|
|
25
|
+
lines[start : (file_idx[k + 1] if k + 1 < len(file_idx) else len(lines))]
|
|
26
|
+
for k, start in enumerate(file_idx)
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Peel the trailing non-indented lines off the last frame: that's the exception.
|
|
30
|
+
last = frames[-1]
|
|
31
|
+
split = len(last)
|
|
32
|
+
for m in range(1, len(last)):
|
|
33
|
+
if last[m].strip() and not last[m].startswith((" ", "\t")):
|
|
34
|
+
split = m
|
|
35
|
+
break
|
|
36
|
+
tail = last[split:]
|
|
37
|
+
frames[-1] = last[:split]
|
|
38
|
+
|
|
39
|
+
out = list(header)
|
|
40
|
+
if len(frames) <= _KEEP_HEAD + _KEEP_TAIL + 1:
|
|
41
|
+
for fr in frames:
|
|
42
|
+
out.extend(fr)
|
|
43
|
+
else:
|
|
44
|
+
for fr in frames[:_KEEP_HEAD]:
|
|
45
|
+
out.extend(fr)
|
|
46
|
+
out.append(f" ⟪… {len(frames) - _KEEP_HEAD - _KEEP_TAIL} stack frames hidden⟫")
|
|
47
|
+
for fr in frames[-_KEEP_TAIL:]:
|
|
48
|
+
out.extend(fr)
|
|
49
|
+
out.extend(tail)
|
|
50
|
+
|
|
51
|
+
notes = [f"kept {min(len(frames), _KEEP_HEAD + _KEEP_TAIL)} of {len(frames)} frames + exception"]
|
|
52
|
+
return "\n".join(out), notes
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _detect(text: str) -> bool:
|
|
56
|
+
return "Traceback (most recent call last)" in text
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
REDUCER = Reducer("stacktrace", _detect, reduce_stacktrace, priority=20)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Whitespace-aligned table reducer.
|
|
2
|
+
|
|
3
|
+
Command-line tools (kubectl, docker, ps, ls -l, df) pad columns with runs of
|
|
4
|
+
spaces so they line up. That padding is pure tokens. We collapse each run of two
|
|
5
|
+
or more spaces to a single space and trim line ends. Every value is kept; only
|
|
6
|
+
the alignment is dropped, so this is lossless for the data.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from .base import Reducer
|
|
14
|
+
|
|
15
|
+
_GAP = re.compile(r"[ \t]{2,}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def reduce_table(text: str) -> tuple[str, list[str]]:
|
|
19
|
+
out = [_GAP.sub(" ", line).rstrip() for line in text.splitlines()]
|
|
20
|
+
return "\n".join(out), ["collapsed column padding; values preserved"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _detect(text: str) -> bool:
|
|
24
|
+
lines = [ln for ln in text.splitlines() if ln.strip()]
|
|
25
|
+
if len(lines) < 3:
|
|
26
|
+
return False
|
|
27
|
+
# A line looks columnar when it has at least two padded gaps (3+ columns).
|
|
28
|
+
columnar = sum(1 for ln in lines if len(_GAP.findall(ln)) >= 2)
|
|
29
|
+
return columnar >= max(3, len(lines) * 0.6)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
REDUCER = Reducer("table", _detect, reduce_table, priority=60)
|