brevix 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
brevix/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ """Brevix — compress LLM output safely."""
2
+
3
+ from brevix.compressor import Compressor, CompressionMode, CompressionResult
4
+ from brevix.accuracy_guard import AccuracyGuard, GuardResult
5
+ from brevix.stats import Stats
6
+ from brevix.adaptive import pick_mode, AdaptiveResult
7
+ from brevix.tokens import count_tokens, count_tokens_method
8
+ from brevix.install import install, list_targets, TARGETS
9
+
10
+ __version__ = "0.4.0"
11
+ __all__ = [
12
+ "Compressor",
13
+ "CompressionMode",
14
+ "CompressionResult",
15
+ "AccuracyGuard",
16
+ "GuardResult",
17
+ "Stats",
18
+ "pick_mode",
19
+ "AdaptiveResult",
20
+ "count_tokens",
21
+ "count_tokens_method",
22
+ "install",
23
+ "list_targets",
24
+ "TARGETS",
25
+ ]
@@ -0,0 +1,124 @@
1
+ """Accuracy Guard — semantic similarity check between original and compressed text.
2
+
3
+ Verifies that rule-based compression preserves meaning. Uses local
4
+ sentence-transformers (no API cost). Falls back to a content-word
5
+ containment metric tailored for compression (NOT Jaccard, which
6
+ structurally penalizes legitimate compression).
7
+
8
+ This is what separates safe production-grade compression from a blind
9
+ text-stripper: every output is scored against the original, and the
10
+ caller can choose to warn, fall back, or block when meaning would be lost.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from dataclasses import dataclass
17
+ from typing import Optional
18
+
19
+
20
+ # Closed-class words that compression is allowed to drop without
21
+ # meaning loss. Excluded from the lexical similarity calculation.
22
+ _STOPWORDS = frozenset({
23
+ "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
24
+ "am", "do", "does", "did", "have", "has", "had", "of", "to", "in",
25
+ "on", "at", "for", "by", "with", "from", "as", "and", "or", "but",
26
+ "so", "if", "then", "than", "that", "this", "these", "those",
27
+ "it", "its", "i", "you", "he", "she", "we", "they", "them",
28
+ "your", "my", "our", "his", "her", "their",
29
+ "just", "really", "basically", "actually", "simply", "very",
30
+ "quite", "perhaps", "maybe", "essentially", "literally",
31
+ "however", "therefore", "thus", "hence", "moreover",
32
+ "sure", "certainly", "of", "course",
33
+ "i'd", "i'll", "i've", "i'm", "we'll", "we're",
34
+ "let", "let's",
35
+ })
36
+
37
+
38
+ @dataclass
39
+ class GuardResult:
40
+ similarity: float
41
+ threshold: float
42
+ passed: bool
43
+ method: str
44
+
45
+ @property
46
+ def warning(self) -> Optional[str]:
47
+ if self.passed:
48
+ return None
49
+ return (
50
+ f"Accuracy Guard: similarity {self.similarity:.2f} below threshold "
51
+ f"{self.threshold:.2f} ({self.method}). Compression may have lost meaning."
52
+ )
53
+
54
+
55
+ class AccuracyGuard:
56
+ """Check whether compressed text preserves original meaning."""
57
+
58
+ def __init__(self, threshold: float = 0.85, model_name: str = "all-MiniLM-L6-v2") -> None:
59
+ self.threshold = threshold
60
+ self.model_name = model_name
61
+ self._model = None
62
+
63
+ def _load_model(self):
64
+ if self._model is not None:
65
+ return self._model
66
+ try:
67
+ from sentence_transformers import SentenceTransformer
68
+ self._model = SentenceTransformer(self.model_name)
69
+ except ImportError:
70
+ self._model = False
71
+ return self._model
72
+
73
+ def check(self, original: str, compressed: str) -> GuardResult:
74
+ if not original.strip() or not compressed.strip():
75
+ return GuardResult(similarity=1.0, threshold=self.threshold, passed=True, method="empty")
76
+
77
+ model = self._load_model()
78
+ if model:
79
+ similarity = self._semantic_similarity(original, compressed, model)
80
+ method = "semantic"
81
+ else:
82
+ similarity = self._content_containment(original, compressed)
83
+ method = "content-containment"
84
+
85
+ return GuardResult(
86
+ similarity=similarity,
87
+ threshold=self.threshold,
88
+ passed=similarity >= self.threshold,
89
+ method=method,
90
+ )
91
+
92
+ @staticmethod
93
+ def _semantic_similarity(a: str, b: str, model) -> float:
94
+ from sentence_transformers import util
95
+ emb = model.encode([a, b], convert_to_tensor=True, show_progress_bar=False)
96
+ score = util.cos_sim(emb[0], emb[1]).item()
97
+ return float(max(0.0, min(1.0, score)))
98
+
99
+ @staticmethod
100
+ def _tokenize(text: str) -> list[str]:
101
+ return re.findall(r"[a-z0-9_]+", text.lower())
102
+
103
+ @classmethod
104
+ def _content_tokens(cls, text: str) -> set[str]:
105
+ return {t for t in cls._tokenize(text) if t not in _STOPWORDS and len(t) > 1}
106
+
107
+ @classmethod
108
+ def _content_containment(cls, original: str, compressed: str) -> float:
109
+ """Fraction of original content words preserved in compressed text.
110
+
111
+ Designed for compression: dropping stopwords/articles is expected and
112
+ does NOT lower the score. Score drops only when meaningful content
113
+ words disappear or new unrelated terms appear.
114
+ """
115
+ orig_tokens = cls._content_tokens(original)
116
+ comp_tokens = cls._content_tokens(compressed)
117
+ if not orig_tokens:
118
+ return 1.0 if not comp_tokens else 0.5
119
+ kept = len(orig_tokens & comp_tokens) / len(orig_tokens)
120
+ spurious = (
121
+ len(comp_tokens - orig_tokens) / max(len(comp_tokens), 1)
122
+ if comp_tokens else 0.0
123
+ )
124
+ return max(0.0, min(1.0, kept - 0.5 * spurious))
brevix/adaptive.py ADDED
@@ -0,0 +1,37 @@
1
+ """Adaptive mode — auto-pick compression level per text characteristics.
2
+
3
+ Heuristic: pick the most aggressive mode that still passes Accuracy Guard.
4
+ Falls back to a static heuristic (verbosity + density) if guard unavailable.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ from brevix.accuracy_guard import AccuracyGuard, GuardResult
12
+ from brevix.compressor import Compressor, CompressionMode, CompressionResult
13
+
14
+
15
+ @dataclass
16
+ class AdaptiveResult:
17
+ chosen_mode: CompressionMode
18
+ compression: CompressionResult
19
+ guard: GuardResult
20
+
21
+
22
+ def pick_mode(text: str, threshold: float = 0.85, guard: AccuracyGuard | None = None) -> AdaptiveResult:
23
+ """Pick the most aggressive compression mode that still passes the guard.
24
+
25
+ Walks ultra → full → lite. Returns the first that passes, else lite (which
26
+ is the safest non-trivial option).
27
+ """
28
+ guard = guard or AccuracyGuard(threshold=threshold)
29
+ last: AdaptiveResult | None = None
30
+ for mode in (CompressionMode.ULTRA, CompressionMode.FULL, CompressionMode.LITE):
31
+ result = Compressor(mode).compress(text)
32
+ check = guard.check(text, result.compressed)
33
+ last = AdaptiveResult(chosen_mode=mode, compression=result, guard=check)
34
+ if check.passed:
35
+ return last
36
+ assert last is not None
37
+ return last
brevix/cli.py ADDED
@@ -0,0 +1,243 @@
1
+ """Brevix CLI entrypoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from brevix import (
10
+ Compressor,
11
+ CompressionMode,
12
+ AccuracyGuard,
13
+ Stats,
14
+ pick_mode,
15
+ count_tokens,
16
+ count_tokens_method,
17
+ install as install_target,
18
+ list_targets,
19
+ TARGETS,
20
+ __version__,
21
+ )
22
+ from brevix.file_compress import compress_file
23
+
24
+
25
+ def _cmd_compress(args: argparse.Namespace) -> int:
26
+ text = args.text
27
+ if text == "-" or not text:
28
+ text = sys.stdin.read()
29
+
30
+ if args.mode == "auto":
31
+ adaptive = pick_mode(text, threshold=args.threshold)
32
+ result = adaptive.compression
33
+ guard_result = adaptive.guard
34
+ chosen = adaptive.chosen_mode
35
+ if not guard_result.passed and args.strict:
36
+ sys.stderr.write(
37
+ f"[brevix] auto: no mode passed guard ({guard_result.similarity:.2f} < {args.threshold:.2f}). "
38
+ f"Emitting original.\n"
39
+ )
40
+ print(text)
41
+ return 2
42
+ if args.verbose:
43
+ sys.stderr.write(f"[brevix] auto picked mode={chosen.value}\n")
44
+ else:
45
+ chosen = CompressionMode(args.mode)
46
+ result = Compressor(mode=chosen).compress(text)
47
+ guard_result = None
48
+ if args.guard:
49
+ guard = AccuracyGuard(threshold=args.threshold)
50
+ guard_result = guard.check(result.original, result.compressed)
51
+ if not guard_result.passed:
52
+ sys.stderr.write(guard_result.warning + "\n")
53
+ if args.strict:
54
+ sys.stderr.write("Strict mode: emitting original instead.\n")
55
+ print(result.original)
56
+ return 2
57
+
58
+ if not args.no_stats:
59
+ Stats().record(
60
+ mode=chosen.value,
61
+ chars_saved=result.char_savings,
62
+ tokens_saved=result.token_savings_estimate,
63
+ )
64
+
65
+ print(result.compressed)
66
+ if args.verbose:
67
+ orig_tok = count_tokens(result.original)
68
+ comp_tok = count_tokens(result.compressed)
69
+ method = count_tokens_method()
70
+ sys.stderr.write(
71
+ f"\n[brevix] mode={chosen.value} "
72
+ f"chars: {len(result.original)}→{len(result.compressed)} "
73
+ f"({result.char_savings_pct}% saved) "
74
+ f"tokens ({method}): {orig_tok}→{comp_tok}\n"
75
+ )
76
+ if guard_result:
77
+ sys.stderr.write(
78
+ f"[brevix] guard: sim={guard_result.similarity:.3f} "
79
+ f"({guard_result.method}) pass={guard_result.passed}\n"
80
+ )
81
+ return 0
82
+
83
+
84
+ def _cmd_compress_file(args: argparse.Namespace) -> int:
85
+ mode = CompressionMode(args.mode)
86
+ try:
87
+ result = compress_file(
88
+ args.path,
89
+ mode=mode,
90
+ guard=not args.no_guard,
91
+ threshold=args.threshold,
92
+ dry_run=args.dry_run,
93
+ force=args.force,
94
+ )
95
+ except (FileNotFoundError, IsADirectoryError) as exc:
96
+ sys.stderr.write(f"Error: {exc}\n")
97
+ return 2
98
+
99
+ if result.skipped:
100
+ sys.stderr.write(f"Skipped: {result.reason}\n")
101
+ return 1
102
+
103
+ suffix = " (dry-run)" if args.dry_run else ""
104
+ print(
105
+ f"Compressed {result.path}{suffix}: "
106
+ f"{result.compression.char_savings} chars saved "
107
+ f"({result.compression.char_savings_pct}%)"
108
+ )
109
+ if result.backup:
110
+ print(f"Backup: {result.backup}")
111
+ if result.guard:
112
+ print(
113
+ f"Guard: sim={result.guard.similarity:.3f} "
114
+ f"({result.guard.method}) pass={result.guard.passed}"
115
+ )
116
+ return 0
117
+
118
+
119
+ def _cmd_stats(args: argparse.Namespace) -> int:
120
+ stats = Stats()
121
+ if args.reset:
122
+ stats.reset()
123
+ print("Stats reset.")
124
+ return 0
125
+ try:
126
+ print(stats.summary(since=args.since, real=args.real, share=args.share))
127
+ except ValueError as e:
128
+ sys.stderr.write(f"Error: {e}\n")
129
+ return 2
130
+ return 0
131
+
132
+
133
+ def _cmd_check(args: argparse.Namespace) -> int:
134
+ guard = AccuracyGuard(threshold=args.threshold)
135
+ result = guard.check(args.original, args.compressed)
136
+ print(
137
+ f"Similarity: {result.similarity:.4f} "
138
+ f"Threshold: {result.threshold:.2f} "
139
+ f"Passed: {result.passed} "
140
+ f"Method: {result.method}"
141
+ )
142
+ return 0 if result.passed else 1
143
+
144
+
145
+ def _cmd_count(args: argparse.Namespace) -> int:
146
+ text = args.text
147
+ if text == "-" or not text:
148
+ text = sys.stdin.read()
149
+ print(f"{count_tokens(text)} tokens ({count_tokens_method()}, {len(text)} chars)")
150
+ return 0
151
+
152
+
153
+ def _cmd_install(args: argparse.Namespace) -> int:
154
+ if args.list:
155
+ print(list_targets())
156
+ return 0
157
+ target = args.target
158
+ if target is None:
159
+ sys.stderr.write("Error: target required (or use --list).\n")
160
+ return 2
161
+ if target != "all" and target not in TARGETS:
162
+ sys.stderr.write(f"Error: unknown target '{target}'. Run `brevix install --list`.\n")
163
+ return 2
164
+ root = Path(args.path).resolve()
165
+
166
+ if args.dry_run:
167
+ print(f"[dry-run] Would install '{target}' into {root}.")
168
+ return 0
169
+
170
+ files = install_target(target, root)
171
+ print(f"Brevix installed for target '{target}' in {root}:")
172
+ for f in files:
173
+ try:
174
+ print(f" + {f.relative_to(root)}")
175
+ except ValueError:
176
+ print(f" + {f}")
177
+ return 0
178
+
179
+
180
+ def main(argv: list[str] | None = None) -> int:
181
+ parser = argparse.ArgumentParser(
182
+ prog="brevix",
183
+ description="Compress LLM output safely. Save tokens without breaking your code.",
184
+ )
185
+ parser.add_argument("--version", action="version", version=f"brevix {__version__}")
186
+ sub = parser.add_subparsers(dest="command", required=True)
187
+
188
+ p_compress = sub.add_parser("compress", help="Compress text")
189
+ p_compress.add_argument("text", nargs="?", default="-", help="Text to compress, or '-' for stdin")
190
+ p_compress.add_argument("--mode", choices=["lite", "full", "ultra", "auto"], default="full")
191
+ p_compress.add_argument("--guard", action="store_true", help="Enable Accuracy Guard")
192
+ p_compress.add_argument("--strict", action="store_true", help="Fall back to original if guard fails")
193
+ p_compress.add_argument("--threshold", type=float, default=0.85)
194
+ p_compress.add_argument("--no-stats", action="store_true", help="Don't record to local stats")
195
+ p_compress.add_argument("-v", "--verbose", action="store_true")
196
+ p_compress.set_defaults(func=_cmd_compress)
197
+
198
+ p_cf = sub.add_parser("compress-file", help="Compress a file in place (with .original backup)")
199
+ p_cf.add_argument("path")
200
+ p_cf.add_argument("--mode", choices=["lite", "full", "ultra"], default="full")
201
+ p_cf.add_argument("--threshold", type=float, default=0.85)
202
+ p_cf.add_argument("--no-guard", action="store_true")
203
+ p_cf.add_argument("--dry-run", action="store_true")
204
+ p_cf.add_argument("--force", action="store_true", help="Overwrite even if guard fails")
205
+ p_cf.set_defaults(func=_cmd_compress_file)
206
+
207
+ p_stats = sub.add_parser("stats", help="Show local stats")
208
+ p_stats.add_argument("--reset", action="store_true")
209
+ p_stats.add_argument("--since", default="all", help="Time window: 7d, 24h, 30m, all")
210
+ p_stats.add_argument("--real", action="store_true", help="Parse real Claude Code session logs")
211
+ p_stats.add_argument("--share", action="store_true", help="One-line tweet-ready output")
212
+ p_stats.set_defaults(func=_cmd_stats)
213
+
214
+ p_check = sub.add_parser("check", help="Check similarity between two texts")
215
+ p_check.add_argument("original")
216
+ p_check.add_argument("compressed")
217
+ p_check.add_argument("--threshold", type=float, default=0.85)
218
+ p_check.set_defaults(func=_cmd_check)
219
+
220
+ p_count = sub.add_parser("count", help="Count tokens in text")
221
+ p_count.add_argument("text", nargs="?", default="-")
222
+ p_count.set_defaults(func=_cmd_count)
223
+
224
+ p_install = sub.add_parser(
225
+ "install",
226
+ help="Install Brevix rules into a project for a specific LLM coding tool",
227
+ )
228
+ p_install.add_argument(
229
+ "target",
230
+ nargs="?",
231
+ help="Target tool. Use --list to see all options.",
232
+ )
233
+ p_install.add_argument("--path", default=".", help="Project root (default: cwd)")
234
+ p_install.add_argument("--list", action="store_true", help="List available targets")
235
+ p_install.add_argument("--dry-run", action="store_true", help="Preview without writing")
236
+ p_install.set_defaults(func=_cmd_install)
237
+
238
+ args = parser.parse_args(argv)
239
+ return args.func(args)
240
+
241
+
242
+ if __name__ == "__main__":
243
+ sys.exit(main())