contextops 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. contextops-0.1.0/LICENSE +21 -0
  2. contextops-0.1.0/PKG-INFO +272 -0
  3. contextops-0.1.0/README.md +238 -0
  4. contextops-0.1.0/contextops/__init__.py +3 -0
  5. contextops-0.1.0/contextops/analyzers/__init__.py +1 -0
  6. contextops-0.1.0/contextops/analyzers/density.py +146 -0
  7. contextops-0.1.0/contextops/analyzers/redundancy.py +362 -0
  8. contextops-0.1.0/contextops/analyzers/structure.py +123 -0
  9. contextops-0.1.0/contextops/analyzers/tokens.py +76 -0
  10. contextops-0.1.0/contextops/api/__init__.py +1 -0
  11. contextops-0.1.0/contextops/api/diff.py +124 -0
  12. contextops-0.1.0/contextops/api/inspect.py +52 -0
  13. contextops-0.1.0/contextops/api/stability.py +264 -0
  14. contextops-0.1.0/contextops/cli/__init__.py +1 -0
  15. contextops-0.1.0/contextops/cli/main.py +320 -0
  16. contextops-0.1.0/contextops/cli/renderer.py +424 -0
  17. contextops-0.1.0/contextops/core/__init__.py +1 -0
  18. contextops-0.1.0/contextops/core/config.py +61 -0
  19. contextops-0.1.0/contextops/core/engine.py +355 -0
  20. contextops-0.1.0/contextops/core/models.py +245 -0
  21. contextops-0.1.0/contextops/core/normalizer.py +187 -0
  22. contextops-0.1.0/contextops.egg-info/PKG-INFO +272 -0
  23. contextops-0.1.0/contextops.egg-info/SOURCES.txt +35 -0
  24. contextops-0.1.0/contextops.egg-info/dependency_links.txt +1 -0
  25. contextops-0.1.0/contextops.egg-info/entry_points.txt +2 -0
  26. contextops-0.1.0/contextops.egg-info/requires.txt +6 -0
  27. contextops-0.1.0/contextops.egg-info/top_level.txt +1 -0
  28. contextops-0.1.0/pyproject.toml +60 -0
  29. contextops-0.1.0/setup.cfg +4 -0
  30. contextops-0.1.0/tests/test_benchmarks.py +86 -0
  31. contextops-0.1.0/tests/test_chaos.py +199 -0
  32. contextops-0.1.0/tests/test_density.py +68 -0
  33. contextops-0.1.0/tests/test_diff.py +61 -0
  34. contextops-0.1.0/tests/test_redundancy.py +83 -0
  35. contextops-0.1.0/tests/test_schema.py +35 -0
  36. contextops-0.1.0/tests/test_signal_contract.py +294 -0
  37. contextops-0.1.0/tests/test_structure.py +39 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Abhijeet Baug
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextops
3
+ Version: 0.1.0
4
+ Summary: Deterministic context linter for LLM applications — analyze, score, and optimize your LLM context payloads.
5
+ Author: Abhijeet Baug
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Abhijeet777/contextops
8
+ Project-URL: Repository, https://github.com/Abhijeet777/contextops
9
+ Project-URL: Issues, https://github.com/Abhijeet777/contextops/issues
10
+ Project-URL: Documentation, https://github.com/Abhijeet777/contextops#readme
11
+ Project-URL: Changelog, https://github.com/Abhijeet777/contextops/blob/main/CHANGELOG.md
12
+ Keywords: llm,context,observability,rag,token,optimization,linter,ci,deterministic,prompt-engineering
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Topic :: Software Development :: Quality Assurance
23
+ Classifier: Topic :: Software Development :: Testing
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.10
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: tiktoken>=0.5.0
29
+ Requires-Dist: click>=8.0.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=7.0; extra == "dev"
32
+ Requires-Dist: pytest-cov; extra == "dev"
33
+ Dynamic: license-file
34
+
35
+ # ContextOps
36
+
37
+ **The deterministic context linter for LLM applications.**
38
+
39
+ [![PyPI version](https://img.shields.io/pypi/v/contextops.svg)](https://pypi.org/project/contextops/)
40
+ [![Python](https://img.shields.io/pypi/pyversions/contextops.svg)](https://pypi.org/project/contextops/)
41
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
42
+ [![CI](https://img.shields.io/badge/CI-stable-brightgreen.svg)](STABILITY.md)
43
+
44
+ ContextOps analyzes the context fed into your LLM and tells you what's broken — redundant chunks, wasted tokens, structural imbalance — with a **deterministic 0–100 score** and actionable fixes.
45
+
46
+ Think of it as **ESLint for your LLM prompts**.
47
+
48
+ ---
49
+
50
+ ## Why ContextOps?
51
+
52
+ Most LLM applications blindly stuff context into the prompt window. This leads to:
53
+
54
+ - 💸 **Wasted spend** — paying for redundant tokens that don't improve output
55
+ - 🔁 **Silent regressions** — a "small RAG change" floods the context with duplicates
56
+ - 🏗️ **Structural drift** — retrieval chunks slowly dominate the entire prompt
57
+ - 🎯 **No visibility** — teams have no way to measure context quality in CI
58
+
59
+ ContextOps gives you that visibility. It runs in your CI pipeline, scores every context payload, and fails the build if quality degrades.
60
+
61
+ ---
62
+
63
+ ## Quick Start
64
+
65
+ ```bash
66
+ pip install contextops
67
+ ```
68
+
69
+ ### See it in action
70
+
71
+ ```bash
72
+ # Run the built-in demo — instant "wow moment"
73
+ contextops demo
74
+ ```
75
+
76
+ ### Analyze your own context
77
+
78
+ ```bash
79
+ # Full analysis with rich terminal output
80
+ contextops inspect context.json
81
+
82
+ # CI mode: fail if score drops below threshold
83
+ contextops check context.json --min-score 70
84
+
85
+ # Compare two snapshots for regressions
86
+ contextops diff before.json after.json
87
+
88
+ # JSON output for dashboards and automation
89
+ contextops inspect context.json --json-output
90
+ ```
91
+
92
+ ### Python API
93
+
94
+ ```python
95
+ from contextops.api.inspect import inspect_context
96
+
97
+ result = inspect_context({
98
+ "system": "You are a helpful assistant.",
99
+ "chunks": [
100
+ {"content": "Refund policy: 30 days...", "source": "docs/refund.md"},
101
+ {"content": "Refund policy: within 30 days...", "source": "docs/refund.md"},
102
+ ],
103
+ "memory": ["User asked about refunds before."],
104
+ })
105
+
106
+ print(f"Score: {result.score}/100")
107
+ print(f"Wasted tokens: {result.token_breakdown.wasted_tokens}")
108
+ for rec in result.recommendations:
109
+ print(f" → {rec.fix}")
110
+ ```
111
+
112
+ ---
113
+
114
+ ## What It Measures
115
+
116
+ ContextOps computes a **0–100 Context Score** from four independent penalty dimensions:
117
+
118
+ | Dimension | What It Detects | Max Penalty |
119
+ |---|---|---|
120
+ | **Redundancy** | Duplicate / near-duplicate chunks (N-gram + Jaccard) | 30 pts |
121
+ | **Density** | Wasted tokens from structural bloat | 30 pts |
122
+ | **Structure** | Imbalanced type distribution (e.g., retrieval > 70%) | 20 pts |
123
+ | **Concentration** | Source dominance or highly imbalanced chunk distribution | 20 pts |
124
+
125
+ ```
126
+ Context Score = 100 - (Redundancy + Density + Structure + Concentration)
127
+ ```
128
+
129
+ Every penalty maps to a **specific finding** with **token savings** and an **actionable fix**.
130
+
131
+ ---
132
+
133
+ ## CI / CD Integration
134
+
135
+ ### GitHub Actions
136
+
137
+ ```yaml
138
+ name: Context Quality Gate
139
+
140
+ on: [pull_request]
141
+
142
+ jobs:
143
+ context-check:
144
+ runs-on: ubuntu-latest
145
+ steps:
146
+ - uses: actions/checkout@v4
147
+ - uses: actions/setup-python@v5
148
+ with:
149
+ python-version: "3.12"
150
+
151
+ - run: pip install contextops
152
+
153
+ - name: Check context quality
154
+ run: contextops check prompts/context.json --min-score 75
155
+ ```
156
+
157
+ ### Exit Codes
158
+
159
+ | Code | Meaning |
160
+ |---|---|
161
+ | `0` | Score meets threshold — build passes |
162
+ | `1` | Score below threshold — build fails |
163
+
164
+ ### Regression Detection
165
+
166
+ ```bash
167
+ # Save a baseline
168
+ contextops inspect prompts/v1.json --json-output > baseline.json
169
+
170
+ # After changes, compare
171
+ contextops diff baseline.json prompts/v2.json
172
+ ```
173
+
174
+ ---
175
+
176
+ ## Context File Format
177
+
178
+ ContextOps accepts a JSON file with any combination of these keys:
179
+
180
+ ```json
181
+ {
182
+ "system": "Your system prompt here",
183
+ "messages": [
184
+ {"role": "user", "content": "User question"}
185
+ ],
186
+ "chunks": [
187
+ {"content": "Retrieved chunk text", "source": "docs/page.md"}
188
+ ],
189
+ "memory": [
190
+ "Previous conversation context"
191
+ ],
192
+ "tools": [
193
+ {"name": "search_api", "output": "Tool response text"}
194
+ ]
195
+ }
196
+ ```
197
+
198
+ It also accepts raw OpenAI message lists:
199
+
200
+ ```json
201
+ [
202
+ {"role": "system", "content": "You are helpful."},
203
+ {"role": "user", "content": "What is the refund policy?"}
204
+ ]
205
+ ```
206
+
207
+ ---
208
+
209
+ ## CLI Reference
210
+
211
+ | Command | Purpose |
212
+ |---|---|
213
+ | `contextops inspect <file>` | Analyze and display results |
214
+ | `contextops check <file> --min-score N` | CI gate with exit codes |
215
+ | `contextops demo` | Built-in demo context |
216
+ | `contextops stability <file>` | Deterministic stability report |
217
+ | `contextops diff <file_a> <file_b>` | Compare two snapshots |
218
+
219
+ ### Flags
220
+
221
+ | Flag | Commands | Purpose |
222
+ |---|---|---|
223
+ | `--json-output` | inspect, check | Machine-readable JSON output |
224
+ | `--min-score N` | check | Minimum passing score (0–100) |
225
+ | `--model <name>` | inspect, check | Target model for cost estimation |
226
+ | `--explain` | inspect, check | Show detailed penalty reasoning |
227
+ | `--config <file>` | inspect, check | Custom threshold config file |
228
+
229
+ ---
230
+
231
+ ## Design Principles
232
+
233
+ 1. **Deterministic** — Same input → same output. Always. No randomness, no embeddings, no LLM calls.
234
+ 2. **Explainable** — Every penalty maps to a real issue with a token count and a fix.
235
+ 3. **CI-native** — Designed for pipelines first. Exit codes, JSON output, threshold gating.
236
+ 4. **Zero network** — Runs entirely offline. No API keys, no external services.
237
+
238
+ ---
239
+
240
+ ## Stability Contract
241
+
242
+ ContextOps ships with a formal [Stability Contract](STABILITY.md) that guarantees:
243
+
244
+ - **Scoring determinism** — same input always produces the same score
245
+ - **Schema stability** — JSON output fields never change within a major version
246
+ - **Performance bounds** — sub-second for payloads up to 50,000 tokens
247
+ - **Semantic versioning** — scoring formula changes require a major version bump
248
+
249
+ This contract exists so teams can trust ContextOps in production CI pipelines.
250
+
251
+ ---
252
+
253
+ ## Development
254
+
255
+ ```bash
256
+ # Clone and install in dev mode
257
+ git clone https://github.com/Abhijeet777/contextops.git
258
+ cd contextops
259
+ pip install -e ".[dev]"
260
+
261
+ # Run tests
262
+ pytest
263
+
264
+ # Run chaos stress tests
265
+ pytest tests/test_chaos.py -v
266
+ ```
267
+
268
+ ---
269
+
270
+ ## License
271
+
272
+ [MIT](LICENSE)
@@ -0,0 +1,238 @@
1
+ # ContextOps
2
+
3
+ **The deterministic context linter for LLM applications.**
4
+
5
+ [![PyPI version](https://img.shields.io/pypi/v/contextops.svg)](https://pypi.org/project/contextops/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/contextops.svg)](https://pypi.org/project/contextops/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+ [![CI](https://img.shields.io/badge/CI-stable-brightgreen.svg)](STABILITY.md)
9
+
10
+ ContextOps analyzes the context fed into your LLM and tells you what's broken — redundant chunks, wasted tokens, structural imbalance — with a **deterministic 0–100 score** and actionable fixes.
11
+
12
+ Think of it as **ESLint for your LLM prompts**.
13
+
14
+ ---
15
+
16
+ ## Why ContextOps?
17
+
18
+ Most LLM applications blindly stuff context into the prompt window. This leads to:
19
+
20
+ - 💸 **Wasted spend** — paying for redundant tokens that don't improve output
21
+ - 🔁 **Silent regressions** — a "small RAG change" floods the context with duplicates
22
+ - 🏗️ **Structural drift** — retrieval chunks slowly dominate the entire prompt
23
+ - 🎯 **No visibility** — teams have no way to measure context quality in CI
24
+
25
+ ContextOps gives you that visibility. It runs in your CI pipeline, scores every context payload, and fails the build if quality degrades.
26
+
27
+ ---
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ pip install contextops
33
+ ```
34
+
35
+ ### See it in action
36
+
37
+ ```bash
38
+ # Run the built-in demo — instant "wow moment"
39
+ contextops demo
40
+ ```
41
+
42
+ ### Analyze your own context
43
+
44
+ ```bash
45
+ # Full analysis with rich terminal output
46
+ contextops inspect context.json
47
+
48
+ # CI mode: fail if score drops below threshold
49
+ contextops check context.json --min-score 70
50
+
51
+ # Compare two snapshots for regressions
52
+ contextops diff before.json after.json
53
+
54
+ # JSON output for dashboards and automation
55
+ contextops inspect context.json --json-output
56
+ ```
57
+
58
+ ### Python API
59
+
60
+ ```python
61
+ from contextops.api.inspect import inspect_context
62
+
63
+ result = inspect_context({
64
+ "system": "You are a helpful assistant.",
65
+ "chunks": [
66
+ {"content": "Refund policy: 30 days...", "source": "docs/refund.md"},
67
+ {"content": "Refund policy: within 30 days...", "source": "docs/refund.md"},
68
+ ],
69
+ "memory": ["User asked about refunds before."],
70
+ })
71
+
72
+ print(f"Score: {result.score}/100")
73
+ print(f"Wasted tokens: {result.token_breakdown.wasted_tokens}")
74
+ for rec in result.recommendations:
75
+ print(f" → {rec.fix}")
76
+ ```
77
+
78
+ ---
79
+
80
+ ## What It Measures
81
+
82
+ ContextOps computes a **0–100 Context Score** from four independent penalty dimensions:
83
+
84
+ | Dimension | What It Detects | Max Penalty |
85
+ |---|---|---|
86
+ | **Redundancy** | Duplicate / near-duplicate chunks (N-gram + Jaccard) | 30 pts |
87
+ | **Density** | Wasted tokens from structural bloat | 30 pts |
88
+ | **Structure** | Imbalanced type distribution (e.g., retrieval > 70%) | 20 pts |
89
+ | **Concentration** | Source dominance or highly imbalanced chunk distribution | 20 pts |
90
+
91
+ ```
92
+ Context Score = 100 - (Redundancy + Density + Structure + Concentration)
93
+ ```
94
+
95
+ Every penalty maps to a **specific finding** with **token savings** and an **actionable fix**.
96
+
97
+ ---
98
+
99
+ ## CI / CD Integration
100
+
101
+ ### GitHub Actions
102
+
103
+ ```yaml
104
+ name: Context Quality Gate
105
+
106
+ on: [pull_request]
107
+
108
+ jobs:
109
+ context-check:
110
+ runs-on: ubuntu-latest
111
+ steps:
112
+ - uses: actions/checkout@v4
113
+ - uses: actions/setup-python@v5
114
+ with:
115
+ python-version: "3.12"
116
+
117
+ - run: pip install contextops
118
+
119
+ - name: Check context quality
120
+ run: contextops check prompts/context.json --min-score 75
121
+ ```
122
+
123
+ ### Exit Codes
124
+
125
+ | Code | Meaning |
126
+ |---|---|
127
+ | `0` | Score meets threshold — build passes |
128
+ | `1` | Score below threshold — build fails |
129
+
130
+ ### Regression Detection
131
+
132
+ ```bash
133
+ # Save a baseline
134
+ contextops inspect prompts/v1.json --json-output > baseline.json
135
+
136
+ # After changes, compare
137
+ contextops diff baseline.json prompts/v2.json
138
+ ```
139
+
140
+ ---
141
+
142
+ ## Context File Format
143
+
144
+ ContextOps accepts a JSON file with any combination of these keys:
145
+
146
+ ```json
147
+ {
148
+ "system": "Your system prompt here",
149
+ "messages": [
150
+ {"role": "user", "content": "User question"}
151
+ ],
152
+ "chunks": [
153
+ {"content": "Retrieved chunk text", "source": "docs/page.md"}
154
+ ],
155
+ "memory": [
156
+ "Previous conversation context"
157
+ ],
158
+ "tools": [
159
+ {"name": "search_api", "output": "Tool response text"}
160
+ ]
161
+ }
162
+ ```
163
+
164
+ It also accepts raw OpenAI message lists:
165
+
166
+ ```json
167
+ [
168
+ {"role": "system", "content": "You are helpful."},
169
+ {"role": "user", "content": "What is the refund policy?"}
170
+ ]
171
+ ```
172
+
173
+ ---
174
+
175
+ ## CLI Reference
176
+
177
+ | Command | Purpose |
178
+ |---|---|
179
+ | `contextops inspect <file>` | Analyze and display results |
180
+ | `contextops check <file> --min-score N` | CI gate with exit codes |
181
+ | `contextops demo` | Built-in demo context |
182
+ | `contextops stability <file>` | Deterministic stability report |
183
+ | `contextops diff <file_a> <file_b>` | Compare two snapshots |
184
+
185
+ ### Flags
186
+
187
+ | Flag | Commands | Purpose |
188
+ |---|---|---|
189
+ | `--json-output` | inspect, check | Machine-readable JSON output |
190
+ | `--min-score N` | check | Minimum passing score (0–100) |
191
+ | `--model <name>` | inspect, check | Target model for cost estimation |
192
+ | `--explain` | inspect, check | Show detailed penalty reasoning |
193
+ | `--config <file>` | inspect, check | Custom threshold config file |
194
+
195
+ ---
196
+
197
+ ## Design Principles
198
+
199
+ 1. **Deterministic** — Same input → same output. Always. No randomness, no embeddings, no LLM calls.
200
+ 2. **Explainable** — Every penalty maps to a real issue with a token count and a fix.
201
+ 3. **CI-native** — Designed for pipelines first. Exit codes, JSON output, threshold gating.
202
+ 4. **Zero network** — Runs entirely offline. No API keys, no external services.
203
+
204
+ ---
205
+
206
+ ## Stability Contract
207
+
208
+ ContextOps ships with a formal [Stability Contract](STABILITY.md) that guarantees:
209
+
210
+ - **Scoring determinism** — same input always produces the same score
211
+ - **Schema stability** — JSON output fields never change within a major version
212
+ - **Performance bounds** — sub-second for payloads up to 50,000 tokens
213
+ - **Semantic versioning** — scoring formula changes require a major version bump
214
+
215
+ This contract exists so teams can trust ContextOps in production CI pipelines.
216
+
217
+ ---
218
+
219
+ ## Development
220
+
221
+ ```bash
222
+ # Clone and install in dev mode
223
+ git clone https://github.com/Abhijeet777/contextops.git
224
+ cd contextops
225
+ pip install -e ".[dev]"
226
+
227
+ # Run tests
228
+ pytest
229
+
230
+ # Run chaos stress tests
231
+ pytest tests/test_chaos.py -v
232
+ ```
233
+
234
+ ---
235
+
236
+ ## License
237
+
238
+ [MIT](LICENSE)
@@ -0,0 +1,3 @@
1
+ """ContextOps — Context observability for LLM applications."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1 @@
1
+ # Analyzers subpackage
@@ -0,0 +1,146 @@
1
+ """
2
+ Density Analyzer — Phase 2.5 (Metric Orthogonalization).
3
+
4
+ Computes the structural Density Signal from raw context text.
5
+
6
+ Signal contract:
7
+ - Reads ONLY from raw ContextBundle item content strings.
8
+ - Does NOT read wasted_tokens, redundancy findings, or any other analyzer output.
9
+ - Is the sole authoritative input for density_penalty in the scoring engine.
10
+
11
+ Three orthogonal character buckets (exhaustive, non-overlapping):
12
+ payload_chars = alphanumeric (actual information)
13
+ syntax_chars = non-alphanum, non-whitespace (brackets, punctuation, markup)
14
+ whitespace_chars = whitespace (layout/formatting overhead)
15
+ total_chars = payload + syntax + whitespace (always sums to 1.0)
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import math
21
+ import re
22
+ from collections import Counter
23
+ from contextops.core.models import ContextBundle, DensitySignal
24
+
25
+
26
+ def normalize_density_input(text: str) -> list[str]:
27
+ """
28
+ Standardize text preprocessing for density metrics to prevent metric drift.
29
+
30
+ Rules (frozen — do not change without updating all callers):
31
+ 1. Lowercase
32
+ 2. Replace non-alphanumeric (including underscore) with spaces
33
+ 3. Split on whitespace only
34
+
35
+ The regex uses [^a-z0-9\\s] (not \\w) to ensure underscores are treated
36
+ as punctuation, not part of identifiers. This makes snake_case and kebab-case
37
+ consistent (both split into component words).
38
+ """
39
+ text = text.lower()
40
+ text = re.sub(r'[^a-z0-9\s]', ' ', text)
41
+ return text.split()
42
+
43
+
44
+ def _calc_format_overhead(text: str) -> float:
45
+ """
46
+ Format Overhead (FO): Ratio of syntax chars to total chars.
47
+
48
+ FO = syntax_chars / total_chars
49
+ where syntax_chars = non-alphanumeric AND non-whitespace characters
50
+ (brackets, punctuation, markup, operators, etc.)
51
+
52
+ Range: 0.0 (no syntax overhead) → 1.0 (all syntax, no payload or whitespace).
53
+ Does NOT include whitespace — that is measured separately by WL.
54
+ """
55
+ total_chars = len(text)
56
+ if total_chars == 0:
57
+ return 0.0
58
+
59
+ syntax_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
60
+ return max(0.0, min(1.0, syntax_chars / total_chars))
61
+
62
+
63
+ def _calc_whitespace_waste(text: str) -> float:
64
+ """
65
+ Whitespace Waste (WL): Ratio of whitespace chars to total chars.
66
+
67
+ WL = whitespace_chars / total_chars
68
+ where whitespace_chars = space, tab, newline, carriage return, etc.
69
+
70
+ Range: 0.0 (no whitespace) → 1.0 (all whitespace).
71
+ Does NOT include syntax chars — that is measured separately by FO.
72
+ """
73
+ total_chars = len(text)
74
+ if total_chars == 0:
75
+ return 0.0
76
+
77
+ whitespace_chars = sum(1 for c in text if c.isspace())
78
+ return max(0.0, min(1.0, whitespace_chars / total_chars))
79
+
80
+
81
+ def _calc_entropy_compression(text: str) -> float:
82
+ """
83
+ Entropy Compression (EC): Statistical measure of repetitive boilerplate.
84
+
85
+ EC = 1 - normalized_shannon_entropy
86
+
87
+ High EC → low entropy → highly repetitive token distribution.
88
+ Low EC → high entropy → diverse vocabulary (good).
89
+
90
+ Normalization: entropy / log2(unique_words) so range is always 0.0–1.0.
91
+ """
92
+ words = normalize_density_input(text)
93
+ if not words:
94
+ return 0.0
95
+
96
+ total_words = len(words)
97
+ word_counts = Counter(words)
98
+ unique_words = len(word_counts)
99
+
100
+ if unique_words <= 1:
101
+ return 1.0 # single word repeated — maximum compression
102
+
103
+ # Shannon entropy
104
+ entropy = 0.0
105
+ for count in word_counts.values():
106
+ p = count / total_words
107
+ entropy -= p * math.log2(p)
108
+
109
+ max_entropy = math.log2(unique_words)
110
+ normalized_entropy = entropy / max_entropy
111
+
112
+ return max(0.0, min(1.0, 1.0 - normalized_entropy))
113
+
114
+
115
+ def compute_density_signal(bundle: ContextBundle) -> DensitySignal:
116
+ """
117
+ Compute the structural Density Signal from raw context content.
118
+
119
+ Signal contract: reads ONLY raw item.content strings.
120
+ Must NOT read token_count, wasted_tokens, or any analyzer output.
121
+
122
+ Weights (initial): FO=0.4, WL=0.2, EC=0.4
123
+ These are calibrated so typical clean context scores near 0.1–0.3,
124
+ and heavily bloated context scores near 0.6–0.9.
125
+ """
126
+ if not bundle.items:
127
+ return DensitySignal(0.0, 0.0, 0.0, 0.0)
128
+
129
+ total_text = "\n".join(item.content for item in bundle.items)
130
+
131
+ if not total_text.strip():
132
+ return DensitySignal(0.0, 0.0, 0.0, 0.0)
133
+
134
+ fo = _calc_format_overhead(total_text)
135
+ wl = _calc_whitespace_waste(total_text)
136
+ ec = _calc_entropy_compression(total_text)
137
+
138
+ # Weights: w_fo=0.4, w_wl=0.2, w_ec=0.4
139
+ total_signal = (0.4 * fo) + (0.2 * wl) + (0.4 * ec)
140
+
141
+ return DensitySignal(
142
+ format_overhead=round(fo, 3),
143
+ whitespace_waste=round(wl, 3),
144
+ entropy_compression=round(ec, 3),
145
+ total_density_signal=round(total_signal, 3),
146
+ )