contextops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextops/__init__.py +3 -0
- contextops/analyzers/__init__.py +1 -0
- contextops/analyzers/density.py +146 -0
- contextops/analyzers/redundancy.py +362 -0
- contextops/analyzers/structure.py +123 -0
- contextops/analyzers/tokens.py +76 -0
- contextops/api/__init__.py +1 -0
- contextops/api/diff.py +124 -0
- contextops/api/inspect.py +52 -0
- contextops/api/stability.py +264 -0
- contextops/cli/__init__.py +1 -0
- contextops/cli/main.py +320 -0
- contextops/cli/renderer.py +424 -0
- contextops/core/__init__.py +1 -0
- contextops/core/config.py +61 -0
- contextops/core/engine.py +355 -0
- contextops/core/models.py +245 -0
- contextops/core/normalizer.py +187 -0
- contextops-0.1.0.dist-info/METADATA +272 -0
- contextops-0.1.0.dist-info/RECORD +24 -0
- contextops-0.1.0.dist-info/WHEEL +5 -0
- contextops-0.1.0.dist-info/entry_points.txt +2 -0
- contextops-0.1.0.dist-info/licenses/LICENSE +21 -0
- contextops-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Token Analyzer.
|
|
3
|
+
|
|
4
|
+
Uses tiktoken to count tokens per ContextItem and estimate costs.
|
|
5
|
+
Nothing fancy — just reliable counting and cost math.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import tiktoken
|
|
11
|
+
|
|
12
|
+
from contextops.core.models import ContextBundle, TokenBreakdown
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ── Pricing per 1K input tokens (USD) — GPT-4o as default reference ─────
|
|
16
|
+
# Users can override this; these are just sensible defaults for estimation.
|
|
17
|
+
DEFAULT_COST_PER_1K_TOKENS: float = 0.005 # $5 per 1M input tokens
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def count_tokens(text: str, model: str = "gpt-4o") -> int:
|
|
21
|
+
"""
|
|
22
|
+
Count tokens for a given text using tiktoken.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: The text to tokenize.
|
|
26
|
+
model: The model name for the encoding. Defaults to gpt-4o.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
The number of tokens.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
encoding = tiktoken.encoding_for_model(model)
|
|
33
|
+
except KeyError:
|
|
34
|
+
# Fallback to cl100k_base (covers GPT-4, GPT-3.5, etc.)
|
|
35
|
+
encoding = tiktoken.get_encoding("cl100k_base")
|
|
36
|
+
|
|
37
|
+
return len(encoding.encode(text))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def analyze_tokens(
|
|
41
|
+
bundle: ContextBundle,
|
|
42
|
+
model: str = "gpt-4o",
|
|
43
|
+
cost_per_1k: float = DEFAULT_COST_PER_1K_TOKENS,
|
|
44
|
+
) -> TokenBreakdown:
|
|
45
|
+
"""
|
|
46
|
+
Count tokens for every item in the bundle and produce a breakdown.
|
|
47
|
+
|
|
48
|
+
Side effect: sets token_count on each ContextItem in the bundle.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
bundle: The context bundle to analyze.
|
|
52
|
+
model: Model name for tiktoken encoding selection.
|
|
53
|
+
cost_per_1k: Cost per 1,000 input tokens in USD.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
A TokenBreakdown with totals, per-type distribution, and cost estimate.
|
|
57
|
+
"""
|
|
58
|
+
by_type: dict[str, int] = {}
|
|
59
|
+
total = 0
|
|
60
|
+
|
|
61
|
+
for item in bundle.items:
|
|
62
|
+
tokens = count_tokens(item.content, model=model)
|
|
63
|
+
item.token_count = tokens
|
|
64
|
+
total += tokens
|
|
65
|
+
|
|
66
|
+
type_key = item.type.value
|
|
67
|
+
by_type[type_key] = by_type.get(type_key, 0) + tokens
|
|
68
|
+
|
|
69
|
+
cost = (total / 1000) * cost_per_1k
|
|
70
|
+
|
|
71
|
+
return TokenBreakdown(
|
|
72
|
+
total_tokens=total,
|
|
73
|
+
by_type=by_type,
|
|
74
|
+
estimated_cost_usd=cost,
|
|
75
|
+
wasted_tokens=0, # filled in later by the engine after redundancy analysis
|
|
76
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# API subpackage
|
contextops/api/diff.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ContextOps Diff API.
|
|
3
|
+
|
|
4
|
+
Computes a deterministic delta between two context payloads.
|
|
5
|
+
Provides the data model and logic for `contextops diff`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from contextops.api.inspect import inspect_context
|
|
16
|
+
from contextops.core.models import AnalysisResult, Recommendation
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ContextDiffResult:
|
|
21
|
+
"""The computed difference between two context analysis results."""
|
|
22
|
+
# Source results
|
|
23
|
+
result_a: AnalysisResult
|
|
24
|
+
result_b: AnalysisResult
|
|
25
|
+
|
|
26
|
+
# Numeric Deltas (B - A)
|
|
27
|
+
score_delta: int
|
|
28
|
+
token_delta: int
|
|
29
|
+
waste_delta: int
|
|
30
|
+
cost_delta: float
|
|
31
|
+
|
|
32
|
+
# Structure Deltas (B - A)
|
|
33
|
+
structure_delta: dict[str, float]
|
|
34
|
+
|
|
35
|
+
# Recommendation Lifecycle
|
|
36
|
+
resolved_recommendations: list[Recommendation] = field(default_factory=list)
|
|
37
|
+
new_recommendations: list[Recommendation] = field(default_factory=list)
|
|
38
|
+
persisting_recommendations: list[Recommendation] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def net_impact(self) -> str:
|
|
42
|
+
"""Categorical summary of the overall change."""
|
|
43
|
+
if self.score_delta > 0:
|
|
44
|
+
return "IMPROVEMENT"
|
|
45
|
+
elif self.score_delta < 0:
|
|
46
|
+
return "DEGRADATION"
|
|
47
|
+
else:
|
|
48
|
+
return "NEUTRAL"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_recommendation_id(rec: Recommendation) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Generate a deterministic, stable ID for a recommendation.
|
|
54
|
+
|
|
55
|
+
This is critical for the set-based diff logic. We hash the normalized issue string.
|
|
56
|
+
Do NOT use fuzzy matching or ML embeddings here.
|
|
57
|
+
"""
|
|
58
|
+
# Normalize: lowercase and collapse all whitespace to single spaces
|
|
59
|
+
normalized = re.sub(r'\s+', ' ', rec.issue.lower().strip())
|
|
60
|
+
return hashlib.md5(normalized.encode("utf-8")).hexdigest()[:12]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def diff_contexts(
|
|
64
|
+
raw_input_a: str | list[dict[str, Any]] | dict[str, Any],
|
|
65
|
+
raw_input_b: str | list[dict[str, Any]] | dict[str, Any],
|
|
66
|
+
) -> ContextDiffResult:
|
|
67
|
+
"""
|
|
68
|
+
Compare two context payloads and return a deterministic diff result.
|
|
69
|
+
"""
|
|
70
|
+
result_a = inspect_context(raw_input_a)
|
|
71
|
+
result_b = inspect_context(raw_input_b)
|
|
72
|
+
|
|
73
|
+
return diff_analysis_results(result_a, result_b)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def diff_analysis_results(result_a: AnalysisResult, result_b: AnalysisResult) -> ContextDiffResult:
|
|
77
|
+
"""Compare two pre-computed AnalysisResult objects."""
|
|
78
|
+
|
|
79
|
+
# 1. Numeric Deltas
|
|
80
|
+
score_delta = result_b.score - result_a.score
|
|
81
|
+
token_delta = result_b.token_breakdown.total_tokens - result_a.token_breakdown.total_tokens
|
|
82
|
+
waste_delta = result_b.token_breakdown.wasted_tokens - result_a.token_breakdown.wasted_tokens
|
|
83
|
+
cost_delta = result_b.token_breakdown.estimated_cost_usd - result_a.token_breakdown.estimated_cost_usd
|
|
84
|
+
|
|
85
|
+
# 2. Structure Deltas
|
|
86
|
+
structure_delta = {
|
|
87
|
+
"redundancy": result_b.score_breakdown.redundancy_penalty - result_a.score_breakdown.redundancy_penalty,
|
|
88
|
+
"density": result_b.score_breakdown.density_penalty - result_a.score_breakdown.density_penalty,
|
|
89
|
+
"structure_imbalance": result_b.score_breakdown.structure_penalty - result_a.score_breakdown.structure_penalty,
|
|
90
|
+
"concentration": result_b.score_breakdown.concentration_penalty - result_a.score_breakdown.concentration_penalty,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# 3. Recommendation Lifecycle
|
|
94
|
+
dict_a = {get_recommendation_id(r): r for r in result_a.recommendations}
|
|
95
|
+
dict_b = {get_recommendation_id(r): r for r in result_b.recommendations}
|
|
96
|
+
|
|
97
|
+
ids_a = set(dict_a.keys())
|
|
98
|
+
ids_b = set(dict_b.keys())
|
|
99
|
+
|
|
100
|
+
resolved_ids = ids_a - ids_b
|
|
101
|
+
new_ids = ids_b - ids_a
|
|
102
|
+
persisting_ids = ids_a & ids_b
|
|
103
|
+
|
|
104
|
+
resolved = [dict_a[rid] for rid in resolved_ids]
|
|
105
|
+
new = [dict_b[nid] for nid in new_ids]
|
|
106
|
+
persisting = [dict_b[pid] for pid in persisting_ids] # Use B's updated version
|
|
107
|
+
|
|
108
|
+
# Sort deterministically by severity/impact
|
|
109
|
+
resolved.sort(key=lambda r: (-r.impact_score, -r.token_savings, r.issue))
|
|
110
|
+
new.sort(key=lambda r: (-r.impact_score, -r.token_savings, r.issue))
|
|
111
|
+
persisting.sort(key=lambda r: (-r.impact_score, -r.token_savings, r.issue))
|
|
112
|
+
|
|
113
|
+
return ContextDiffResult(
|
|
114
|
+
result_a=result_a,
|
|
115
|
+
result_b=result_b,
|
|
116
|
+
score_delta=score_delta,
|
|
117
|
+
token_delta=token_delta,
|
|
118
|
+
waste_delta=waste_delta,
|
|
119
|
+
cost_delta=cost_delta,
|
|
120
|
+
structure_delta=structure_delta,
|
|
121
|
+
resolved_recommendations=resolved,
|
|
122
|
+
new_recommendations=new,
|
|
123
|
+
persisting_recommendations=persisting,
|
|
124
|
+
)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ContextOps Programmatic API.
|
|
3
|
+
|
|
4
|
+
This is the primary interface for using ContextOps as a library.
|
|
5
|
+
Import and call `inspect_context()` with any supported input format.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
from contextops.api.inspect import inspect_context
|
|
9
|
+
|
|
10
|
+
result = inspect_context({
|
|
11
|
+
"system": "You are a helpful assistant.",
|
|
12
|
+
"chunks": ["chunk 1", "chunk 2"],
|
|
13
|
+
})
|
|
14
|
+
print(result.score)
|
|
15
|
+
print(json.dumps(result.to_dict(), indent=2))
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from contextops.core.config import ContextOpsConfig
|
|
23
|
+
from contextops.core.engine import analyze
|
|
24
|
+
from contextops.core.models import AnalysisResult
|
|
25
|
+
from contextops.core.normalizer import normalize
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def inspect_context(
|
|
29
|
+
raw_input: str | list[dict[str, Any]] | dict[str, Any],
|
|
30
|
+
model: str = "gpt-4o",
|
|
31
|
+
cost_per_1k: float = 0.005,
|
|
32
|
+
config: ContextOpsConfig | None = None,
|
|
33
|
+
) -> AnalysisResult:
|
|
34
|
+
"""
|
|
35
|
+
Analyze an LLM context and return a full AnalysisResult.
|
|
36
|
+
|
|
37
|
+
This is the main entry point for the ContextOps library.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
raw_input: Raw LLM context in any supported format:
|
|
41
|
+
- str: treated as a system prompt
|
|
42
|
+
- list[dict]: OpenAI-style message list
|
|
43
|
+
- dict: structured dict with system/messages/chunks/memory/tools
|
|
44
|
+
model: Model name for tiktoken encoding.
|
|
45
|
+
cost_per_1k: Cost per 1K input tokens in USD.
|
|
46
|
+
config: Optional custom threshold configuration.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
AnalysisResult containing score, breakdown, findings, and recommendations.
|
|
50
|
+
"""
|
|
51
|
+
bundle = normalize(raw_input)
|
|
52
|
+
return analyze(bundle, model=model, cost_per_1k=cost_per_1k, config=config)
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ContextOps Stability API.
|
|
3
|
+
|
|
4
|
+
Runs deterministic perturbations against a context bundle to verify the scoring
|
|
5
|
+
engine behaves logically. Testing properties and invariants over specific scores.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import copy
|
|
11
|
+
import random
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from contextops.core.engine import analyze
|
|
16
|
+
from contextops.core.models import ContextItem, ContextType, RedundancyClassification
|
|
17
|
+
from contextops.core.normalizer import normalize
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class InvariantResult:
|
|
22
|
+
"""The outcome of a single stability invariant check."""
|
|
23
|
+
name: str
|
|
24
|
+
passed: bool
|
|
25
|
+
severity: str = "critical"
|
|
26
|
+
diagnostic_info: dict[str, Any] = field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class StabilityReport:
|
|
31
|
+
"""Complete stability report containing all invariant checks."""
|
|
32
|
+
base_score: int = 0
|
|
33
|
+
base_tokens: int = 0
|
|
34
|
+
base_waste_tokens: int = 0
|
|
35
|
+
invariants: list[InvariantResult] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def score_percentage(self) -> int:
|
|
39
|
+
"""Percentage of invariants that passed."""
|
|
40
|
+
if not self.invariants:
|
|
41
|
+
return 0
|
|
42
|
+
passed = sum(1 for inv in self.invariants if inv.passed)
|
|
43
|
+
return int((passed / len(self.invariants)) * 100)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_stability_report(raw_input: str | list[dict] | dict) -> StabilityReport:
|
|
47
|
+
"""
|
|
48
|
+
Run the formal sanity-check layer for the scoring engine.
|
|
49
|
+
|
|
50
|
+
Applies deterministic mutations to the context bundle and verifies
|
|
51
|
+
that the system behaves logically.
|
|
52
|
+
"""
|
|
53
|
+
base_bundle = normalize(raw_input)
|
|
54
|
+
base_result = analyze(base_bundle)
|
|
55
|
+
base_score = base_result.score
|
|
56
|
+
|
|
57
|
+
invariants = []
|
|
58
|
+
|
|
59
|
+
# 1. Shuffle Invariant
|
|
60
|
+
# ContextOps should care about content, not ordering.
|
|
61
|
+
shuffled_bundle = copy.deepcopy(base_bundle)
|
|
62
|
+
shuffled_bundle.items = sorted(shuffled_bundle.items, key=lambda x: x.id, reverse=True)
|
|
63
|
+
shuffle_result = analyze(shuffled_bundle)
|
|
64
|
+
invariants.append(InvariantResult(
|
|
65
|
+
name="Shuffle Invariant",
|
|
66
|
+
passed=(shuffle_result.score == base_score),
|
|
67
|
+
severity="critical",
|
|
68
|
+
))
|
|
69
|
+
|
|
70
|
+
# 2. Duplicate Injection
|
|
71
|
+
# Injecting an exact duplicate must be detected and penalized.
|
|
72
|
+
dup_passed = True
|
|
73
|
+
dup_diagnostic = {}
|
|
74
|
+
if base_bundle.items:
|
|
75
|
+
retrieval_items = base_bundle.items_by_type(ContextType.RETRIEVAL)
|
|
76
|
+
if not retrieval_items:
|
|
77
|
+
retrieval_items = base_bundle.items
|
|
78
|
+
|
|
79
|
+
dup_bundle = copy.deepcopy(base_bundle)
|
|
80
|
+
item_to_dup = copy.deepcopy(retrieval_items[0])
|
|
81
|
+
item_to_dup.id = item_to_dup.id + "_dup"
|
|
82
|
+
dup_bundle.items.append(item_to_dup)
|
|
83
|
+
|
|
84
|
+
dup_result = analyze(dup_bundle)
|
|
85
|
+
score_delta = dup_result.score - base_score
|
|
86
|
+
|
|
87
|
+
dup_passed = (dup_result.score < base_score)
|
|
88
|
+
dup_diagnostic = {
|
|
89
|
+
"Score Delta": f"{score_delta:+d}",
|
|
90
|
+
"Expected Direction": "Decrease",
|
|
91
|
+
}
|
|
92
|
+
else:
|
|
93
|
+
dup_diagnostic = {"Note": "No items to duplicate"}
|
|
94
|
+
|
|
95
|
+
invariants.append(InvariantResult(
|
|
96
|
+
name="Duplicate Injection",
|
|
97
|
+
passed=dup_passed,
|
|
98
|
+
severity="critical",
|
|
99
|
+
diagnostic_info=dup_diagnostic
|
|
100
|
+
))
|
|
101
|
+
|
|
102
|
+
# 3. Noise Injection
|
|
103
|
+
# Pure synthetic noise shouldn't magically improve the score.
|
|
104
|
+
noise_bundle = copy.deepcopy(base_bundle)
|
|
105
|
+
noise_content = " ".join([f"TOKEN_{i:04d}" for i in range(1, 101)])
|
|
106
|
+
noise_item = ContextItem(
|
|
107
|
+
type=ContextType.RETRIEVAL,
|
|
108
|
+
content=noise_content,
|
|
109
|
+
source="synthetic_noise"
|
|
110
|
+
)
|
|
111
|
+
noise_bundle.items.append(noise_item)
|
|
112
|
+
noise_result = analyze(noise_bundle)
|
|
113
|
+
noise_score_delta = noise_result.score - base_score
|
|
114
|
+
invariants.append(InvariantResult(
|
|
115
|
+
name="Noise Injection",
|
|
116
|
+
passed=(noise_result.score <= base_score),
|
|
117
|
+
severity="important",
|
|
118
|
+
diagnostic_info={
|
|
119
|
+
"Score Delta": f"{noise_score_delta:+d}",
|
|
120
|
+
"Expected Direction": "<= 0",
|
|
121
|
+
}
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
# 4. Chunk Split Invariant
|
|
125
|
+
# Splitting content shouldn't dramatically alter conclusions.
|
|
126
|
+
split_passed = True
|
|
127
|
+
split_diagnostic = {}
|
|
128
|
+
if base_bundle.items:
|
|
129
|
+
split_bundle = copy.deepcopy(base_bundle)
|
|
130
|
+
non_system_indices = [
|
|
131
|
+
i for i, item in enumerate(split_bundle.items)
|
|
132
|
+
if item.type != ContextType.SYSTEM
|
|
133
|
+
]
|
|
134
|
+
if non_system_indices:
|
|
135
|
+
longest_idx = max(non_system_indices, key=lambda i: len(split_bundle.items[i].content))
|
|
136
|
+
longest_item = split_bundle.items.pop(longest_idx)
|
|
137
|
+
|
|
138
|
+
mid = len(longest_item.content) // 2
|
|
139
|
+
part1 = longest_item.content[:mid]
|
|
140
|
+
part2 = longest_item.content[mid:]
|
|
141
|
+
|
|
142
|
+
item1 = ContextItem(type=longest_item.type, content=part1, source=longest_item.source)
|
|
143
|
+
item2 = ContextItem(type=longest_item.type, content=part2, source=longest_item.source)
|
|
144
|
+
|
|
145
|
+
split_bundle.items.append(item1)
|
|
146
|
+
split_bundle.items.append(item2)
|
|
147
|
+
|
|
148
|
+
split_result = analyze(split_bundle)
|
|
149
|
+
split_delta = split_result.score - base_score
|
|
150
|
+
|
|
151
|
+
DEFAULT_SPLIT_TOLERANCE = 10
|
|
152
|
+
split_passed = (abs(split_delta) <= DEFAULT_SPLIT_TOLERANCE)
|
|
153
|
+
split_diagnostic = {
|
|
154
|
+
"Base Score": base_score,
|
|
155
|
+
"Split Score": split_result.score,
|
|
156
|
+
"Delta": f"{split_delta:+d}",
|
|
157
|
+
}
|
|
158
|
+
else:
|
|
159
|
+
split_diagnostic = {"Note": "No non-system items to split"}
|
|
160
|
+
else:
|
|
161
|
+
split_diagnostic = {"Note": "No items to split"}
|
|
162
|
+
|
|
163
|
+
invariants.append(InvariantResult(
|
|
164
|
+
name="Chunk Split Invariant",
|
|
165
|
+
passed=split_passed,
|
|
166
|
+
severity="important",
|
|
167
|
+
diagnostic_info=split_diagnostic
|
|
168
|
+
))
|
|
169
|
+
|
|
170
|
+
# 5. Boilerplate Invariant
|
|
171
|
+
# Tests the core philosophy: expected repetition vs real waste.
|
|
172
|
+
bp_bundle = copy.deepcopy(base_bundle)
|
|
173
|
+
bp_item = ContextItem(
|
|
174
|
+
type=ContextType.SYSTEM,
|
|
175
|
+
content="You are a helpful assistant. Please follow all instructions carefully.",
|
|
176
|
+
source="system"
|
|
177
|
+
)
|
|
178
|
+
bp_item_dup = copy.deepcopy(bp_item)
|
|
179
|
+
bp_item_dup.id = bp_item.id + "_dup"
|
|
180
|
+
|
|
181
|
+
bp_bundle.items.extend([bp_item, bp_item_dup])
|
|
182
|
+
bp_result = analyze(bp_bundle)
|
|
183
|
+
|
|
184
|
+
# Only check findings between the two injected boilerplate items
|
|
185
|
+
bp_ids = {bp_item.id, bp_item_dup.id}
|
|
186
|
+
bp_pair_findings = [
|
|
187
|
+
f for f in bp_result.redundancy_findings
|
|
188
|
+
if f.item_a_id in bp_ids and f.item_b_id in bp_ids
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
detected_bp = any(
|
|
192
|
+
f.classification == RedundancyClassification.BOILERPLATE
|
|
193
|
+
for f in bp_pair_findings
|
|
194
|
+
)
|
|
195
|
+
detected_redundant = any(
|
|
196
|
+
f.classification == RedundancyClassification.REDUNDANT_CONTEXT
|
|
197
|
+
for f in bp_pair_findings
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
invariants.append(InvariantResult(
|
|
201
|
+
name="Boilerplate Invariant",
|
|
202
|
+
passed=(detected_bp and not detected_redundant),
|
|
203
|
+
severity="critical",
|
|
204
|
+
diagnostic_info={
|
|
205
|
+
"Detected as BOILERPLATE": detected_bp,
|
|
206
|
+
"Detected as REDUNDANT_CONTEXT": detected_redundant,
|
|
207
|
+
}
|
|
208
|
+
))
|
|
209
|
+
|
|
210
|
+
# 6. Semantic Blindness Guard
|
|
211
|
+
# Semantic blindness is a feature, not a bug. Verify it stays that way.
|
|
212
|
+
SEMANTIC_BLINDNESS_CASES = [
|
|
213
|
+
(
|
|
214
|
+
"The startup raised one million dollars.",
|
|
215
|
+
"The company secured $1M in funding."
|
|
216
|
+
),
|
|
217
|
+
(
|
|
218
|
+
"The API request timed out after thirty seconds.",
|
|
219
|
+
"The endpoint exceeded its 30-second timeout threshold."
|
|
220
|
+
),
|
|
221
|
+
(
|
|
222
|
+
"The quick brown fox jumps over the lazy dog.",
|
|
223
|
+
"A fast dark-colored canine leaped above a resting dog."
|
|
224
|
+
),
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
sb_passed = True
|
|
228
|
+
sb_diagnostics = []
|
|
229
|
+
|
|
230
|
+
for case_idx, (text1, text2) in enumerate(SEMANTIC_BLINDNESS_CASES):
|
|
231
|
+
sb_bundle = copy.deepcopy(base_bundle)
|
|
232
|
+
item1 = ContextItem(type=ContextType.RETRIEVAL, content=text1, source=f"sb_a_{case_idx}")
|
|
233
|
+
item2 = ContextItem(type=ContextType.RETRIEVAL, content=text2, source=f"sb_b_{case_idx}")
|
|
234
|
+
sb_bundle.items.extend([item1, item2])
|
|
235
|
+
|
|
236
|
+
sb_result = analyze(sb_bundle)
|
|
237
|
+
|
|
238
|
+
has_redundancy = any(
|
|
239
|
+
f.classification == RedundancyClassification.REDUNDANT_CONTEXT
|
|
240
|
+
and ((f.item_a_id == item1.id and f.item_b_id == item2.id) or
|
|
241
|
+
(f.item_a_id == item2.id and f.item_b_id == item1.id))
|
|
242
|
+
for f in sb_result.redundancy_findings
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if has_redundancy:
|
|
246
|
+
sb_passed = False
|
|
247
|
+
sb_diagnostics.append(f"Case {case_idx+1} triggered redundancy")
|
|
248
|
+
|
|
249
|
+
invariants.append(InvariantResult(
|
|
250
|
+
name="Semantic Blindness Guard",
|
|
251
|
+
passed=sb_passed,
|
|
252
|
+
severity="important",
|
|
253
|
+
diagnostic_info={
|
|
254
|
+
"Redundancy Detected": not sb_passed,
|
|
255
|
+
"Details": ", ".join(sb_diagnostics) if sb_diagnostics else "Clean across all cases",
|
|
256
|
+
}
|
|
257
|
+
))
|
|
258
|
+
|
|
259
|
+
return StabilityReport(
|
|
260
|
+
base_score=base_score,
|
|
261
|
+
base_tokens=base_result.token_breakdown.total_tokens,
|
|
262
|
+
base_waste_tokens=base_result.token_breakdown.wasted_tokens,
|
|
263
|
+
invariants=invariants
|
|
264
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# CLI subpackage
|