livepilot 1.9.13 → 1.9.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +3 -3
- package/AGENTS.md +3 -3
- package/CHANGELOG.md +51 -0
- package/CONTRIBUTING.md +1 -1
- package/README.md +7 -7
- package/bin/livepilot.js +32 -8
- package/installer/install.js +21 -2
- package/livepilot/.Codex-plugin/plugin.json +2 -2
- package/livepilot/.claude-plugin/plugin.json +2 -2
- package/livepilot/agents/livepilot-producer/AGENT.md +243 -49
- package/livepilot/skills/livepilot-core/SKILL.md +81 -6
- package/livepilot/skills/livepilot-core/references/m4l-devices.md +2 -2
- package/livepilot/skills/livepilot-core/references/overview.md +3 -3
- package/livepilot/skills/livepilot-core/references/sound-design.md +3 -2
- package/livepilot/skills/livepilot-release/SKILL.md +13 -13
- package/m4l_device/LivePilot_Analyzer.amxd +0 -0
- package/m4l_device/livepilot_bridge.js +6 -3
- package/mcp_server/__init__.py +1 -1
- package/mcp_server/curves.py +11 -3
- package/mcp_server/evaluation/__init__.py +1 -0
- package/mcp_server/evaluation/fabric.py +575 -0
- package/mcp_server/evaluation/feature_extractors.py +84 -0
- package/mcp_server/evaluation/policy.py +67 -0
- package/mcp_server/evaluation/tools.py +53 -0
- package/mcp_server/memory/__init__.py +11 -2
- package/mcp_server/memory/anti_memory.py +78 -0
- package/mcp_server/memory/promotion.py +94 -0
- package/mcp_server/memory/session_memory.py +108 -0
- package/mcp_server/memory/taste_memory.py +158 -0
- package/mcp_server/memory/technique_store.py +2 -1
- package/mcp_server/memory/tools.py +112 -0
- package/mcp_server/mix_engine/__init__.py +1 -0
- package/mcp_server/mix_engine/critics.py +299 -0
- package/mcp_server/mix_engine/models.py +152 -0
- package/mcp_server/mix_engine/planner.py +103 -0
- package/mcp_server/mix_engine/state_builder.py +316 -0
- package/mcp_server/mix_engine/tools.py +214 -0
- package/mcp_server/performance_engine/__init__.py +1 -0
- package/mcp_server/performance_engine/models.py +148 -0
- package/mcp_server/performance_engine/planner.py +267 -0
- package/mcp_server/performance_engine/safety.py +162 -0
- package/mcp_server/performance_engine/tools.py +183 -0
- package/mcp_server/project_brain/__init__.py +6 -0
- package/mcp_server/project_brain/arrangement_graph.py +64 -0
- package/mcp_server/project_brain/automation_graph.py +72 -0
- package/mcp_server/project_brain/builder.py +123 -0
- package/mcp_server/project_brain/capability_graph.py +64 -0
- package/mcp_server/project_brain/models.py +282 -0
- package/mcp_server/project_brain/refresh.py +80 -0
- package/mcp_server/project_brain/role_graph.py +103 -0
- package/mcp_server/project_brain/session_graph.py +51 -0
- package/mcp_server/project_brain/tools.py +144 -0
- package/mcp_server/reference_engine/__init__.py +1 -0
- package/mcp_server/reference_engine/gap_analyzer.py +239 -0
- package/mcp_server/reference_engine/models.py +105 -0
- package/mcp_server/reference_engine/profile_builder.py +149 -0
- package/mcp_server/reference_engine/tactic_router.py +117 -0
- package/mcp_server/reference_engine/tools.py +235 -0
- package/mcp_server/runtime/__init__.py +1 -0
- package/mcp_server/runtime/action_ledger.py +117 -0
- package/mcp_server/runtime/action_ledger_models.py +84 -0
- package/mcp_server/runtime/action_tools.py +57 -0
- package/mcp_server/runtime/capability_state.py +218 -0
- package/mcp_server/runtime/safety_kernel.py +339 -0
- package/mcp_server/runtime/safety_tools.py +42 -0
- package/mcp_server/runtime/tools.py +64 -0
- package/mcp_server/server.py +23 -1
- package/mcp_server/sound_design/__init__.py +1 -0
- package/mcp_server/sound_design/critics.py +297 -0
- package/mcp_server/sound_design/models.py +147 -0
- package/mcp_server/sound_design/planner.py +104 -0
- package/mcp_server/sound_design/tools.py +297 -0
- package/mcp_server/tools/_agent_os_engine.py +947 -0
- package/mcp_server/tools/_composition_engine.py +1530 -0
- package/mcp_server/tools/_conductor.py +199 -0
- package/mcp_server/tools/_conductor_budgets.py +222 -0
- package/mcp_server/tools/_evaluation_contracts.py +91 -0
- package/mcp_server/tools/_form_engine.py +416 -0
- package/mcp_server/tools/_motif_engine.py +351 -0
- package/mcp_server/tools/_planner_engine.py +516 -0
- package/mcp_server/tools/_research_engine.py +542 -0
- package/mcp_server/tools/_research_provider.py +185 -0
- package/mcp_server/tools/_snapshot_normalizer.py +49 -0
- package/mcp_server/tools/agent_os.py +440 -0
- package/mcp_server/tools/analyzer.py +18 -0
- package/mcp_server/tools/automation.py +25 -10
- package/mcp_server/tools/composition.py +563 -0
- package/mcp_server/tools/motif.py +104 -0
- package/mcp_server/tools/planner.py +144 -0
- package/mcp_server/tools/research.py +223 -0
- package/mcp_server/tools/tracks.py +18 -3
- package/mcp_server/tools/transport.py +10 -2
- package/mcp_server/transition_engine/__init__.py +6 -0
- package/mcp_server/transition_engine/archetypes.py +167 -0
- package/mcp_server/transition_engine/critics.py +340 -0
- package/mcp_server/transition_engine/models.py +90 -0
- package/mcp_server/transition_engine/tools.py +291 -0
- package/mcp_server/translation_engine/__init__.py +5 -0
- package/mcp_server/translation_engine/critics.py +297 -0
- package/mcp_server/translation_engine/models.py +27 -0
- package/mcp_server/translation_engine/tools.py +74 -0
- package/package.json +2 -2
- package/remote_script/LivePilot/__init__.py +1 -1
- package/remote_script/LivePilot/arrangement.py +12 -2
- package/requirements.txt +1 -1
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
"""Evaluation Fabric — unified entry point for all engine evaluators.
|
|
2
|
+
|
|
3
|
+
Provides evaluate() as the single router, plus engine-specific evaluators:
|
|
4
|
+
- evaluate_sonic_move() — spectral before/after
|
|
5
|
+
- evaluate_composition_move() — issue list before/after
|
|
6
|
+
- evaluate_mix_move() — mix critic issues before/after
|
|
7
|
+
- evaluate_transition() — transition score before/after
|
|
8
|
+
- evaluate_translation() — translation report before/after
|
|
9
|
+
|
|
10
|
+
Uses feature_extractors for dimension extraction, policy for hard rules,
|
|
11
|
+
and the existing contracts from _evaluation_contracts.
|
|
12
|
+
|
|
13
|
+
Design: EVALUATION_FABRIC_V1.md
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from ..tools._evaluation_contracts import (
|
|
21
|
+
EvaluationRequest,
|
|
22
|
+
EvaluationResult,
|
|
23
|
+
MEASURABLE_DIMENSIONS,
|
|
24
|
+
is_dimension_measurable,
|
|
25
|
+
)
|
|
26
|
+
from ..tools._snapshot_normalizer import normalize_sonic_snapshot
|
|
27
|
+
from .feature_extractors import extract_dimension_value, _clamp
|
|
28
|
+
from .policy import apply_hard_rules
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── Sonic Evaluator ──────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def evaluate_sonic_move(
|
|
35
|
+
request: EvaluationRequest,
|
|
36
|
+
outcome_history: Optional[list[dict]] = None,
|
|
37
|
+
) -> EvaluationResult:
|
|
38
|
+
"""Evaluate a sonic move using the Evaluation Fabric.
|
|
39
|
+
|
|
40
|
+
Normalizes before/after snapshots, extracts dimension values,
|
|
41
|
+
computes score and goal progress, and applies hard rules.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
request: EvaluationRequest with before/after snapshots,
|
|
45
|
+
goal targets/protect in goal and protect dicts.
|
|
46
|
+
outcome_history: optional list of past outcomes for taste fit.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
EvaluationResult with score, keep_change, dimension_changes, etc.
|
|
50
|
+
"""
|
|
51
|
+
# Normalize snapshots
|
|
52
|
+
before_norm = normalize_sonic_snapshot(request.before, source="before")
|
|
53
|
+
after_norm = normalize_sonic_snapshot(request.after, source="after")
|
|
54
|
+
|
|
55
|
+
targets = request.goal.get("targets", {})
|
|
56
|
+
protect = request.protect
|
|
57
|
+
|
|
58
|
+
notes: list[str] = []
|
|
59
|
+
dimension_changes: dict[str, dict] = {}
|
|
60
|
+
|
|
61
|
+
# Compute per-dimension deltas
|
|
62
|
+
total_goal_progress = 0.0
|
|
63
|
+
measurable_count = 0
|
|
64
|
+
|
|
65
|
+
for dim, weight in targets.items():
|
|
66
|
+
before_val = extract_dimension_value(before_norm, dim) if before_norm else None
|
|
67
|
+
after_val = extract_dimension_value(after_norm, dim) if after_norm else None
|
|
68
|
+
|
|
69
|
+
if before_val is not None and after_val is not None:
|
|
70
|
+
delta = after_val - before_val
|
|
71
|
+
dimension_changes[dim] = {
|
|
72
|
+
"before": round(before_val, 4),
|
|
73
|
+
"after": round(after_val, 4),
|
|
74
|
+
"delta": round(delta, 4),
|
|
75
|
+
}
|
|
76
|
+
total_goal_progress += delta * weight
|
|
77
|
+
measurable_count += 1
|
|
78
|
+
else:
|
|
79
|
+
notes.append(f"{dim}: not measurable in Phase 1 (confidence=0.0)")
|
|
80
|
+
|
|
81
|
+
# Check protected dimensions
|
|
82
|
+
collateral_damage = 0.0
|
|
83
|
+
protection_violated = False
|
|
84
|
+
|
|
85
|
+
for dim, threshold in protect.items():
|
|
86
|
+
before_val = extract_dimension_value(before_norm, dim) if before_norm else None
|
|
87
|
+
after_val = extract_dimension_value(after_norm, dim) if after_norm else None
|
|
88
|
+
|
|
89
|
+
if before_val is not None and after_val is not None:
|
|
90
|
+
drop = before_val - after_val
|
|
91
|
+
if drop > 0:
|
|
92
|
+
collateral_damage = max(collateral_damage, drop)
|
|
93
|
+
if after_val < threshold:
|
|
94
|
+
protection_violated = True
|
|
95
|
+
notes.append(
|
|
96
|
+
f"PROTECTED dimension '{dim}' at {after_val:.3f}, "
|
|
97
|
+
f"below threshold {threshold:.3f}"
|
|
98
|
+
)
|
|
99
|
+
elif drop > 0.15:
|
|
100
|
+
protection_violated = True
|
|
101
|
+
notes.append(
|
|
102
|
+
f"PROTECTED dimension '{dim}' dropped by {drop:.3f} "
|
|
103
|
+
f"(absolute drop > 0.15)"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Compute composite score
|
|
107
|
+
measurable_delta = total_goal_progress / max(measurable_count, 1)
|
|
108
|
+
goal_fit = _clamp(0.5 + total_goal_progress)
|
|
109
|
+
measurable_component = _clamp(0.5 + measurable_delta)
|
|
110
|
+
preservation = _clamp(1.0 - collateral_damage * 5)
|
|
111
|
+
confidence = measurable_count / max(len(targets), 1)
|
|
112
|
+
|
|
113
|
+
score = (
|
|
114
|
+
0.30 * goal_fit
|
|
115
|
+
+ 0.25 * measurable_component
|
|
116
|
+
+ 0.15 * preservation
|
|
117
|
+
+ 0.10 * 0.0 # taste_fit: placeholder, no history in fabric v1
|
|
118
|
+
+ 0.10 * confidence
|
|
119
|
+
+ 0.10 * 1.0 # reversibility: 1.0 for undo-able moves
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Apply hard rules via policy engine
|
|
123
|
+
keep_change, rule_failures = apply_hard_rules(
|
|
124
|
+
goal_progress=total_goal_progress,
|
|
125
|
+
collateral_damage=collateral_damage,
|
|
126
|
+
protection_violated=protection_violated,
|
|
127
|
+
measurable_count=measurable_count,
|
|
128
|
+
score=score,
|
|
129
|
+
target_count=len(targets),
|
|
130
|
+
)
|
|
131
|
+
notes.extend(rule_failures)
|
|
132
|
+
|
|
133
|
+
# Decide decision_mode
|
|
134
|
+
if measurable_count == 0:
|
|
135
|
+
decision_mode = "deferred"
|
|
136
|
+
elif keep_change:
|
|
137
|
+
decision_mode = "measured"
|
|
138
|
+
else:
|
|
139
|
+
decision_mode = "measured"
|
|
140
|
+
|
|
141
|
+
return EvaluationResult(
|
|
142
|
+
engine=request.engine or "sonic",
|
|
143
|
+
score=round(score, 4),
|
|
144
|
+
keep_change=keep_change,
|
|
145
|
+
goal_progress=round(total_goal_progress, 4),
|
|
146
|
+
collateral_damage=round(collateral_damage, 4),
|
|
147
|
+
hard_rule_failures=rule_failures,
|
|
148
|
+
dimension_changes=dimension_changes,
|
|
149
|
+
notes=notes,
|
|
150
|
+
decision_mode=decision_mode,
|
|
151
|
+
memory_candidate=keep_change and measurable_count > 0,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ── Composition Evaluator ────────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def evaluate_composition_move(
|
|
159
|
+
before_issues: list[dict],
|
|
160
|
+
after_issues: list[dict],
|
|
161
|
+
) -> EvaluationResult:
|
|
162
|
+
"""Evaluate a composition move by comparing issue lists.
|
|
163
|
+
|
|
164
|
+
Wraps the existing composition evaluation logic using the
|
|
165
|
+
canonical EvaluationResult contract.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
before_issues: list of dicts with at least "severity" (float).
|
|
169
|
+
after_issues: list of dicts with at least "severity" (float).
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
EvaluationResult with score, keep_change, notes.
|
|
173
|
+
"""
|
|
174
|
+
notes: list[str] = []
|
|
175
|
+
|
|
176
|
+
before_count = len(before_issues)
|
|
177
|
+
after_count = len(after_issues)
|
|
178
|
+
issue_delta = before_count - after_count
|
|
179
|
+
|
|
180
|
+
before_severity = sum(i.get("severity", 0.0) for i in before_issues)
|
|
181
|
+
after_severity = sum(i.get("severity", 0.0) for i in after_issues)
|
|
182
|
+
severity_improvement = before_severity - after_severity
|
|
183
|
+
|
|
184
|
+
if before_count > 0:
|
|
185
|
+
improvement_ratio = severity_improvement / max(before_severity, 0.01)
|
|
186
|
+
else:
|
|
187
|
+
improvement_ratio = 0.0 if after_count == 0 else -0.5
|
|
188
|
+
|
|
189
|
+
score = max(0.0, min(1.0, 0.5 + improvement_ratio * 0.5))
|
|
190
|
+
|
|
191
|
+
keep_change = True
|
|
192
|
+
|
|
193
|
+
if severity_improvement < 0:
|
|
194
|
+
keep_change = False
|
|
195
|
+
notes.append(
|
|
196
|
+
f"WORSE: total severity increased by {-severity_improvement:.2f}"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if after_count > before_count + 1:
|
|
200
|
+
keep_change = False
|
|
201
|
+
notes.append(
|
|
202
|
+
f"NEW ISSUES: {after_count - before_count} new issues introduced"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if score < 0.40:
|
|
206
|
+
keep_change = False
|
|
207
|
+
notes.append(f"SCORE: {score:.3f} below 0.40 threshold")
|
|
208
|
+
|
|
209
|
+
if keep_change and severity_improvement > 0:
|
|
210
|
+
notes.append(
|
|
211
|
+
f"IMPROVED: resolved {issue_delta} issue(s), "
|
|
212
|
+
f"severity reduced by {severity_improvement:.2f}"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
hard_rule_failures = [n for n in notes if n.startswith(("WORSE:", "NEW ISSUES:", "SCORE:"))]
|
|
216
|
+
|
|
217
|
+
return EvaluationResult(
|
|
218
|
+
engine="composition",
|
|
219
|
+
score=round(score, 4),
|
|
220
|
+
keep_change=keep_change,
|
|
221
|
+
goal_progress=round(severity_improvement, 4),
|
|
222
|
+
collateral_damage=0.0,
|
|
223
|
+
hard_rule_failures=hard_rule_failures,
|
|
224
|
+
dimension_changes={"issue_delta": issue_delta},
|
|
225
|
+
notes=notes,
|
|
226
|
+
decision_mode="measured",
|
|
227
|
+
memory_candidate=keep_change and severity_improvement > 0,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# ── Mix Evaluator ───────────────────────────────────────────────────
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def evaluate_mix_move(
|
|
235
|
+
before_issues: list[dict],
|
|
236
|
+
after_issues: list[dict],
|
|
237
|
+
) -> EvaluationResult:
|
|
238
|
+
"""Evaluate a mix move by comparing mix critic issue lists.
|
|
239
|
+
|
|
240
|
+
Scores masking reduction, punch change, headroom, stereo stability
|
|
241
|
+
and applies hard rules from policy.py.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
before_issues: list of MixIssue dicts (from run_all_mix_critics).
|
|
245
|
+
after_issues: list of MixIssue dicts (from run_all_mix_critics).
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
EvaluationResult with score, keep_change, dimension_changes.
|
|
249
|
+
"""
|
|
250
|
+
notes: list[str] = []
|
|
251
|
+
|
|
252
|
+
# Severity helpers per critic category
|
|
253
|
+
def _severity_for(issues: list[dict], critic: str) -> float:
|
|
254
|
+
return sum(i.get("severity", 0.0) for i in issues if i.get("critic") == critic)
|
|
255
|
+
|
|
256
|
+
# Track four key dimensions
|
|
257
|
+
before_masking = _severity_for(before_issues, "masking")
|
|
258
|
+
after_masking = _severity_for(after_issues, "masking")
|
|
259
|
+
masking_delta = before_masking - after_masking # positive = improvement
|
|
260
|
+
|
|
261
|
+
before_dynamics = _severity_for(before_issues, "dynamics")
|
|
262
|
+
after_dynamics = _severity_for(after_issues, "dynamics")
|
|
263
|
+
headroom_delta = before_dynamics - after_dynamics
|
|
264
|
+
|
|
265
|
+
before_stereo = _severity_for(before_issues, "stereo")
|
|
266
|
+
after_stereo = _severity_for(after_issues, "stereo")
|
|
267
|
+
stereo_delta = before_stereo - after_stereo
|
|
268
|
+
|
|
269
|
+
before_balance = _severity_for(before_issues, "balance")
|
|
270
|
+
after_balance = _severity_for(after_issues, "balance")
|
|
271
|
+
balance_delta = before_balance - after_balance
|
|
272
|
+
|
|
273
|
+
dimension_changes = {
|
|
274
|
+
"masking_reduction": round(masking_delta, 4),
|
|
275
|
+
"headroom_change": round(headroom_delta, 4),
|
|
276
|
+
"stereo_stability": round(stereo_delta, 4),
|
|
277
|
+
"balance_change": round(balance_delta, 4),
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
# Overall severity comparison
|
|
281
|
+
before_total = sum(i.get("severity", 0.0) for i in before_issues)
|
|
282
|
+
after_total = sum(i.get("severity", 0.0) for i in after_issues)
|
|
283
|
+
severity_improvement = before_total - after_total
|
|
284
|
+
|
|
285
|
+
before_count = len(before_issues)
|
|
286
|
+
after_count = len(after_issues)
|
|
287
|
+
|
|
288
|
+
if before_total > 0:
|
|
289
|
+
improvement_ratio = severity_improvement / max(before_total, 0.01)
|
|
290
|
+
else:
|
|
291
|
+
improvement_ratio = 0.0 if after_count == 0 else -0.5
|
|
292
|
+
|
|
293
|
+
score = _clamp(0.5 + improvement_ratio * 0.5)
|
|
294
|
+
|
|
295
|
+
# Hard-rule style checks
|
|
296
|
+
keep_change = True
|
|
297
|
+
|
|
298
|
+
if severity_improvement < 0:
|
|
299
|
+
keep_change = False
|
|
300
|
+
notes.append(
|
|
301
|
+
f"WORSE: total mix severity increased by {-severity_improvement:.2f}"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
if after_count > before_count + 2:
|
|
305
|
+
keep_change = False
|
|
306
|
+
notes.append(
|
|
307
|
+
f"NEW ISSUES: {after_count - before_count} new mix issues introduced"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if score < 0.40:
|
|
311
|
+
keep_change = False
|
|
312
|
+
notes.append(f"SCORE: {score:.3f} below 0.40 threshold")
|
|
313
|
+
|
|
314
|
+
if keep_change and severity_improvement > 0:
|
|
315
|
+
notes.append(
|
|
316
|
+
f"IMPROVED: mix severity reduced by {severity_improvement:.2f} "
|
|
317
|
+
f"across {before_count - after_count} fewer issues"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
hard_rule_failures = [
|
|
321
|
+
n for n in notes if n.startswith(("WORSE:", "NEW ISSUES:", "SCORE:"))
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
return EvaluationResult(
|
|
325
|
+
engine="mix",
|
|
326
|
+
score=round(score, 4),
|
|
327
|
+
keep_change=keep_change,
|
|
328
|
+
goal_progress=round(severity_improvement, 4),
|
|
329
|
+
collateral_damage=0.0,
|
|
330
|
+
hard_rule_failures=hard_rule_failures,
|
|
331
|
+
dimension_changes=dimension_changes,
|
|
332
|
+
notes=notes,
|
|
333
|
+
decision_mode="measured",
|
|
334
|
+
memory_candidate=keep_change and severity_improvement > 0,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
# ── Transition Evaluator ────────────────────────────────────────────
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def evaluate_transition(
|
|
342
|
+
before_score: dict,
|
|
343
|
+
after_score: dict,
|
|
344
|
+
) -> EvaluationResult:
|
|
345
|
+
"""Evaluate a transition move by comparing TransitionScore dicts.
|
|
346
|
+
|
|
347
|
+
Compares boundary_clarity, payoff_strength, energy_redirection,
|
|
348
|
+
and overall_quality before and after the move.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
before_score: dict with transition quality metrics.
|
|
352
|
+
after_score: dict with transition quality metrics.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
EvaluationResult with score, keep_change, dimension_changes.
|
|
356
|
+
"""
|
|
357
|
+
notes: list[str] = []
|
|
358
|
+
|
|
359
|
+
# Key transition dimensions to compare
|
|
360
|
+
dims = ["boundary_clarity", "payoff_strength", "energy_redirection", "overall_quality"]
|
|
361
|
+
dimension_changes: dict[str, dict] = {}
|
|
362
|
+
total_improvement = 0.0
|
|
363
|
+
measured = 0
|
|
364
|
+
|
|
365
|
+
for dim in dims:
|
|
366
|
+
bv = before_score.get(dim)
|
|
367
|
+
av = after_score.get(dim)
|
|
368
|
+
if bv is not None and av is not None:
|
|
369
|
+
delta = av - bv
|
|
370
|
+
dimension_changes[dim] = {
|
|
371
|
+
"before": round(bv, 4),
|
|
372
|
+
"after": round(av, 4),
|
|
373
|
+
"delta": round(delta, 4),
|
|
374
|
+
}
|
|
375
|
+
total_improvement += delta
|
|
376
|
+
measured += 1
|
|
377
|
+
|
|
378
|
+
avg_improvement = total_improvement / max(measured, 1)
|
|
379
|
+
score = _clamp(0.5 + avg_improvement)
|
|
380
|
+
|
|
381
|
+
keep_change = True
|
|
382
|
+
|
|
383
|
+
if measured > 0 and total_improvement < 0:
|
|
384
|
+
keep_change = False
|
|
385
|
+
notes.append(
|
|
386
|
+
f"WORSE: transition quality decreased by {-total_improvement:.3f}"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
if score < 0.40:
|
|
390
|
+
keep_change = False
|
|
391
|
+
notes.append(f"SCORE: {score:.3f} below 0.40 threshold")
|
|
392
|
+
|
|
393
|
+
if keep_change and total_improvement > 0:
|
|
394
|
+
notes.append(
|
|
395
|
+
f"IMPROVED: transition quality improved by {total_improvement:.3f} "
|
|
396
|
+
f"across {measured} dimensions"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
hard_rule_failures = [
|
|
400
|
+
n for n in notes if n.startswith(("WORSE:", "SCORE:"))
|
|
401
|
+
]
|
|
402
|
+
|
|
403
|
+
return EvaluationResult(
|
|
404
|
+
engine="transition",
|
|
405
|
+
score=round(score, 4),
|
|
406
|
+
keep_change=keep_change,
|
|
407
|
+
goal_progress=round(total_improvement, 4),
|
|
408
|
+
collateral_damage=0.0,
|
|
409
|
+
hard_rule_failures=hard_rule_failures,
|
|
410
|
+
dimension_changes=dimension_changes,
|
|
411
|
+
notes=notes,
|
|
412
|
+
decision_mode="measured",
|
|
413
|
+
memory_candidate=keep_change and total_improvement > 0,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
# ── Translation Evaluator ───────────────────────────────────────────
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def evaluate_translation(
|
|
421
|
+
before_report: dict,
|
|
422
|
+
after_report: dict,
|
|
423
|
+
) -> EvaluationResult:
|
|
424
|
+
"""Evaluate a translation move by comparing TranslationReport dicts.
|
|
425
|
+
|
|
426
|
+
Compares robustness booleans (mono_safe, small_speaker_safe,
|
|
427
|
+
low_end_stable, front_element_present) and harshness_risk.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
before_report: dict from build_translation_report().to_dict().
|
|
431
|
+
after_report: dict from build_translation_report().to_dict().
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
EvaluationResult with score, keep_change, dimension_changes.
|
|
435
|
+
"""
|
|
436
|
+
notes: list[str] = []
|
|
437
|
+
dimension_changes: dict[str, dict] = {}
|
|
438
|
+
|
|
439
|
+
# Boolean robustness flags — True is good, False is bad
|
|
440
|
+
bool_dims = ["mono_safe", "small_speaker_safe", "low_end_stable", "front_element_present"]
|
|
441
|
+
improvements = 0
|
|
442
|
+
regressions = 0
|
|
443
|
+
|
|
444
|
+
for dim in bool_dims:
|
|
445
|
+
bv = before_report.get(dim)
|
|
446
|
+
av = after_report.get(dim)
|
|
447
|
+
if bv is not None and av is not None:
|
|
448
|
+
dimension_changes[dim] = {"before": bv, "after": av}
|
|
449
|
+
if not bv and av:
|
|
450
|
+
improvements += 1
|
|
451
|
+
elif bv and not av:
|
|
452
|
+
regressions += 1
|
|
453
|
+
|
|
454
|
+
# Harshness risk — lower is better
|
|
455
|
+
bh = before_report.get("harshness_risk", 0.0)
|
|
456
|
+
ah = after_report.get("harshness_risk", 0.0)
|
|
457
|
+
harshness_delta = bh - ah # positive = improvement
|
|
458
|
+
dimension_changes["harshness_risk"] = {
|
|
459
|
+
"before": round(bh, 4),
|
|
460
|
+
"after": round(ah, 4),
|
|
461
|
+
"delta": round(harshness_delta, 4),
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
# Overall robustness classification
|
|
465
|
+
robustness_map = {"robust": 1.0, "fragile": 0.5, "critical": 0.0}
|
|
466
|
+
before_rob = robustness_map.get(before_report.get("overall_robustness", ""), 0.5)
|
|
467
|
+
after_rob = robustness_map.get(after_report.get("overall_robustness", ""), 0.5)
|
|
468
|
+
robustness_delta = after_rob - before_rob
|
|
469
|
+
dimension_changes["overall_robustness"] = {
|
|
470
|
+
"before": before_report.get("overall_robustness", "unknown"),
|
|
471
|
+
"after": after_report.get("overall_robustness", "unknown"),
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
# Composite score
|
|
475
|
+
flag_score = (improvements - regressions) / max(len(bool_dims), 1)
|
|
476
|
+
score = _clamp(
|
|
477
|
+
0.5
|
|
478
|
+
+ flag_score * 0.3
|
|
479
|
+
+ harshness_delta * 0.3
|
|
480
|
+
+ robustness_delta * 0.4
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
total_improvement = flag_score + harshness_delta + robustness_delta
|
|
484
|
+
|
|
485
|
+
keep_change = True
|
|
486
|
+
|
|
487
|
+
if regressions > improvements:
|
|
488
|
+
keep_change = False
|
|
489
|
+
notes.append(
|
|
490
|
+
f"WORSE: {regressions} robustness flags regressed vs "
|
|
491
|
+
f"{improvements} improved"
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
if after_rob < before_rob:
|
|
495
|
+
keep_change = False
|
|
496
|
+
notes.append(
|
|
497
|
+
f"WORSE: overall robustness degraded from "
|
|
498
|
+
f"{before_report.get('overall_robustness')} to "
|
|
499
|
+
f"{after_report.get('overall_robustness')}"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
if score < 0.40:
|
|
503
|
+
keep_change = False
|
|
504
|
+
notes.append(f"SCORE: {score:.3f} below 0.40 threshold")
|
|
505
|
+
|
|
506
|
+
if keep_change and total_improvement > 0:
|
|
507
|
+
notes.append(
|
|
508
|
+
f"IMPROVED: {improvements} robustness flags improved, "
|
|
509
|
+
f"harshness reduced by {harshness_delta:.3f}"
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
hard_rule_failures = [
|
|
513
|
+
n for n in notes if n.startswith(("WORSE:", "SCORE:"))
|
|
514
|
+
]
|
|
515
|
+
|
|
516
|
+
return EvaluationResult(
|
|
517
|
+
engine="translation",
|
|
518
|
+
score=round(score, 4),
|
|
519
|
+
keep_change=keep_change,
|
|
520
|
+
goal_progress=round(total_improvement, 4),
|
|
521
|
+
collateral_damage=0.0,
|
|
522
|
+
hard_rule_failures=hard_rule_failures,
|
|
523
|
+
dimension_changes=dimension_changes,
|
|
524
|
+
notes=notes,
|
|
525
|
+
decision_mode="measured",
|
|
526
|
+
memory_candidate=keep_change and total_improvement > 0,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# ── Unified Entry Point ─────────────────────────────────────────────
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def evaluate(request: EvaluationRequest) -> EvaluationResult:
|
|
534
|
+
"""Unified evaluation entry point — routes to engine-specific evaluator.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
request: EvaluationRequest with engine field determining routing:
|
|
538
|
+
- "sonic" -> evaluate_sonic_move
|
|
539
|
+
- "composition" -> evaluate_composition_move
|
|
540
|
+
- "mix" -> evaluate_mix_move
|
|
541
|
+
- "transition" -> evaluate_transition
|
|
542
|
+
- "translation" -> evaluate_translation
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
EvaluationResult from the appropriate engine evaluator.
|
|
546
|
+
"""
|
|
547
|
+
engine = (request.engine or "sonic").lower()
|
|
548
|
+
|
|
549
|
+
if engine == "sonic":
|
|
550
|
+
return evaluate_sonic_move(request)
|
|
551
|
+
|
|
552
|
+
elif engine == "composition":
|
|
553
|
+
before_issues = request.before.get("issues", [])
|
|
554
|
+
after_issues = request.after.get("issues", [])
|
|
555
|
+
return evaluate_composition_move(before_issues, after_issues)
|
|
556
|
+
|
|
557
|
+
elif engine == "mix":
|
|
558
|
+
before_issues = request.before.get("issues", [])
|
|
559
|
+
after_issues = request.after.get("issues", [])
|
|
560
|
+
return evaluate_mix_move(before_issues, after_issues)
|
|
561
|
+
|
|
562
|
+
elif engine == "transition":
|
|
563
|
+
return evaluate_transition(request.before, request.after)
|
|
564
|
+
|
|
565
|
+
elif engine == "translation":
|
|
566
|
+
return evaluate_translation(request.before, request.after)
|
|
567
|
+
|
|
568
|
+
else:
|
|
569
|
+
return EvaluationResult(
|
|
570
|
+
engine=engine,
|
|
571
|
+
score=0.0,
|
|
572
|
+
keep_change=True,
|
|
573
|
+
decision_mode="deferred",
|
|
574
|
+
notes=[f"Unknown engine '{engine}' — deferring to agent judgment"],
|
|
575
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Feature Extractors — derive measurable values from normalized snapshots.
|
|
2
|
+
|
|
3
|
+
Replicates the dimension-extraction logic from _agent_os_engine but operates
|
|
4
|
+
on the canonical normalized snapshot format (always has "spectrum" key).
|
|
5
|
+
|
|
6
|
+
All returned values are clamped to 0.0-1.0 for consistent scoring.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from ..tools._evaluation_contracts import MEASURABLE_DIMENSIONS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _clamp(value: float, lo: float = 0.0, hi: float = 1.0) -> float:
|
|
18
|
+
"""Clamp value to [lo, hi] range."""
|
|
19
|
+
return max(lo, min(hi, value))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_dimension_value(
|
|
23
|
+
snapshot: dict,
|
|
24
|
+
dimension: str,
|
|
25
|
+
) -> Optional[float]:
|
|
26
|
+
"""Extract a measurable dimension value from a normalized sonic snapshot.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
snapshot: Normalized snapshot (from normalize_sonic_snapshot).
|
|
30
|
+
Must have a "spectrum" key with band values.
|
|
31
|
+
dimension: One of the MEASURABLE_DIMENSIONS (brightness, warmth,
|
|
32
|
+
weight, clarity, density, energy, punch).
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Float in 0.0-1.0 for measurable dimensions, None otherwise.
|
|
36
|
+
"""
|
|
37
|
+
if not snapshot or not isinstance(snapshot, dict):
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
bands = snapshot.get("spectrum")
|
|
41
|
+
if not bands:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
rms = snapshot.get("rms")
|
|
45
|
+
peak = snapshot.get("peak")
|
|
46
|
+
|
|
47
|
+
if dimension == "brightness":
|
|
48
|
+
high = bands.get("high", 0)
|
|
49
|
+
presence = bands.get("presence", 0)
|
|
50
|
+
return _clamp((high + presence) / 2.0)
|
|
51
|
+
|
|
52
|
+
elif dimension == "warmth":
|
|
53
|
+
return _clamp(bands.get("low_mid", 0))
|
|
54
|
+
|
|
55
|
+
elif dimension == "weight":
|
|
56
|
+
sub = bands.get("sub", 0)
|
|
57
|
+
low = bands.get("low", 0)
|
|
58
|
+
return _clamp((sub + low) / 2.0)
|
|
59
|
+
|
|
60
|
+
elif dimension == "clarity":
|
|
61
|
+
low_mid = bands.get("low_mid", 0)
|
|
62
|
+
return _clamp(1.0 - low_mid)
|
|
63
|
+
|
|
64
|
+
elif dimension == "density":
|
|
65
|
+
vals = [max(v, 1e-10) for v in bands.values()
|
|
66
|
+
if isinstance(v, (int, float))]
|
|
67
|
+
if not vals:
|
|
68
|
+
return None
|
|
69
|
+
geo_mean = math.exp(sum(math.log(v) for v in vals) / len(vals))
|
|
70
|
+
arith_mean = sum(vals) / len(vals)
|
|
71
|
+
return _clamp(geo_mean / max(arith_mean, 1e-10))
|
|
72
|
+
|
|
73
|
+
elif dimension == "energy":
|
|
74
|
+
return _clamp(rms) if rms is not None else None
|
|
75
|
+
|
|
76
|
+
elif dimension == "punch":
|
|
77
|
+
if rms and peak and rms > 0:
|
|
78
|
+
crest_db = 20.0 * math.log10(max(peak / rms, 1.0))
|
|
79
|
+
return _clamp(crest_db / 20.0)
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
else:
|
|
83
|
+
# Unmeasurable dimension
|
|
84
|
+
return None
|