iceni 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iceni/__init__.py +8 -0
- iceni/__main__.py +4 -0
- iceni/benchmark.py +371 -0
- iceni/calibration.py +87 -0
- iceni/cli.py +636 -0
- iceni/config.py +74 -0
- iceni/discovery.py +272 -0
- iceni/feedback.py +123 -0
- iceni/intent.py +44 -0
- iceni/mcp_server.py +263 -0
- iceni/packs/architecture-design.json +237 -0
- iceni/packs/business-strategy.json +258 -0
- iceni/packs/code-quality-plus.json +258 -0
- iceni/packs/code-quality.json +76 -0
- iceni/packs/daily-dev.json +48 -0
- iceni/packs/debugging.json +237 -0
- iceni/packs/decisions-planning.json +258 -0
- iceni/packs/devops-deploy.json +216 -0
- iceni/packs/document-data.json +258 -0
- iceni/packs/documentation.json +237 -0
- iceni/packs/email-comms.json +258 -0
- iceni/packs/git-workflow.json +237 -0
- iceni/packs/lang-tools.json +174 -0
- iceni/packs/learning-explain.json +258 -0
- iceni/packs/marketing-content.json +279 -0
- iceni/packs/refactoring.json +279 -0
- iceni/packs/research-analysis.json +279 -0
- iceni/packs/testing.json +279 -0
- iceni/packs/writing-pro.json +300 -0
- iceni/providers/__init__.py +2 -0
- iceni/providers/anthropic.py +31 -0
- iceni/providers/base.py +50 -0
- iceni/providers/openai_compat.py +34 -0
- iceni/sql/0001_init.sql +91 -0
- iceni/sql/0002_exec_feedback.sql +6 -0
- iceni/sql/0003_edit_signal.sql +5 -0
- iceni/store/__init__.py +1 -0
- iceni/store/aliases.py +284 -0
- iceni/store/db.py +34 -0
- iceni/trust/__init__.py +8 -0
- iceni/trust/identity.py +23 -0
- iceni/trust/keystore.py +26 -0
- iceni/trust/sign.py +38 -0
- iceni-0.1.0.dist-info/METADATA +292 -0
- iceni-0.1.0.dist-info/RECORD +48 -0
- iceni-0.1.0.dist-info/WHEEL +4 -0
- iceni-0.1.0.dist-info/entry_points.txt +2 -0
- iceni-0.1.0.dist-info/licenses/LICENSE +21 -0
iceni/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""ICENI — auto-discovered, cross-model-calibrated, self-evolving prompt aliases.
|
|
2
|
+
|
|
3
|
+
The load-bearing idea: the human-readable alias name is NOT the trust anchor.
|
|
4
|
+
A name resolves locally (petname) to a cryptographic identity, which signs a
|
|
5
|
+
content-addressed, model-agnostic intent that is rendered per model.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
iceni/__main__.py
ADDED
iceni/benchmark.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""The value-case benchmark (ChatGPT + Kimi's unanimous next step).
|
|
2
|
+
|
|
3
|
+
Compares, per task and per model, a natural-language BASELINE prompt against the
|
|
4
|
+
ICENI per-model calibrated render. Offline it measures input tokens, format fit,
|
|
5
|
+
and semantic preservation; with --execute it adds output tokens, cost, latency,
|
|
6
|
+
LLM-scored task quality, and a cost-per-quality breakdown. Prints an honest
|
|
7
|
+
verdict vs the agreed thresholds — including PIVOT if ICENI merely ties baseline.
|
|
8
|
+
|
|
9
|
+
Swarm projection (always offline): models how inter-agent routing tokens compare
|
|
10
|
+
when aliases compress the instruction set to ≈3 tokens vs full-prompt relay.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import statistics
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from . import calibration
|
|
21
|
+
from .intent import Intent
|
|
22
|
+
from .providers.base import ProviderUnavailable, get_provider
|
|
23
|
+
|
|
24
|
+
# $/Mtok — override per-task-file via "pricing". Verify at provider pricing pages.
|
|
25
|
+
# Kimi = moonshot-v1-32k (June 2026). Claude = sonnet-4-6. GPT = gpt-4o.
|
|
26
|
+
PRICING_DEFAULT = {
|
|
27
|
+
"claude": {"in": 3.0, "out": 15.0},
|
|
28
|
+
"gpt": {"in": 2.5, "out": 10.0},
|
|
29
|
+
"kimi": {"in": 1.5, "out": 7.5}, # moonshot-v1-32k — was $0.6/$2.5 (cheaper tier)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Thresholds (GPT). Any one met => proceed.
|
|
33
|
+
THRESH = {"token_reduction": 0.20, "quality_gain": 0.15, "speed_gain": 0.25}
|
|
34
|
+
# V1 rubric weights (Kimi).
|
|
35
|
+
V1_WEIGHTS = {"functional": 0.40, "token": 0.30, "format": 0.15, "semantic": 0.15}
|
|
36
|
+
# Tokens for alias petname in inter-agent messages (e.g. "review" ≈ 1-3 tok).
|
|
37
|
+
ALIAS_ROUTING_TOKENS = 3
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def estimate_tokens(text: str) -> int:
|
|
41
|
+
"""Consistent token estimate. Uses tiktoken if present, else ~4 chars/token."""
|
|
42
|
+
try:
|
|
43
|
+
import tiktoken
|
|
44
|
+
return len(tiktoken.get_encoding("o200k_base").encode(text))
|
|
45
|
+
except Exception:
|
|
46
|
+
return max(1, round(len(text) / 4))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def format_score(prompt: str, model: str) -> float:
|
|
50
|
+
"""Does the render use the model's preferred shape? 1.0 = yes."""
|
|
51
|
+
fam = model.lower()
|
|
52
|
+
if "claude" in fam:
|
|
53
|
+
return 1.0 if "<" in prompt and ">" in prompt else 0.0
|
|
54
|
+
if "gpt" in fam:
|
|
55
|
+
return 1.0 if "**" in prompt or prompt.lstrip().startswith("-") else 0.0
|
|
56
|
+
if "kimi" in fam:
|
|
57
|
+
return 1.0 if ("**" not in prompt and "<" not in prompt) else 0.0
|
|
58
|
+
return 0.5
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def semantic_similarity(a: str, b: str):
|
|
62
|
+
"""Cosine between two renders (sentence-transformers). None if unavailable."""
|
|
63
|
+
try:
|
|
64
|
+
from sentence_transformers import SentenceTransformer, util
|
|
65
|
+
m = SentenceTransformer("all-MiniLM-L6-v2")
|
|
66
|
+
emb = m.encode([a, b])
|
|
67
|
+
return float(util.cos_sim(emb[0], emb[1]).item())
|
|
68
|
+
except Exception:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _apply_input(prompt: str, text: str) -> str:
|
|
73
|
+
for ph in ("{{file}}", "{{code}}", "{{input}}", "{{context}}"):
|
|
74
|
+
if ph in prompt:
|
|
75
|
+
return prompt.replace(ph, text)
|
|
76
|
+
return prompt + "\n\n" + text
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class Cell:
|
|
81
|
+
model: str
|
|
82
|
+
in_tok_base: int
|
|
83
|
+
in_tok_iceni: int
|
|
84
|
+
fmt: float
|
|
85
|
+
semantic: float | None = None
|
|
86
|
+
input_tok: int = 0 # tokens in raw input alone (for swarm projection)
|
|
87
|
+
out_tok_base: int = 0
|
|
88
|
+
out_tok_iceni: int = 0
|
|
89
|
+
cost_base: float = 0.0
|
|
90
|
+
cost_iceni: float = 0.0
|
|
91
|
+
latency_base: float = 0.0
|
|
92
|
+
latency_iceni: float = 0.0
|
|
93
|
+
quality_base: float | None = None
|
|
94
|
+
quality_iceni: float | None = None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class TaskResult:
|
|
99
|
+
name: str
|
|
100
|
+
cells: list[Cell] = field(default_factory=list)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _intent_for(task: dict, resolver) -> Intent:
|
|
104
|
+
if "intent" in task:
|
|
105
|
+
return Intent.from_dict(task["intent"])
|
|
106
|
+
if "petname" in task and resolver is not None:
|
|
107
|
+
resolved = resolver(task["petname"])
|
|
108
|
+
if resolved:
|
|
109
|
+
return Intent.from_json(resolved[1]["intent_json"])
|
|
110
|
+
raise ValueError(f"task '{task.get('name')}' needs an 'intent' or resolvable 'petname'")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _judge(provider, goal: str, output: str) -> float | None:
|
|
114
|
+
prompt = (f"Task: {goal}\n\nA model produced this output:\n---\n{output[:4000]}\n---\n"
|
|
115
|
+
"Score 0-100 how well the output accomplishes the task. Reply with ONLY the integer.")
|
|
116
|
+
try:
|
|
117
|
+
import re
|
|
118
|
+
txt = provider.complete(prompt).text
|
|
119
|
+
m = re.search(r"\d{1,3}", txt)
|
|
120
|
+
return float(m.group()) if m else None
|
|
121
|
+
except Exception:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def run(tasks_path: str, models: list[str], cfg: dict, *, execute: bool = False,
|
|
126
|
+
judge_model: str | None = None, resolver=None) -> tuple[list[TaskResult], dict]:
|
|
127
|
+
data = json.loads(Path(tasks_path).read_text(encoding="utf-8"))
|
|
128
|
+
pricing = {**PRICING_DEFAULT, **data.get("pricing", {})}
|
|
129
|
+
base_dir = Path(tasks_path).parent
|
|
130
|
+
judge_provider = None
|
|
131
|
+
if execute and judge_model:
|
|
132
|
+
try:
|
|
133
|
+
judge_provider = get_provider(judge_model, cfg)
|
|
134
|
+
except ProviderUnavailable:
|
|
135
|
+
judge_provider = None
|
|
136
|
+
|
|
137
|
+
results: list[TaskResult] = []
|
|
138
|
+
for task in data.get("tasks", []):
|
|
139
|
+
intent = _intent_for(task, resolver)
|
|
140
|
+
baseline = task.get("baseline") or intent.goal
|
|
141
|
+
|
|
142
|
+
# Resolve input: file takes priority, then inline "input" field
|
|
143
|
+
inp = ""
|
|
144
|
+
if task.get("input_file"):
|
|
145
|
+
p = Path(task["input_file"])
|
|
146
|
+
if not p.is_absolute():
|
|
147
|
+
p = base_dir / p
|
|
148
|
+
inp = p.read_text(encoding="utf-8") if p.exists() else ""
|
|
149
|
+
elif task.get("input"):
|
|
150
|
+
inp = task["input"]
|
|
151
|
+
input_tok = estimate_tokens(inp) if inp else 0
|
|
152
|
+
|
|
153
|
+
tr = TaskResult(name=task.get("name", intent.goal[:24]))
|
|
154
|
+
for model in models:
|
|
155
|
+
iceni_prompt = calibration.render(intent, model)
|
|
156
|
+
base_prompt = baseline
|
|
157
|
+
if inp:
|
|
158
|
+
iceni_prompt = _apply_input(iceni_prompt, inp)
|
|
159
|
+
base_prompt = _apply_input(base_prompt, inp)
|
|
160
|
+
|
|
161
|
+
cell = Cell(
|
|
162
|
+
model=model,
|
|
163
|
+
in_tok_base=estimate_tokens(base_prompt),
|
|
164
|
+
in_tok_iceni=estimate_tokens(iceni_prompt),
|
|
165
|
+
fmt=format_score(iceni_prompt, model),
|
|
166
|
+
semantic=semantic_similarity(base_prompt, iceni_prompt),
|
|
167
|
+
input_tok=input_tok,
|
|
168
|
+
)
|
|
169
|
+
if execute:
|
|
170
|
+
try:
|
|
171
|
+
prov = get_provider(model, cfg)
|
|
172
|
+
price = pricing.get(model, {"in": 0.0, "out": 0.0})
|
|
173
|
+
for variant, prompt in (("base", base_prompt), ("iceni", iceni_prompt)):
|
|
174
|
+
t0 = time.perf_counter()
|
|
175
|
+
comp = prov.complete(prompt)
|
|
176
|
+
dt = time.perf_counter() - t0
|
|
177
|
+
out_tok = comp.tokens_out or estimate_tokens(comp.text)
|
|
178
|
+
in_tok = comp.tokens_in or estimate_tokens(prompt)
|
|
179
|
+
cost = (in_tok * price["in"] + out_tok * price["out"]) / 1_000_000
|
|
180
|
+
q = _judge(judge_provider, intent.goal, comp.text) if judge_provider else None
|
|
181
|
+
if variant == "base":
|
|
182
|
+
cell.out_tok_base, cell.cost_base, cell.latency_base, cell.quality_base = out_tok, cost, dt, q
|
|
183
|
+
else:
|
|
184
|
+
cell.out_tok_iceni, cell.cost_iceni, cell.latency_iceni, cell.quality_iceni = out_tok, cost, dt, q
|
|
185
|
+
except ProviderUnavailable:
|
|
186
|
+
pass
|
|
187
|
+
tr.cells.append(cell)
|
|
188
|
+
results.append(tr)
|
|
189
|
+
|
|
190
|
+
return results, {"executed": execute, "judged": judge_provider is not None,
|
|
191
|
+
"models": models, "pricing": pricing}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _pct(base: float, new: float) -> float | None:
|
|
195
|
+
if not base:
|
|
196
|
+
return None
|
|
197
|
+
return (base - new) / base
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _swarm_projection(results: list[TaskResult], pricing: dict) -> list[str]:
|
|
201
|
+
"""
|
|
202
|
+
Project inter-agent routing token savings at swarm scale.
|
|
203
|
+
Traditional: each hop relays the full instruction+input.
|
|
204
|
+
ICENI: each hop sends alias name (≈3 tok) + raw input only.
|
|
205
|
+
"""
|
|
206
|
+
hop_data = []
|
|
207
|
+
for tr in results:
|
|
208
|
+
for c in tr.cells:
|
|
209
|
+
trad = c.in_tok_base
|
|
210
|
+
iceni = c.input_tok + ALIAS_ROUTING_TOKENS
|
|
211
|
+
if trad > 0 and trad > iceni:
|
|
212
|
+
hop_data.append((tr.name, c.model, trad, iceni,
|
|
213
|
+
pricing.get(c.model, {"in": 3.0})["in"]))
|
|
214
|
+
|
|
215
|
+
if not hop_data:
|
|
216
|
+
return []
|
|
217
|
+
|
|
218
|
+
avg_trad = statistics.mean(h[2] for h in hop_data)
|
|
219
|
+
avg_iceni = statistics.mean(h[3] for h in hop_data)
|
|
220
|
+
compression = (avg_trad - avg_iceni) / avg_trad if avg_trad else 0
|
|
221
|
+
|
|
222
|
+
lines = ["## Swarm Scale Projection ⚠ HYPOTHESIS — NOT YET MEASURED", ""]
|
|
223
|
+
lines.append(
|
|
224
|
+
f"> **Assumptions (unverified):** (1) agents relay intent to the next hop; "
|
|
225
|
+
f"(2) ICENI path sends alias name (≈{ALIAS_ROUTING_TOKENS} tok) + raw input only; "
|
|
226
|
+
f"(3) traditional path sends the full instruction set + input every hop. "
|
|
227
|
+
f"**This has not been tested in a real multi-agent setup.** "
|
|
228
|
+
f"The model below is a theoretical projection to motivate the multi-agent benchmark."
|
|
229
|
+
)
|
|
230
|
+
lines.append("")
|
|
231
|
+
lines.append(
|
|
232
|
+
f"Theoretical per-hop: traditional **{avg_trad:.0f} tokens** · "
|
|
233
|
+
f"ICENI **{avg_iceni:.0f} tokens** → **{compression*100:.0f}% routing compression**."
|
|
234
|
+
)
|
|
235
|
+
lines.append("")
|
|
236
|
+
lines.append("| agents | msgs/agent/day | trad routing tok (projected) | ICENI routing tok (projected) | projected savings |")
|
|
237
|
+
lines.append("|--:|--:|--:|--:|--:|")
|
|
238
|
+
for n_agents in (5, 10, 25, 50, 100):
|
|
239
|
+
for msgs_per_agent in (100, 1000):
|
|
240
|
+
trad_total = n_agents * msgs_per_agent * avg_trad
|
|
241
|
+
iceni_total = n_agents * msgs_per_agent * avg_iceni
|
|
242
|
+
pct = (trad_total - iceni_total) / trad_total if trad_total else 0
|
|
243
|
+
lines.append(
|
|
244
|
+
f"| {n_agents} | {msgs_per_agent} "
|
|
245
|
+
f"| {trad_total:,.0f} | {iceni_total:,.0f} | {pct*100:.0f}% |"
|
|
246
|
+
)
|
|
247
|
+
lines.append("")
|
|
248
|
+
lines.append(
|
|
249
|
+
"_To validate: build a multi-agent harness where Agent A passes intent to Agent B — _\n"
|
|
250
|
+
"_compare full-prompt relay vs alias-name relay. That data replaces this projection._"
|
|
251
|
+
)
|
|
252
|
+
return lines
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def render_report(results: list[TaskResult], meta: dict) -> str:
|
|
256
|
+
cells = [c for tr in results for c in tr.cells]
|
|
257
|
+
pricing = meta.get("pricing", PRICING_DEFAULT)
|
|
258
|
+
lines = ["# ICENI Benchmark Report", ""]
|
|
259
|
+
lines.append(f"Tasks: {len(results)} · Models: {', '.join(meta['models'])} · "
|
|
260
|
+
f"Mode: {'EXECUTE' if meta['executed'] else 'offline (input-side only)'}"
|
|
261
|
+
+ (" · judged" if meta.get("judged") else ""))
|
|
262
|
+
lines.append("")
|
|
263
|
+
|
|
264
|
+
# Per-task input-side table
|
|
265
|
+
lines.append("## Input-side (offline-measurable)")
|
|
266
|
+
lines.append("")
|
|
267
|
+
lines.append("| task | model | base tok | iceni tok | Δtok | format | semantic |")
|
|
268
|
+
lines.append("|---|---|--:|--:|--:|:--:|:--:|")
|
|
269
|
+
for tr in results:
|
|
270
|
+
for c in tr.cells:
|
|
271
|
+
d = _pct(c.in_tok_base, c.in_tok_iceni)
|
|
272
|
+
dstr = f"{d*100:+.0f}%" if d is not None else "n/a"
|
|
273
|
+
sem = f"{c.semantic:.2f}" if c.semantic is not None else "—"
|
|
274
|
+
lines.append(f"| {tr.name} | {c.model} | {c.in_tok_base} | {c.in_tok_iceni} | {dstr} "
|
|
275
|
+
f"| {'✓' if c.fmt >= 1 else '·'} | {sem} |")
|
|
276
|
+
lines.append("")
|
|
277
|
+
|
|
278
|
+
if meta["executed"]:
|
|
279
|
+
lines.append("## Execution (live)")
|
|
280
|
+
lines.append("")
|
|
281
|
+
lines.append("| task | model | out base→iceni | cost base→iceni ($) | latency (s) | quality 0-100 |")
|
|
282
|
+
lines.append("|---|---|--:|--:|--:|--:|")
|
|
283
|
+
for tr in results:
|
|
284
|
+
for c in tr.cells:
|
|
285
|
+
q = (f"{c.quality_base:.0f}→{c.quality_iceni:.0f}"
|
|
286
|
+
if c.quality_base is not None and c.quality_iceni is not None else "—")
|
|
287
|
+
lines.append(f"| {tr.name} | {c.model} | {c.out_tok_base}→{c.out_tok_iceni} "
|
|
288
|
+
f"| {c.cost_base:.5f}→{c.cost_iceni:.5f} "
|
|
289
|
+
f"| {c.latency_base:.2f}→{c.latency_iceni:.2f} | {q} |")
|
|
290
|
+
lines.append("")
|
|
291
|
+
|
|
292
|
+
# Cost-per-quality (GPT's key metric: pay less, get more)
|
|
293
|
+
cpq_rows = [
|
|
294
|
+
(c.model,
|
|
295
|
+
c.quality_base / c.cost_base if c.cost_base and c.quality_base else None,
|
|
296
|
+
c.quality_iceni / c.cost_iceni if c.cost_iceni and c.quality_iceni else None)
|
|
297
|
+
for c in cells
|
|
298
|
+
if c.quality_base and c.quality_iceni and c.cost_base and c.cost_iceni
|
|
299
|
+
]
|
|
300
|
+
if cpq_rows:
|
|
301
|
+
lines.append("## Cost per Quality Point (GPT's metric)")
|
|
302
|
+
lines.append("")
|
|
303
|
+
lines.append("Quality points per dollar — higher = better value. "
|
|
304
|
+
"ICENI wins if it delivers more quality for the same cost even if tokens go up.")
|
|
305
|
+
lines.append("")
|
|
306
|
+
lines.append("| model | quality/$ base | quality/$ iceni | Δ |")
|
|
307
|
+
lines.append("|---|--:|--:|--:|")
|
|
308
|
+
by_model: dict[str, list] = {}
|
|
309
|
+
for model, base_cpq, iceni_cpq in cpq_rows:
|
|
310
|
+
by_model.setdefault(model, []).append((base_cpq, iceni_cpq))
|
|
311
|
+
for model, pairs in sorted(by_model.items()):
|
|
312
|
+
valid = [(b, i) for b, i in pairs if b and i]
|
|
313
|
+
if not valid:
|
|
314
|
+
continue
|
|
315
|
+
avg_b = statistics.mean(b for b, _ in valid)
|
|
316
|
+
avg_i = statistics.mean(i for _, i in valid)
|
|
317
|
+
delta = _pct(avg_b, avg_i)
|
|
318
|
+
delta_str = f"{-delta*100:+.0f}%" if delta is not None else "n/a"
|
|
319
|
+
lines.append(f"| {model} | {avg_b:,.0f} | {avg_i:,.0f} | {delta_str} |")
|
|
320
|
+
lines.append("")
|
|
321
|
+
|
|
322
|
+
# ---- aggregates + verdict ----
|
|
323
|
+
in_deltas = [d for c in cells if (d := _pct(c.in_tok_base, c.in_tok_iceni)) is not None]
|
|
324
|
+
fmt_ok = statistics.mean([c.fmt for c in cells]) if cells else 0.0
|
|
325
|
+
sem_vals = [c.semantic for c in cells if c.semantic is not None]
|
|
326
|
+
|
|
327
|
+
lines.append("## Verdict")
|
|
328
|
+
lines.append("")
|
|
329
|
+
avg_in_delta = statistics.mean(in_deltas) if in_deltas else None
|
|
330
|
+
if avg_in_delta is not None:
|
|
331
|
+
lines.append(f"- Input-token Δ (ICENI vs baseline): **{avg_in_delta*100:+.0f}%** "
|
|
332
|
+
f"(positive = ICENI uses fewer)")
|
|
333
|
+
lines.append(f"- Format-appropriateness: **{fmt_ok*100:.0f}%** of renders match the model's preferred shape")
|
|
334
|
+
if sem_vals:
|
|
335
|
+
lines.append(f"- Semantic preservation (render↔baseline): **{statistics.mean(sem_vals):.2f}** "
|
|
336
|
+
f"(target ≥ 0.90)")
|
|
337
|
+
|
|
338
|
+
proceed = False
|
|
339
|
+
if meta["executed"]:
|
|
340
|
+
cost_deltas = [d for c in cells if (d := _pct(c.cost_base, c.cost_iceni)) is not None]
|
|
341
|
+
speed_deltas = [d for c in cells if (d := _pct(c.latency_base, c.latency_iceni)) is not None]
|
|
342
|
+
if cost_deltas:
|
|
343
|
+
lines.append(f"- Cost Δ: **{statistics.mean(cost_deltas)*100:+.0f}%**")
|
|
344
|
+
if speed_deltas:
|
|
345
|
+
lines.append(f"- Speed Δ: **{statistics.mean(speed_deltas)*100:+.0f}%**")
|
|
346
|
+
iceni_q = [c.quality_iceni for c in cells if c.quality_iceni is not None]
|
|
347
|
+
if len(iceni_q) > 1:
|
|
348
|
+
lines.append(f"- Cross-model quality spread (ICENI): **±{statistics.pstdev(iceni_q):.1f}** "
|
|
349
|
+
f"(lower = more consistent across models)")
|
|
350
|
+
tok_hit = avg_in_delta is not None and avg_in_delta >= THRESH["token_reduction"]
|
|
351
|
+
cost_hit = bool(cost_deltas and statistics.mean(cost_deltas) >= THRESH["token_reduction"])
|
|
352
|
+
speed_hit = bool(speed_deltas and statistics.mean(speed_deltas) >= THRESH["speed_gain"])
|
|
353
|
+
proceed = tok_hit or cost_hit or speed_hit
|
|
354
|
+
lines.append("")
|
|
355
|
+
lines.append(f"### → {'PROCEED' if proceed else 'WEAK / PIVOT'} "
|
|
356
|
+
f"(GPT thresholds: ≥20% token or cost, or ≥25% speed)")
|
|
357
|
+
else:
|
|
358
|
+
lines.append("")
|
|
359
|
+
lines.append("### → run with `--execute` (+ API keys) for the cost/quality/speed verdict")
|
|
360
|
+
|
|
361
|
+
lines.append("")
|
|
362
|
+
lines.append("_Honesty note: Claude XML renders cost more input tokens but may return higher quality. "
|
|
363
|
+
"The definitive metric is cost-per-quality-point, not raw token count._")
|
|
364
|
+
lines.append("")
|
|
365
|
+
|
|
366
|
+
# Swarm projection (always shown — pure math, no API calls)
|
|
367
|
+
swarm_lines = _swarm_projection(results, pricing)
|
|
368
|
+
if swarm_lines:
|
|
369
|
+
lines.extend(swarm_lines)
|
|
370
|
+
|
|
371
|
+
return "\n".join(lines)
|
iceni/calibration.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Render a model-agnostic Intent into a per-model prompt.
|
|
2
|
+
|
|
3
|
+
v0.1 uses a deterministic, offline template renderer keyed on model family and
|
|
4
|
+
the intent's own style_hints. This deliberately needs NO API call, so `compare`
|
|
5
|
+
and `run --preview` demonstrate cross-model calibration (value-case V1) even
|
|
6
|
+
without keys. v0.2 swaps in an LLM-backed calibrator (PromptBridge-style) behind
|
|
7
|
+
the same `render()` signature.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .intent import Intent
|
|
12
|
+
|
|
13
|
+
KNOWN_MODELS = ("claude", "gpt", "kimi")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def render(intent: Intent, model: str) -> str:
|
|
17
|
+
fam = model.lower()
|
|
18
|
+
if "claude" in fam:
|
|
19
|
+
return _claude(intent)
|
|
20
|
+
if "gpt" in fam:
|
|
21
|
+
return _gpt(intent)
|
|
22
|
+
if "kimi" in fam:
|
|
23
|
+
return _kimi(intent)
|
|
24
|
+
return _generic(intent)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _hint(intent: Intent, model_family: str) -> str:
|
|
28
|
+
return str(intent.style_hints.get(model_family, "")).strip()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _claude(i: Intent) -> str:
|
|
32
|
+
# Claude responds well to XML-structured instructions.
|
|
33
|
+
parts = [f"<task>{i.goal}</task>"]
|
|
34
|
+
if i.inputs:
|
|
35
|
+
parts.append("<inputs>\n" + "\n".join(f" <input>{x}</input>" for x in i.inputs) + "\n</inputs>")
|
|
36
|
+
if i.constraints:
|
|
37
|
+
parts.append("<constraints>\n" + "\n".join(f" <constraint>{c}</constraint>" for c in i.constraints) + "\n</constraints>")
|
|
38
|
+
if i.outputs:
|
|
39
|
+
fmt = i.outputs.get("format") or ", ".join(i.outputs.keys())
|
|
40
|
+
parts.append(f"<output_format>{fmt}</output_format>")
|
|
41
|
+
extra = _hint(i, "claude")
|
|
42
|
+
if extra:
|
|
43
|
+
parts.append(f"<note>{extra}</note>")
|
|
44
|
+
return "\n".join(parts)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _gpt(i: Intent) -> str:
|
|
48
|
+
# GPT works well with concise markdown.
|
|
49
|
+
lines = [f"**Task:** {i.goal}", ""]
|
|
50
|
+
if i.inputs:
|
|
51
|
+
lines += ["**Inputs:**"] + [f"- {x}" for x in i.inputs] + [""]
|
|
52
|
+
if i.constraints:
|
|
53
|
+
lines += ["**Requirements:**"] + [f"- {c}" for c in i.constraints] + [""]
|
|
54
|
+
if i.outputs:
|
|
55
|
+
fmt = i.outputs.get("format") or ", ".join(i.outputs.keys())
|
|
56
|
+
lines += [f"**Output:** {fmt}"]
|
|
57
|
+
extra = _hint(i, "gpt")
|
|
58
|
+
if extra:
|
|
59
|
+
lines += ["", extra]
|
|
60
|
+
return "\n".join(lines).strip()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _kimi(i: Intent) -> str:
|
|
64
|
+
# Kimi: direct, compact imperative.
|
|
65
|
+
seg = [i.goal.rstrip(".") + "."]
|
|
66
|
+
if i.inputs:
|
|
67
|
+
seg.append("Inputs: " + "; ".join(i.inputs) + ".")
|
|
68
|
+
if i.constraints:
|
|
69
|
+
seg.append("Constraints: " + "; ".join(i.constraints) + ".")
|
|
70
|
+
if i.outputs:
|
|
71
|
+
fmt = i.outputs.get("format") or ", ".join(i.outputs.keys())
|
|
72
|
+
seg.append(f"Return: {fmt}.")
|
|
73
|
+
extra = _hint(i, "kimi")
|
|
74
|
+
if extra:
|
|
75
|
+
seg.append(extra)
|
|
76
|
+
return " ".join(seg)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _generic(i: Intent) -> str:
|
|
80
|
+
out = [i.goal]
|
|
81
|
+
if i.inputs:
|
|
82
|
+
out.append("Inputs: " + "; ".join(i.inputs))
|
|
83
|
+
if i.constraints:
|
|
84
|
+
out.append("Constraints: " + "; ".join(i.constraints))
|
|
85
|
+
if i.outputs:
|
|
86
|
+
out.append("Output: " + (i.outputs.get("format") or ", ".join(i.outputs.keys())))
|
|
87
|
+
return "\n".join(out)
|