simplicio-prompt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +173 -0
- package/YOOL_TUPLE_HAMT.md +1149 -0
- package/adopters.md +24 -0
- package/benchmarks/generate_prompt_benchmark_pdf.py +355 -0
- package/benchmarks/generate_v2_benchmark_pdf.py +302 -0
- package/benchmarks/prompt_vs_normal.py +431 -0
- package/benchmarks/prompt_vs_normal_benchmark.pdf +124 -0
- package/benchmarks/prompt_vs_normal_results.md +148 -0
- package/benchmarks/v2_safe_speed_benchmark.pdf +118 -0
- package/benchmarks/v2_safe_speed_benchmark.py +626 -0
- package/benchmarks/v2_safe_speed_results.json +446 -0
- package/benchmarks/v2_safe_speed_results.md +96 -0
- package/docs/assets/simplicio-prompt-hero.png +0 -0
- package/docs/assets/yool-v2-safe-speed-infographic-en.png +0 -0
- package/docs/assets/yool-v2-safe-speed-infographic-pt.png +0 -0
- package/examples/node/build-catalog.mjs +70 -0
- package/examples/python/minimal_bus.py +134 -0
- package/examples/python/receipts.py +152 -0
- package/guardrails/cpu_throttle.py +119 -0
- package/guardrails/disk_gc.py +212 -0
- package/kernel/README.md +82 -0
- package/kernel/yool_tuple_kernel.py +1109 -0
- package/kernel-implementation-request.md +38 -0
- package/package.json +40 -0
- package/prompts/agent-runtime-execution-prompt.md +119 -0
- package/prompts/legacy-tuple-space-engine-prompt.md +36 -0
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
"""Benchmark V2 safe-speed runtime vs V1 and normal instructions.
|
|
2
|
+
|
|
3
|
+
The benchmark stays local and deterministic enough for documentation:
|
|
4
|
+
|
|
5
|
+
- normal instruction: flat planning, sequential/repeated work, no runtime guardrails;
|
|
6
|
+
- V1 runtime: lazy batch_spawn and fixed LaneWorkerPool fan-out;
|
|
7
|
+
- V2 runtime: V1 plus adaptive lanes, receipt/input cache, batching, provider
|
|
8
|
+
circuit breakers, local routing, and context compression.
|
|
9
|
+
|
|
10
|
+
It does not call hosted LLMs or external APIs.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import math
|
|
19
|
+
import statistics
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
import tracemalloc
|
|
23
|
+
from dataclasses import asdict, dataclass
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, Callable
|
|
26
|
+
|
|
27
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
28
|
+
sys.path.insert(0, str(ROOT))
|
|
29
|
+
|
|
30
|
+
from kernel.yool_tuple_kernel import ( # noqa: E402
|
|
31
|
+
CircuitOpenError,
|
|
32
|
+
LaneWorkerPool,
|
|
33
|
+
RuntimePolicy,
|
|
34
|
+
TupleSpace,
|
|
35
|
+
YoolTuple,
|
|
36
|
+
build_default_space,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
OUT_JSON = ROOT / "benchmarks" / "v2_safe_speed_results.json"
|
|
40
|
+
OUT_MD = ROOT / "benchmarks" / "v2_safe_speed_results.md"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ProfileResult:
|
|
45
|
+
scenario: str
|
|
46
|
+
profile: str
|
|
47
|
+
wall_ms: float
|
|
48
|
+
tasks: int
|
|
49
|
+
peak_kb: float
|
|
50
|
+
provider_calls: int = 0
|
|
51
|
+
cache_hits: int = 0
|
|
52
|
+
blocked_calls: int = 0
|
|
53
|
+
total_agents: int = 0
|
|
54
|
+
virtual_agents: int = 0
|
|
55
|
+
tokens: int = 0
|
|
56
|
+
notes: str = ""
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def throughput_tasks_s(self) -> float:
|
|
60
|
+
if self.wall_ms <= 0:
|
|
61
|
+
return 0.0
|
|
62
|
+
return self.tasks / (self.wall_ms / 1000)
|
|
63
|
+
|
|
64
|
+
def to_dict(self) -> dict[str, Any]:
|
|
65
|
+
payload = asdict(self)
|
|
66
|
+
payload["throughput_tasks_s"] = self.throughput_tasks_s
|
|
67
|
+
return payload
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def measure(fn: Callable[[], ProfileResult]) -> ProfileResult:
|
|
71
|
+
tracemalloc.start()
|
|
72
|
+
try:
|
|
73
|
+
result = fn()
|
|
74
|
+
_current, peak = tracemalloc.get_traced_memory()
|
|
75
|
+
result.peak_kb = peak / 1024
|
|
76
|
+
return result
|
|
77
|
+
finally:
|
|
78
|
+
tracemalloc.stop()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def estimate_tokens(text: str) -> int:
|
|
82
|
+
return max(1, math.ceil(len(text.encode("utf-8")) / 4))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def simulated_work(index: int, sleep_ms: float) -> str:
|
|
86
|
+
if sleep_ms > 0:
|
|
87
|
+
time.sleep(sleep_ms / 1000)
|
|
88
|
+
return hashlib.blake2b(f"task:{index}".encode("utf-8"), digest_size=8).hexdigest()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def normal_flat_scale(total_agents: int) -> ProfileResult:
|
|
92
|
+
def run() -> ProfileResult:
|
|
93
|
+
t0 = time.perf_counter()
|
|
94
|
+
agents = [{"id": index, "lane": "flat"} for index in range(total_agents)]
|
|
95
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
96
|
+
return ProfileResult(
|
|
97
|
+
scenario="scale_representation",
|
|
98
|
+
profile="normal instruction",
|
|
99
|
+
wall_ms=wall_ms,
|
|
100
|
+
tasks=total_agents,
|
|
101
|
+
peak_kb=0.0,
|
|
102
|
+
total_agents=len(agents),
|
|
103
|
+
notes="flat list materialization",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return measure(run)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def lazy_scale(profile: str, target_agents: int, branching: int = 32) -> ProfileResult:
|
|
110
|
+
depth = 1
|
|
111
|
+
virtual = branching
|
|
112
|
+
while virtual < target_agents:
|
|
113
|
+
depth += 1
|
|
114
|
+
virtual *= branching
|
|
115
|
+
|
|
116
|
+
def run() -> ProfileResult:
|
|
117
|
+
space, root = build_default_space()
|
|
118
|
+
t0 = time.perf_counter()
|
|
119
|
+
receipt = space.batch_spawn(
|
|
120
|
+
root,
|
|
121
|
+
"prompt_worker",
|
|
122
|
+
depth=depth,
|
|
123
|
+
branching=branching,
|
|
124
|
+
compression_threshold=1024,
|
|
125
|
+
)
|
|
126
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
127
|
+
snapshot = space.snapshot()
|
|
128
|
+
return ProfileResult(
|
|
129
|
+
scenario="scale_representation",
|
|
130
|
+
profile=profile,
|
|
131
|
+
wall_ms=wall_ms,
|
|
132
|
+
tasks=receipt.virtual_agents,
|
|
133
|
+
peak_kb=0.0,
|
|
134
|
+
total_agents=snapshot["total_agents"],
|
|
135
|
+
virtual_agents=snapshot["virtual_agents"],
|
|
136
|
+
notes=f"lazy batch_spawn depth={depth}, branching={branching}",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return measure(run)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def normal_sequential(tasks: int, sleep_ms: float) -> ProfileResult:
|
|
143
|
+
def run() -> ProfileResult:
|
|
144
|
+
t0 = time.perf_counter()
|
|
145
|
+
for index in range(tasks):
|
|
146
|
+
simulated_work(index, sleep_ms)
|
|
147
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
148
|
+
return ProfileResult(
|
|
149
|
+
scenario="active_execution",
|
|
150
|
+
profile="normal instruction",
|
|
151
|
+
wall_ms=wall_ms,
|
|
152
|
+
tasks=tasks,
|
|
153
|
+
peak_kb=0.0,
|
|
154
|
+
provider_calls=tasks,
|
|
155
|
+
total_agents=tasks,
|
|
156
|
+
notes="sequential execution",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return measure(run)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def lane_execution(
|
|
163
|
+
profile: str,
|
|
164
|
+
tasks: int,
|
|
165
|
+
sleep_ms: float,
|
|
166
|
+
*,
|
|
167
|
+
max_lane_concurrency: int,
|
|
168
|
+
) -> ProfileResult:
|
|
169
|
+
def run() -> ProfileResult:
|
|
170
|
+
policy = RuntimePolicy(
|
|
171
|
+
lane_concurrency=32,
|
|
172
|
+
max_lane_concurrency=max_lane_concurrency,
|
|
173
|
+
queue_maxsize=8192,
|
|
174
|
+
)
|
|
175
|
+
space = TupleSpace(policy=policy)
|
|
176
|
+
root = YoolTuple("kernel_root", (0,), "root", "main", "benchmark")
|
|
177
|
+
space.out_tuple(root)
|
|
178
|
+
for index in range(tasks):
|
|
179
|
+
space.spawn_agent(root, "prompt_worker", {"index": index, "lane": "exec"})
|
|
180
|
+
|
|
181
|
+
calls = 0
|
|
182
|
+
|
|
183
|
+
def executor(tup: YoolTuple) -> str:
|
|
184
|
+
nonlocal calls
|
|
185
|
+
calls += 1
|
|
186
|
+
return simulated_work(int(tup.data["index"]), sleep_ms)
|
|
187
|
+
|
|
188
|
+
t0 = time.perf_counter()
|
|
189
|
+
pool = LaneWorkerPool(space, policy=policy)
|
|
190
|
+
pool.run_lane("exec", executor, use_cache=False)
|
|
191
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
192
|
+
snapshot = space.snapshot()
|
|
193
|
+
return ProfileResult(
|
|
194
|
+
scenario="active_execution",
|
|
195
|
+
profile=profile,
|
|
196
|
+
wall_ms=wall_ms,
|
|
197
|
+
tasks=tasks,
|
|
198
|
+
peak_kb=0.0,
|
|
199
|
+
provider_calls=calls,
|
|
200
|
+
total_agents=snapshot["total_agents"],
|
|
201
|
+
notes=f"lane_concurrency=32, max_lane_concurrency={max_lane_concurrency}",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return measure(run)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def cache_workload(
|
|
208
|
+
profile: str, tasks: int, unique_inputs: int, sleep_ms: float
|
|
209
|
+
) -> ProfileResult:
|
|
210
|
+
def run() -> ProfileResult:
|
|
211
|
+
calls = 0
|
|
212
|
+
space = TupleSpace(policy=RuntimePolicy(cache_ttl_s=3600))
|
|
213
|
+
|
|
214
|
+
def executor(tup: YoolTuple) -> str:
|
|
215
|
+
nonlocal calls
|
|
216
|
+
calls += 1
|
|
217
|
+
return simulated_work(int(tup.data["input_id"]), sleep_ms)
|
|
218
|
+
|
|
219
|
+
t0 = time.perf_counter()
|
|
220
|
+
for index in range(tasks):
|
|
221
|
+
input_id = index % unique_inputs
|
|
222
|
+
tup = YoolTuple(
|
|
223
|
+
"llm_call",
|
|
224
|
+
(index,),
|
|
225
|
+
"root",
|
|
226
|
+
"llm",
|
|
227
|
+
"benchmark",
|
|
228
|
+
{"provider": "claude", "input_id": input_id},
|
|
229
|
+
)
|
|
230
|
+
space.execute_tuple(tup, executor, use_cache=(profile == "V2 safe-speed"))
|
|
231
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
232
|
+
cache = space.snapshot()["cache"]
|
|
233
|
+
return ProfileResult(
|
|
234
|
+
scenario="cache_dedupe",
|
|
235
|
+
profile=profile,
|
|
236
|
+
wall_ms=wall_ms,
|
|
237
|
+
tasks=tasks,
|
|
238
|
+
peak_kb=0.0,
|
|
239
|
+
provider_calls=calls,
|
|
240
|
+
cache_hits=cache["hits"],
|
|
241
|
+
notes=f"{unique_inputs} unique inputs repeated across {tasks} tasks",
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return measure(run)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def batching_workload(profile: str, tasks: int, batch_size: int) -> ProfileResult:
|
|
248
|
+
per_call_ms = 1.0
|
|
249
|
+
per_item_ms = 0.03
|
|
250
|
+
|
|
251
|
+
def run_v1() -> ProfileResult:
|
|
252
|
+
calls = 0
|
|
253
|
+
t0 = time.perf_counter()
|
|
254
|
+
for index in range(tasks):
|
|
255
|
+
calls += 1
|
|
256
|
+
time.sleep((per_call_ms + per_item_ms) / 1000)
|
|
257
|
+
simulated_work(index, 0)
|
|
258
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
259
|
+
return ProfileResult(
|
|
260
|
+
scenario="small_task_batching",
|
|
261
|
+
profile=profile,
|
|
262
|
+
wall_ms=wall_ms,
|
|
263
|
+
tasks=tasks,
|
|
264
|
+
peak_kb=0.0,
|
|
265
|
+
provider_calls=calls,
|
|
266
|
+
notes="one provider-sized call per small task",
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def run_v2() -> ProfileResult:
|
|
270
|
+
calls = 0
|
|
271
|
+
policy = RuntimePolicy(batch_small_task_size=batch_size, lane_concurrency=32)
|
|
272
|
+
space = TupleSpace(policy=policy)
|
|
273
|
+
root = YoolTuple("kernel_root", (0,), "root", "main", "benchmark")
|
|
274
|
+
space.out_tuple(root)
|
|
275
|
+
for index in range(tasks):
|
|
276
|
+
space.spawn_agent(root, "small_task", {"index": index, "lane": "batch"})
|
|
277
|
+
|
|
278
|
+
def batch_executor(items: list[YoolTuple]) -> list[str]:
|
|
279
|
+
nonlocal calls
|
|
280
|
+
calls += 1
|
|
281
|
+
time.sleep((per_call_ms + per_item_ms * len(items)) / 1000)
|
|
282
|
+
return [simulated_work(int(item.data["index"]), 0) for item in items]
|
|
283
|
+
|
|
284
|
+
t0 = time.perf_counter()
|
|
285
|
+
LaneWorkerPool(space, policy=policy).run_lane_batched("batch", batch_executor)
|
|
286
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
287
|
+
return ProfileResult(
|
|
288
|
+
scenario="small_task_batching",
|
|
289
|
+
profile=profile,
|
|
290
|
+
wall_ms=wall_ms,
|
|
291
|
+
tasks=tasks,
|
|
292
|
+
peak_kb=0.0,
|
|
293
|
+
provider_calls=calls,
|
|
294
|
+
notes=f"batch_size={batch_size}",
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return measure(run_v2 if profile == "V2 safe-speed" else run_v1)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def circuit_breaker_workload(profile: str, tasks: int) -> ProfileResult:
|
|
301
|
+
retries = 2
|
|
302
|
+
|
|
303
|
+
def run_v1() -> ProfileResult:
|
|
304
|
+
attempts = 0
|
|
305
|
+
t0 = time.perf_counter()
|
|
306
|
+
for _ in range(tasks):
|
|
307
|
+
for _attempt in range(retries + 1):
|
|
308
|
+
attempts += 1
|
|
309
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
310
|
+
return ProfileResult(
|
|
311
|
+
scenario="provider_failure_control",
|
|
312
|
+
profile=profile,
|
|
313
|
+
wall_ms=wall_ms,
|
|
314
|
+
tasks=tasks,
|
|
315
|
+
peak_kb=0.0,
|
|
316
|
+
provider_calls=attempts,
|
|
317
|
+
blocked_calls=0,
|
|
318
|
+
notes="no provider circuit breaker",
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def run_v2() -> ProfileResult:
|
|
322
|
+
attempts = 0
|
|
323
|
+
blocked = 0
|
|
324
|
+
policy = RuntimePolicy(
|
|
325
|
+
api_max_retries=retries,
|
|
326
|
+
api_backoff_base_ms=1,
|
|
327
|
+
circuit_failure_threshold=3,
|
|
328
|
+
circuit_cooldown_s=60,
|
|
329
|
+
)
|
|
330
|
+
space = TupleSpace(policy=policy)
|
|
331
|
+
|
|
332
|
+
def failing_call() -> str:
|
|
333
|
+
nonlocal attempts
|
|
334
|
+
attempts += 1
|
|
335
|
+
raise TimeoutError("simulated provider outage")
|
|
336
|
+
|
|
337
|
+
t0 = time.perf_counter()
|
|
338
|
+
for _ in range(tasks):
|
|
339
|
+
try:
|
|
340
|
+
space.call_with_backoff(
|
|
341
|
+
"llm-provider", failing_call, sleep_fn=lambda _s: None
|
|
342
|
+
)
|
|
343
|
+
except CircuitOpenError:
|
|
344
|
+
blocked += 1
|
|
345
|
+
except TimeoutError:
|
|
346
|
+
pass
|
|
347
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
348
|
+
return ProfileResult(
|
|
349
|
+
scenario="provider_failure_control",
|
|
350
|
+
profile=profile,
|
|
351
|
+
wall_ms=wall_ms,
|
|
352
|
+
tasks=tasks,
|
|
353
|
+
peak_kb=0.0,
|
|
354
|
+
provider_calls=attempts,
|
|
355
|
+
blocked_calls=blocked,
|
|
356
|
+
notes="breaker opens after 3 provider failures",
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
return measure(run_v2 if profile == "V2 safe-speed" else run_v1)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def context_compression_workload(profile: str, chars: int) -> ProfileResult:
|
|
363
|
+
raw_context = "A" * chars
|
|
364
|
+
payload = {"provider": "claude", "context": raw_context, "prompt": "Implement X"}
|
|
365
|
+
|
|
366
|
+
def run() -> ProfileResult:
|
|
367
|
+
tup = YoolTuple("llm_call", (0,), "root", "llm", "benchmark", dict(payload))
|
|
368
|
+
space = TupleSpace()
|
|
369
|
+
t0 = time.perf_counter()
|
|
370
|
+
if profile == "V2 safe-speed":
|
|
371
|
+
space.compress_context(tup)
|
|
372
|
+
serialized = json.dumps(tup.data, sort_keys=True, ensure_ascii=False)
|
|
373
|
+
wall_ms = (time.perf_counter() - t0) * 1000
|
|
374
|
+
return ProfileResult(
|
|
375
|
+
scenario="context_compression",
|
|
376
|
+
profile=profile,
|
|
377
|
+
wall_ms=wall_ms,
|
|
378
|
+
tasks=1,
|
|
379
|
+
peak_kb=0.0,
|
|
380
|
+
tokens=estimate_tokens(serialized),
|
|
381
|
+
notes=f"{chars} char context",
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
return measure(run)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def run_suite() -> dict[str, Any]:
|
|
388
|
+
results = [
|
|
389
|
+
normal_flat_scale(131_072),
|
|
390
|
+
lazy_scale("V1 high-throughput", 1_048_576),
|
|
391
|
+
lazy_scale("V2 safe-speed", 1_048_576),
|
|
392
|
+
normal_sequential(1024, 5.0),
|
|
393
|
+
lane_execution("V1 high-throughput", 1024, 5.0, max_lane_concurrency=32),
|
|
394
|
+
lane_execution("V2 safe-speed", 1024, 5.0, max_lane_concurrency=64),
|
|
395
|
+
cache_workload("normal instruction", 256, 64, 1.0),
|
|
396
|
+
cache_workload("V1 high-throughput", 256, 64, 1.0),
|
|
397
|
+
cache_workload("V2 safe-speed", 256, 64, 1.0),
|
|
398
|
+
batching_workload("normal instruction", 512, 32),
|
|
399
|
+
batching_workload("V1 high-throughput", 512, 32),
|
|
400
|
+
batching_workload("V2 safe-speed", 512, 32),
|
|
401
|
+
circuit_breaker_workload("normal instruction", 64),
|
|
402
|
+
circuit_breaker_workload("V1 high-throughput", 64),
|
|
403
|
+
circuit_breaker_workload("V2 safe-speed", 64),
|
|
404
|
+
context_compression_workload("normal instruction", 20_000),
|
|
405
|
+
context_compression_workload("V1 high-throughput", 20_000),
|
|
406
|
+
context_compression_workload("V2 safe-speed", 20_000),
|
|
407
|
+
]
|
|
408
|
+
return summarise(results)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def summarise(results: list[ProfileResult]) -> dict[str, Any]:
|
|
412
|
+
by_key = {(item.scenario, item.profile): item for item in results}
|
|
413
|
+
|
|
414
|
+
def gain(
|
|
415
|
+
scenario: str,
|
|
416
|
+
baseline: str,
|
|
417
|
+
improved: str,
|
|
418
|
+
*,
|
|
419
|
+
value: str = "wall_ms",
|
|
420
|
+
lower_is_better: bool = True,
|
|
421
|
+
) -> dict[str, Any]:
|
|
422
|
+
base = by_key[(scenario, baseline)]
|
|
423
|
+
new = by_key[(scenario, improved)]
|
|
424
|
+
base_value = getattr(base, value)
|
|
425
|
+
new_value = getattr(new, value)
|
|
426
|
+
if lower_is_better:
|
|
427
|
+
ratio = base_value / new_value if new_value else None
|
|
428
|
+
pct = ((base_value - new_value) / base_value * 100) if base_value else 0.0
|
|
429
|
+
else:
|
|
430
|
+
ratio = new_value / base_value if base_value else None
|
|
431
|
+
pct = ((new_value - base_value) / base_value * 100) if base_value else 0.0
|
|
432
|
+
return {
|
|
433
|
+
"scenario": scenario,
|
|
434
|
+
"baseline": baseline,
|
|
435
|
+
"improved": improved,
|
|
436
|
+
"metric": value,
|
|
437
|
+
"baseline_value": base_value,
|
|
438
|
+
"improved_value": new_value,
|
|
439
|
+
"ratio": ratio,
|
|
440
|
+
"percent": pct,
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
comparisons = [
|
|
444
|
+
gain("scale_representation", "normal instruction", "V2 safe-speed"),
|
|
445
|
+
gain("active_execution", "normal instruction", "V1 high-throughput"),
|
|
446
|
+
gain("active_execution", "normal instruction", "V2 safe-speed"),
|
|
447
|
+
gain("active_execution", "V1 high-throughput", "V2 safe-speed"),
|
|
448
|
+
gain("cache_dedupe", "normal instruction", "V2 safe-speed"),
|
|
449
|
+
gain(
|
|
450
|
+
"cache_dedupe",
|
|
451
|
+
"normal instruction",
|
|
452
|
+
"V2 safe-speed",
|
|
453
|
+
value="provider_calls",
|
|
454
|
+
),
|
|
455
|
+
gain("cache_dedupe", "V1 high-throughput", "V2 safe-speed"),
|
|
456
|
+
gain(
|
|
457
|
+
"cache_dedupe",
|
|
458
|
+
"V1 high-throughput",
|
|
459
|
+
"V2 safe-speed",
|
|
460
|
+
value="provider_calls",
|
|
461
|
+
),
|
|
462
|
+
gain("small_task_batching", "normal instruction", "V2 safe-speed"),
|
|
463
|
+
gain(
|
|
464
|
+
"small_task_batching",
|
|
465
|
+
"normal instruction",
|
|
466
|
+
"V2 safe-speed",
|
|
467
|
+
value="provider_calls",
|
|
468
|
+
),
|
|
469
|
+
gain("small_task_batching", "V1 high-throughput", "V2 safe-speed"),
|
|
470
|
+
gain(
|
|
471
|
+
"small_task_batching",
|
|
472
|
+
"V1 high-throughput",
|
|
473
|
+
"V2 safe-speed",
|
|
474
|
+
value="provider_calls",
|
|
475
|
+
),
|
|
476
|
+
gain(
|
|
477
|
+
"provider_failure_control",
|
|
478
|
+
"normal instruction",
|
|
479
|
+
"V2 safe-speed",
|
|
480
|
+
value="provider_calls",
|
|
481
|
+
),
|
|
482
|
+
gain(
|
|
483
|
+
"provider_failure_control",
|
|
484
|
+
"V1 high-throughput",
|
|
485
|
+
"V2 safe-speed",
|
|
486
|
+
value="provider_calls",
|
|
487
|
+
),
|
|
488
|
+
gain(
|
|
489
|
+
"context_compression",
|
|
490
|
+
"normal instruction",
|
|
491
|
+
"V2 safe-speed",
|
|
492
|
+
value="tokens",
|
|
493
|
+
),
|
|
494
|
+
gain(
|
|
495
|
+
"context_compression", "V1 high-throughput", "V2 safe-speed", value="tokens"
|
|
496
|
+
),
|
|
497
|
+
]
|
|
498
|
+
return {
|
|
499
|
+
"title": "Yool Safe-Speed Benchmark V2",
|
|
500
|
+
"run_date": "2026-05-21",
|
|
501
|
+
"environment": {
|
|
502
|
+
"python": sys.version.split()[0],
|
|
503
|
+
"repository": "wesleysimplicio/simplicio-prompt",
|
|
504
|
+
"branch": "codex/lane-concurrency-runtime",
|
|
505
|
+
"v1_definition": "high-throughput runtime with fixed lane ceiling and safe-speed controls disabled",
|
|
506
|
+
"v2_definition": "V1 plus cache, adaptive lanes, backoff, circuit breaker, batching, context compression, local routing, and idempotent speculation",
|
|
507
|
+
},
|
|
508
|
+
"results": [item.to_dict() for item in results],
|
|
509
|
+
"comparisons": comparisons,
|
|
510
|
+
"median_wall_ms": statistics.median(item.wall_ms for item in results),
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def write_markdown(payload: dict[str, Any], path: Path) -> None:
|
|
515
|
+
results = payload["results"]
|
|
516
|
+
comparisons = payload["comparisons"]
|
|
517
|
+
|
|
518
|
+
def rows_for(scenario: str) -> list[dict[str, Any]]:
|
|
519
|
+
return [item for item in results if item["scenario"] == scenario]
|
|
520
|
+
|
|
521
|
+
lines = [
|
|
522
|
+
"# Yool Safe-Speed Benchmark V2",
|
|
523
|
+
"",
|
|
524
|
+
"Run date: 2026-05-21",
|
|
525
|
+
"",
|
|
526
|
+
"This report compares three execution styles:",
|
|
527
|
+
"",
|
|
528
|
+
"- Normal instruction: generic prompt, flat or repeated work, no runtime guardrails.",
|
|
529
|
+
"- V1 high-throughput: lazy `batch_spawn` and fixed `LaneWorkerPool` fan-out.",
|
|
530
|
+
"- V2 safe-speed: V1 plus cache, adaptive lanes, backoff, provider circuit breaker, batching, context compression, local routing, and idempotent speculation.",
|
|
531
|
+
"",
|
|
532
|
+
"The benchmark is local. It does not call hosted LLMs or external APIs.",
|
|
533
|
+
"",
|
|
534
|
+
]
|
|
535
|
+
|
|
536
|
+
lines.extend(
|
|
537
|
+
_scenario_table("Scale Representation", rows_for("scale_representation"))
|
|
538
|
+
)
|
|
539
|
+
lines.extend(_scenario_table("Active Execution", rows_for("active_execution")))
|
|
540
|
+
lines.extend(_scenario_table("Cache Dedupe", rows_for("cache_dedupe")))
|
|
541
|
+
lines.extend(
|
|
542
|
+
_scenario_table("Small Task Batching", rows_for("small_task_batching"))
|
|
543
|
+
)
|
|
544
|
+
lines.extend(
|
|
545
|
+
_scenario_table(
|
|
546
|
+
"Provider Failure Control", rows_for("provider_failure_control")
|
|
547
|
+
)
|
|
548
|
+
)
|
|
549
|
+
lines.extend(
|
|
550
|
+
_scenario_table("Context Compression", rows_for("context_compression"))
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
lines.extend(["## Gains", ""])
|
|
554
|
+
lines.append("| Scenario | Baseline | Improved | Metric | Ratio | Gain |")
|
|
555
|
+
lines.append("|---|---|---|---|---:|---:|")
|
|
556
|
+
for item in comparisons:
|
|
557
|
+
ratio = "n/a" if item["ratio"] is None else f"{item['ratio']:.2f}x"
|
|
558
|
+
row = {**item, "ratio_label": ratio}
|
|
559
|
+
lines.append(
|
|
560
|
+
"| {scenario} | {baseline} | {improved} | {metric} | {ratio_label} | {percent:.2f}% |".format(
|
|
561
|
+
**row,
|
|
562
|
+
)
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
lines.extend(
|
|
566
|
+
[
|
|
567
|
+
"",
|
|
568
|
+
"## Interpretation",
|
|
569
|
+
"",
|
|
570
|
+
"- V2 keeps the V1 lazy million-agent scale model.",
|
|
571
|
+
"- V2 improves active fan-out by allowing lanes to grow toward the configured ceiling when backlog is high.",
|
|
572
|
+
"- Cache reduces repeated provider calls when the same `yool + data` appears again.",
|
|
573
|
+
"- Batching turns many tiny provider/API-sized operations into fewer bounded calls.",
|
|
574
|
+
"- Circuit breaker reduces hammering during provider outages, which is the anti-ban part of the speed model.",
|
|
575
|
+
"- Context compression lowers token transfer before LLM calls while preserving a digest and preview.",
|
|
576
|
+
"",
|
|
577
|
+
"## Reproduce",
|
|
578
|
+
"",
|
|
579
|
+
"```bash",
|
|
580
|
+
"python benchmarks/v2_safe_speed_benchmark.py --json-output benchmarks/v2_safe_speed_results.json --md-output benchmarks/v2_safe_speed_results.md",
|
|
581
|
+
"python benchmarks/generate_v2_benchmark_pdf.py",
|
|
582
|
+
"```",
|
|
583
|
+
"",
|
|
584
|
+
]
|
|
585
|
+
)
|
|
586
|
+
path.write_text("\n".join(lines), encoding="utf-8")
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def _scenario_table(title: str, rows: list[dict[str, Any]]) -> list[str]:
|
|
590
|
+
lines = [f"## {title}", ""]
|
|
591
|
+
lines.append(
|
|
592
|
+
"| Profile | Tasks | Wall ms | Throughput/s | Peak KiB | Provider calls | Cache hits | Blocked | Tokens | Notes |"
|
|
593
|
+
)
|
|
594
|
+
lines.append("|---|---:|---:|---:|---:|---:|---:|---:|---:|---|")
|
|
595
|
+
for row in rows:
|
|
596
|
+
lines.append(
|
|
597
|
+
"| {profile} | {tasks:,} | {wall_ms:.2f} | {throughput_tasks_s:.1f} | {peak_kb:.1f} | {provider_calls:,} | {cache_hits:,} | {blocked_calls:,} | {tokens:,} | {notes} |".format(
|
|
598
|
+
**row
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
lines.append("")
|
|
602
|
+
return lines
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def main() -> int:
|
|
606
|
+
parser = argparse.ArgumentParser()
|
|
607
|
+
parser.add_argument("--json-output", type=Path, default=OUT_JSON)
|
|
608
|
+
parser.add_argument("--md-output", type=Path, default=OUT_MD)
|
|
609
|
+
parser.add_argument("--print-json", action="store_true")
|
|
610
|
+
args = parser.parse_args()
|
|
611
|
+
|
|
612
|
+
payload = run_suite()
|
|
613
|
+
args.json_output.write_text(
|
|
614
|
+
json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
615
|
+
)
|
|
616
|
+
write_markdown(payload, args.md_output)
|
|
617
|
+
if args.print_json:
|
|
618
|
+
print(json.dumps(payload, indent=2, ensure_ascii=False))
|
|
619
|
+
else:
|
|
620
|
+
print(args.json_output)
|
|
621
|
+
print(args.md_output)
|
|
622
|
+
return 0
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
if __name__ == "__main__":
|
|
626
|
+
raise SystemExit(main())
|