@miller-tech/uap 1.13.12 → 1.13.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/.tsbuildinfo +1 -1
  2. package/dist/benchmarks/speculative-autotune.d.ts +46 -0
  3. package/dist/benchmarks/speculative-autotune.d.ts.map +1 -0
  4. package/dist/benchmarks/speculative-autotune.js +145 -0
  5. package/dist/benchmarks/speculative-autotune.js.map +1 -0
  6. package/dist/benchmarks/token-throughput.d.ts +46 -46
  7. package/dist/bin/cli.js +2 -0
  8. package/dist/bin/cli.js.map +1 -1
  9. package/dist/bin/llama-server-optimize.js +176 -0
  10. package/dist/bin/llama-server-optimize.js.map +1 -1
  11. package/dist/bin/policy.js +0 -0
  12. package/dist/cli/hooks.js +1 -0
  13. package/dist/cli/hooks.js.map +1 -1
  14. package/dist/cli/init.d.ts +1 -0
  15. package/dist/cli/init.d.ts.map +1 -1
  16. package/dist/cli/init.js +18 -0
  17. package/dist/cli/init.js.map +1 -1
  18. package/dist/cli/setup.d.ts +1 -0
  19. package/dist/cli/setup.d.ts.map +1 -1
  20. package/dist/cli/setup.js +1 -0
  21. package/dist/cli/setup.js.map +1 -1
  22. package/dist/cli/systemd-services.d.ts +12 -0
  23. package/dist/cli/systemd-services.d.ts.map +1 -0
  24. package/dist/cli/systemd-services.js +179 -0
  25. package/dist/cli/systemd-services.js.map +1 -0
  26. package/dist/models/types.d.ts +12 -12
  27. package/dist/policies/schemas/policy.d.ts +12 -12
  28. package/dist/types/config.d.ts +24 -24
  29. package/docs/deployment/QWEN35_LLAMA_CPP.md +49 -0
  30. package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +279 -0
  31. package/package.json +1 -1
  32. package/templates/hooks/loop-protection.sh +250 -0
  33. package/templates/hooks/post-compact.sh +14 -0
  34. package/templates/hooks/post-tool-use-edit-write.sh +15 -0
  35. package/templates/hooks/pre-compact.sh +9 -0
  36. package/templates/hooks/pre-tool-use-bash.sh +6 -0
  37. package/templates/hooks/pre-tool-use-edit-write.sh +10 -0
  38. package/templates/hooks/session-start.sh +64 -44
  39. package/templates/hooks/stop.sh +9 -0
  40. package/tools/agents/scripts/anthropic_proxy.py +716 -166
  41. package/tools/agents/tests/test_anthropic_proxy_streaming.py +51 -0
  42. package/tools/agents/scripts/__pycache__/anthropic_proxy.cpython-313.pyc +0 -0
  43. package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
@@ -1272,7 +1272,7 @@ export declare const ModelConfigSchema: z.ZodObject<{
1272
1272
  }, "strip", z.ZodTypeAny, {
1273
1273
  id: string;
1274
1274
  name: string;
1275
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
1275
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
1276
1276
  apiModel: string;
1277
1277
  maxContextTokens: number;
1278
1278
  capabilities: string[];
@@ -1283,7 +1283,7 @@ export declare const ModelConfigSchema: z.ZodObject<{
1283
1283
  }, {
1284
1284
  id: string;
1285
1285
  name: string;
1286
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
1286
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
1287
1287
  apiModel: string;
1288
1288
  endpoint?: string | undefined;
1289
1289
  apiKeyEnvVar?: string | undefined;
@@ -1303,12 +1303,12 @@ export declare const RoutingRuleSchema: z.ZodObject<{
1303
1303
  priority: number;
1304
1304
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
1305
1305
  keywords?: string[] | undefined;
1306
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
1306
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
1307
1307
  }, {
1308
1308
  targetRole: "planner" | "executor" | "reviewer" | "fallback";
1309
1309
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
1310
1310
  keywords?: string[] | undefined;
1311
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
1311
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
1312
1312
  priority?: number | undefined;
1313
1313
  }>;
1314
1314
  export declare const MultiModelSchema: z.ZodObject<{
@@ -1328,7 +1328,7 @@ export declare const MultiModelSchema: z.ZodObject<{
1328
1328
  }, "strip", z.ZodTypeAny, {
1329
1329
  id: string;
1330
1330
  name: string;
1331
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
1331
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
1332
1332
  apiModel: string;
1333
1333
  maxContextTokens: number;
1334
1334
  capabilities: string[];
@@ -1340,7 +1340,7 @@ export declare const MultiModelSchema: z.ZodObject<{
1340
1340
  }, {
1341
1341
  id: string;
1342
1342
  name: string;
1343
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
1343
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
1344
1344
  apiModel: string;
1345
1345
  endpoint?: string | undefined;
1346
1346
  apiKeyEnvVar?: string | undefined;
@@ -1377,12 +1377,12 @@ export declare const MultiModelSchema: z.ZodObject<{
1377
1377
  priority: number;
1378
1378
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
1379
1379
  keywords?: string[] | undefined;
1380
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
1380
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
1381
1381
  }, {
1382
1382
  targetRole: "planner" | "executor" | "reviewer" | "fallback";
1383
1383
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
1384
1384
  keywords?: string[] | undefined;
1385
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
1385
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
1386
1386
  priority?: number | undefined;
1387
1387
  }>, "many">>;
1388
1388
  costOptimization: z.ZodOptional<z.ZodObject<{
@@ -1443,7 +1443,7 @@ export declare const MultiModelSchema: z.ZodObject<{
1443
1443
  models: (string | {
1444
1444
  id: string;
1445
1445
  name: string;
1446
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
1446
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
1447
1447
  apiModel: string;
1448
1448
  maxContextTokens: number;
1449
1449
  capabilities: string[];
@@ -1465,7 +1465,7 @@ export declare const MultiModelSchema: z.ZodObject<{
1465
1465
  priority: number;
1466
1466
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
1467
1467
  keywords?: string[] | undefined;
1468
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
1468
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
1469
1469
  }[] | undefined;
1470
1470
  costOptimization?: {
1471
1471
  enabled: boolean;
@@ -1492,7 +1492,7 @@ export declare const MultiModelSchema: z.ZodObject<{
1492
1492
  models?: (string | {
1493
1493
  id: string;
1494
1494
  name: string;
1495
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
1495
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
1496
1496
  apiModel: string;
1497
1497
  endpoint?: string | undefined;
1498
1498
  apiKeyEnvVar?: string | undefined;
@@ -1512,7 +1512,7 @@ export declare const MultiModelSchema: z.ZodObject<{
1512
1512
  targetRole: "planner" | "executor" | "reviewer" | "fallback";
1513
1513
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
1514
1514
  keywords?: string[] | undefined;
1515
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
1515
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
1516
1516
  priority?: number | undefined;
1517
1517
  }[] | undefined;
1518
1518
  costOptimization?: {
@@ -2583,7 +2583,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2583
2583
  }, "strip", z.ZodTypeAny, {
2584
2584
  id: string;
2585
2585
  name: string;
2586
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
2586
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
2587
2587
  apiModel: string;
2588
2588
  maxContextTokens: number;
2589
2589
  capabilities: string[];
@@ -2595,7 +2595,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2595
2595
  }, {
2596
2596
  id: string;
2597
2597
  name: string;
2598
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
2598
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
2599
2599
  apiModel: string;
2600
2600
  endpoint?: string | undefined;
2601
2601
  apiKeyEnvVar?: string | undefined;
@@ -2632,12 +2632,12 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2632
2632
  priority: number;
2633
2633
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
2634
2634
  keywords?: string[] | undefined;
2635
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
2635
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
2636
2636
  }, {
2637
2637
  targetRole: "planner" | "executor" | "reviewer" | "fallback";
2638
2638
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
2639
2639
  keywords?: string[] | undefined;
2640
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
2640
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
2641
2641
  priority?: number | undefined;
2642
2642
  }>, "many">>;
2643
2643
  costOptimization: z.ZodOptional<z.ZodObject<{
@@ -2698,7 +2698,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2698
2698
  models: (string | {
2699
2699
  id: string;
2700
2700
  name: string;
2701
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
2701
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
2702
2702
  apiModel: string;
2703
2703
  maxContextTokens: number;
2704
2704
  capabilities: string[];
@@ -2720,7 +2720,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2720
2720
  priority: number;
2721
2721
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
2722
2722
  keywords?: string[] | undefined;
2723
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
2723
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
2724
2724
  }[] | undefined;
2725
2725
  costOptimization?: {
2726
2726
  enabled: boolean;
@@ -2747,7 +2747,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2747
2747
  models?: (string | {
2748
2748
  id: string;
2749
2749
  name: string;
2750
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
2750
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
2751
2751
  apiModel: string;
2752
2752
  endpoint?: string | undefined;
2753
2753
  apiKeyEnvVar?: string | undefined;
@@ -2767,7 +2767,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
2767
2767
  targetRole: "planner" | "executor" | "reviewer" | "fallback";
2768
2768
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
2769
2769
  keywords?: string[] | undefined;
2770
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
2770
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
2771
2771
  priority?: number | undefined;
2772
2772
  }[] | undefined;
2773
2773
  costOptimization?: {
@@ -3092,7 +3092,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
3092
3092
  models: (string | {
3093
3093
  id: string;
3094
3094
  name: string;
3095
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
3095
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
3096
3096
  apiModel: string;
3097
3097
  maxContextTokens: number;
3098
3098
  capabilities: string[];
@@ -3114,7 +3114,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
3114
3114
  priority: number;
3115
3115
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
3116
3116
  keywords?: string[] | undefined;
3117
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
3117
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
3118
3118
  }[] | undefined;
3119
3119
  costOptimization?: {
3120
3120
  enabled: boolean;
@@ -3353,7 +3353,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
3353
3353
  models?: (string | {
3354
3354
  id: string;
3355
3355
  name: string;
3356
- provider: "custom" | "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama";
3356
+ provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
3357
3357
  apiModel: string;
3358
3358
  endpoint?: string | undefined;
3359
3359
  apiKeyEnvVar?: string | undefined;
@@ -3373,7 +3373,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
3373
3373
  targetRole: "planner" | "executor" | "reviewer" | "fallback";
3374
3374
  complexity?: "low" | "medium" | "high" | "critical" | undefined;
3375
3375
  keywords?: string[] | undefined;
3376
- taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "review" | "documentation" | undefined;
3376
+ taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
3377
3377
  priority?: number | undefined;
3378
3378
  }[] | undefined;
3379
3379
  costOptimization?: {
@@ -116,6 +116,50 @@ llama-server \
116
116
  | `--draft-max` | `16` | Max tokens to draft per iteration. Higher = more throughput, more VRAM. |
117
117
  | `--draft-p-min` | `0.75` | Minimum acceptance probability. Lower = more aggressive drafting. |
118
118
 
119
+ ## Extension Options for Speculative Decoding
120
+
121
+ ### Option 1: Adaptive Runtime Tuning (implemented)
122
+
123
+ Use acceptance and rollback rates to auto-adjust `draft-max`, `draft-min`, and `draft-p-min` over time.
124
+
125
+ - Best for immediate gains without kernel changes
126
+ - Reduces bad bursts when acceptance drops
127
+ - Increases burst length automatically during high-acceptance windows
128
+
129
+ Commands:
130
+
131
+ ```bash
132
+ # Tune once from observed metrics
133
+ llama-optimize spec-autotune --acceptance 0.71 --rollback 0.14 --profile throughput
134
+
135
+ # Compare static defaults vs adaptive tuning using deterministic simulation
136
+ llama-optimize spec-benchmark --profile throughput --trace mixed --steps 180
137
+
138
+ # Live benchmark active server and get tuned flag recommendation
139
+ llama-optimize spec-benchmark-live \
140
+ --endpoint http://127.0.0.1:8080/v1 \
141
+ --model qwen3.5-a3b-iq4xs \
142
+ --runs 5 --max-tokens 256 --profile throughput
143
+ ```
144
+
145
+ Recommended workflow:
146
+
147
+ 1. Run `spec-benchmark-live` with your current startup flags and note `Throughput`.
148
+ 2. Restart `llama-server` with the `Suggested params` flags.
149
+ 3. Re-run `spec-benchmark-live` with the same settings to measure actual gain.
150
+
151
+ ### Option 2: GPU Residency + Overlap
152
+
153
+ - Keep draft model and draft KV fully on GPU
154
+ - Preallocate buffers and overlap draft + verify passes with CUDA streams
155
+ - Improves p95 latency consistency on long runs
156
+
157
+ ### Option 3: GPU Checkpoint/Rollback
158
+
159
+ - Move speculative checkpoint snapshots from CPU RAM to GPU buffers
160
+ - Remove host-device copy overhead from rollback paths
161
+ - Highest upside, but requires deeper runtime changes
162
+
119
163
  ### Sampling
120
164
 
121
165
  | Flag | Value | Purpose |
@@ -177,6 +221,10 @@ All settings are via environment variables:
177
221
  | `PROXY_LOG_LEVEL` | `INFO` | Logging level (DEBUG/INFO/WARNING/ERROR) |
178
222
  | `PROXY_READ_TIMEOUT` | `600` | Read timeout (seconds) for LLM streaming |
179
223
  | `PROXY_MAX_CONNECTIONS` | `20` | Max concurrent upstream connections |
224
+ | `PROXY_STREAM_REASONING_FALLBACK` | `off` | Streaming behavior for reasoning-only empty turns (`off`, `sanitized`, `visible`) |
225
+ | `PROXY_STREAM_REASONING_MAX_CHARS` | `240` | Max fallback length when `PROXY_STREAM_REASONING_FALLBACK=sanitized` |
226
+
227
+ For agentic coding workloads, keep `PROXY_STREAM_REASONING_FALLBACK=off` (default) to avoid leaking malformed internal reasoning as user-visible output. Use `sanitized` only for debugging.
180
228
 
181
229
  ### Example: Custom upstream
182
230
 
@@ -339,3 +387,4 @@ Two possible causes:
339
387
  - `tools/agents/scripts/qwen_tool_call_test.py` - Test suite using OpenAI-compatible API
340
388
  - `src/cli/tool-calls.ts` - CLI command for template management
341
389
  - `src/bin/llama-server-optimize.ts` - llama-server startup optimizer
390
+ - `docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md` - service bootstrap + ngram-cache A/B benchmarking
@@ -0,0 +1,279 @@
1
+ # UAP + llama.cpp + Anthropic Proxy Bootstrap
2
+
3
+ This guide captures the local continuity stack as a repeatable bootstrap:
4
+
5
+ - `uap-llama-server.service` (llama.cpp)
6
+ - `uap-anthropic-proxy.service` (Anthropic API compatibility)
7
+ - A/B benchmark workflow for speculative decoding with `ngram-cache`
8
+
9
+ It also documents the UAP-side support changes needed to keep llama.cpp speculative decoding stable in agentic workflows.
10
+
11
+ ## 1) Bootstrap services
12
+
13
+ Run:
14
+
15
+ ```bash
16
+ bash scripts/bootstrap/bootstrap-uap-llama-proxy-stack.sh
17
+ ```
18
+
19
+ This writes:
20
+
21
+ - `~/.config/uap/llama-server.env`
22
+ - `~/.config/uap/anthropic-proxy.env`
23
+ - `~/.config/systemd/user/uap-llama-server.service`
24
+ - `~/.config/systemd/user/uap-anthropic-proxy.service`
25
+
26
+ Then it enables and starts both user services.
27
+
28
+ ## 2) Key llama env knobs
29
+
30
+ Edit `~/.config/uap/llama-server.env` and restart service:
31
+
32
+ ```bash
33
+ systemctl --user restart uap-llama-server.service
34
+ ```
35
+
36
+ Important variables:
37
+
38
+ - `LLAMA_SPEC_TYPE` (`none`, `ngram-cache`, etc.)
39
+ - `LLAMA_DRAFT_MAX`
40
+ - `LLAMA_DRAFT_MIN`
41
+ - `LLAMA_DRAFT_P_MIN`
42
+ - `LLAMA_EXTRA_ARGS` (optional additional startup flags)
43
+
44
+ ## 3) Key proxy env knobs
45
+
46
+ Edit `~/.config/uap/anthropic-proxy.env` and restart proxy:
47
+
48
+ ```bash
49
+ systemctl --user restart uap-anthropic-proxy.service
50
+ ```
51
+
52
+ Important variables:
53
+
54
+ - `PROXY_PORT`
55
+ - `LLAMA_CPP_BASE`
56
+ - `PROXY_CONTEXT_WINDOW` (set to `262144` to match llama context)
57
+ - Loop/guardrail options (`PROXY_LOOP_BREAKER`, `PROXY_FORCED_THRESHOLD`, etc.)
58
+
59
+ ## 4) Run ngram-cache signal benchmark
60
+
61
+ Use the service-oriented A/B script:
62
+
63
+ ```bash
64
+ bash scripts/benchmarks/run-spec-ngram-service-ab.sh
65
+ ```
66
+
67
+ What it does:
68
+
69
+ 1. Stops managed `uap-llama-server.service` temporarily
70
+ 2. Runs transient systemd service benchmarks for:
71
+ - `spec-type=none`
72
+ - `spec-type=ngram-cache` (default draft params)
73
+ - `spec-type=ngram-cache` (tuned: `21/6/0.72`)
74
+ 3. Restores managed `uap-llama-server.service`
75
+ 4. Writes report artifacts under `benchmark-results/spec-ngram-ab-<timestamp>/`
76
+
77
+ Outputs:
78
+
79
+ - `report.json` machine-readable deltas
80
+ - `report.md` human-readable summary
81
+
82
+ ## 5) Run automatic draft-parameter sweep (Option 2)
83
+
84
+ Use this to search for the best local `ngram-cache` settings:
85
+
86
+ ```bash
87
+ bash scripts/benchmarks/run-spec-ngram-sweep.sh
88
+ ```
89
+
90
+ Useful overrides:
91
+
92
+ ```bash
93
+ RUNS=5 MAX_TOKENS=256 \
94
+ DRAFT_MAXS="16 18 20 22" \
95
+ DRAFT_MINS="3 4 5 6" \
96
+ DRAFT_P_MINS="0.70 0.72 0.75 0.78" \
97
+ bash scripts/benchmarks/run-spec-ngram-sweep.sh
98
+ ```
99
+
100
+ Outputs are written under `benchmark-results/spec-ngram-sweep-<timestamp>/`:
101
+
102
+ - `results.jsonl` one entry per candidate
103
+ - `summary.json` best candidate + stats
104
+ - `summary.md` top 5 table
105
+
106
+ ## 6) Profiles for agentic coding vs max speed
107
+
108
+ Use two explicit profiles depending on your goal.
109
+
110
+ ### A) Agentic coding continuity profile (recommended daily use)
111
+
112
+ This profile prioritizes long, coherent coding sessions and minimizes `find_slot` warnings.
113
+
114
+ `~/.config/uap/llama-server.env`:
115
+
116
+ ```env
117
+ LLAMA_CTX_SIZE=262144
118
+ LLAMA_SPEC_TYPE=ngram-cache
119
+ LLAMA_DRAFT_MAX=12
120
+ LLAMA_DRAFT_MIN=2
121
+ LLAMA_DRAFT_P_MIN=0.80
122
+ LLAMA_HYBRID_ROLLBACK_MODE=strict
123
+ ```
124
+
125
+ Apply it:
126
+
127
+ ```bash
128
+ systemctl --user restart uap-llama-server.service
129
+ ```
130
+
131
+ `~/.config/uap/anthropic-proxy.env`:
132
+
133
+ ```env
134
+ PROXY_CONTEXT_WINDOW=262144
135
+ PROXY_LOOP_BREAKER=on
136
+ PROXY_LOOP_WINDOW=6
137
+ PROXY_LOOP_REPEAT_THRESHOLD=10
138
+ PROXY_FORCED_THRESHOLD=18
139
+ PROXY_NO_PROGRESS_THRESHOLD=5
140
+ PROXY_CONTEXT_RELEASE_THRESHOLD=0.95
141
+ PROXY_GUARDRAIL_RETRY=on
142
+ ```
143
+
144
+ Apply it:
145
+
146
+ ```bash
147
+ systemctl --user restart uap-anthropic-proxy.service
148
+ ```
149
+
150
+ ### B) Max-throughput benchmark profile (where 220+ tok/s was observed)
151
+
152
+ The 220+ decode throughput observed in this session was achieved with:
153
+
154
+ - CUDA build: `/home/cogtek/llama.cpp/.worktrees/001-llama-spec-rollback-fix/build-cuda/bin/llama-server`
155
+ - GPU flags: `--device CUDA0 --n-gpu-layers all --flash-attn on`
156
+ - Speculative mode: `--spec-type ngram-cache`
157
+ - Rollback mode: `LLAMA_HYBRID_ROLLBACK_MODE=hybrid`
158
+ - Workload: repetitive pattern prompt, `n_predict=512`
159
+
160
+ Run command used for that profile:
161
+
162
+ ```bash
163
+ LLAMA_HYBRID_ROLLBACK_MODE=hybrid \
164
+ /home/cogtek/llama.cpp/.worktrees/001-llama-spec-rollback-fix/build-cuda/bin/llama-server \
165
+ -m "/home/cogtek/Downloads/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf" \
166
+ --host 127.0.0.1 --port 18121 \
167
+ --ctx-size 16384 --parallel 1 --no-warmup \
168
+ --device CUDA0 --n-gpu-layers all --flash-attn on \
169
+ --spec-type ngram-cache
170
+ ```
171
+
172
+ Important: this max-speed profile is workload-sensitive and was measured on a pattern-heavy prompt. For real agentic coding, use Profile A.
173
+
174
+ ## 7) Validated A/B findings (2026-03-23)
175
+
176
+ Direct old-vs-new A/B was run against:
177
+
178
+ - old fast commit: `029edcafc` (first pushed fast state around 21:35)
179
+ - newer commit: `1f8225f8f`
180
+ - model: `Qwen3.5-35B-A3B-UD-IQ4_XS.gguf`
181
+ - speculative: `ngram-cache`, `draft-max=16`, `draft-min=3`, `draft-p-min=0.72`
182
+
183
+ Notes:
184
+
185
+ - Standalone launches at `ctx-size=262144` can fail GPU allocation on some runs for the old commit (`failed to allocate compute pp buffers`).
186
+ - For controlled apples-to-apples throughput comparison, A/B was run at `ctx-size=16384`.
187
+
188
+ Observed results (`/tmp/ab_matrix_ctx16_v2.json`):
189
+
190
+ | Path | Old `029edcafc` | New `1f8225f8f` | Delta (new vs old) |
191
+ | --------------- | --------------- | --------------- | ------------------- |
192
+ | Raw coding | 107.97 tok/s | 99.23 tok/s | -8.1% |
193
+ | Raw pattern | 158.71 tok/s | 105.75 tok/s | -33.4% |
194
+ | Proxy plain | 113.74 tok/s | 109.39 tok/s | -3.8% |
195
+ | Agentic tool 2nd turn | `tool_use` (stable) | `tool_use` (stable) | parity on control flow |
196
+
197
+ Behavioral observations:
198
+
199
+ - Newer commit emitted many `find_slot: non-consecutive token position` warnings in raw/proxy runs under the same speculative settings.
200
+ - Old commit produced materially cleaner logs and higher throughput in the same benchmark profile.
201
+ - Proxy continuity fixes improved agentic tool-loop stability and no longer force premature stop in the tested loop.
202
+
203
+ Decision for throughput-sensitive testing:
204
+
205
+ - Prefer old fast commit `029edcafc` profile for max-throughput benchmarking.
206
+ - Keep a separate continuity profile for long-context agentic coding if warning volume grows.
207
+
208
+ Additional 27B impact snapshot (`Qwen3.5-27B-IQ4_XS`, `ctx=262144`, q4 KV cache):
209
+
210
+ - no speculative: ~43 tok/s coding, ~41 tok/s pattern
211
+ - aggressive speculative (`16/3/0.72`): ~44 tok/s coding, ~102 tok/s pattern
212
+ - balanced speculative (`12/2/0.80`): ~43 tok/s coding, ~102 tok/s pattern
213
+
214
+ Interpretation:
215
+
216
+ - balanced profile is functionally safer for agentic sessions,
217
+ - aggressive profile can edge higher on some coding runs,
218
+ - both speculative profiles massively outperform no-spec on repetition-heavy drafts.
219
+
220
+ ## 8) Throughput interpretation and loop prevention
221
+
222
+ When reading llama logs, treat these as different metrics:
223
+
224
+ - `prompt eval time ... tokens per second` = prefill throughput
225
+ - `eval time ... tokens per second` = decode/completion throughput
226
+
227
+ In local continuity runs with large context, prompt throughput may exceed 2k tok/s while decode remains near 80-125 tok/s.
228
+
229
+ For default stability, use the guardrails from Profile A. If you hit active loop incidents, temporarily tighten to:
230
+
231
+ ```env
232
+ PROXY_LOOP_WINDOW=6
233
+ PROXY_LOOP_REPEAT_THRESHOLD=8
234
+ PROXY_FORCED_THRESHOLD=14
235
+ PROXY_NO_PROGRESS_THRESHOLD=4
236
+ PROXY_CONTEXT_RELEASE_THRESHOLD=0.90
237
+ ```
238
+
239
+ Then restart proxy:
240
+
241
+ ```bash
242
+ systemctl --user restart uap-anthropic-proxy.service
243
+ ```
244
+
245
+ ## 9) UAP support changes required for reliable operation
246
+
247
+ The following UAP-side changes are part of the working stack and should be present:
248
+
249
+ 1. Session-scoped loop protection in Anthropic proxy (no cross-session contamination).
250
+ 2. Guardrail retry for unexpected text-only end-turn in active tool loops.
251
+ 3. Optional systemd scaffolding from CLI:
252
+ - `uap init --systemd-services`
253
+ - `uap setup --systemd-services`
254
+ 4. Dedicated launch scripts:
255
+ - `scripts/run-llama-server-continuity.sh`
256
+ - `scripts/run-anthropic-proxy-continuity.sh`
257
+
258
+ These changes ensure llama speculative behavior is evaluated in a stable proxy/control-plane environment.
259
+
260
+ ## 10) Check service health
261
+
262
+ ```bash
263
+ systemctl --user status uap-llama-server.service --no-pager
264
+ systemctl --user status uap-anthropic-proxy.service --no-pager
265
+ curl -sf http://127.0.0.1:8080/v1/models
266
+ curl -sf http://127.0.0.1:4000/health
267
+ ```
268
+
269
+ ## 11) References and credits
270
+
271
+ This implementation and tuning flow builds on prior llama.cpp and proxy work:
272
+
273
+ - llama.cpp speculative docs: `docs/speculative.md`
274
+ - llama.cpp hybrid rollout notes: `docs/development/speculative-hybrid-rollout.md`
275
+ - llama.cpp speculative lineage: #5479, #6828, #6848, #19164
276
+ - checkpoint/SWA context note:
277
+ - https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055
278
+
279
+ Thanks to ggml-org/llama.cpp maintainers and contributors for speculative, cache, and memory-path groundwork.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@miller-tech/uap",
3
- "version": "1.13.12",
3
+ "version": "1.13.14",
4
4
  "description": "Autonomous AI agent memory system with CLAUDE.md protocol enforcement",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",