@miller-tech/uap 1.13.12 → 1.13.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/benchmarks/speculative-autotune.d.ts +46 -0
- package/dist/benchmarks/speculative-autotune.d.ts.map +1 -0
- package/dist/benchmarks/speculative-autotune.js +145 -0
- package/dist/benchmarks/speculative-autotune.js.map +1 -0
- package/dist/benchmarks/token-throughput.d.ts +46 -46
- package/dist/bin/cli.js +2 -0
- package/dist/bin/cli.js.map +1 -1
- package/dist/bin/llama-server-optimize.js +176 -0
- package/dist/bin/llama-server-optimize.js.map +1 -1
- package/dist/bin/policy.js +0 -0
- package/dist/cli/hooks.js +1 -0
- package/dist/cli/hooks.js.map +1 -1
- package/dist/cli/init.d.ts +1 -0
- package/dist/cli/init.d.ts.map +1 -1
- package/dist/cli/init.js +18 -0
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/setup.d.ts +1 -0
- package/dist/cli/setup.d.ts.map +1 -1
- package/dist/cli/setup.js +1 -0
- package/dist/cli/setup.js.map +1 -1
- package/dist/cli/systemd-services.d.ts +12 -0
- package/dist/cli/systemd-services.d.ts.map +1 -0
- package/dist/cli/systemd-services.js +179 -0
- package/dist/cli/systemd-services.js.map +1 -0
- package/dist/models/types.d.ts +12 -12
- package/dist/policies/schemas/policy.d.ts +12 -12
- package/dist/types/config.d.ts +24 -24
- package/docs/deployment/QWEN35_LLAMA_CPP.md +49 -0
- package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +279 -0
- package/package.json +1 -1
- package/templates/hooks/loop-protection.sh +250 -0
- package/templates/hooks/post-compact.sh +14 -0
- package/templates/hooks/post-tool-use-edit-write.sh +15 -0
- package/templates/hooks/pre-compact.sh +9 -0
- package/templates/hooks/pre-tool-use-bash.sh +6 -0
- package/templates/hooks/pre-tool-use-edit-write.sh +10 -0
- package/templates/hooks/session-start.sh +64 -44
- package/templates/hooks/stop.sh +9 -0
- package/tools/agents/scripts/anthropic_proxy.py +716 -166
- package/tools/agents/tests/test_anthropic_proxy_streaming.py +51 -0
- package/tools/agents/scripts/__pycache__/anthropic_proxy.cpython-313.pyc +0 -0
- package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
package/dist/types/config.d.ts
CHANGED
|
@@ -1272,7 +1272,7 @@ export declare const ModelConfigSchema: z.ZodObject<{
|
|
|
1272
1272
|
}, "strip", z.ZodTypeAny, {
|
|
1273
1273
|
id: string;
|
|
1274
1274
|
name: string;
|
|
1275
|
-
provider: "
|
|
1275
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
1276
1276
|
apiModel: string;
|
|
1277
1277
|
maxContextTokens: number;
|
|
1278
1278
|
capabilities: string[];
|
|
@@ -1283,7 +1283,7 @@ export declare const ModelConfigSchema: z.ZodObject<{
|
|
|
1283
1283
|
}, {
|
|
1284
1284
|
id: string;
|
|
1285
1285
|
name: string;
|
|
1286
|
-
provider: "
|
|
1286
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
1287
1287
|
apiModel: string;
|
|
1288
1288
|
endpoint?: string | undefined;
|
|
1289
1289
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -1303,12 +1303,12 @@ export declare const RoutingRuleSchema: z.ZodObject<{
|
|
|
1303
1303
|
priority: number;
|
|
1304
1304
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
1305
1305
|
keywords?: string[] | undefined;
|
|
1306
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
1306
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
1307
1307
|
}, {
|
|
1308
1308
|
targetRole: "planner" | "executor" | "reviewer" | "fallback";
|
|
1309
1309
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
1310
1310
|
keywords?: string[] | undefined;
|
|
1311
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
1311
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
1312
1312
|
priority?: number | undefined;
|
|
1313
1313
|
}>;
|
|
1314
1314
|
export declare const MultiModelSchema: z.ZodObject<{
|
|
@@ -1328,7 +1328,7 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1328
1328
|
}, "strip", z.ZodTypeAny, {
|
|
1329
1329
|
id: string;
|
|
1330
1330
|
name: string;
|
|
1331
|
-
provider: "
|
|
1331
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
1332
1332
|
apiModel: string;
|
|
1333
1333
|
maxContextTokens: number;
|
|
1334
1334
|
capabilities: string[];
|
|
@@ -1340,7 +1340,7 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1340
1340
|
}, {
|
|
1341
1341
|
id: string;
|
|
1342
1342
|
name: string;
|
|
1343
|
-
provider: "
|
|
1343
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
1344
1344
|
apiModel: string;
|
|
1345
1345
|
endpoint?: string | undefined;
|
|
1346
1346
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -1377,12 +1377,12 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1377
1377
|
priority: number;
|
|
1378
1378
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
1379
1379
|
keywords?: string[] | undefined;
|
|
1380
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
1380
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
1381
1381
|
}, {
|
|
1382
1382
|
targetRole: "planner" | "executor" | "reviewer" | "fallback";
|
|
1383
1383
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
1384
1384
|
keywords?: string[] | undefined;
|
|
1385
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
1385
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
1386
1386
|
priority?: number | undefined;
|
|
1387
1387
|
}>, "many">>;
|
|
1388
1388
|
costOptimization: z.ZodOptional<z.ZodObject<{
|
|
@@ -1443,7 +1443,7 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1443
1443
|
models: (string | {
|
|
1444
1444
|
id: string;
|
|
1445
1445
|
name: string;
|
|
1446
|
-
provider: "
|
|
1446
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
1447
1447
|
apiModel: string;
|
|
1448
1448
|
maxContextTokens: number;
|
|
1449
1449
|
capabilities: string[];
|
|
@@ -1465,7 +1465,7 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1465
1465
|
priority: number;
|
|
1466
1466
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
1467
1467
|
keywords?: string[] | undefined;
|
|
1468
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
1468
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
1469
1469
|
}[] | undefined;
|
|
1470
1470
|
costOptimization?: {
|
|
1471
1471
|
enabled: boolean;
|
|
@@ -1492,7 +1492,7 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1492
1492
|
models?: (string | {
|
|
1493
1493
|
id: string;
|
|
1494
1494
|
name: string;
|
|
1495
|
-
provider: "
|
|
1495
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
1496
1496
|
apiModel: string;
|
|
1497
1497
|
endpoint?: string | undefined;
|
|
1498
1498
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -1512,7 +1512,7 @@ export declare const MultiModelSchema: z.ZodObject<{
|
|
|
1512
1512
|
targetRole: "planner" | "executor" | "reviewer" | "fallback";
|
|
1513
1513
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
1514
1514
|
keywords?: string[] | undefined;
|
|
1515
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
1515
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
1516
1516
|
priority?: number | undefined;
|
|
1517
1517
|
}[] | undefined;
|
|
1518
1518
|
costOptimization?: {
|
|
@@ -2583,7 +2583,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2583
2583
|
}, "strip", z.ZodTypeAny, {
|
|
2584
2584
|
id: string;
|
|
2585
2585
|
name: string;
|
|
2586
|
-
provider: "
|
|
2586
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
2587
2587
|
apiModel: string;
|
|
2588
2588
|
maxContextTokens: number;
|
|
2589
2589
|
capabilities: string[];
|
|
@@ -2595,7 +2595,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2595
2595
|
}, {
|
|
2596
2596
|
id: string;
|
|
2597
2597
|
name: string;
|
|
2598
|
-
provider: "
|
|
2598
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
2599
2599
|
apiModel: string;
|
|
2600
2600
|
endpoint?: string | undefined;
|
|
2601
2601
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -2632,12 +2632,12 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2632
2632
|
priority: number;
|
|
2633
2633
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
2634
2634
|
keywords?: string[] | undefined;
|
|
2635
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
2635
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
2636
2636
|
}, {
|
|
2637
2637
|
targetRole: "planner" | "executor" | "reviewer" | "fallback";
|
|
2638
2638
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
2639
2639
|
keywords?: string[] | undefined;
|
|
2640
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
2640
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
2641
2641
|
priority?: number | undefined;
|
|
2642
2642
|
}>, "many">>;
|
|
2643
2643
|
costOptimization: z.ZodOptional<z.ZodObject<{
|
|
@@ -2698,7 +2698,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2698
2698
|
models: (string | {
|
|
2699
2699
|
id: string;
|
|
2700
2700
|
name: string;
|
|
2701
|
-
provider: "
|
|
2701
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
2702
2702
|
apiModel: string;
|
|
2703
2703
|
maxContextTokens: number;
|
|
2704
2704
|
capabilities: string[];
|
|
@@ -2720,7 +2720,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2720
2720
|
priority: number;
|
|
2721
2721
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
2722
2722
|
keywords?: string[] | undefined;
|
|
2723
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
2723
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
2724
2724
|
}[] | undefined;
|
|
2725
2725
|
costOptimization?: {
|
|
2726
2726
|
enabled: boolean;
|
|
@@ -2747,7 +2747,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2747
2747
|
models?: (string | {
|
|
2748
2748
|
id: string;
|
|
2749
2749
|
name: string;
|
|
2750
|
-
provider: "
|
|
2750
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
2751
2751
|
apiModel: string;
|
|
2752
2752
|
endpoint?: string | undefined;
|
|
2753
2753
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -2767,7 +2767,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
2767
2767
|
targetRole: "planner" | "executor" | "reviewer" | "fallback";
|
|
2768
2768
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
2769
2769
|
keywords?: string[] | undefined;
|
|
2770
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
2770
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
2771
2771
|
priority?: number | undefined;
|
|
2772
2772
|
}[] | undefined;
|
|
2773
2773
|
costOptimization?: {
|
|
@@ -3092,7 +3092,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
3092
3092
|
models: (string | {
|
|
3093
3093
|
id: string;
|
|
3094
3094
|
name: string;
|
|
3095
|
-
provider: "
|
|
3095
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
3096
3096
|
apiModel: string;
|
|
3097
3097
|
maxContextTokens: number;
|
|
3098
3098
|
capabilities: string[];
|
|
@@ -3114,7 +3114,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
3114
3114
|
priority: number;
|
|
3115
3115
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
3116
3116
|
keywords?: string[] | undefined;
|
|
3117
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
3117
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
3118
3118
|
}[] | undefined;
|
|
3119
3119
|
costOptimization?: {
|
|
3120
3120
|
enabled: boolean;
|
|
@@ -3353,7 +3353,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
3353
3353
|
models?: (string | {
|
|
3354
3354
|
id: string;
|
|
3355
3355
|
name: string;
|
|
3356
|
-
provider: "
|
|
3356
|
+
provider: "anthropic" | "deepseek" | "openai" | "zhipu" | "ollama" | "custom";
|
|
3357
3357
|
apiModel: string;
|
|
3358
3358
|
endpoint?: string | undefined;
|
|
3359
3359
|
apiKeyEnvVar?: string | undefined;
|
|
@@ -3373,7 +3373,7 @@ export declare const AgentContextConfigSchema: z.ZodObject<{
|
|
|
3373
3373
|
targetRole: "planner" | "executor" | "reviewer" | "fallback";
|
|
3374
3374
|
complexity?: "low" | "medium" | "high" | "critical" | undefined;
|
|
3375
3375
|
keywords?: string[] | undefined;
|
|
3376
|
-
taskType?: "planning" | "coding" | "refactoring" | "bug-fix" | "
|
|
3376
|
+
taskType?: "planning" | "review" | "coding" | "refactoring" | "bug-fix" | "documentation" | undefined;
|
|
3377
3377
|
priority?: number | undefined;
|
|
3378
3378
|
}[] | undefined;
|
|
3379
3379
|
costOptimization?: {
|
|
@@ -116,6 +116,50 @@ llama-server \
|
|
|
116
116
|
| `--draft-max` | `16` | Max tokens to draft per iteration. Higher = more throughput, more VRAM. |
|
|
117
117
|
| `--draft-p-min` | `0.75` | Minimum acceptance probability. Lower = more aggressive drafting. |
|
|
118
118
|
|
|
119
|
+
## Extension Options for Speculative Decoding
|
|
120
|
+
|
|
121
|
+
### Option 1: Adaptive Runtime Tuning (implemented)
|
|
122
|
+
|
|
123
|
+
Use acceptance and rollback rates to auto-adjust `draft-max`, `draft-min`, and `draft-p-min` over time.
|
|
124
|
+
|
|
125
|
+
- Best for immediate gains without kernel changes
|
|
126
|
+
- Reduces bad bursts when acceptance drops
|
|
127
|
+
- Increases burst length automatically during high-acceptance windows
|
|
128
|
+
|
|
129
|
+
Commands:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Tune once from observed metrics
|
|
133
|
+
llama-optimize spec-autotune --acceptance 0.71 --rollback 0.14 --profile throughput
|
|
134
|
+
|
|
135
|
+
# Compare static defaults vs adaptive tuning using deterministic simulation
|
|
136
|
+
llama-optimize spec-benchmark --profile throughput --trace mixed --steps 180
|
|
137
|
+
|
|
138
|
+
# Live benchmark active server and get tuned flag recommendation
|
|
139
|
+
llama-optimize spec-benchmark-live \
|
|
140
|
+
--endpoint http://127.0.0.1:8080/v1 \
|
|
141
|
+
--model qwen3.5-a3b-iq4xs \
|
|
142
|
+
--runs 5 --max-tokens 256 --profile throughput
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Recommended workflow:
|
|
146
|
+
|
|
147
|
+
1. Run `spec-benchmark-live` with your current startup flags and note `Throughput`.
|
|
148
|
+
2. Restart `llama-server` with the `Suggested params` flags.
|
|
149
|
+
3. Re-run `spec-benchmark-live` with the same settings to measure actual gain.
|
|
150
|
+
|
|
151
|
+
### Option 2: GPU Residency + Overlap
|
|
152
|
+
|
|
153
|
+
- Keep draft model and draft KV fully on GPU
|
|
154
|
+
- Preallocate buffers and overlap draft + verify passes with CUDA streams
|
|
155
|
+
- Improves p95 latency consistency on long runs
|
|
156
|
+
|
|
157
|
+
### Option 3: GPU Checkpoint/Rollback
|
|
158
|
+
|
|
159
|
+
- Move speculative checkpoint snapshots from CPU RAM to GPU buffers
|
|
160
|
+
- Remove host-device copy overhead from rollback paths
|
|
161
|
+
- Highest upside, but requires deeper runtime changes
|
|
162
|
+
|
|
119
163
|
### Sampling
|
|
120
164
|
|
|
121
165
|
| Flag | Value | Purpose |
|
|
@@ -177,6 +221,10 @@ All settings are via environment variables:
|
|
|
177
221
|
| `PROXY_LOG_LEVEL` | `INFO` | Logging level (DEBUG/INFO/WARNING/ERROR) |
|
|
178
222
|
| `PROXY_READ_TIMEOUT` | `600` | Read timeout (seconds) for LLM streaming |
|
|
179
223
|
| `PROXY_MAX_CONNECTIONS` | `20` | Max concurrent upstream connections |
|
|
224
|
+
| `PROXY_STREAM_REASONING_FALLBACK` | `off` | Streaming behavior for reasoning-only empty turns (`off`, `sanitized`, `visible`) |
|
|
225
|
+
| `PROXY_STREAM_REASONING_MAX_CHARS` | `240` | Max fallback length when `PROXY_STREAM_REASONING_FALLBACK=sanitized` |
|
|
226
|
+
|
|
227
|
+
For agentic coding workloads, keep `PROXY_STREAM_REASONING_FALLBACK=off` (default) to avoid leaking malformed internal reasoning as user-visible output. Use `sanitized` only for debugging.
|
|
180
228
|
|
|
181
229
|
### Example: Custom upstream
|
|
182
230
|
|
|
@@ -339,3 +387,4 @@ Two possible causes:
|
|
|
339
387
|
- `tools/agents/scripts/qwen_tool_call_test.py` - Test suite using OpenAI-compatible API
|
|
340
388
|
- `src/cli/tool-calls.ts` - CLI command for template management
|
|
341
389
|
- `src/bin/llama-server-optimize.ts` - llama-server startup optimizer
|
|
390
|
+
- `docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md` - service bootstrap + ngram-cache A/B benchmarking
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# UAP + llama.cpp + Anthropic Proxy Bootstrap
|
|
2
|
+
|
|
3
|
+
This guide captures the local continuity stack as a repeatable bootstrap:
|
|
4
|
+
|
|
5
|
+
- `uap-llama-server.service` (llama.cpp)
|
|
6
|
+
- `uap-anthropic-proxy.service` (Anthropic API compatibility)
|
|
7
|
+
- A/B benchmark workflow for speculative decoding with `ngram-cache`
|
|
8
|
+
|
|
9
|
+
It also documents the UAP-side support changes needed to keep llama.cpp speculative decoding stable in agentic workflows.
|
|
10
|
+
|
|
11
|
+
## 1) Bootstrap services
|
|
12
|
+
|
|
13
|
+
Run:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bash scripts/bootstrap/bootstrap-uap-llama-proxy-stack.sh
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
This writes:
|
|
20
|
+
|
|
21
|
+
- `~/.config/uap/llama-server.env`
|
|
22
|
+
- `~/.config/uap/anthropic-proxy.env`
|
|
23
|
+
- `~/.config/systemd/user/uap-llama-server.service`
|
|
24
|
+
- `~/.config/systemd/user/uap-anthropic-proxy.service`
|
|
25
|
+
|
|
26
|
+
Then it enables and starts both user services.
|
|
27
|
+
|
|
28
|
+
## 2) Key llama env knobs
|
|
29
|
+
|
|
30
|
+
Edit `~/.config/uap/llama-server.env` and restart service:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
systemctl --user restart uap-llama-server.service
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Important variables:
|
|
37
|
+
|
|
38
|
+
- `LLAMA_SPEC_TYPE` (`none`, `ngram-cache`, etc.)
|
|
39
|
+
- `LLAMA_DRAFT_MAX`
|
|
40
|
+
- `LLAMA_DRAFT_MIN`
|
|
41
|
+
- `LLAMA_DRAFT_P_MIN`
|
|
42
|
+
- `LLAMA_EXTRA_ARGS` (optional additional startup flags)
|
|
43
|
+
|
|
44
|
+
## 3) Key proxy env knobs
|
|
45
|
+
|
|
46
|
+
Edit `~/.config/uap/anthropic-proxy.env` and restart proxy:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
systemctl --user restart uap-anthropic-proxy.service
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Important variables:
|
|
53
|
+
|
|
54
|
+
- `PROXY_PORT`
|
|
55
|
+
- `LLAMA_CPP_BASE`
|
|
56
|
+
- `PROXY_CONTEXT_WINDOW` (set to `262144` to match llama context)
|
|
57
|
+
- Loop/guardrail options (`PROXY_LOOP_BREAKER`, `PROXY_FORCED_THRESHOLD`, etc.)
|
|
58
|
+
|
|
59
|
+
## 4) Run ngram-cache signal benchmark
|
|
60
|
+
|
|
61
|
+
Use the service-oriented A/B script:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
bash scripts/benchmarks/run-spec-ngram-service-ab.sh
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
What it does:
|
|
68
|
+
|
|
69
|
+
1. Stops managed `uap-llama-server.service` temporarily
|
|
70
|
+
2. Runs transient systemd service benchmarks for:
|
|
71
|
+
- `spec-type=none`
|
|
72
|
+
- `spec-type=ngram-cache` (default draft params)
|
|
73
|
+
- `spec-type=ngram-cache` (tuned: `21/6/0.72`)
|
|
74
|
+
3. Restores managed `uap-llama-server.service`
|
|
75
|
+
4. Writes report artifacts under `benchmark-results/spec-ngram-ab-<timestamp>/`
|
|
76
|
+
|
|
77
|
+
Outputs:
|
|
78
|
+
|
|
79
|
+
- `report.json` machine-readable deltas
|
|
80
|
+
- `report.md` human-readable summary
|
|
81
|
+
|
|
82
|
+
## 5) Run automatic draft-parameter sweep (Option 2)
|
|
83
|
+
|
|
84
|
+
Use this to search for the best local `ngram-cache` settings:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
bash scripts/benchmarks/run-spec-ngram-sweep.sh
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Useful overrides:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
RUNS=5 MAX_TOKENS=256 \
|
|
94
|
+
DRAFT_MAXS="16 18 20 22" \
|
|
95
|
+
DRAFT_MINS="3 4 5 6" \
|
|
96
|
+
DRAFT_P_MINS="0.70 0.72 0.75 0.78" \
|
|
97
|
+
bash scripts/benchmarks/run-spec-ngram-sweep.sh
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Outputs are written under `benchmark-results/spec-ngram-sweep-<timestamp>/`:
|
|
101
|
+
|
|
102
|
+
- `results.jsonl` one entry per candidate
|
|
103
|
+
- `summary.json` best candidate + stats
|
|
104
|
+
- `summary.md` top 5 table
|
|
105
|
+
|
|
106
|
+
## 6) Profiles for agentic coding vs max speed
|
|
107
|
+
|
|
108
|
+
Use two explicit profiles depending on your goal.
|
|
109
|
+
|
|
110
|
+
### A) Agentic coding continuity profile (recommended daily use)
|
|
111
|
+
|
|
112
|
+
This profile prioritizes long, coherent coding sessions and minimizes `find_slot` warnings.
|
|
113
|
+
|
|
114
|
+
`~/.config/uap/llama-server.env`:
|
|
115
|
+
|
|
116
|
+
```env
|
|
117
|
+
LLAMA_CTX_SIZE=262144
|
|
118
|
+
LLAMA_SPEC_TYPE=ngram-cache
|
|
119
|
+
LLAMA_DRAFT_MAX=12
|
|
120
|
+
LLAMA_DRAFT_MIN=2
|
|
121
|
+
LLAMA_DRAFT_P_MIN=0.80
|
|
122
|
+
LLAMA_HYBRID_ROLLBACK_MODE=strict
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Apply it:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
systemctl --user restart uap-llama-server.service
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
`~/.config/uap/anthropic-proxy.env`:
|
|
132
|
+
|
|
133
|
+
```env
|
|
134
|
+
PROXY_CONTEXT_WINDOW=262144
|
|
135
|
+
PROXY_LOOP_BREAKER=on
|
|
136
|
+
PROXY_LOOP_WINDOW=6
|
|
137
|
+
PROXY_LOOP_REPEAT_THRESHOLD=10
|
|
138
|
+
PROXY_FORCED_THRESHOLD=18
|
|
139
|
+
PROXY_NO_PROGRESS_THRESHOLD=5
|
|
140
|
+
PROXY_CONTEXT_RELEASE_THRESHOLD=0.95
|
|
141
|
+
PROXY_GUARDRAIL_RETRY=on
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Apply it:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
systemctl --user restart uap-anthropic-proxy.service
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### B) Max-throughput benchmark profile (where 220+ tok/s was observed)
|
|
151
|
+
|
|
152
|
+
The 220+ decode throughput observed in this session was achieved with:
|
|
153
|
+
|
|
154
|
+
- CUDA build: `/home/cogtek/llama.cpp/.worktrees/001-llama-spec-rollback-fix/build-cuda/bin/llama-server`
|
|
155
|
+
- GPU flags: `--device CUDA0 --n-gpu-layers all --flash-attn on`
|
|
156
|
+
- Speculative mode: `--spec-type ngram-cache`
|
|
157
|
+
- Rollback mode: `LLAMA_HYBRID_ROLLBACK_MODE=hybrid`
|
|
158
|
+
- Workload: repetitive pattern prompt, `n_predict=512`
|
|
159
|
+
|
|
160
|
+
Run command used for that profile:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
LLAMA_HYBRID_ROLLBACK_MODE=hybrid \
|
|
164
|
+
/home/cogtek/llama.cpp/.worktrees/001-llama-spec-rollback-fix/build-cuda/bin/llama-server \
|
|
165
|
+
-m "/home/cogtek/Downloads/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf" \
|
|
166
|
+
--host 127.0.0.1 --port 18121 \
|
|
167
|
+
--ctx-size 16384 --parallel 1 --no-warmup \
|
|
168
|
+
--device CUDA0 --n-gpu-layers all --flash-attn on \
|
|
169
|
+
--spec-type ngram-cache
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Important: this max-speed profile is workload-sensitive and was measured on a pattern-heavy prompt. For real agentic coding, use Profile A.
|
|
173
|
+
|
|
174
|
+
## 7) Validated A/B findings (2026-03-23)
|
|
175
|
+
|
|
176
|
+
Direct old-vs-new A/B was run against:
|
|
177
|
+
|
|
178
|
+
- old fast commit: `029edcafc` (first pushed fast state around 21:35)
|
|
179
|
+
- newer commit: `1f8225f8f`
|
|
180
|
+
- model: `Qwen3.5-35B-A3B-UD-IQ4_XS.gguf`
|
|
181
|
+
- speculative: `ngram-cache`, `draft-max=16`, `draft-min=3`, `draft-p-min=0.72`
|
|
182
|
+
|
|
183
|
+
Notes:
|
|
184
|
+
|
|
185
|
+
- Standalone launches at `ctx-size=262144` can fail GPU allocation on some runs for the old commit (`failed to allocate compute pp buffers`).
|
|
186
|
+
- For controlled apples-to-apples throughput comparison, A/B was run at `ctx-size=16384`.
|
|
187
|
+
|
|
188
|
+
Observed results (`/tmp/ab_matrix_ctx16_v2.json`):
|
|
189
|
+
|
|
190
|
+
| Path | Old `029edcafc` | New `1f8225f8f` | Delta (new vs old) |
|
|
191
|
+
| --------------- | --------------- | --------------- | ------------------- |
|
|
192
|
+
| Raw coding | 107.97 tok/s | 99.23 tok/s | -8.1% |
|
|
193
|
+
| Raw pattern | 158.71 tok/s | 105.75 tok/s | -33.4% |
|
|
194
|
+
| Proxy plain | 113.74 tok/s | 109.39 tok/s | -3.8% |
|
|
195
|
+
| Agentic tool 2nd turn | `tool_use` (stable) | `tool_use` (stable) | parity on control flow |
|
|
196
|
+
|
|
197
|
+
Behavioral observations:
|
|
198
|
+
|
|
199
|
+
- Newer commit emitted many `find_slot: non-consecutive token position` warnings in raw/proxy runs under the same speculative settings.
|
|
200
|
+
- Old commit produced materially cleaner logs and higher throughput in the same benchmark profile.
|
|
201
|
+
- Proxy continuity fixes improved agentic tool-loop stability and no longer force premature stop in the tested loop.
|
|
202
|
+
|
|
203
|
+
Decision for throughput-sensitive testing:
|
|
204
|
+
|
|
205
|
+
- Prefer old fast commit `029edcafc` profile for max-throughput benchmarking.
|
|
206
|
+
- Keep a separate continuity profile for long-context agentic coding if warning volume grows.
|
|
207
|
+
|
|
208
|
+
Additional 27B impact snapshot (`Qwen3.5-27B-IQ4_XS`, `ctx=262144`, q4 KV cache):
|
|
209
|
+
|
|
210
|
+
- no speculative: ~43 tok/s coding, ~41 tok/s pattern
|
|
211
|
+
- aggressive speculative (`16/3/0.72`): ~44 tok/s coding, ~102 tok/s pattern
|
|
212
|
+
- balanced speculative (`12/2/0.80`): ~43 tok/s coding, ~102 tok/s pattern
|
|
213
|
+
|
|
214
|
+
Interpretation:
|
|
215
|
+
|
|
216
|
+
- balanced profile is functionally safer for agentic sessions,
|
|
217
|
+
- aggressive profile can edge higher on some coding runs,
|
|
218
|
+
- both speculative profiles massively outperform no-spec on repetition-heavy drafts.
|
|
219
|
+
|
|
220
|
+
## 8) Throughput interpretation and loop prevention
|
|
221
|
+
|
|
222
|
+
When reading llama logs, treat these as different metrics:
|
|
223
|
+
|
|
224
|
+
- `prompt eval time ... tokens per second` = prefill throughput
|
|
225
|
+
- `eval time ... tokens per second` = decode/completion throughput
|
|
226
|
+
|
|
227
|
+
In local continuity runs with large context, prompt throughput may exceed 2k tok/s while decode remains near 80-125 tok/s.
|
|
228
|
+
|
|
229
|
+
For default stability, use the guardrails from Profile A. If you hit active loop incidents, temporarily tighten to:
|
|
230
|
+
|
|
231
|
+
```env
|
|
232
|
+
PROXY_LOOP_WINDOW=6
|
|
233
|
+
PROXY_LOOP_REPEAT_THRESHOLD=8
|
|
234
|
+
PROXY_FORCED_THRESHOLD=14
|
|
235
|
+
PROXY_NO_PROGRESS_THRESHOLD=4
|
|
236
|
+
PROXY_CONTEXT_RELEASE_THRESHOLD=0.90
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Then restart proxy:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
systemctl --user restart uap-anthropic-proxy.service
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## 9) UAP support changes required for reliable operation
|
|
246
|
+
|
|
247
|
+
The following UAP-side changes are part of the working stack and should be present:
|
|
248
|
+
|
|
249
|
+
1. Session-scoped loop protection in Anthropic proxy (no cross-session contamination).
|
|
250
|
+
2. Guardrail retry for unexpected text-only end-turn in active tool loops.
|
|
251
|
+
3. Optional systemd scaffolding from CLI:
|
|
252
|
+
- `uap init --systemd-services`
|
|
253
|
+
- `uap setup --systemd-services`
|
|
254
|
+
4. Dedicated launch scripts:
|
|
255
|
+
- `scripts/run-llama-server-continuity.sh`
|
|
256
|
+
- `scripts/run-anthropic-proxy-continuity.sh`
|
|
257
|
+
|
|
258
|
+
These changes ensure llama speculative behavior is evaluated in a stable proxy/control-plane environment.
|
|
259
|
+
|
|
260
|
+
## 10) Check service health
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
systemctl --user status uap-llama-server.service --no-pager
|
|
264
|
+
systemctl --user status uap-anthropic-proxy.service --no-pager
|
|
265
|
+
curl -sf http://127.0.0.1:8080/v1/models
|
|
266
|
+
curl -sf http://127.0.0.1:4000/health
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## 11) References and credits
|
|
270
|
+
|
|
271
|
+
This implementation and tuning flow builds on prior llama.cpp and proxy work:
|
|
272
|
+
|
|
273
|
+
- llama.cpp speculative docs: `docs/speculative.md`
|
|
274
|
+
- llama.cpp hybrid rollout notes: `docs/development/speculative-hybrid-rollout.md`
|
|
275
|
+
- llama.cpp speculative lineage: #5479, #6828, #6848, #19164
|
|
276
|
+
- checkpoint/SWA context note:
|
|
277
|
+
- https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055
|
|
278
|
+
|
|
279
|
+
Thanks to ggml-org/llama.cpp maintainers and contributors for speculative, cache, and memory-path groundwork.
|