@miller-tech/uap 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +888 -0
- package/dist/analyzers/index.d.ts +3 -0
- package/dist/analyzers/index.d.ts.map +1 -0
- package/dist/analyzers/index.js +684 -0
- package/dist/analyzers/index.js.map +1 -0
- package/dist/benchmarks/agents/naive-agent.d.ts +60 -0
- package/dist/benchmarks/agents/naive-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/naive-agent.js +144 -0
- package/dist/benchmarks/agents/naive-agent.js.map +1 -0
- package/dist/benchmarks/agents/uap-agent.d.ts +167 -0
- package/dist/benchmarks/agents/uap-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/uap-agent.js +437 -0
- package/dist/benchmarks/agents/uap-agent.js.map +1 -0
- package/dist/benchmarks/benchmark.d.ts +328 -0
- package/dist/benchmarks/benchmark.d.ts.map +1 -0
- package/dist/benchmarks/benchmark.js +112 -0
- package/dist/benchmarks/benchmark.js.map +1 -0
- package/dist/benchmarks/execution-verifier.d.ts +41 -0
- package/dist/benchmarks/execution-verifier.d.ts.map +1 -0
- package/dist/benchmarks/execution-verifier.js +340 -0
- package/dist/benchmarks/execution-verifier.js.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts +37 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.js +246 -0
- package/dist/benchmarks/hierarchical-prompting.js.map +1 -0
- package/dist/benchmarks/improved-benchmark.d.ts +89 -0
- package/dist/benchmarks/improved-benchmark.d.ts.map +1 -0
- package/dist/benchmarks/improved-benchmark.js +585 -0
- package/dist/benchmarks/improved-benchmark.js.map +1 -0
- package/dist/benchmarks/index.d.ts +11 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +11 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/benchmarks/model-integration.d.ts +111 -0
- package/dist/benchmarks/model-integration.d.ts.map +1 -0
- package/dist/benchmarks/model-integration.js +904 -0
- package/dist/benchmarks/model-integration.js.map +1 -0
- package/dist/benchmarks/multi-turn-agent.d.ts +44 -0
- package/dist/benchmarks/multi-turn-agent.d.ts.map +1 -0
- package/dist/benchmarks/multi-turn-agent.js +254 -0
- package/dist/benchmarks/multi-turn-agent.js.map +1 -0
- package/dist/benchmarks/multi-turn-loop.d.ts +57 -0
- package/dist/benchmarks/multi-turn-loop.d.ts.map +1 -0
- package/dist/benchmarks/multi-turn-loop.js +167 -0
- package/dist/benchmarks/multi-turn-loop.js.map +1 -0
- package/dist/benchmarks/tasks.d.ts +19 -0
- package/dist/benchmarks/tasks.d.ts.map +1 -0
- package/dist/benchmarks/tasks.js +435 -0
- package/dist/benchmarks/tasks.js.map +1 -0
- package/dist/bin/cli.d.ts +3 -0
- package/dist/bin/cli.d.ts.map +1 -0
- package/dist/bin/cli.js +546 -0
- package/dist/bin/cli.js.map +1 -0
- package/dist/bin/llama-server-optimize.d.ts +18 -0
- package/dist/bin/llama-server-optimize.d.ts.map +1 -0
- package/dist/bin/llama-server-optimize.js +708 -0
- package/dist/bin/llama-server-optimize.js.map +1 -0
- package/dist/bin/policy.d.ts +3 -0
- package/dist/bin/policy.d.ts.map +1 -0
- package/dist/bin/policy.js +143 -0
- package/dist/bin/policy.js.map +1 -0
- package/dist/bin/tool-calls.d.ts +3 -0
- package/dist/bin/tool-calls.d.ts.map +1 -0
- package/dist/bin/tool-calls.js +4 -0
- package/dist/bin/tool-calls.js.map +1 -0
- package/dist/browser/index.d.ts +2 -0
- package/dist/browser/index.d.ts.map +1 -0
- package/dist/browser/index.js +2 -0
- package/dist/browser/index.js.map +1 -0
- package/dist/browser/web-browser.d.ts +30 -0
- package/dist/browser/web-browser.d.ts.map +1 -0
- package/dist/browser/web-browser.js +93 -0
- package/dist/browser/web-browser.js.map +1 -0
- package/dist/cli/agent.d.ts +20 -0
- package/dist/cli/agent.d.ts.map +1 -0
- package/dist/cli/agent.js +474 -0
- package/dist/cli/agent.js.map +1 -0
- package/dist/cli/analyze.d.ts +7 -0
- package/dist/cli/analyze.d.ts.map +1 -0
- package/dist/cli/analyze.js +103 -0
- package/dist/cli/analyze.js.map +1 -0
- package/dist/cli/completion-gates.d.ts +51 -0
- package/dist/cli/completion-gates.d.ts.map +1 -0
- package/dist/cli/completion-gates.js +201 -0
- package/dist/cli/completion-gates.js.map +1 -0
- package/dist/cli/compliance.d.ts +8 -0
- package/dist/cli/compliance.d.ts.map +1 -0
- package/dist/cli/compliance.js +509 -0
- package/dist/cli/compliance.js.map +1 -0
- package/dist/cli/coord.d.ts +7 -0
- package/dist/cli/coord.d.ts.map +1 -0
- package/dist/cli/coord.js +138 -0
- package/dist/cli/coord.js.map +1 -0
- package/dist/cli/dashboard.d.ts +21 -0
- package/dist/cli/dashboard.d.ts.map +1 -0
- package/dist/cli/dashboard.js +1508 -0
- package/dist/cli/dashboard.js.map +1 -0
- package/dist/cli/deploy.d.ts +19 -0
- package/dist/cli/deploy.d.ts.map +1 -0
- package/dist/cli/deploy.js +387 -0
- package/dist/cli/deploy.js.map +1 -0
- package/dist/cli/droids.d.ts +9 -0
- package/dist/cli/droids.d.ts.map +1 -0
- package/dist/cli/droids.js +227 -0
- package/dist/cli/droids.js.map +1 -0
- package/dist/cli/generate.d.ts +17 -0
- package/dist/cli/generate.d.ts.map +1 -0
- package/dist/cli/generate.js +432 -0
- package/dist/cli/generate.js.map +1 -0
- package/dist/cli/hooks.d.ts +9 -0
- package/dist/cli/hooks.d.ts.map +1 -0
- package/dist/cli/hooks.js +464 -0
- package/dist/cli/hooks.js.map +1 -0
- package/dist/cli/init.d.ts +12 -0
- package/dist/cli/init.d.ts.map +1 -0
- package/dist/cli/init.js +364 -0
- package/dist/cli/init.js.map +1 -0
- package/dist/cli/mcp-router.d.ts +16 -0
- package/dist/cli/mcp-router.d.ts.map +1 -0
- package/dist/cli/mcp-router.js +143 -0
- package/dist/cli/mcp-router.js.map +1 -0
- package/dist/cli/memory.d.ts +24 -0
- package/dist/cli/memory.d.ts.map +1 -0
- package/dist/cli/memory.js +885 -0
- package/dist/cli/memory.js.map +1 -0
- package/dist/cli/model.d.ts +15 -0
- package/dist/cli/model.d.ts.map +1 -0
- package/dist/cli/model.js +290 -0
- package/dist/cli/model.js.map +1 -0
- package/dist/cli/patterns.d.ts +26 -0
- package/dist/cli/patterns.d.ts.map +1 -0
- package/dist/cli/patterns.js +862 -0
- package/dist/cli/patterns.js.map +1 -0
- package/dist/cli/rtk-validation.d.ts +9 -0
- package/dist/cli/rtk-validation.d.ts.map +1 -0
- package/dist/cli/rtk-validation.js +9 -0
- package/dist/cli/rtk-validation.js.map +1 -0
- package/dist/cli/rtk.d.ts +34 -0
- package/dist/cli/rtk.d.ts.map +1 -0
- package/dist/cli/rtk.js +401 -0
- package/dist/cli/rtk.js.map +1 -0
- package/dist/cli/schema-diff.d.ts +7 -0
- package/dist/cli/schema-diff.d.ts.map +1 -0
- package/dist/cli/schema-diff.js +11 -0
- package/dist/cli/schema-diff.js.map +1 -0
- package/dist/cli/setup-mcp-router.d.ts +8 -0
- package/dist/cli/setup-mcp-router.d.ts.map +1 -0
- package/dist/cli/setup-mcp-router.js +163 -0
- package/dist/cli/setup-mcp-router.js.map +1 -0
- package/dist/cli/setup-wizard.d.ts +2 -0
- package/dist/cli/setup-wizard.d.ts.map +1 -0
- package/dist/cli/setup-wizard.js +806 -0
- package/dist/cli/setup-wizard.js.map +1 -0
- package/dist/cli/setup.d.ts +15 -0
- package/dist/cli/setup.d.ts.map +1 -0
- package/dist/cli/setup.js +154 -0
- package/dist/cli/setup.js.map +1 -0
- package/dist/cli/sync.d.ts +8 -0
- package/dist/cli/sync.d.ts.map +1 -0
- package/dist/cli/sync.js +395 -0
- package/dist/cli/sync.js.map +1 -0
- package/dist/cli/task.d.ts +33 -0
- package/dist/cli/task.d.ts.map +1 -0
- package/dist/cli/task.js +672 -0
- package/dist/cli/task.js.map +1 -0
- package/dist/cli/tool-calls.d.ts +20 -0
- package/dist/cli/tool-calls.d.ts.map +1 -0
- package/dist/cli/tool-calls.js +605 -0
- package/dist/cli/tool-calls.js.map +1 -0
- package/dist/cli/uap.d.ts +10 -0
- package/dist/cli/uap.d.ts.map +1 -0
- package/dist/cli/uap.js +398 -0
- package/dist/cli/uap.js.map +1 -0
- package/dist/cli/update.d.ts +10 -0
- package/dist/cli/update.d.ts.map +1 -0
- package/dist/cli/update.js +300 -0
- package/dist/cli/update.js.map +1 -0
- package/dist/cli/visualize.d.ts +77 -0
- package/dist/cli/visualize.d.ts.map +1 -0
- package/dist/cli/visualize.js +287 -0
- package/dist/cli/visualize.js.map +1 -0
- package/dist/cli/worktree.d.ts +9 -0
- package/dist/cli/worktree.d.ts.map +1 -0
- package/dist/cli/worktree.js +213 -0
- package/dist/cli/worktree.js.map +1 -0
- package/dist/coordination/adaptive-patterns.d.ts +65 -0
- package/dist/coordination/adaptive-patterns.d.ts.map +1 -0
- package/dist/coordination/adaptive-patterns.js +108 -0
- package/dist/coordination/adaptive-patterns.js.map +1 -0
- package/dist/coordination/auto-agent.d.ts +82 -0
- package/dist/coordination/auto-agent.d.ts.map +1 -0
- package/dist/coordination/auto-agent.js +145 -0
- package/dist/coordination/auto-agent.js.map +1 -0
- package/dist/coordination/capability-router.d.ts +79 -0
- package/dist/coordination/capability-router.d.ts.map +1 -0
- package/dist/coordination/capability-router.js +334 -0
- package/dist/coordination/capability-router.js.map +1 -0
- package/dist/coordination/database.d.ts +13 -0
- package/dist/coordination/database.d.ts.map +1 -0
- package/dist/coordination/database.js +136 -0
- package/dist/coordination/database.js.map +1 -0
- package/dist/coordination/deploy-batcher.d.ts +122 -0
- package/dist/coordination/deploy-batcher.d.ts.map +1 -0
- package/dist/coordination/deploy-batcher.js +718 -0
- package/dist/coordination/deploy-batcher.js.map +1 -0
- package/dist/coordination/droid-validator.d.ts +59 -0
- package/dist/coordination/droid-validator.d.ts.map +1 -0
- package/dist/coordination/droid-validator.js +142 -0
- package/dist/coordination/droid-validator.js.map +1 -0
- package/dist/coordination/index.d.ts +10 -0
- package/dist/coordination/index.d.ts.map +1 -0
- package/dist/coordination/index.js +10 -0
- package/dist/coordination/index.js.map +1 -0
- package/dist/coordination/pattern-router.d.ts +50 -0
- package/dist/coordination/pattern-router.d.ts.map +1 -0
- package/dist/coordination/pattern-router.js +118 -0
- package/dist/coordination/pattern-router.js.map +1 -0
- package/dist/coordination/service.d.ts +81 -0
- package/dist/coordination/service.d.ts.map +1 -0
- package/dist/coordination/service.js +619 -0
- package/dist/coordination/service.js.map +1 -0
- package/dist/coordination/worktree-enforcer.d.ts +22 -0
- package/dist/coordination/worktree-enforcer.d.ts.map +1 -0
- package/dist/coordination/worktree-enforcer.js +71 -0
- package/dist/coordination/worktree-enforcer.js.map +1 -0
- package/dist/generators/claude-md.d.ts +3 -0
- package/dist/generators/claude-md.d.ts.map +1 -0
- package/dist/generators/claude-md.js +1020 -0
- package/dist/generators/claude-md.js.map +1 -0
- package/dist/generators/template-loader.d.ts +105 -0
- package/dist/generators/template-loader.d.ts.map +1 -0
- package/dist/generators/template-loader.js +291 -0
- package/dist/generators/template-loader.js.map +1 -0
- package/dist/index.d.ts +49 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +63 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp-router/config/parser.d.ts +9 -0
- package/dist/mcp-router/config/parser.d.ts.map +1 -0
- package/dist/mcp-router/config/parser.js +174 -0
- package/dist/mcp-router/config/parser.js.map +1 -0
- package/dist/mcp-router/executor/client.d.ts +31 -0
- package/dist/mcp-router/executor/client.d.ts.map +1 -0
- package/dist/mcp-router/executor/client.js +189 -0
- package/dist/mcp-router/executor/client.js.map +1 -0
- package/dist/mcp-router/index.d.ts +22 -0
- package/dist/mcp-router/index.d.ts.map +1 -0
- package/dist/mcp-router/index.js +18 -0
- package/dist/mcp-router/index.js.map +1 -0
- package/dist/mcp-router/output-compressor.d.ts +26 -0
- package/dist/mcp-router/output-compressor.d.ts.map +1 -0
- package/dist/mcp-router/output-compressor.js +236 -0
- package/dist/mcp-router/output-compressor.js.map +1 -0
- package/dist/mcp-router/search/fuzzy.d.ts +26 -0
- package/dist/mcp-router/search/fuzzy.d.ts.map +1 -0
- package/dist/mcp-router/search/fuzzy.js +94 -0
- package/dist/mcp-router/search/fuzzy.js.map +1 -0
- package/dist/mcp-router/server.d.ts +50 -0
- package/dist/mcp-router/server.d.ts.map +1 -0
- package/dist/mcp-router/server.js +229 -0
- package/dist/mcp-router/server.js.map +1 -0
- package/dist/mcp-router/session-stats.d.ts +37 -0
- package/dist/mcp-router/session-stats.d.ts.map +1 -0
- package/dist/mcp-router/session-stats.js +56 -0
- package/dist/mcp-router/session-stats.js.map +1 -0
- package/dist/mcp-router/tools/discover.d.ts +37 -0
- package/dist/mcp-router/tools/discover.d.ts.map +1 -0
- package/dist/mcp-router/tools/discover.js +65 -0
- package/dist/mcp-router/tools/discover.js.map +1 -0
- package/dist/mcp-router/tools/execute.d.ts +43 -0
- package/dist/mcp-router/tools/execute.d.ts.map +1 -0
- package/dist/mcp-router/tools/execute.js +144 -0
- package/dist/mcp-router/tools/execute.js.map +1 -0
- package/dist/mcp-router/types.d.ts +62 -0
- package/dist/mcp-router/types.d.ts.map +1 -0
- package/dist/mcp-router/types.js +6 -0
- package/dist/mcp-router/types.js.map +1 -0
- package/dist/memory/adaptive-context.d.ts +149 -0
- package/dist/memory/adaptive-context.d.ts.map +1 -0
- package/dist/memory/adaptive-context.js +1095 -0
- package/dist/memory/adaptive-context.js.map +1 -0
- package/dist/memory/agent-scoped-memory.d.ts +67 -0
- package/dist/memory/agent-scoped-memory.d.ts.map +1 -0
- package/dist/memory/agent-scoped-memory.js +126 -0
- package/dist/memory/agent-scoped-memory.js.map +1 -0
- package/dist/memory/ambiguity-detector.d.ts +54 -0
- package/dist/memory/ambiguity-detector.d.ts.map +1 -0
- package/dist/memory/ambiguity-detector.js +401 -0
- package/dist/memory/ambiguity-detector.js.map +1 -0
- package/dist/memory/backends/base.d.ts +18 -0
- package/dist/memory/backends/base.d.ts.map +1 -0
- package/dist/memory/backends/base.js +2 -0
- package/dist/memory/backends/base.js.map +1 -0
- package/dist/memory/backends/factory.d.ts +4 -0
- package/dist/memory/backends/factory.d.ts.map +1 -0
- package/dist/memory/backends/factory.js +53 -0
- package/dist/memory/backends/factory.js.map +1 -0
- package/dist/memory/backends/github.d.ts +27 -0
- package/dist/memory/backends/github.d.ts.map +1 -0
- package/dist/memory/backends/github.js +134 -0
- package/dist/memory/backends/github.js.map +1 -0
- package/dist/memory/backends/qdrant-cloud.d.ts +32 -0
- package/dist/memory/backends/qdrant-cloud.d.ts.map +1 -0
- package/dist/memory/backends/qdrant-cloud.js +167 -0
- package/dist/memory/backends/qdrant-cloud.js.map +1 -0
- package/dist/memory/context-compressor.d.ts +116 -0
- package/dist/memory/context-compressor.d.ts.map +1 -0
- package/dist/memory/context-compressor.js +430 -0
- package/dist/memory/context-compressor.js.map +1 -0
- package/dist/memory/context-pruner.d.ts +55 -0
- package/dist/memory/context-pruner.d.ts.map +1 -0
- package/dist/memory/context-pruner.js +85 -0
- package/dist/memory/context-pruner.js.map +1 -0
- package/dist/memory/correction-propagator.d.ts +44 -0
- package/dist/memory/correction-propagator.d.ts.map +1 -0
- package/dist/memory/correction-propagator.js +156 -0
- package/dist/memory/correction-propagator.js.map +1 -0
- package/dist/memory/daily-log.d.ts +67 -0
- package/dist/memory/daily-log.d.ts.map +1 -0
- package/dist/memory/daily-log.js +143 -0
- package/dist/memory/daily-log.js.map +1 -0
- package/dist/memory/dynamic-retrieval.d.ts +112 -0
- package/dist/memory/dynamic-retrieval.d.ts.map +1 -0
- package/dist/memory/dynamic-retrieval.js +908 -0
- package/dist/memory/dynamic-retrieval.js.map +1 -0
- package/dist/memory/embeddings.d.ts +172 -0
- package/dist/memory/embeddings.d.ts.map +1 -0
- package/dist/memory/embeddings.js +780 -0
- package/dist/memory/embeddings.js.map +1 -0
- package/dist/memory/generic-uap-patterns.d.ts +7 -0
- package/dist/memory/generic-uap-patterns.d.ts.map +1 -0
- package/dist/memory/generic-uap-patterns.js +43 -0
- package/dist/memory/generic-uap-patterns.js.map +1 -0
- package/dist/memory/hierarchical-memory.d.ts +141 -0
- package/dist/memory/hierarchical-memory.d.ts.map +1 -0
- package/dist/memory/hierarchical-memory.js +485 -0
- package/dist/memory/hierarchical-memory.js.map +1 -0
- package/dist/memory/knowledge-graph.d.ts +98 -0
- package/dist/memory/knowledge-graph.d.ts.map +1 -0
- package/dist/memory/knowledge-graph.js +275 -0
- package/dist/memory/knowledge-graph.js.map +1 -0
- package/dist/memory/memory-consolidator.d.ts +124 -0
- package/dist/memory/memory-consolidator.d.ts.map +1 -0
- package/dist/memory/memory-consolidator.js +514 -0
- package/dist/memory/memory-consolidator.js.map +1 -0
- package/dist/memory/memory-maintenance.d.ts +39 -0
- package/dist/memory/memory-maintenance.d.ts.map +1 -0
- package/dist/memory/memory-maintenance.js +336 -0
- package/dist/memory/memory-maintenance.js.map +1 -0
- package/dist/memory/model-router.d.ts +105 -0
- package/dist/memory/model-router.d.ts.map +1 -0
- package/dist/memory/model-router.js +474 -0
- package/dist/memory/model-router.js.map +1 -0
- package/dist/memory/multi-view-memory.d.ts +134 -0
- package/dist/memory/multi-view-memory.d.ts.map +1 -0
- package/dist/memory/multi-view-memory.js +430 -0
- package/dist/memory/multi-view-memory.js.map +1 -0
- package/dist/memory/predictive-memory.d.ts +79 -0
- package/dist/memory/predictive-memory.d.ts.map +1 -0
- package/dist/memory/predictive-memory.js +294 -0
- package/dist/memory/predictive-memory.js.map +1 -0
- package/dist/memory/prepopulate.d.ts +76 -0
- package/dist/memory/prepopulate.d.ts.map +1 -0
- package/dist/memory/prepopulate.js +832 -0
- package/dist/memory/prepopulate.js.map +1 -0
- package/dist/memory/semantic-compression.d.ts +77 -0
- package/dist/memory/semantic-compression.d.ts.map +1 -0
- package/dist/memory/semantic-compression.js +359 -0
- package/dist/memory/semantic-compression.js.map +1 -0
- package/dist/memory/serverless-qdrant.d.ts +102 -0
- package/dist/memory/serverless-qdrant.d.ts.map +1 -0
- package/dist/memory/serverless-qdrant.js +369 -0
- package/dist/memory/serverless-qdrant.js.map +1 -0
- package/dist/memory/short-term/factory.d.ts +26 -0
- package/dist/memory/short-term/factory.d.ts.map +1 -0
- package/dist/memory/short-term/factory.js +28 -0
- package/dist/memory/short-term/factory.js.map +1 -0
- package/dist/memory/short-term/indexeddb.d.ts +25 -0
- package/dist/memory/short-term/indexeddb.d.ts.map +1 -0
- package/dist/memory/short-term/indexeddb.js +64 -0
- package/dist/memory/short-term/indexeddb.js.map +1 -0
- package/dist/memory/short-term/schema.d.ts +6 -0
- package/dist/memory/short-term/schema.d.ts.map +1 -0
- package/dist/memory/short-term/schema.js +141 -0
- package/dist/memory/short-term/schema.js.map +1 -0
- package/dist/memory/short-term/sqlite.d.ts +64 -0
- package/dist/memory/short-term/sqlite.d.ts.map +1 -0
- package/dist/memory/short-term/sqlite.js +274 -0
- package/dist/memory/short-term/sqlite.js.map +1 -0
- package/dist/memory/speculative-cache.d.ts +111 -0
- package/dist/memory/speculative-cache.d.ts.map +1 -0
- package/dist/memory/speculative-cache.js +457 -0
- package/dist/memory/speculative-cache.js.map +1 -0
- package/dist/memory/task-classifier.d.ts +40 -0
- package/dist/memory/task-classifier.d.ts.map +1 -0
- package/dist/memory/task-classifier.js +342 -0
- package/dist/memory/task-classifier.js.map +1 -0
- package/dist/memory/terminal-bench-knowledge.d.ts +48 -0
- package/dist/memory/terminal-bench-knowledge.d.ts.map +1 -0
- package/dist/memory/terminal-bench-knowledge.js +622 -0
- package/dist/memory/terminal-bench-knowledge.js.map +1 -0
- package/dist/memory/write-gate.d.ts +39 -0
- package/dist/memory/write-gate.d.ts.map +1 -0
- package/dist/memory/write-gate.js +190 -0
- package/dist/memory/write-gate.js.map +1 -0
- package/dist/models/api-client.d.ts +46 -0
- package/dist/models/api-client.d.ts.map +1 -0
- package/dist/models/api-client.js +182 -0
- package/dist/models/api-client.js.map +1 -0
- package/dist/models/execution-profiles.d.ts +64 -0
- package/dist/models/execution-profiles.d.ts.map +1 -0
- package/dist/models/execution-profiles.js +403 -0
- package/dist/models/execution-profiles.js.map +1 -0
- package/dist/models/executor.d.ts +130 -0
- package/dist/models/executor.d.ts.map +1 -0
- package/dist/models/executor.js +382 -0
- package/dist/models/executor.js.map +1 -0
- package/dist/models/index.d.ts +19 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +23 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/plan-validator.d.ts +37 -0
- package/dist/models/plan-validator.d.ts.map +1 -0
- package/dist/models/plan-validator.js +179 -0
- package/dist/models/plan-validator.js.map +1 -0
- package/dist/models/planner.d.ts +73 -0
- package/dist/models/planner.d.ts.map +1 -0
- package/dist/models/planner.js +375 -0
- package/dist/models/planner.js.map +1 -0
- package/dist/models/router.d.ts +96 -0
- package/dist/models/router.d.ts.map +1 -0
- package/dist/models/router.js +523 -0
- package/dist/models/router.js.map +1 -0
- package/dist/models/types.d.ts +370 -0
- package/dist/models/types.d.ts.map +1 -0
- package/dist/models/types.js +232 -0
- package/dist/models/types.js.map +1 -0
- package/dist/models/unified-router.d.ts +152 -0
- package/dist/models/unified-router.d.ts.map +1 -0
- package/dist/models/unified-router.js +313 -0
- package/dist/models/unified-router.js.map +1 -0
- package/dist/policies/convert-policy-to-claude.d.ts +3 -0
- package/dist/policies/convert-policy-to-claude.d.ts.map +1 -0
- package/dist/policies/convert-policy-to-claude.js +87 -0
- package/dist/policies/convert-policy-to-claude.js.map +1 -0
- package/dist/policies/database-manager.d.ts +27 -0
- package/dist/policies/database-manager.d.ts.map +1 -0
- package/dist/policies/database-manager.js +198 -0
- package/dist/policies/database-manager.js.map +1 -0
- package/dist/policies/enforced-tool-router.d.ts +53 -0
- package/dist/policies/enforced-tool-router.d.ts.map +1 -0
- package/dist/policies/enforced-tool-router.js +80 -0
- package/dist/policies/enforced-tool-router.js.map +1 -0
- package/dist/policies/index.d.ts +10 -0
- package/dist/policies/index.d.ts.map +1 -0
- package/dist/policies/index.js +8 -0
- package/dist/policies/index.js.map +1 -0
- package/dist/policies/policy-gate.d.ts +59 -0
- package/dist/policies/policy-gate.d.ts.map +1 -0
- package/dist/policies/policy-gate.js +171 -0
- package/dist/policies/policy-gate.js.map +1 -0
- package/dist/policies/policy-memory.d.ts +18 -0
- package/dist/policies/policy-memory.d.ts.map +1 -0
- package/dist/policies/policy-memory.js +126 -0
- package/dist/policies/policy-memory.js.map +1 -0
- package/dist/policies/policy-tools.d.ts +11 -0
- package/dist/policies/policy-tools.d.ts.map +1 -0
- package/dist/policies/policy-tools.js +66 -0
- package/dist/policies/policy-tools.js.map +1 -0
- package/dist/policies/schemas/policy.d.ts +69 -0
- package/dist/policies/schemas/policy.d.ts.map +1 -0
- package/dist/policies/schemas/policy.js +31 -0
- package/dist/policies/schemas/policy.js.map +1 -0
- package/dist/tasks/coordination.d.ts +83 -0
- package/dist/tasks/coordination.d.ts.map +1 -0
- package/dist/tasks/coordination.js +291 -0
- package/dist/tasks/coordination.js.map +1 -0
- package/dist/tasks/database.d.ts +19 -0
- package/dist/tasks/database.d.ts.map +1 -0
- package/dist/tasks/database.js +149 -0
- package/dist/tasks/database.js.map +1 -0
- package/dist/tasks/decoder-gate.d.ts +64 -0
- package/dist/tasks/decoder-gate.d.ts.map +1 -0
- package/dist/tasks/decoder-gate.js +268 -0
- package/dist/tasks/decoder-gate.js.map +1 -0
- package/dist/tasks/index.d.ts +6 -0
- package/dist/tasks/index.d.ts.map +1 -0
- package/dist/tasks/index.js +6 -0
- package/dist/tasks/index.js.map +1 -0
- package/dist/tasks/service.d.ts +40 -0
- package/dist/tasks/service.d.ts.map +1 -0
- package/dist/tasks/service.js +671 -0
- package/dist/tasks/service.js.map +1 -0
- package/dist/tasks/types.d.ts +238 -0
- package/dist/tasks/types.d.ts.map +1 -0
- package/dist/tasks/types.js +74 -0
- package/dist/tasks/types.js.map +1 -0
- package/dist/telemetry/index.d.ts +2 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +2 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/telemetry/session-telemetry.d.ts +56 -0
- package/dist/telemetry/session-telemetry.d.ts.map +1 -0
- package/dist/telemetry/session-telemetry.js +807 -0
- package/dist/telemetry/session-telemetry.js.map +1 -0
- package/dist/types/analysis.d.ts +82 -0
- package/dist/types/analysis.d.ts.map +1 -0
- package/dist/types/analysis.js +2 -0
- package/dist/types/analysis.js.map +1 -0
- package/dist/types/config.d.ts +3324 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/config.js +418 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/coordination.d.ts +240 -0
- package/dist/types/coordination.d.ts.map +1 -0
- package/dist/types/coordination.js +43 -0
- package/dist/types/coordination.js.map +1 -0
- package/dist/types/index.d.ts +4 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +4 -0
- package/dist/types/index.js.map +1 -0
- package/dist/uap-droids-strict.d.ts +59 -0
- package/dist/uap-droids-strict.d.ts.map +1 -0
- package/dist/uap-droids-strict.js +200 -0
- package/dist/uap-droids-strict.js.map +1 -0
- package/dist/utils/config-manager.d.ts +30 -0
- package/dist/utils/config-manager.d.ts.map +1 -0
- package/dist/utils/config-manager.js +41 -0
- package/dist/utils/config-manager.js.map +1 -0
- package/dist/utils/fetch-with-retry.d.ts +5 -0
- package/dist/utils/fetch-with-retry.d.ts.map +1 -0
- package/dist/utils/fetch-with-retry.js +61 -0
- package/dist/utils/fetch-with-retry.js.map +1 -0
- package/dist/utils/merge-claude-md.d.ts +28 -0
- package/dist/utils/merge-claude-md.d.ts.map +1 -0
- package/dist/utils/merge-claude-md.js +342 -0
- package/dist/utils/merge-claude-md.js.map +1 -0
- package/dist/utils/rate-limiter.d.ts +58 -0
- package/dist/utils/rate-limiter.d.ts.map +1 -0
- package/dist/utils/rate-limiter.js +100 -0
- package/dist/utils/rate-limiter.js.map +1 -0
- package/dist/utils/string-similarity.d.ts +37 -0
- package/dist/utils/string-similarity.d.ts.map +1 -0
- package/dist/utils/string-similarity.js +114 -0
- package/dist/utils/string-similarity.js.map +1 -0
- package/dist/utils/validate-json.d.ts +51 -0
- package/dist/utils/validate-json.d.ts.map +1 -0
- package/dist/utils/validate-json.js +94 -0
- package/dist/utils/validate-json.js.map +1 -0
- package/docs/INDEX.md +66 -0
- package/docs/architecture/MULTI_MODEL.md +224 -0
- package/docs/architecture/SYSTEM_ANALYSIS.md +1117 -0
- package/docs/architecture/UAP_COMPLIANCE.md +217 -0
- package/docs/architecture/UAP_PROTOCOL.md +339 -0
- package/docs/architecture/UAP_STRICT_DROIDS.md +172 -0
- package/docs/archive/BALLS_MODE_SELF_ANALYSIS.md +260 -0
- package/docs/archive/FAILING_TASKS_SOLUTION_PLAN.md +668 -0
- package/docs/archive/JINJA2-SYSTEM-MESSAGE-FIX.md +209 -0
- package/docs/archive/NPM-PUBLISH-V0.9.1.md +240 -0
- package/docs/archive/OPTIMIZATION_OPTIONS.md +334 -0
- package/docs/archive/SETUP_IMPROVEMENTS.md +213 -0
- package/docs/archive/UAP_GENERIC_OPTIMIZATION_PLAN.md +270 -0
- package/docs/archive/UAP_V103_PATTERN_DESIGN.md +315 -0
- package/docs/archive/UAP_V104_COMPLIANCE_DESIGN.md +223 -0
- package/docs/archive/changelog/2026-03-10_uap-100-compliance.md +77 -0
- package/docs/archive/changelog/2026-03-10_uap-full-system-verification.md +109 -0
- package/docs/benchmarks/ACCURACY_ANALYSIS.md +471 -0
- package/docs/benchmarks/TOKEN_OPTIMIZATION.md +572 -0
- package/docs/benchmarks/VALIDATION_PLAN.md +568 -0
- package/docs/benchmarks/VALIDATION_RESULTS.md +161 -0
- package/docs/deployment/DEPLOYMENT.md +895 -0
- package/docs/deployment/DEPLOYMENT_STRATEGIES.md +518 -0
- package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +856 -0
- package/docs/deployment/DEPLOY_BATCHING.md +273 -0
- package/docs/deployment/DEPLOY_BUCKETING_ANALYSIS.md +420 -0
- package/docs/deployment/QWEN35_LLAMA_CPP.md +265 -0
- package/docs/getting-started/INTEGRATION.md +449 -0
- package/docs/getting-started/OVERVIEW.md +344 -0
- package/docs/getting-started/SETUP.md +203 -0
- package/docs/integrations/MCP_ROUTER_SETUP.md +445 -0
- package/docs/integrations/RTK_INTEGRATION.md +468 -0
- package/docs/operations/TROUBLESHOOTING.md +660 -0
- package/docs/reference/API_REFERENCE.md +903 -0
- package/docs/reference/FEATURES.md +472 -0
- package/docs/reference/HARNESS-MATRIX.md +318 -0
- package/docs/reference/UAP_CLI_REFERENCE.md +600 -0
- package/docs/research/BEHAVIORAL_PATTERNS.md +228 -0
- package/docs/research/DOMAIN_STRATEGIES.md +316 -0
- package/docs/research/MEMORY_SYSTEMS_COMPARISON.md +812 -0
- package/docs/research/PATTERN_ANALYSIS_2026-01-18.md +436 -0
- package/docs/research/PERFORMANCE_ANALYSIS_2026-01-18.md +209 -0
- package/docs/research/PERFORMANCE_TEST_PLAN.md +383 -0
- package/docs/research/TERMINAL_BENCH_LEARNINGS.md +217 -0
- package/package.json +113 -0
- package/scripts/README.md +161 -0
- package/templates/CLAUDE.template.md +10 -0
- package/templates/CLAUDE_ARCHITECTURE.template.md +103 -0
- package/templates/CLAUDE_CODING.template.md +127 -0
- package/templates/CLAUDE_DROIDS.template.md +109 -0
- package/templates/CLAUDE_MEMORY.template.md +131 -0
- package/templates/CLAUDE_WORKFLOWS.template.md +139 -0
- package/templates/PROJECT.template.md +209 -0
- package/templates/SCHEMA.md +57 -0
- package/templates/archive/CLAUDE.template.root-v6.md +534 -0
- package/templates/archive/CLAUDE.template.v6.md +534 -0
- package/templates/hooks/forgecode/pre-compact.sh +68 -0
- package/templates/hooks/forgecode/session-start.sh +169 -0
- package/templates/hooks/forgecode.plugin.sh +128 -0
- package/templates/hooks/pre-compact.sh +74 -0
- package/templates/hooks/session-start.sh +366 -0
- package/tools/agents/README.md +224 -0
- package/tools/agents/UAP/README.md +386 -0
- package/tools/agents/UAP/__init__.py +9 -0
- package/tools/agents/UAP/cli.py +901 -0
- package/tools/agents/UAP/compliance_verify.sh +108 -0
- package/tools/agents/UAP/full_verification.sh +126 -0
- package/tools/agents/UAP/version.py +32 -0
- package/tools/agents/benchmarks/benchmark_memory_systems.py +730 -0
- package/tools/agents/benchmarks/results/benchmark_20260106_064817.json +170 -0
- package/tools/agents/benchmarks/results/benchmark_20260106_064817.md +51 -0
- package/tools/agents/config/chat_template.jinja +77 -0
- package/tools/agents/config/tool-call-schema.json +19 -0
- package/tools/agents/config/tool-call.gbnf +58 -0
- package/tools/agents/docker/Dockerfile.python +52 -0
- package/tools/agents/docker/Dockerfile.ubuntu +55 -0
- package/tools/agents/docker-compose.qdrant.yml +24 -0
- package/tools/agents/install-opencode-local.sh.j2 +135 -0
- package/tools/agents/migrations/apply.py +256 -0
- package/tools/agents/opencode_uap_agent.py +1505 -0
- package/tools/agents/plugin/README.md +91 -0
- package/tools/agents/plugin/index.ts +46 -0
- package/tools/agents/plugin/pre-compact.sh +68 -0
- package/tools/agents/plugin/session-start.sh +175 -0
- package/tools/agents/plugin/uap-commands.ts +45 -0
- package/tools/agents/plugin/uap-droids.ts +54 -0
- package/tools/agents/plugin/uap-patterns.ts +54 -0
- package/tools/agents/plugin/uap-skills.ts +52 -0
- package/tools/agents/plugins/uap-enforce.ts +314 -0
- package/tools/agents/scripts/__pycache__/tool_call_wrapper.cpython-313.pyc +0 -0
- package/tools/agents/scripts/chat_template_verifier.py +343 -0
- package/tools/agents/scripts/fix-qwen-template.js +38 -0
- package/tools/agents/scripts/fix_qwen_chat_template.py +316 -0
- package/tools/agents/scripts/generate_lora_training_data.py +412 -0
- package/tools/agents/scripts/init_qdrant.py +151 -0
- package/tools/agents/scripts/memory_migration.py +560 -0
- package/tools/agents/scripts/migrate_memory_to_qdrant.py +110 -0
- package/tools/agents/scripts/prepare_lora.sh +512 -0
- package/tools/agents/scripts/query_memory.py +200 -0
- package/tools/agents/scripts/qwen-tool-call-test.js +38 -0
- package/tools/agents/scripts/qwen-tool-call-wrapper.js +38 -0
- package/tools/agents/scripts/qwen_tool_call_test.py +464 -0
- package/tools/agents/scripts/qwen_tool_call_wrapper.py +686 -0
- package/tools/agents/scripts/start-services.sh +96 -0
- package/tools/agents/scripts/tool-choice-proxy.cjs +296 -0
- package/tools/agents/scripts/tool_call_test.py +656 -0
- package/tools/agents/scripts/tool_call_wrapper.py +799 -0
- package/tools/agents/tests/test_uap_compliance.py +257 -0
- package/tools/agents/uap_agent.py +122 -0
- package/tools/agents/uap_agent_install.sh +12 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
# UAP Benchmark Analysis: Accuracy Issues & Improvement Recommendations
|
|
2
|
+
|
|
3
|
+
**Date:** 2026-01-15
|
|
4
|
+
**Author:** Droid Analysis
|
|
5
|
+
|
|
6
|
+
## Executive Summary
|
|
7
|
+
|
|
8
|
+
Our internal UAP benchmark shows dramatically different results from Terminal-Bench 2.0:
|
|
9
|
+
|
|
10
|
+
| Model | Our Benchmark | Terminal-Bench 2.0 | Delta |
|
|
11
|
+
| --------------- | ------------- | ---------------------------------- | ------- |
|
|
12
|
+
| Claude Opus 4.5 | 100% | 63.1% (Droid), 52.1% (Claude Code) | +37-48% |
|
|
13
|
+
| GPT 5.2 Codex | 87.5% | 64.9% (Droid), 62.9% (Codex CLI) | +23-25% |
|
|
14
|
+
| GLM 4.7 | 75% | ~24.5% (GLM 4.6 baseline) | +50% |
|
|
15
|
+
|
|
16
|
+
**Conclusion:** Our benchmark is NOT accurate and significantly overestimates model capabilities.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Root Cause Analysis
|
|
21
|
+
|
|
22
|
+
### 1. Task Complexity Gap (CRITICAL)
|
|
23
|
+
|
|
24
|
+
**Our Tasks:**
|
|
25
|
+
|
|
26
|
+
- Simple code generation (calculate average, fix off-by-one bug)
|
|
27
|
+
- Pattern implementation (singleton class)
|
|
28
|
+
- Refactoring (strategy pattern)
|
|
29
|
+
- Algorithm (Dijkstra's - well-documented)
|
|
30
|
+
|
|
31
|
+
**Terminal-Bench 2.0 Tasks (89 tasks):**
|
|
32
|
+
|
|
33
|
+
- Build Linux kernel from source
|
|
34
|
+
- Configure git web server with authentication
|
|
35
|
+
- Exploit CVE-2023-28432 (MinIO vulnerability)
|
|
36
|
+
- Train RL agents and text classifiers
|
|
37
|
+
- Resolve Conda environment dependency conflicts
|
|
38
|
+
- Scrub repository of secrets
|
|
39
|
+
- QEMU/KVM virtualization setup
|
|
40
|
+
- DNS server configuration
|
|
41
|
+
- Cron job debugging with malware detection
|
|
42
|
+
|
|
43
|
+
**Issue:** Our tasks are "textbook problems" with well-known solutions in training data. Terminal-Bench tasks require:
|
|
44
|
+
|
|
45
|
+
- Multi-step environment exploration
|
|
46
|
+
- Real system interaction (file I/O, network, processes)
|
|
47
|
+
- Domain-specific knowledge (security, ML, sysadmin)
|
|
48
|
+
- Error recovery and debugging
|
|
49
|
+
- Time-constrained execution (aggressive timeouts)
|
|
50
|
+
|
|
51
|
+
### 2. Evaluation Method Flaws
|
|
52
|
+
|
|
53
|
+
**Our Method:**
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
// Simple pattern matching - 60% threshold
|
|
57
|
+
const matchRatio = result.matchedPatterns.length / task.expectedPatterns.length;
|
|
58
|
+
result.success = matchRatio >= 0.6;
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Problems:**
|
|
62
|
+
|
|
63
|
+
1. **Pattern matching != correctness** - Code can contain patterns but be wrong
|
|
64
|
+
2. **No execution verification** - We don't run the generated code
|
|
65
|
+
3. **No test suite** - Terminal-Bench has post-run tests for each task
|
|
66
|
+
4. **No environment interaction** - Real tasks need file I/O, network, shell commands
|
|
67
|
+
|
|
68
|
+
**Terminal-Bench Method:**
|
|
69
|
+
|
|
70
|
+
- Tasks run in Docker containers
|
|
71
|
+
- Time-boxed execution (aggressive timeouts)
|
|
72
|
+
- Success = ALL post-run tests pass
|
|
73
|
+
- Real environment interaction required
|
|
74
|
+
|
|
75
|
+
### 3. Memory Context Injection Issues
|
|
76
|
+
|
|
77
|
+
**Current Implementation:**
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
const prompt = withMemory
|
|
81
|
+
? getUAPMemoryContext() + task.prompt // Just prepend context
|
|
82
|
+
: task.prompt;
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**Problems:**
|
|
86
|
+
|
|
87
|
+
1. **Static context prepending** - Not dynamically relevant to task
|
|
88
|
+
2. **No semantic retrieval** - Doesn't query memory based on task content
|
|
89
|
+
3. **Context bloat** - 4158 chars of generic info, may hurt more than help
|
|
90
|
+
4. **No task-specific memories** - Doesn't retrieve relevant past experiences
|
|
91
|
+
|
|
92
|
+
### 4. Execution Environment Gap
|
|
93
|
+
|
|
94
|
+
**Our Benchmark:**
|
|
95
|
+
|
|
96
|
+
- `droid exec` with `--auto medium` (limited permissions)
|
|
97
|
+
- Single prompt completion (no multi-turn interaction)
|
|
98
|
+
- No file system access during execution
|
|
99
|
+
- No shell command execution
|
|
100
|
+
- No network access
|
|
101
|
+
|
|
102
|
+
**Terminal-Bench:**
|
|
103
|
+
|
|
104
|
+
- Full Docker container with root access
|
|
105
|
+
- Multi-turn agent loop (explore, act, verify)
|
|
106
|
+
- Real file system operations
|
|
107
|
+
- Shell command execution
|
|
108
|
+
- Network access for some tasks
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## Why Models Score Higher in Our Benchmark
|
|
113
|
+
|
|
114
|
+
1. **Training data advantage** - Our tasks (Dijkstra, singleton, etc.) are common in training data
|
|
115
|
+
2. **No verification** - Pattern matching doesn't catch bugs
|
|
116
|
+
3. **Single-shot completion** - No need for environment exploration
|
|
117
|
+
4. **No time pressure** - Tasks take 20-200s, Terminal-Bench has strict timeouts
|
|
118
|
+
5. **No real execution** - Generated code is never run
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## UAP Improvement Recommendations
|
|
123
|
+
|
|
124
|
+
### High Impact (Must Do)
|
|
125
|
+
|
|
126
|
+
#### 1. Implement Semantic Memory Retrieval
|
|
127
|
+
|
|
128
|
+
```typescript
|
|
129
|
+
// BEFORE: Static prepending
|
|
130
|
+
const prompt = memoryContext + task.prompt;
|
|
131
|
+
|
|
132
|
+
// AFTER: Semantic retrieval based on task content
|
|
133
|
+
async function getRelevantMemory(task: BenchmarkTaskDef): Promise<string> {
|
|
134
|
+
const keywords = extractKeywords(task.prompt);
|
|
135
|
+
const relevantLessons = await querySemanticMemory(keywords, {
|
|
136
|
+
minSimilarity: 0.7,
|
|
137
|
+
limit: 5,
|
|
138
|
+
types: ['lesson', 'gotcha', 'pattern'],
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
const taskCategory = classifyTask(task.prompt);
|
|
142
|
+
const categoryPatterns = await getPatternsByCategory(taskCategory);
|
|
143
|
+
|
|
144
|
+
return formatMemoryContext(relevantLessons, categoryPatterns);
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
#### 2. Add Task Classification & Routing
|
|
149
|
+
|
|
150
|
+
```typescript
|
|
151
|
+
interface TaskClassification {
|
|
152
|
+
category: 'sysadmin' | 'security' | 'ml' | 'debugging' | 'coding';
|
|
153
|
+
requiredCapabilities: string[];
|
|
154
|
+
suggestedDroid: string;
|
|
155
|
+
memoryQueryHints: string[];
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function classifyTask(instruction: string): TaskClassification {
|
|
159
|
+
// Use keyword matching + LLM classification
|
|
160
|
+
// Route to specialized droids based on category
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
#### 3. Implement Real Execution Verification
|
|
165
|
+
|
|
166
|
+
```typescript
|
|
167
|
+
// Execute generated code in sandboxed environment
|
|
168
|
+
async function verifyCodeExecution(
|
|
169
|
+
code: string,
|
|
170
|
+
testCases: TestCase[]
|
|
171
|
+
): Promise<VerificationResult> {
|
|
172
|
+
const sandbox = await createSandbox();
|
|
173
|
+
try {
|
|
174
|
+
await sandbox.writeFile('solution.ts', code);
|
|
175
|
+
await sandbox.exec('npx tsc solution.ts');
|
|
176
|
+
|
|
177
|
+
for (const test of testCases) {
|
|
178
|
+
const result = await sandbox.exec(`node solution.js ${test.input}`);
|
|
179
|
+
if (result.stdout.trim() !== test.expectedOutput) {
|
|
180
|
+
return { success: false, failedTest: test };
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return { success: true };
|
|
184
|
+
} finally {
|
|
185
|
+
await sandbox.cleanup();
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
#### 4. Add Multi-Turn Agent Loop
|
|
191
|
+
|
|
192
|
+
```typescript
|
|
193
|
+
// Enable iterative refinement like real Terminal-Bench agents
|
|
194
|
+
async function executeWithRetry(task: BenchmarkTaskDef, maxTurns: number = 5): Promise<TaskResult> {
|
|
195
|
+
let context = getInitialContext(task);
|
|
196
|
+
|
|
197
|
+
for (let turn = 0; turn < maxTurns; turn++) {
|
|
198
|
+
const response = await model.complete(context);
|
|
199
|
+
const verification = await verifyResponse(response, task);
|
|
200
|
+
|
|
201
|
+
if (verification.success) {
|
|
202
|
+
return { success: true, turns: turn + 1, response };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Add error feedback for next turn
|
|
206
|
+
context += `\n\nPrevious attempt failed: ${verification.error}\nPlease fix and try again.`;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return { success: false, turns: maxTurns };
|
|
210
|
+
}
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### Medium Impact (Should Do)
|
|
214
|
+
|
|
215
|
+
#### 5. Implement Hierarchical Prompting (from Droid #1)
|
|
216
|
+
|
|
217
|
+
```typescript
|
|
218
|
+
// Three-tier prompting hierarchy
|
|
219
|
+
interface PromptHierarchy {
|
|
220
|
+
toolDescriptions: string; // High-level capabilities
|
|
221
|
+
systemPrompt: string; // Behavioral guidelines
|
|
222
|
+
systemNotifications: string; // Time-sensitive context (injected at END for recency bias)
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
function buildPrompt(task: BenchmarkTaskDef, memory: string): string {
|
|
226
|
+
return `
|
|
227
|
+
${TOOL_DESCRIPTIONS}
|
|
228
|
+
|
|
229
|
+
${SYSTEM_PROMPT}
|
|
230
|
+
|
|
231
|
+
Task: ${task.prompt}
|
|
232
|
+
|
|
233
|
+
${memory}
|
|
234
|
+
|
|
235
|
+
${SYSTEM_NOTIFICATION} // Put critical reminders at END
|
|
236
|
+
`;
|
|
237
|
+
}
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
#### 6. Add Environment Bootstrap Phase
|
|
241
|
+
|
|
242
|
+
```typescript
|
|
243
|
+
// Gather system info before task execution (like Droid does)
|
|
244
|
+
async function bootstrapEnvironment(): Promise<EnvironmentContext> {
|
|
245
|
+
const sysInfo = await exec('uname -a && cat /etc/os-release');
|
|
246
|
+
const tools = await exec('which python python3 pip npm node go cargo');
|
|
247
|
+
const diskMem = await exec('df -h / && free -h');
|
|
248
|
+
const processes = await exec('ps aux | head -20');
|
|
249
|
+
const gitStatus = await exec('git status 2>/dev/null');
|
|
250
|
+
|
|
251
|
+
return {
|
|
252
|
+
system: sysInfo,
|
|
253
|
+
availableTools: tools,
|
|
254
|
+
resources: diskMem,
|
|
255
|
+
processes,
|
|
256
|
+
gitStatus,
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
#### 7. Store Task-Specific Learnings
|
|
262
|
+
|
|
263
|
+
```typescript
|
|
264
|
+
// After each task, store what worked/failed
|
|
265
|
+
async function storeTaskLearning(task: BenchmarkTaskDef, result: TaskResult): Promise<void> {
|
|
266
|
+
if (result.success) {
|
|
267
|
+
await storeMemory({
|
|
268
|
+
type: 'lesson',
|
|
269
|
+
content: `Task "${task.name}" succeeded with approach: ${summarizeApproach(result.response)}`,
|
|
270
|
+
tags: [task.category, task.difficulty],
|
|
271
|
+
importance: 8,
|
|
272
|
+
});
|
|
273
|
+
} else {
|
|
274
|
+
await storeMemory({
|
|
275
|
+
type: 'gotcha',
|
|
276
|
+
content: `Task "${task.name}" failed: ${result.error}. Avoid: ${summarizeFailure(result)}`,
|
|
277
|
+
tags: [task.category, 'failure'],
|
|
278
|
+
importance: 9,
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Lower Impact (Nice to Have)
|
|
285
|
+
|
|
286
|
+
#### 8. Add Speed Optimizations
|
|
287
|
+
|
|
288
|
+
- Track tool/command runtime, inject into context
|
|
289
|
+
- Use short default timeouts (30s), opt-in to longer
|
|
290
|
+
- Cache environment bootstrap info
|
|
291
|
+
- Parallelize independent operations
|
|
292
|
+
|
|
293
|
+
#### 9. Model-Specific Adaptations
|
|
294
|
+
|
|
295
|
+
```typescript
|
|
296
|
+
const MODEL_CONFIGS = {
|
|
297
|
+
'opus-4.5': {
|
|
298
|
+
fileEditFormat: 'FIND_AND_REPLACE',
|
|
299
|
+
pathStyle: 'absolute',
|
|
300
|
+
strengths: ['security', 'debugging', 'CVE exploitation'],
|
|
301
|
+
},
|
|
302
|
+
'gpt-5.2-codex': {
|
|
303
|
+
fileEditFormat: 'V4A_DIFF',
|
|
304
|
+
pathStyle: 'relative',
|
|
305
|
+
strengths: ['ML training', 'video editing'],
|
|
306
|
+
},
|
|
307
|
+
'glm-4.7': {
|
|
308
|
+
fileEditFormat: 'FIND_AND_REPLACE',
|
|
309
|
+
pathStyle: 'absolute',
|
|
310
|
+
strengths: ['speed', 'simple tasks'],
|
|
311
|
+
},
|
|
312
|
+
};
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
## Recommended Benchmark Redesign
|
|
318
|
+
|
|
319
|
+
### Phase 1: Add Real Tasks (Week 1)
|
|
320
|
+
|
|
321
|
+
1. Add 10+ Terminal-Bench-style tasks:
|
|
322
|
+
- File manipulation with verification
|
|
323
|
+
- Git operations with repo state verification
|
|
324
|
+
- Code refactoring with test execution
|
|
325
|
+
- Debugging tasks with error injection
|
|
326
|
+
- Configuration tasks with validation
|
|
327
|
+
|
|
328
|
+
### Phase 2: Add Execution Verification (Week 2)
|
|
329
|
+
|
|
330
|
+
1. Integrate Docker sandbox for code execution
|
|
331
|
+
2. Add test suites for each task
|
|
332
|
+
3. Implement timeout handling
|
|
333
|
+
4. Add resource monitoring (memory, CPU)
|
|
334
|
+
|
|
335
|
+
### Phase 3: Improve Memory System (Week 3)
|
|
336
|
+
|
|
337
|
+
1. Implement semantic memory retrieval
|
|
338
|
+
2. Add task classification & routing
|
|
339
|
+
3. Store task-specific learnings
|
|
340
|
+
4. Add pattern extraction from successes
|
|
341
|
+
|
|
342
|
+
### Phase 4: Multi-Turn Agent Loop (Week 4)
|
|
343
|
+
|
|
344
|
+
1. Enable iterative refinement
|
|
345
|
+
2. Add error feedback mechanism
|
|
346
|
+
3. Implement planning & progress tracking
|
|
347
|
+
4. Add environment exploration phase
|
|
348
|
+
|
|
349
|
+
---
|
|
350
|
+
|
|
351
|
+
## Expected Impact
|
|
352
|
+
|
|
353
|
+
After implementing these improvements:
|
|
354
|
+
|
|
355
|
+
| Model | Current | Expected | Reasoning |
|
|
356
|
+
| --------------- | ------------ | --------------- | --------------------------------- |
|
|
357
|
+
| Claude Opus 4.5 | 100% → ~65% | Real difficulty | Harder tasks, real verification |
|
|
358
|
+
| GPT 5.2 Codex | 87.5% → ~60% | Real difficulty | Harder tasks, real verification |
|
|
359
|
+
| GLM 4.7 | 75% → ~35% | Base capability | Lower base, but memory helps more |
|
|
360
|
+
|
|
361
|
+
**With UAP Memory Improvements:**
|
|
362
|
+
|
|
363
|
+
| Model | Without UAP | With UAP | Expected Gain |
|
|
364
|
+
| --------------- | ----------- | -------- | ----------------------- |
|
|
365
|
+
| Claude Opus 4.5 | ~60% | ~68% | +8% (already high base) |
|
|
366
|
+
| GPT 5.2 Codex | ~55% | ~65% | +10% |
|
|
367
|
+
| GLM 4.7 | ~30% | ~45% | +15% (most benefit) |
|
|
368
|
+
|
|
369
|
+
---
|
|
370
|
+
|
|
371
|
+
## Conclusion
|
|
372
|
+
|
|
373
|
+
Our current benchmark significantly overestimates model performance due to:
|
|
374
|
+
|
|
375
|
+
1. Simple, well-known tasks vs real-world complexity
|
|
376
|
+
2. Pattern matching vs execution verification
|
|
377
|
+
3. Single-shot vs multi-turn interaction
|
|
378
|
+
4. No environment interaction
|
|
379
|
+
|
|
380
|
+
To make UAP truly effective for Terminal-Bench-style tasks, we need to:
|
|
381
|
+
|
|
382
|
+
1. Implement semantic memory retrieval
|
|
383
|
+
2. Add task classification & routing
|
|
384
|
+
3. Enable multi-turn agent loops
|
|
385
|
+
4. Add real execution verification
|
|
386
|
+
|
|
387
|
+
The current +12.5% improvement for GLM 4.7 with memory is likely real but understated - with proper memory retrieval, the benefit could be +15-20% for lower-capability models.
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
## Appendix: Specific Code Issues Found
|
|
392
|
+
|
|
393
|
+
### Issue 1: Qdrant Query Uses Dummy Embeddings
|
|
394
|
+
|
|
395
|
+
**File:** `src/memory/backends/qdrant-cloud.ts`
|
|
396
|
+
|
|
397
|
+
```typescript
|
|
398
|
+
async query(_queryText: string, limit = 10): Promise<MemoryEntry[]> {
|
|
399
|
+
// TODO: Generate embedding for query string
|
|
400
|
+
// For now, use dummy embedding - needs embedding service integration
|
|
401
|
+
const queryEmbedding = new Array(384).fill(0); // THIS IS BROKEN
|
|
402
|
+
|
|
403
|
+
const results = await this.client.search(this.collection, {
|
|
404
|
+
vector: queryEmbedding,
|
|
405
|
+
limit,
|
|
406
|
+
});
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
**Impact:** Semantic memory retrieval is completely non-functional. All queries return random results.
|
|
410
|
+
|
|
411
|
+
**Fix Required:** Integrate embedding generation (OpenAI, Sentence Transformers, or local model).
|
|
412
|
+
|
|
413
|
+
### Issue 2: Memory Context Loading Is Static
|
|
414
|
+
|
|
415
|
+
**File:** `src/benchmarks/model-integration.ts`
|
|
416
|
+
|
|
417
|
+
```typescript
|
|
418
|
+
function loadUAPMemoryContext(): string {
|
|
419
|
+
// Extracts fixed sections from CLAUDE.md
|
|
420
|
+
// Does NOT query based on task content
|
|
421
|
+
// Uses hardcoded patterns, not semantic search
|
|
422
|
+
}
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
**Impact:** Memory context is identical for all tasks regardless of relevance.
|
|
426
|
+
|
|
427
|
+
### Issue 3: Short-Term Memory Query Is Keyword-Only
|
|
428
|
+
|
|
429
|
+
**File:** `src/memory/short-term/sqlite.ts`
|
|
430
|
+
|
|
431
|
+
```typescript
|
|
432
|
+
async query(searchTerm: string, limit = 10): Promise<ShortTermMemory[]> {
|
|
433
|
+
const stmt = this.db.prepare(`
|
|
434
|
+
SELECT ... FROM memories
|
|
435
|
+
WHERE project_id = ? AND content LIKE ? // Simple substring match
|
|
436
|
+
`);
|
|
437
|
+
return stmt.all(this.projectId, `%${searchTerm}%`, limit);
|
|
438
|
+
}
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
**Impact:** No semantic understanding. "authentication flow" won't match "login process".
|
|
442
|
+
|
|
443
|
+
### Issue 4: No Task-Specific Memory Retrieval
|
|
444
|
+
|
|
445
|
+
**Current flow:**
|
|
446
|
+
|
|
447
|
+
1. Load generic CLAUDE.md sections
|
|
448
|
+
2. Query short-term memory with fixed SQL
|
|
449
|
+
3. Prepend to prompt
|
|
450
|
+
|
|
451
|
+
**Required flow:**
|
|
452
|
+
|
|
453
|
+
1. Classify task type (sysadmin, security, ML, etc.)
|
|
454
|
+
2. Extract task keywords/entities
|
|
455
|
+
3. Query semantic memory with embeddings
|
|
456
|
+
4. Retrieve task-specific patterns and gotchas
|
|
457
|
+
5. Format context with recency bias (critical info at END)
|
|
458
|
+
6. Inject dynamically during multi-turn execution
|
|
459
|
+
|
|
460
|
+
---
|
|
461
|
+
|
|
462
|
+
## Implementation Priority
|
|
463
|
+
|
|
464
|
+
| Priority | Issue | Impact | Effort |
|
|
465
|
+
| -------- | ---------------------------------- | ---------------------- | ------ |
|
|
466
|
+
| P0 | Fix Qdrant embedding generation | Semantic search broken | Medium |
|
|
467
|
+
| P0 | Add task classification | Enable routing | Low |
|
|
468
|
+
| P1 | Implement dynamic memory retrieval | Context relevance | Medium |
|
|
469
|
+
| P1 | Add execution verification | Accuracy measurement | High |
|
|
470
|
+
| P2 | Multi-turn agent loop | Error recovery | High |
|
|
471
|
+
| P2 | Hierarchical prompting | Context optimization | Medium |
|