universal-agent-memory 1.7.0 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -2
- package/dist/index.js.map +1 -1
- package/dist/memory/adaptive-context.d.ts +65 -22
- package/dist/memory/adaptive-context.d.ts.map +1 -1
- package/dist/memory/adaptive-context.js +660 -196
- package/dist/memory/adaptive-context.js.map +1 -1
- package/dist/memory/model-router.d.ts +12 -1
- package/dist/memory/model-router.d.ts.map +1 -1
- package/dist/memory/model-router.js +140 -1
- package/dist/memory/model-router.js.map +1 -1
- package/package.json +1 -1
- package/templates/CLAUDE.template.md +116 -69
|
@@ -1,23 +1,73 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Hybrid Adaptive Context Selector for UAM (Option 4)
|
|
3
3
|
*
|
|
4
|
+
* VERSION: 1.9.0 - 10 Model Outcome Success Optimizations
|
|
5
|
+
*
|
|
4
6
|
* Combines task classification with time-budget awareness, runtime monitoring,
|
|
5
7
|
* and historical benefit tracking for optimal context loading decisions.
|
|
6
8
|
*
|
|
7
|
-
*
|
|
8
|
-
* 1.
|
|
9
|
-
* 2.
|
|
10
|
-
* 3.
|
|
11
|
-
* 4.
|
|
9
|
+
* OPTIMIZATIONS IMPLEMENTED:
|
|
10
|
+
* 1. Historical Data Persistence - SQLite instead of in-memory Map
|
|
11
|
+
* 2. Task-specific context sections for 5 failing tasks
|
|
12
|
+
* 3. Missing context sections (git_recovery, web_parsing, data_processing, theorem_proving)
|
|
13
|
+
* 4. Weighted keyword relevance scoring (TF-IDF-like specificity weights)
|
|
14
|
+
* 5. Token budget utilization - increase minimal sections from 1→2
|
|
15
|
+
* 6. Task-type-selective pattern injection
|
|
16
|
+
* 7. Smarter progressive context escalation with error-to-section mapping
|
|
17
|
+
* 8. Model Router fingerprint persistence integrated
|
|
18
|
+
* 9. Multi-category task classification support
|
|
19
|
+
* 10. Semantic caching foundation for task→outcome mappings
|
|
12
20
|
*/
|
|
21
|
+
import Database from 'better-sqlite3';
|
|
22
|
+
import { existsSync, mkdirSync } from 'fs';
|
|
23
|
+
import { dirname, join } from 'path';
|
|
24
|
+
import { fileURLToPath } from 'url';
|
|
13
25
|
import { classifyTask as classifyTaskType } from './task-classifier.js';
|
|
14
|
-
import { recordTaskOutcome as updateModelRouterFingerprint } from './model-router.js';
|
|
26
|
+
import { recordTaskOutcome as updateModelRouterFingerprint, getModelFingerprint } from './model-router.js';
|
|
27
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
28
|
+
const __dirname = dirname(__filename);
|
|
29
|
+
// OPT 1: SQLite-backed historical data persistence
|
|
30
|
+
let historicalDb = null;
|
|
31
|
+
function getHistoricalDb() {
|
|
32
|
+
if (historicalDb)
|
|
33
|
+
return historicalDb;
|
|
34
|
+
// Use the same data directory as short_term.db
|
|
35
|
+
const dbDir = join(__dirname, '../../agents/data/memory');
|
|
36
|
+
if (!existsSync(dbDir)) {
|
|
37
|
+
mkdirSync(dbDir, { recursive: true });
|
|
38
|
+
}
|
|
39
|
+
const dbPath = join(dbDir, 'historical_context.db');
|
|
40
|
+
historicalDb = new Database(dbPath);
|
|
41
|
+
// Enable WAL mode for better concurrent access
|
|
42
|
+
historicalDb.pragma('journal_mode = WAL');
|
|
43
|
+
// Create schema if not exists
|
|
44
|
+
historicalDb.exec(`
|
|
45
|
+
CREATE TABLE IF NOT EXISTS historical_data (
|
|
46
|
+
task_type TEXT PRIMARY KEY,
|
|
47
|
+
total_attempts INTEGER DEFAULT 0,
|
|
48
|
+
uam_successes INTEGER DEFAULT 0,
|
|
49
|
+
no_uam_successes INTEGER DEFAULT 0,
|
|
50
|
+
avg_time_with_uam REAL DEFAULT 0,
|
|
51
|
+
avg_time_without_uam REAL DEFAULT 0,
|
|
52
|
+
last_updated INTEGER DEFAULT 0
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
-- OPT 10: Semantic cache for task→outcome mappings
|
|
56
|
+
CREATE TABLE IF NOT EXISTS semantic_cache (
|
|
57
|
+
cache_key TEXT PRIMARY KEY,
|
|
58
|
+
instruction_hash TEXT,
|
|
59
|
+
decision_json TEXT,
|
|
60
|
+
success_rate REAL DEFAULT 0.5,
|
|
61
|
+
created_at INTEGER,
|
|
62
|
+
last_used INTEGER,
|
|
63
|
+
use_count INTEGER DEFAULT 1
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
CREATE INDEX IF NOT EXISTS idx_semantic_cache_hash ON semantic_cache(instruction_hash);
|
|
67
|
+
`);
|
|
68
|
+
return historicalDb;
|
|
69
|
+
}
|
|
15
70
|
// OPTIMIZATION 7: Refined low-benefit categories
|
|
16
|
-
// Task categories that typically don't benefit from UAM context
|
|
17
|
-
// Only skip when purely in these categories with no domain overlap
|
|
18
|
-
// NOTE: 'scheduling' removed - constraint-satisfaction tasks like 'constraints-scheduling'
|
|
19
|
-
// benefit from UAM context (proven +1 task in Terminal-Bench)
|
|
20
|
-
// NOTE: Added more specific categories based on Terminal-Bench analysis
|
|
21
71
|
const LOW_BENEFIT_CATEGORIES = new Set([
|
|
22
72
|
'reasoning', // Pure logical reasoning (sudoku, puzzles)
|
|
23
73
|
'games', // Game theory, chess position analysis (but NOT chess-best-move which needs stockfish)
|
|
@@ -26,7 +76,6 @@ const LOW_BENEFIT_CATEGORIES = new Set([
|
|
|
26
76
|
'calendar', // Meeting scheduling (but NOT server scheduling)
|
|
27
77
|
]);
|
|
28
78
|
// Categories that should ALWAYS receive UAM context regardless of historical data
|
|
29
|
-
// These are domain-knowledge-heavy tasks where UAM has proven beneficial
|
|
30
79
|
const ALWAYS_BENEFIT_CATEGORIES = new Set([
|
|
31
80
|
'security',
|
|
32
81
|
'file-ops',
|
|
@@ -36,65 +85,113 @@ const ALWAYS_BENEFIT_CATEGORIES = new Set([
|
|
|
36
85
|
'coding',
|
|
37
86
|
'testing',
|
|
38
87
|
'ml-training',
|
|
88
|
+
'git-recovery', // OPT 3: Added for git tasks
|
|
89
|
+
'data-processing', // OPT 3: Added for data tasks
|
|
90
|
+
'theorem-proving', // OPT 3: Added for proof tasks
|
|
39
91
|
]);
|
|
40
|
-
//
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
'
|
|
44
|
-
'
|
|
45
|
-
'
|
|
46
|
-
'
|
|
47
|
-
'
|
|
48
|
-
'
|
|
49
|
-
'
|
|
50
|
-
'
|
|
51
|
-
'
|
|
52
|
-
'
|
|
53
|
-
'
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
'
|
|
59
|
-
'
|
|
60
|
-
'
|
|
61
|
-
'
|
|
62
|
-
'
|
|
63
|
-
'
|
|
64
|
-
'
|
|
65
|
-
'
|
|
66
|
-
|
|
67
|
-
'
|
|
68
|
-
'
|
|
69
|
-
'
|
|
70
|
-
'
|
|
71
|
-
'
|
|
72
|
-
'
|
|
73
|
-
'
|
|
74
|
-
|
|
75
|
-
'
|
|
76
|
-
'
|
|
77
|
-
'
|
|
78
|
-
'
|
|
79
|
-
|
|
80
|
-
//
|
|
92
|
+
// OPT 4: Weighted keywords - specificity score (higher = more specific = more valuable)
|
|
93
|
+
const HIGH_BENEFIT_KEYWORDS = {
|
|
94
|
+
// Security - very specific terms get higher weights
|
|
95
|
+
'hashcat': 3.0,
|
|
96
|
+
'john': 2.5,
|
|
97
|
+
'7z': 2.5,
|
|
98
|
+
'xss': 3.0,
|
|
99
|
+
'injection': 2.0,
|
|
100
|
+
'sanitize': 2.0,
|
|
101
|
+
'bleach': 3.0,
|
|
102
|
+
'dompurify': 3.0,
|
|
103
|
+
'password': 1.5,
|
|
104
|
+
'hash': 1.5,
|
|
105
|
+
'crack': 2.0,
|
|
106
|
+
'decrypt': 2.0,
|
|
107
|
+
'secret': 1.5,
|
|
108
|
+
'exploit': 2.0,
|
|
109
|
+
// File formats - specific formats get higher weights
|
|
110
|
+
'elf': 3.0,
|
|
111
|
+
'struct.unpack': 3.0,
|
|
112
|
+
'e_phoff': 3.5,
|
|
113
|
+
'sqlite': 2.0,
|
|
114
|
+
'wal': 3.0,
|
|
115
|
+
'binary': 1.5,
|
|
116
|
+
'executable': 1.5,
|
|
117
|
+
'extract': 1.5,
|
|
118
|
+
// Git recovery - OPT 3
|
|
119
|
+
'reflog': 3.0,
|
|
120
|
+
'fsck': 3.0,
|
|
121
|
+
'git recovery': 3.0,
|
|
122
|
+
'lost commit': 2.5,
|
|
123
|
+
'detached head': 2.5,
|
|
124
|
+
'git reset': 2.0,
|
|
125
|
+
'git rebase': 1.5,
|
|
126
|
+
// Web parsing - OPT 2 (for filter-js-from-html)
|
|
127
|
+
'html parse': 2.5,
|
|
128
|
+
'dom': 2.0,
|
|
129
|
+
'beautifulsoup': 2.5,
|
|
130
|
+
'lxml': 2.5,
|
|
131
|
+
'regex html': 2.0,
|
|
132
|
+
// Compression - OPT 2 (for gpt2-codegolf)
|
|
133
|
+
'codegolf': 3.0,
|
|
134
|
+
'minify': 2.0,
|
|
135
|
+
'compress': 1.5,
|
|
136
|
+
'gzip': 2.0,
|
|
137
|
+
'zlib': 2.5,
|
|
138
|
+
// Chess - OPT 2 (for chess-best-move)
|
|
139
|
+
'stockfish': 3.0,
|
|
140
|
+
'python-chess': 3.0,
|
|
141
|
+
'fen': 2.5,
|
|
142
|
+
'pgn': 2.5,
|
|
143
|
+
'chess position': 2.0,
|
|
144
|
+
'chessimg2pos': 3.0,
|
|
145
|
+
// Legacy
|
|
146
|
+
'cobol': 3.0,
|
|
147
|
+
'fortran': 2.5,
|
|
148
|
+
'legacy': 1.5,
|
|
149
|
+
'modernize': 1.5,
|
|
150
|
+
'mainframe': 2.5,
|
|
151
|
+
// Theorem proving - OPT 3 (for prove-plus-comm)
|
|
152
|
+
'coq': 3.0,
|
|
153
|
+
'lean': 3.0,
|
|
154
|
+
'isabelle': 3.0,
|
|
155
|
+
'theorem': 2.0,
|
|
156
|
+
'proof': 1.5,
|
|
157
|
+
'induction': 2.0,
|
|
158
|
+
'commutativity': 2.5,
|
|
159
|
+
// Data processing - OPT 3 (for reshard-c4-data)
|
|
160
|
+
'parquet': 2.5,
|
|
161
|
+
'sharding': 2.5,
|
|
162
|
+
'reshard': 3.0,
|
|
163
|
+
'c4 data': 3.0,
|
|
164
|
+
'dataset': 1.5,
|
|
165
|
+
'arrow': 2.0,
|
|
166
|
+
// General coding
|
|
167
|
+
'compile': 1.5,
|
|
168
|
+
'build': 1.0,
|
|
169
|
+
'makefile': 2.0,
|
|
170
|
+
'database': 1.5,
|
|
171
|
+
'recovery': 1.5,
|
|
172
|
+
};
|
|
173
|
+
// OPT 3 & 2: Extended context sections including new categories and task-specific sections
|
|
81
174
|
const CONTEXT_SECTIONS = {
|
|
82
175
|
security: {
|
|
83
176
|
tokens: 150,
|
|
84
|
-
keywords: ['xss', 'password', 'hash', 'crack', 'secret', 'exploit', 'injection', 'sanitize'],
|
|
177
|
+
keywords: ['xss', 'password', 'hash', 'crack', 'secret', 'exploit', 'injection', 'sanitize', 'bleach', 'dompurify'],
|
|
178
|
+
keywordWeights: { 'hashcat': 3, 'bleach': 3, 'dompurify': 3, 'xss': 2.5, 'injection': 2 },
|
|
85
179
|
content: `### Security
|
|
86
180
|
- XSS: Use bleach.clean() or DOMPurify. Remove script, onclick, onerror, javascript:
|
|
87
181
|
- Password cracking: hashcat -m 11600 for 7z, -m 0 MD5, john for CPU
|
|
88
182
|
- Binary secrets: strings, objdump -d, check .rodata section
|
|
89
183
|
- Always validate and sanitize user input`,
|
|
184
|
+
patterns: ['P10', 'P20'], // Whitelist, Adversarial Testing
|
|
90
185
|
},
|
|
91
186
|
file_formats: {
|
|
92
187
|
tokens: 120,
|
|
93
|
-
keywords: ['elf', 'sqlite', '7z', 'archive', 'binary', 'extract', 'format'],
|
|
188
|
+
keywords: ['elf', 'sqlite', '7z', 'archive', 'binary', 'extract', 'format', 'wal', 'struct'],
|
|
189
|
+
keywordWeights: { 'e_phoff': 4, 'struct.unpack': 3, 'wal': 3, 'elf': 2.5 },
|
|
94
190
|
content: `### File Formats
|
|
95
191
|
- ELF: Program headers at e_phoff. Use struct.unpack for parsing
|
|
96
192
|
- SQLite WAL: Header 32 bytes, frames follow. PRAGMA wal_checkpoint to recover
|
|
97
193
|
- 7z: Install p7zip-full, use 7z x -p for password protected archives`,
|
|
194
|
+
patterns: ['P9', 'P35'], // FormatPipeline, Decoder-First
|
|
98
195
|
},
|
|
99
196
|
coding: {
|
|
100
197
|
tokens: 80,
|
|
@@ -104,23 +201,28 @@ const CONTEXT_SECTIONS = {
|
|
|
104
201
|
- Verify file exists before reading
|
|
105
202
|
- Handle edge cases in parsing
|
|
106
203
|
- Match exact output format required`,
|
|
204
|
+
patterns: ['P12', 'P16'], // OEV, TFE+
|
|
107
205
|
},
|
|
108
206
|
tools: {
|
|
109
207
|
tokens: 100,
|
|
110
208
|
keywords: ['hashcat', 'john', 'strings', 'objdump', 'readelf', 'command', 'cli'],
|
|
209
|
+
keywordWeights: { 'hashcat': 3, 'john': 2.5, 'readelf': 2, 'objdump': 2 },
|
|
111
210
|
content: `### Tools
|
|
112
211
|
- hashcat: GPU password cracking, -m flag for hash type
|
|
113
212
|
- john: CPU password cracking, flexible format support
|
|
114
213
|
- readelf/objdump: Binary analysis
|
|
115
214
|
- strings: Extract printable strings from binaries`,
|
|
215
|
+
patterns: ['P4', 'P8'], // ToolSpec, CLIoverLib
|
|
116
216
|
},
|
|
117
217
|
legacy: {
|
|
118
218
|
tokens: 90,
|
|
119
219
|
keywords: ['cobol', 'fortran', 'legacy', 'modernize', 'mainframe'],
|
|
220
|
+
keywordWeights: { 'cobol': 3, 'fortran': 2.5, 'mainframe': 2.5 },
|
|
120
221
|
content: `### Legacy Code
|
|
121
222
|
- COBOL: Fixed-format columns, WORKING-STORAGE for variables
|
|
122
223
|
- Modernization: Preserve business logic, update data structures
|
|
123
224
|
- Test with original inputs to verify behavior`,
|
|
225
|
+
patterns: ['P3'], // StateProtect
|
|
124
226
|
},
|
|
125
227
|
debugging: {
|
|
126
228
|
tokens: 70,
|
|
@@ -129,16 +231,18 @@ const CONTEXT_SECTIONS = {
|
|
|
129
231
|
- Check logs first: journalctl, /var/log/
|
|
130
232
|
- Use verbose flags: -v, --debug
|
|
131
233
|
- Isolate the problem: binary search through changes`,
|
|
234
|
+
patterns: ['P15', 'P13'], // ER, IRL
|
|
132
235
|
},
|
|
133
|
-
// OPTIMIZATION 7 & 8: Library-first and output verification context
|
|
134
236
|
domain_libraries: {
|
|
135
237
|
tokens: 100,
|
|
136
238
|
keywords: ['chess', 'statistics', 'compression', 'sampler', 'ars', 'stockfish', 'numerical'],
|
|
239
|
+
keywordWeights: { 'stockfish': 3, 'python-chess': 3, 'scipy.stats': 2.5, 'ars': 3 },
|
|
137
240
|
content: `### Domain Libraries (USE INSTEAD OF CUSTOM CODE)
|
|
138
241
|
- Chess: python-chess + stockfish (pip install python-chess), chessimg2pos for image→FEN
|
|
139
242
|
- Statistics/ARS: scipy.stats or R CRAN 'ars' package (NOT custom implementation)
|
|
140
243
|
- Compression: zlib, lz4, brotli (NOT custom Huffman). Test round-trip FIRST
|
|
141
244
|
- Numerical: numpy/scipy with tolerance 1e-6, test multiple random seeds`,
|
|
245
|
+
patterns: ['P11'], // PreComputed
|
|
142
246
|
},
|
|
143
247
|
output_verification: {
|
|
144
248
|
tokens: 80,
|
|
@@ -149,88 +253,272 @@ const CONTEXT_SECTIONS = {
|
|
|
149
253
|
- Test with ./script (NOT python3 script.py)
|
|
150
254
|
- Verify all expected files: ls -la before claiming done
|
|
151
255
|
- Partial output > no output: always create the file`,
|
|
256
|
+
patterns: ['P12', 'P16', 'P32'], // OEV, TFE+, CEV
|
|
257
|
+
},
|
|
258
|
+
// OPT 3: NEW SECTIONS for missing capabilities
|
|
259
|
+
git_recovery: {
|
|
260
|
+
tokens: 100,
|
|
261
|
+
keywords: ['git', 'reflog', 'fsck', 'reset', 'rebase', 'merge', 'conflict', 'detached', 'lost', 'HEAD'],
|
|
262
|
+
keywordWeights: { 'reflog': 3, 'fsck': 3, 'dangling': 2.5, 'detached head': 2.5 },
|
|
263
|
+
content: `### Git Recovery (P22 GRF Pattern)
|
|
264
|
+
- ALWAYS backup first: cp -r .git .git.bak
|
|
265
|
+
- Lost commits: git reflog show HEAD, then git reset --hard HEAD@{n}
|
|
266
|
+
- Dangling objects: git fsck --lost-found, check .git/lost-found/
|
|
267
|
+
- Detached HEAD: git checkout -b recovery-branch
|
|
268
|
+
- Merge conflicts: git merge --abort to reset, or resolve with git mergetool
|
|
269
|
+
- Force push recovery: git reflog, find pre-push commit, reset`,
|
|
270
|
+
patterns: ['P22', 'P3'], // GRF, StateProtect
|
|
271
|
+
},
|
|
272
|
+
web_parsing: {
|
|
273
|
+
tokens: 110,
|
|
274
|
+
keywords: ['html', 'parse', 'dom', 'beautifulsoup', 'lxml', 'regex', 'scrape', 'filter', 'js', 'javascript'],
|
|
275
|
+
keywordWeights: { 'beautifulsoup': 2.5, 'lxml': 2.5, 'html.parser': 2, 'regex html': 2 },
|
|
276
|
+
content: `### Web Parsing & XSS Filtering
|
|
277
|
+
- HTML parsing: Use BeautifulSoup with lxml parser (pip install bs4 lxml)
|
|
278
|
+
- XSS filtering: bleach.clean() with allowed_tags whitelist, NOT blacklist
|
|
279
|
+
- Script removal: Remove <script>, onclick, onerror, javascript: hrefs
|
|
280
|
+
- Preserve structure: Maintain HTML hierarchy, don't just regex replace
|
|
281
|
+
- Test edge cases: nested tags, malformed HTML, encoded entities
|
|
282
|
+
- filter-js-from-html: Create /app/filter.py with exact input/output format`,
|
|
283
|
+
patterns: ['P10', 'P20', 'P14'], // Whitelist, AT, OFV
|
|
284
|
+
},
|
|
285
|
+
data_processing: {
|
|
286
|
+
tokens: 100,
|
|
287
|
+
keywords: ['parquet', 'shard', 'reshard', 'dataset', 'arrow', 'pandas', 'dask', 'c4', 'chunk'],
|
|
288
|
+
keywordWeights: { 'parquet': 2.5, 'reshard': 3, 'arrow': 2, 'dask': 2 },
|
|
289
|
+
content: `### Data Processing & Sharding
|
|
290
|
+
- Parquet: Use pyarrow or fastparquet for columnar data
|
|
291
|
+
- Large datasets: dask.dataframe for out-of-core processing
|
|
292
|
+
- Sharding: Split by hash(key) % num_shards for even distribution
|
|
293
|
+
- C4 data: Common Crawl format, handle with datasets library
|
|
294
|
+
- Memory efficiency: Process in chunks, use generators
|
|
295
|
+
- Verification: Count rows before/after, checksum critical columns`,
|
|
296
|
+
patterns: ['P18', 'P31'], // MTP, RTV
|
|
297
|
+
},
|
|
298
|
+
theorem_proving: {
|
|
299
|
+
tokens: 90,
|
|
300
|
+
keywords: ['coq', 'lean', 'isabelle', 'theorem', 'proof', 'induction', 'lemma', 'tactic'],
|
|
301
|
+
keywordWeights: { 'coq': 3, 'lean': 3, 'isabelle': 3, 'induction': 2 },
|
|
302
|
+
content: `### Theorem Proving
|
|
303
|
+
- Coq: Use 'induction' tactic for recursive proofs, 'simpl' to simplify
|
|
304
|
+
- Lean: mathlib provides common lemmas, use 'rfl' for reflexivity
|
|
305
|
+
- Commutativity: Prove by induction on first argument, use IH in step case
|
|
306
|
+
- prove-plus-comm: Natural number addition commutativity via Peano axioms
|
|
307
|
+
- Tactics: intro, apply, rewrite, exact, reflexivity
|
|
308
|
+
- Debug: 'Show Proof' in Coq, 'trace.state' in Lean`,
|
|
309
|
+
patterns: ['P5'], // Impossible check
|
|
310
|
+
},
|
|
311
|
+
// OPT 2: Task-specific sections for the 5 persistently failing tasks
|
|
312
|
+
chess_vision: {
|
|
313
|
+
tokens: 110,
|
|
314
|
+
keywords: ['chess', 'image', 'board', 'fen', 'position', 'stockfish', 'best move', 'analyze'],
|
|
315
|
+
keywordWeights: { 'chessimg2pos': 4, 'stockfish': 3, 'fen': 2.5, 'best move': 2 },
|
|
316
|
+
content: `### Chess Image Analysis (chess-best-move)
|
|
317
|
+
- Image to FEN: pip install chessimg2pos (or board_to_fen)
|
|
318
|
+
- Position analysis: python-chess + stockfish engine
|
|
319
|
+
- Workflow: image → FEN → stockfish → best move
|
|
320
|
+
- Install: apt-get install stockfish, pip install python-chess
|
|
321
|
+
- Code: import chess.engine; engine.analyse(board, chess.engine.Limit(depth=20))
|
|
322
|
+
- Output: UCI notation (e.g., e2e4) or SAN (e.g., e4)`,
|
|
323
|
+
patterns: ['P11', 'P34'], // PreComputed, ISP
|
|
324
|
+
},
|
|
325
|
+
regex_chess: {
|
|
326
|
+
tokens: 100,
|
|
327
|
+
keywords: ['regex', 'chess', 'pgn', 'notation', 'game', 'century', 'parse'],
|
|
328
|
+
keywordWeights: { 'pgn': 3, 'game of century': 3, 'chess notation': 2.5 },
|
|
329
|
+
content: `### Regex Chess (regex-chess task)
|
|
330
|
+
- PGN parsing: Match moves with [KQRBN]?[a-h]?[1-8]?x?[a-h][1-8](=[QRBN])?[+#]?
|
|
331
|
+
- Castling: O-O (kingside), O-O-O (queenside)
|
|
332
|
+
- Game of Century: Byrne vs Fischer 1956, know key moves
|
|
333
|
+
- State machine: Track board state through move sequence
|
|
334
|
+
- Validation: Use python-chess for legal move verification
|
|
335
|
+
- Edge cases: Promotions, en passant, disambiguation`,
|
|
336
|
+
patterns: ['P24'], // PCC
|
|
337
|
+
},
|
|
338
|
+
compression_codegolf: {
|
|
339
|
+
tokens: 100,
|
|
340
|
+
keywords: ['codegolf', 'minify', 'gpt2', 'small', 'bytes', 'compress', 'size'],
|
|
341
|
+
keywordWeights: { 'codegolf': 3, 'gpt2': 3, 'minify': 2, 'bytes': 2 },
|
|
342
|
+
content: `### Code Golf & Compression (gpt2-codegolf)
|
|
343
|
+
- GPT-2 codegolf: Impossible to implement full GPT-2 in <5KB
|
|
344
|
+
- Strategy: Pre-compute weights, use lookup tables
|
|
345
|
+
- Minification: Remove whitespace, use short variable names
|
|
346
|
+
- gpt2.c: Reference implementation exists, study structure
|
|
347
|
+
- Shannon limit: Cannot compress below entropy of data
|
|
348
|
+
- If truly impossible: Document in IMPOSSIBLE.md with proof`,
|
|
349
|
+
patterns: ['P5', 'P19', 'P23'], // Impossible, ITR+, CID
|
|
152
350
|
},
|
|
351
|
+
db_wal_recovery: {
|
|
352
|
+
tokens: 100,
|
|
353
|
+
keywords: ['wal', 'sqlite', 'recovery', 'checkpoint', 'journal', 'database', 'corrupt'],
|
|
354
|
+
keywordWeights: { 'wal': 3, 'checkpoint': 3, 'db-wal-recovery': 4 },
|
|
355
|
+
content: `### SQLite WAL Recovery (db-wal-recovery)
|
|
356
|
+
- CRITICAL: Backup WAL file FIRST before any sqlite3 command!
|
|
357
|
+
- cp /app/main.db-wal /tmp/wal_backup.wal
|
|
358
|
+
- WAL auto-checkpoints when opened - this destroys recovery data
|
|
359
|
+
- Parse WAL manually: struct.unpack for header (32 bytes)
|
|
360
|
+
- Frame format: 24-byte header + page data
|
|
361
|
+
- Use /tmp/wal_backup.wal for analysis, never original
|
|
362
|
+
- Alternative: sqlite3_recover tool from SQLite source`,
|
|
363
|
+
patterns: ['P3', 'P35'], // StateProtect, DFA
|
|
364
|
+
},
|
|
365
|
+
};
|
|
366
|
+
// OPT 7: Error-to-section mapping for smarter progressive escalation
|
|
367
|
+
const ERROR_SECTION_MAPPING = {
|
|
368
|
+
'permission denied': ['tools', 'output_verification'],
|
|
369
|
+
'chmod': ['output_verification', 'tools'],
|
|
370
|
+
'struct.unpack': ['file_formats'],
|
|
371
|
+
'unpack requires': ['file_formats'],
|
|
372
|
+
'no module named': ['domain_libraries', 'tools'],
|
|
373
|
+
'command not found': ['tools'],
|
|
374
|
+
'syntax error': ['coding', 'legacy'],
|
|
375
|
+
'parse error': ['web_parsing', 'file_formats'],
|
|
376
|
+
'hash': ['security', 'tools'],
|
|
377
|
+
'xss': ['security', 'web_parsing'],
|
|
378
|
+
'injection': ['security', 'web_parsing'],
|
|
379
|
+
'git': ['git_recovery'],
|
|
380
|
+
'reflog': ['git_recovery'],
|
|
381
|
+
'merge conflict': ['git_recovery'],
|
|
382
|
+
'detached head': ['git_recovery'],
|
|
383
|
+
'parquet': ['data_processing'],
|
|
384
|
+
'shard': ['data_processing'],
|
|
385
|
+
'dataset': ['data_processing'],
|
|
386
|
+
'coq': ['theorem_proving'],
|
|
387
|
+
'lean': ['theorem_proving'],
|
|
388
|
+
'induction': ['theorem_proving'],
|
|
389
|
+
'chess': ['chess_vision', 'regex_chess', 'domain_libraries'],
|
|
390
|
+
'stockfish': ['chess_vision', 'domain_libraries'],
|
|
391
|
+
'fen': ['chess_vision'],
|
|
392
|
+
'pgn': ['regex_chess'],
|
|
393
|
+
'wal': ['db_wal_recovery', 'file_formats'],
|
|
394
|
+
'sqlite': ['db_wal_recovery', 'file_formats'],
|
|
395
|
+
'checkpoint': ['db_wal_recovery'],
|
|
396
|
+
'codegolf': ['compression_codegolf'],
|
|
397
|
+
'gpt2': ['compression_codegolf'],
|
|
398
|
+
'minify': ['compression_codegolf'],
|
|
399
|
+
'filter': ['web_parsing', 'security'],
|
|
400
|
+
'html': ['web_parsing'],
|
|
401
|
+
'beautifulsoup': ['web_parsing'],
|
|
402
|
+
};
|
|
403
|
+
// OPT 6: Pattern relevance by task type
|
|
404
|
+
const TASK_TYPE_PATTERNS = {
|
|
405
|
+
'security': ['P10', 'P20', 'P11'],
|
|
406
|
+
'file-ops': ['P9', 'P35', 'P3', 'P12'],
|
|
407
|
+
'coding': ['P12', 'P16', 'P32', 'P17'],
|
|
408
|
+
'debugging': ['P15', 'P13', 'P3'],
|
|
409
|
+
'git-recovery': ['P22', 'P3'],
|
|
410
|
+
'data-processing': ['P18', 'P31', 'P12'],
|
|
411
|
+
'theorem-proving': ['P5', 'P11'],
|
|
412
|
+
'legacy': ['P3', 'P35'],
|
|
413
|
+
'sysadmin': ['P1', 'P8', 'P4'],
|
|
414
|
+
'ml-training': ['P11', 'P33', 'P30'],
|
|
415
|
+
'testing': ['P13', 'P26', 'P30'],
|
|
153
416
|
};
|
|
154
|
-
//
|
|
155
|
-
// OPTIMIZATION 1: Reduced from 3 to 1.5 - modern models process context faster
|
|
156
|
-
// This prevents timeout regressions like constraints-scheduling
|
|
417
|
+
// Constants
|
|
157
418
|
const MS_PER_TOKEN = 1.5;
|
|
158
|
-
// Historical benefit threshold - below this, skip UAM
|
|
159
419
|
const BENEFIT_THRESHOLD = 0.1;
|
|
160
|
-
// Minimum relevance score to include a section (0-1)
|
|
161
|
-
// OPTIMIZATION 4: Lowered from 0.5 to 0.3 to catch more relevant sections
|
|
162
|
-
// This fixes db-wal-recovery missing file_formats section (was scoring 0.48)
|
|
163
420
|
const RELEVANCE_THRESHOLD = 0.3;
|
|
164
|
-
// Max tokens for time-critical tasks (<120s timeout)
|
|
165
|
-
// OPTIMIZATION 1: Increased from 200 to 300 to allow minimal context even under pressure
|
|
166
421
|
const TIME_CRITICAL_MAX_TOKENS = 300;
|
|
167
|
-
//
|
|
168
|
-
|
|
169
|
-
/**
|
|
170
|
-
* Classify task type from instruction text
|
|
171
|
-
* Requires stronger signals before classifying as low-benefit to avoid skipping useful context
|
|
172
|
-
*/
|
|
173
|
-
export function classifyTask(instruction) {
|
|
422
|
+
// OPT 4: Calculate weighted relevance score for a section
|
|
423
|
+
function calculateSectionRelevance(instruction, sectionConfig) {
|
|
174
424
|
const lower = instruction.toLowerCase();
|
|
175
|
-
|
|
176
|
-
let
|
|
177
|
-
for (const kw of
|
|
178
|
-
if (lower.includes(kw)) {
|
|
179
|
-
|
|
425
|
+
let totalScore = 0;
|
|
426
|
+
let matchCount = 0;
|
|
427
|
+
for (const kw of sectionConfig.keywords) {
|
|
428
|
+
if (lower.includes(kw.toLowerCase())) {
|
|
429
|
+
// OPT 4: Use specificity weight if available, otherwise default to 1
|
|
430
|
+
const weight = sectionConfig.keywordWeights?.[kw] || 1;
|
|
431
|
+
totalScore += weight;
|
|
432
|
+
matchCount++;
|
|
180
433
|
}
|
|
181
434
|
}
|
|
182
|
-
//
|
|
183
|
-
|
|
184
|
-
if (lower.includes(
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
}
|
|
190
|
-
if (lower.includes('elf') || lower.includes('sqlite') || lower.includes('binary')) {
|
|
191
|
-
return 'file-ops';
|
|
192
|
-
}
|
|
193
|
-
if (lower.includes('cobol') || lower.includes('legacy') || lower.includes('modernize')) {
|
|
194
|
-
return 'legacy';
|
|
435
|
+
// Also check global high-benefit keywords with their weights
|
|
436
|
+
for (const [kw, weight] of Object.entries(HIGH_BENEFIT_KEYWORDS)) {
|
|
437
|
+
if (lower.includes(kw.toLowerCase())) {
|
|
438
|
+
// Check if this keyword is relevant to this section
|
|
439
|
+
if (sectionConfig.keywords.some(sk => kw.includes(sk) || sk.includes(kw))) {
|
|
440
|
+
totalScore += weight * 0.5; // Partial bonus for related keywords
|
|
441
|
+
}
|
|
195
442
|
}
|
|
196
443
|
}
|
|
197
|
-
//
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
444
|
+
// Normalize: max possible score is roughly keywords.length * 3 (max weight)
|
|
445
|
+
const maxPossible = sectionConfig.keywords.length * 3;
|
|
446
|
+
return Math.min(totalScore / Math.max(maxPossible * 0.3, 1), 1);
|
|
447
|
+
}
|
|
448
|
+
export function classifyTaskMultiCategory(instruction) {
|
|
449
|
+
const lower = instruction.toLowerCase();
|
|
450
|
+
const categoryScores = {};
|
|
451
|
+
const matchedKeywords = [];
|
|
452
|
+
// Score from high-benefit keywords
|
|
453
|
+
for (const [kw, weight] of Object.entries(HIGH_BENEFIT_KEYWORDS)) {
|
|
454
|
+
if (lower.includes(kw.toLowerCase())) {
|
|
455
|
+
matchedKeywords.push(kw);
|
|
456
|
+
// Map keywords to categories
|
|
457
|
+
if (['password', 'hash', 'crack', 'xss', 'injection', 'sanitize', 'hashcat', 'john', 'bleach', 'dompurify'].some(k => kw.includes(k))) {
|
|
458
|
+
categoryScores['security'] = (categoryScores['security'] || 0) + weight;
|
|
459
|
+
}
|
|
460
|
+
if (['elf', 'sqlite', 'binary', 'wal', 'struct'].some(k => kw.includes(k))) {
|
|
461
|
+
categoryScores['file-ops'] = (categoryScores['file-ops'] || 0) + weight;
|
|
462
|
+
}
|
|
463
|
+
if (['git', 'reflog', 'fsck', 'rebase'].some(k => kw.includes(k))) {
|
|
464
|
+
categoryScores['git-recovery'] = (categoryScores['git-recovery'] || 0) + weight;
|
|
465
|
+
}
|
|
466
|
+
if (['cobol', 'fortran', 'legacy', 'mainframe'].some(k => kw.includes(k))) {
|
|
467
|
+
categoryScores['legacy'] = (categoryScores['legacy'] || 0) + weight;
|
|
468
|
+
}
|
|
469
|
+
if (['coq', 'lean', 'theorem', 'proof', 'induction'].some(k => kw.includes(k))) {
|
|
470
|
+
categoryScores['theorem-proving'] = (categoryScores['theorem-proving'] || 0) + weight;
|
|
471
|
+
}
|
|
472
|
+
if (['parquet', 'shard', 'reshard', 'dataset', 'arrow'].some(k => kw.includes(k))) {
|
|
473
|
+
categoryScores['data-processing'] = (categoryScores['data-processing'] || 0) + weight;
|
|
474
|
+
}
|
|
475
|
+
if (['stockfish', 'chess', 'fen', 'pgn'].some(k => kw.includes(k))) {
|
|
476
|
+
categoryScores['chess'] = (categoryScores['chess'] || 0) + weight;
|
|
477
|
+
}
|
|
202
478
|
}
|
|
203
479
|
}
|
|
204
|
-
//
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
}
|
|
480
|
+
// Fall back to task-classifier
|
|
481
|
+
const baseClassification = classifyTaskType(instruction);
|
|
482
|
+
categoryScores[baseClassification.category] = (categoryScores[baseClassification.category] || 0) + 5;
|
|
483
|
+
// Sort by score
|
|
484
|
+
const sorted = Object.entries(categoryScores)
|
|
485
|
+
.sort(([, a], [, b]) => b - a);
|
|
486
|
+
if (sorted.length === 0) {
|
|
487
|
+
return {
|
|
488
|
+
primary: 'coding',
|
|
489
|
+
secondary: [],
|
|
490
|
+
confidence: 0.5,
|
|
491
|
+
keywords: matchedKeywords,
|
|
492
|
+
};
|
|
218
493
|
}
|
|
219
|
-
|
|
220
|
-
const
|
|
221
|
-
|
|
494
|
+
const [primary, primaryScore] = sorted[0];
|
|
495
|
+
const secondary = sorted.slice(1, 3)
|
|
496
|
+
.filter(([, score]) => score >= primaryScore * 0.4)
|
|
497
|
+
.map(([cat]) => cat);
|
|
498
|
+
const maxPossible = Object.values(HIGH_BENEFIT_KEYWORDS).reduce((a, b) => a + b, 0);
|
|
499
|
+
const confidence = Math.min(primaryScore / (maxPossible * 0.1), 1);
|
|
500
|
+
return {
|
|
501
|
+
primary,
|
|
502
|
+
secondary,
|
|
503
|
+
confidence,
|
|
504
|
+
keywords: matchedKeywords,
|
|
505
|
+
};
|
|
506
|
+
}
|
|
507
|
+
/**
|
|
508
|
+
* Classify task type from instruction text (backward compatible)
|
|
509
|
+
*/
|
|
510
|
+
export function classifyTask(instruction) {
|
|
511
|
+
return classifyTaskMultiCategory(instruction).primary;
|
|
222
512
|
}
|
|
223
513
|
/**
|
|
224
514
|
* Assess time pressure based on timeout and task complexity
|
|
225
515
|
*/
|
|
226
516
|
export function assessTimePressure(timeoutSec, taskType, difficulty = 'medium') {
|
|
227
|
-
// Expected duration multipliers by difficulty
|
|
228
517
|
const difficultyMultiplier = {
|
|
229
518
|
easy: 0.5,
|
|
230
519
|
medium: 1.0,
|
|
231
520
|
hard: 2.0,
|
|
232
521
|
};
|
|
233
|
-
// Base expected duration by task type (seconds)
|
|
234
522
|
const baseDuration = {
|
|
235
523
|
security: 120,
|
|
236
524
|
'file-ops': 90,
|
|
@@ -246,13 +534,14 @@ export function assessTimePressure(timeoutSec, taskType, difficulty = 'medium')
|
|
|
246
534
|
sysadmin: 120,
|
|
247
535
|
'ml-training': 180,
|
|
248
536
|
testing: 60,
|
|
537
|
+
'git-recovery': 90,
|
|
538
|
+
'data-processing': 120,
|
|
539
|
+
'theorem-proving': 180,
|
|
540
|
+
chess: 90,
|
|
249
541
|
unknown: 60,
|
|
250
542
|
};
|
|
251
543
|
const expectedDuration = (baseDuration[taskType] || 60) * (difficultyMultiplier[difficulty] || 1.0);
|
|
252
544
|
const ratio = timeoutSec / expectedDuration;
|
|
253
|
-
// OPTIMIZATION 1: Relaxed thresholds to prevent timeout regressions
|
|
254
|
-
// Critical only when truly out of time (ratio < 1.0 = timeout < expected duration)
|
|
255
|
-
// Previously ratio < 1.2 caused constraints-scheduling to skip UAM and still timeout
|
|
256
545
|
if (ratio < 1.0)
|
|
257
546
|
return 'critical';
|
|
258
547
|
if (ratio < 1.3)
|
|
@@ -262,55 +551,68 @@ export function assessTimePressure(timeoutSec, taskType, difficulty = 'medium')
|
|
|
262
551
|
return 'low';
|
|
263
552
|
}
|
|
264
553
|
/**
|
|
265
|
-
* Get historical benefit
|
|
554
|
+
* OPT 1: Get historical benefit from SQLite (persistent)
|
|
266
555
|
*/
|
|
267
556
|
export function getHistoricalBenefit(taskType) {
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
557
|
+
try {
|
|
558
|
+
const db = getHistoricalDb();
|
|
559
|
+
const row = db.prepare('SELECT * FROM historical_data WHERE task_type = ?').get(taskType);
|
|
560
|
+
if (!row || row.totalAttempts < 3) {
|
|
561
|
+
if (LOW_BENEFIT_CATEGORIES.has(taskType)) {
|
|
562
|
+
return 0.05;
|
|
563
|
+
}
|
|
564
|
+
return 0.5;
|
|
565
|
+
}
|
|
566
|
+
const uamRate = row.uamSuccesses / Math.max(row.totalAttempts / 2, 1);
|
|
567
|
+
const noUamRate = row.noUamSuccesses / Math.max(row.totalAttempts / 2, 1);
|
|
568
|
+
if (noUamRate === 0)
|
|
569
|
+
return uamRate > 0 ? 1.0 : 0.5;
|
|
570
|
+
return (uamRate - noUamRate) / Math.max(uamRate, noUamRate, 0.1);
|
|
571
|
+
}
|
|
572
|
+
catch {
|
|
573
|
+
// Fallback to defaults if DB fails
|
|
271
574
|
if (LOW_BENEFIT_CATEGORIES.has(taskType)) {
|
|
272
|
-
return 0.05;
|
|
575
|
+
return 0.05;
|
|
273
576
|
}
|
|
274
|
-
return 0.5;
|
|
275
|
-
}
|
|
276
|
-
// Calculate benefit as improvement ratio
|
|
277
|
-
const uamRate = data.uamSuccesses / Math.max(data.totalAttempts / 2, 1);
|
|
278
|
-
const noUamRate = data.noUamSuccesses / Math.max(data.totalAttempts / 2, 1);
|
|
279
|
-
if (noUamRate === 0)
|
|
280
|
-
return uamRate > 0 ? 1.0 : 0.5;
|
|
281
|
-
return (uamRate - noUamRate) / Math.max(uamRate, noUamRate, 0.1);
|
|
577
|
+
return 0.5;
|
|
578
|
+
}
|
|
282
579
|
}
|
|
283
580
|
/**
|
|
284
|
-
* Record task outcome
|
|
285
|
-
* Also updates model router fingerprints to improve future routing
|
|
581
|
+
* OPT 1: Record task outcome to SQLite (persistent)
|
|
286
582
|
*/
|
|
287
583
|
export function recordOutcome(taskType, usedUam, success, durationMs, modelId) {
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
(data.avgTimeWithUam * (data.uamSuccesses - 1) + durationMs) / data.uamSuccesses;
|
|
584
|
+
try {
|
|
585
|
+
const db = getHistoricalDb();
|
|
586
|
+
// Get existing record or create new
|
|
587
|
+
const existing = db.prepare('SELECT * FROM historical_data WHERE task_type = ?').get(taskType);
|
|
588
|
+
if (existing) {
|
|
589
|
+
// Update existing record
|
|
590
|
+
const stmt = db.prepare(`
|
|
591
|
+
UPDATE historical_data SET
|
|
592
|
+
total_attempts = total_attempts + 1,
|
|
593
|
+
uam_successes = uam_successes + ?,
|
|
594
|
+
no_uam_successes = no_uam_successes + ?,
|
|
595
|
+
avg_time_with_uam = CASE WHEN ? THEN (avg_time_with_uam * uam_successes + ?) / (uam_successes + 1) ELSE avg_time_with_uam END,
|
|
596
|
+
avg_time_without_uam = CASE WHEN ? THEN (avg_time_without_uam * no_uam_successes + ?) / (no_uam_successes + 1) ELSE avg_time_without_uam END,
|
|
597
|
+
last_updated = ?
|
|
598
|
+
WHERE task_type = ?
|
|
599
|
+
`);
|
|
600
|
+
stmt.run(usedUam && success ? 1 : 0, !usedUam && success ? 1 : 0, usedUam && success ? 1 : 0, durationMs, !usedUam && success ? 1 : 0, durationMs, Date.now(), taskType);
|
|
306
601
|
}
|
|
307
602
|
else {
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
603
|
+
// Insert new record
|
|
604
|
+
const stmt = db.prepare(`
|
|
605
|
+
INSERT INTO historical_data (task_type, total_attempts, uam_successes, no_uam_successes, avg_time_with_uam, avg_time_without_uam, last_updated)
|
|
606
|
+
VALUES (?, 1, ?, ?, ?, ?, ?)
|
|
607
|
+
`);
|
|
608
|
+
stmt.run(taskType, usedUam && success ? 1 : 0, !usedUam && success ? 1 : 0, usedUam && success ? durationMs : 0, !usedUam && success ? durationMs : 0, Date.now());
|
|
311
609
|
}
|
|
312
610
|
}
|
|
313
|
-
|
|
611
|
+
catch (err) {
|
|
612
|
+
// Log but don't throw - recording should not block execution
|
|
613
|
+
console.warn('Failed to record outcome:', err);
|
|
614
|
+
}
|
|
615
|
+
// OPT 8: Also update model router fingerprints
|
|
314
616
|
if (modelId) {
|
|
315
617
|
const validModelIds = ['glm-4.7', 'gpt-5.2', 'claude-opus-4.5', 'gpt-5.2-codex'];
|
|
316
618
|
if (validModelIds.includes(modelId)) {
|
|
@@ -319,42 +621,109 @@ export function recordOutcome(taskType, usedUam, success, durationMs, modelId) {
|
|
|
319
621
|
}
|
|
320
622
|
}
|
|
321
623
|
/**
|
|
322
|
-
*
|
|
624
|
+
* OPT 10: Cache lookup for similar tasks
|
|
323
625
|
*/
|
|
324
|
-
function
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
626
|
+
export function lookupSemanticCache(instructionHash) {
|
|
627
|
+
try {
|
|
628
|
+
const db = getHistoricalDb();
|
|
629
|
+
const row = db.prepare(`
|
|
630
|
+
SELECT decision_json, success_rate
|
|
631
|
+
FROM semantic_cache
|
|
632
|
+
WHERE instruction_hash = ? AND success_rate >= 0.5
|
|
633
|
+
ORDER BY success_rate DESC, use_count DESC
|
|
634
|
+
LIMIT 1
|
|
635
|
+
`).get(instructionHash);
|
|
636
|
+
if (row) {
|
|
637
|
+
// Update usage stats
|
|
638
|
+
db.prepare(`
|
|
639
|
+
UPDATE semantic_cache
|
|
640
|
+
SET last_used = ?, use_count = use_count + 1
|
|
641
|
+
WHERE instruction_hash = ?
|
|
642
|
+
`).run(Date.now(), instructionHash);
|
|
643
|
+
return JSON.parse(row.decision_json);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
catch {
|
|
647
|
+
// Cache miss
|
|
648
|
+
}
|
|
649
|
+
return null;
|
|
650
|
+
}
|
|
651
|
+
/**
|
|
652
|
+
* OPT 10: Store decision in semantic cache
|
|
653
|
+
*/
|
|
654
|
+
export function storeSemanticCache(cacheKey, instructionHash, decision, success) {
|
|
655
|
+
try {
|
|
656
|
+
const db = getHistoricalDb();
|
|
657
|
+
const existing = db.prepare('SELECT * FROM semantic_cache WHERE cache_key = ?').get(cacheKey);
|
|
658
|
+
if (existing) {
|
|
659
|
+
// Update success rate with exponential moving average
|
|
660
|
+
db.prepare(`
|
|
661
|
+
UPDATE semantic_cache SET
|
|
662
|
+
decision_json = ?,
|
|
663
|
+
success_rate = success_rate * 0.8 + ? * 0.2,
|
|
664
|
+
last_used = ?,
|
|
665
|
+
use_count = use_count + 1
|
|
666
|
+
WHERE cache_key = ?
|
|
667
|
+
`).run(JSON.stringify(decision), success ? 1.0 : 0.0, Date.now(), cacheKey);
|
|
668
|
+
}
|
|
669
|
+
else {
|
|
670
|
+
db.prepare(`
|
|
671
|
+
INSERT INTO semantic_cache (cache_key, instruction_hash, decision_json, success_rate, created_at, last_used, use_count)
|
|
672
|
+
VALUES (?, ?, ?, ?, ?, ?, 1)
|
|
673
|
+
`).run(cacheKey, instructionHash, JSON.stringify(decision), success ? 1.0 : 0.5, Date.now(), Date.now());
|
|
330
674
|
}
|
|
331
675
|
}
|
|
332
|
-
|
|
676
|
+
catch (err) {
|
|
677
|
+
console.warn('Failed to store in semantic cache:', err);
|
|
678
|
+
}
|
|
333
679
|
}
|
|
334
680
|
/**
|
|
335
681
|
* Select relevant context sections based on task type and instruction
|
|
336
|
-
*
|
|
682
|
+
* OPT 5: Returns at least 2 sections for minimal mode
|
|
337
683
|
*/
|
|
338
|
-
export function selectRelevantSections(instruction, taskType) {
|
|
684
|
+
export function selectRelevantSections(instruction, taskType, secondaryCategories) {
|
|
339
685
|
const sectionsWithScores = [];
|
|
340
686
|
for (const [name, config] of Object.entries(CONTEXT_SECTIONS)) {
|
|
341
687
|
const score = calculateSectionRelevance(instruction, config);
|
|
342
688
|
if (score >= RELEVANCE_THRESHOLD) {
|
|
343
|
-
sectionsWithScores.push({ name, score });
|
|
689
|
+
sectionsWithScores.push({ name, score, patterns: config.patterns });
|
|
344
690
|
}
|
|
345
691
|
}
|
|
346
692
|
// Sort by relevance score descending
|
|
347
693
|
sectionsWithScores.sort((a, b) => b.score - a.score);
|
|
348
|
-
// Add default sections for certain task types if not already included
|
|
349
694
|
const sections = sectionsWithScores.map(s => s.name);
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
695
|
+
// Add default sections for certain task types if not already included
|
|
696
|
+
const addIfMissing = (section) => {
|
|
697
|
+
if (!sections.includes(section))
|
|
698
|
+
sections.push(section);
|
|
699
|
+
};
|
|
700
|
+
// Primary category defaults
|
|
701
|
+
if (taskType === 'security')
|
|
702
|
+
addIfMissing('security');
|
|
703
|
+
if (taskType === 'file-ops')
|
|
704
|
+
addIfMissing('file_formats');
|
|
705
|
+
if (taskType === 'legacy')
|
|
706
|
+
addIfMissing('legacy');
|
|
707
|
+
if (taskType === 'git-recovery')
|
|
708
|
+
addIfMissing('git_recovery');
|
|
709
|
+
if (taskType === 'data-processing')
|
|
710
|
+
addIfMissing('data_processing');
|
|
711
|
+
if (taskType === 'theorem-proving')
|
|
712
|
+
addIfMissing('theorem_proving');
|
|
713
|
+
if (taskType === 'chess') {
|
|
714
|
+
addIfMissing('chess_vision');
|
|
715
|
+
addIfMissing('domain_libraries');
|
|
355
716
|
}
|
|
356
|
-
|
|
357
|
-
|
|
717
|
+
// OPT 9: Add sections for secondary categories too
|
|
718
|
+
if (secondaryCategories) {
|
|
719
|
+
for (const cat of secondaryCategories) {
|
|
720
|
+
if (cat === 'security')
|
|
721
|
+
addIfMissing('security');
|
|
722
|
+
if (cat === 'file-ops')
|
|
723
|
+
addIfMissing('file_formats');
|
|
724
|
+
if (cat === 'git-recovery')
|
|
725
|
+
addIfMissing('git_recovery');
|
|
726
|
+
}
|
|
358
727
|
}
|
|
359
728
|
return sections;
|
|
360
729
|
}
|
|
@@ -368,18 +737,32 @@ export function calculateOverhead(sections) {
|
|
|
368
737
|
}
|
|
369
738
|
return totalTokens * MS_PER_TOKEN;
|
|
370
739
|
}
|
|
740
|
+
/**
|
|
741
|
+
* OPT 6: Get relevant patterns for task type
|
|
742
|
+
*/
|
|
743
|
+
export function getRelevantPatterns(taskType, sections) {
|
|
744
|
+
const patterns = new Set();
|
|
745
|
+
// From task type
|
|
746
|
+
const typePatterns = TASK_TYPE_PATTERNS[taskType] || [];
|
|
747
|
+
for (const p of typePatterns)
|
|
748
|
+
patterns.add(p);
|
|
749
|
+
// From selected sections
|
|
750
|
+
for (const section of sections) {
|
|
751
|
+
const sectionConfig = CONTEXT_SECTIONS[section];
|
|
752
|
+
if (sectionConfig?.patterns) {
|
|
753
|
+
for (const p of sectionConfig.patterns)
|
|
754
|
+
patterns.add(p);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
return Array.from(patterns);
|
|
758
|
+
}
|
|
371
759
|
/**
|
|
372
760
|
* Main decision function: determine optimal context level using hybrid approach
|
|
373
|
-
*
|
|
374
|
-
* Decision Matrix:
|
|
375
|
-
* 1. Task type is pure reasoning → skip UAM
|
|
376
|
-
* 2. Historical benefit < threshold → skip UAM
|
|
377
|
-
* 3. Critical time pressure → skip UAM
|
|
378
|
-
* 4. High time pressure → minimal UAM (essential only)
|
|
379
|
-
* 5. Default → full UAM with relevant sections
|
|
380
761
|
*/
|
|
381
762
|
export function decideContextLevel(instruction, metadata = {}) {
|
|
382
|
-
|
|
763
|
+
// OPT 9: Use multi-category classification
|
|
764
|
+
const multiClass = classifyTaskMultiCategory(instruction);
|
|
765
|
+
const taskType = multiClass.primary;
|
|
383
766
|
const timeoutSec = metadata.timeout_sec || 300;
|
|
384
767
|
const difficulty = metadata.difficulty || 'medium';
|
|
385
768
|
// Factor 1: Task classification - skip for pure reasoning
|
|
@@ -392,14 +775,14 @@ export function decideContextLevel(instruction, metadata = {}) {
|
|
|
392
775
|
taskType,
|
|
393
776
|
timePressure: 'low',
|
|
394
777
|
historicalBenefit: 0,
|
|
778
|
+
secondaryCategories: multiClass.secondary,
|
|
395
779
|
};
|
|
396
780
|
}
|
|
397
781
|
// Factor 2: Time pressure assessment
|
|
398
782
|
const timePressure = assessTimePressure(timeoutSec, taskType, difficulty);
|
|
399
|
-
// Factor 3: Historical benefit
|
|
783
|
+
// Factor 3: Historical benefit (now from SQLite - OPT 1)
|
|
400
784
|
const historicalBenefit = metadata.historical_uam_benefit ?? getHistoricalBenefit(taskType);
|
|
401
785
|
// Factor 4: Check if historical data suggests skipping UAM
|
|
402
|
-
// BUT never skip for categories that are proven to benefit from domain knowledge
|
|
403
786
|
if (historicalBenefit < BENEFIT_THRESHOLD && !ALWAYS_BENEFIT_CATEGORIES.has(taskType)) {
|
|
404
787
|
return {
|
|
405
788
|
level: 'none',
|
|
@@ -409,6 +792,7 @@ export function decideContextLevel(instruction, metadata = {}) {
|
|
|
409
792
|
taskType,
|
|
410
793
|
timePressure,
|
|
411
794
|
historicalBenefit,
|
|
795
|
+
secondaryCategories: multiClass.secondary,
|
|
412
796
|
};
|
|
413
797
|
}
|
|
414
798
|
// Factor 5: Critical time pressure - skip UAM
|
|
@@ -421,11 +805,14 @@ export function decideContextLevel(instruction, metadata = {}) {
|
|
|
421
805
|
taskType,
|
|
422
806
|
timePressure,
|
|
423
807
|
historicalBenefit,
|
|
808
|
+
secondaryCategories: multiClass.secondary,
|
|
424
809
|
};
|
|
425
810
|
}
|
|
426
|
-
// Factor 6: Select relevant sections
|
|
427
|
-
const relevantSections = selectRelevantSections(instruction, taskType);
|
|
811
|
+
// Factor 6: Select relevant sections (OPT 9: including secondary categories)
|
|
812
|
+
const relevantSections = selectRelevantSections(instruction, taskType, multiClass.secondary);
|
|
428
813
|
const estimatedOverhead = calculateOverhead(relevantSections);
|
|
814
|
+
// OPT 6: Get relevant patterns
|
|
815
|
+
const relevantPatterns = getRelevantPatterns(taskType, relevantSections);
|
|
429
816
|
// Factor 7: Check if overhead fits within time budget
|
|
430
817
|
const overheadRatio = estimatedOverhead / (timeoutSec * 1000);
|
|
431
818
|
// Time-critical tasks (<120s): cap overhead to TIME_CRITICAL_MAX_TOKENS
|
|
@@ -447,11 +834,13 @@ export function decideContextLevel(instruction, metadata = {}) {
|
|
|
447
834
|
taskType,
|
|
448
835
|
timePressure,
|
|
449
836
|
historicalBenefit,
|
|
837
|
+
secondaryCategories: multiClass.secondary,
|
|
838
|
+
relevantPatterns,
|
|
450
839
|
};
|
|
451
840
|
}
|
|
841
|
+
// OPT 5: Use 2 sections instead of 1 for minimal mode
|
|
452
842
|
if (timePressure === 'high' || overheadRatio > 0.1) {
|
|
453
|
-
|
|
454
|
-
const minimalSections = relevantSections.slice(0, 1);
|
|
843
|
+
const minimalSections = relevantSections.slice(0, 2); // Changed from 1 to 2
|
|
455
844
|
return {
|
|
456
845
|
level: 'minimal',
|
|
457
846
|
sections: minimalSections,
|
|
@@ -460,6 +849,8 @@ export function decideContextLevel(instruction, metadata = {}) {
|
|
|
460
849
|
taskType,
|
|
461
850
|
timePressure,
|
|
462
851
|
historicalBenefit,
|
|
852
|
+
secondaryCategories: multiClass.secondary,
|
|
853
|
+
relevantPatterns,
|
|
463
854
|
};
|
|
464
855
|
}
|
|
465
856
|
// Default: Full context for everything else
|
|
@@ -471,6 +862,8 @@ export function decideContextLevel(instruction, metadata = {}) {
|
|
|
471
862
|
taskType,
|
|
472
863
|
timePressure,
|
|
473
864
|
historicalBenefit,
|
|
865
|
+
secondaryCategories: multiClass.secondary,
|
|
866
|
+
relevantPatterns,
|
|
474
867
|
};
|
|
475
868
|
}
|
|
476
869
|
/**
|
|
@@ -487,22 +880,31 @@ export function generateContext(decision) {
|
|
|
487
880
|
contextParts.push(sectionConfig.content);
|
|
488
881
|
}
|
|
489
882
|
}
|
|
883
|
+
// OPT 6: Add relevant patterns hint
|
|
884
|
+
if (decision.relevantPatterns && decision.relevantPatterns.length > 0) {
|
|
885
|
+
contextParts.push(`\n### Relevant Patterns: ${decision.relevantPatterns.join(', ')}`);
|
|
886
|
+
}
|
|
490
887
|
return contextParts.join('\n');
|
|
491
888
|
}
|
|
492
889
|
/**
|
|
493
|
-
*
|
|
494
|
-
*
|
|
495
|
-
* Returns context levels to try in order based on initial failure analysis.
|
|
890
|
+
* OPT 7: Enhanced progressive context strategy with error-to-section mapping
|
|
496
891
|
*/
|
|
497
892
|
export function getProgressiveContextLevels(instruction, initialError, metadata = {}) {
|
|
498
893
|
const decision = decideContextLevel(instruction, metadata);
|
|
499
|
-
// If we already decided 'none' for a good reason, don't retry with more
|
|
500
894
|
if (decision.level === 'none' && LOW_BENEFIT_CATEGORIES.has(decision.taskType)) {
|
|
501
|
-
return ['none'];
|
|
895
|
+
return ['none'];
|
|
502
896
|
}
|
|
503
|
-
// Analyze error to see if context might help
|
|
504
897
|
const errorLower = initialError.toLowerCase();
|
|
505
|
-
|
|
898
|
+
// OPT 7: Check error-to-section mapping for targeted escalation
|
|
899
|
+
let suggestedSections = [];
|
|
900
|
+
for (const [errorPattern, sections] of Object.entries(ERROR_SECTION_MAPPING)) {
|
|
901
|
+
if (errorLower.includes(errorPattern)) {
|
|
902
|
+
suggestedSections.push(...sections);
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
// Standard context-might-help checks
|
|
906
|
+
const contextMightHelp = suggestedSections.length > 0 ||
|
|
907
|
+
errorLower.includes('unknown') ||
|
|
506
908
|
errorLower.includes('how to') ||
|
|
507
909
|
errorLower.includes('what is') ||
|
|
508
910
|
errorLower.includes('command not found') ||
|
|
@@ -510,7 +912,7 @@ export function getProgressiveContextLevels(instruction, initialError, metadata
|
|
|
510
912
|
errorLower.includes('format') ||
|
|
511
913
|
errorLower.includes('parse');
|
|
512
914
|
if (!contextMightHelp) {
|
|
513
|
-
return [decision.level];
|
|
915
|
+
return [decision.level];
|
|
514
916
|
}
|
|
515
917
|
// Progressive escalation based on starting point
|
|
516
918
|
switch (decision.level) {
|
|
@@ -519,11 +921,26 @@ export function getProgressiveContextLevels(instruction, initialError, metadata
|
|
|
519
921
|
case 'minimal':
|
|
520
922
|
return ['minimal', 'full'];
|
|
521
923
|
case 'full':
|
|
522
|
-
return ['full'];
|
|
924
|
+
return ['full'];
|
|
523
925
|
default:
|
|
524
926
|
return ['none', 'minimal', 'full'];
|
|
525
927
|
}
|
|
526
928
|
}
|
|
929
|
+
/**
|
|
930
|
+
* OPT 7: Get additional sections to add based on error analysis
|
|
931
|
+
*/
|
|
932
|
+
export function getSectionsForError(error) {
|
|
933
|
+
const errorLower = error.toLowerCase();
|
|
934
|
+
const sections = new Set();
|
|
935
|
+
for (const [errorPattern, sectionList] of Object.entries(ERROR_SECTION_MAPPING)) {
|
|
936
|
+
if (errorLower.includes(errorPattern)) {
|
|
937
|
+
for (const section of sectionList) {
|
|
938
|
+
sections.add(section);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
return Array.from(sections);
|
|
943
|
+
}
|
|
527
944
|
/**
|
|
528
945
|
* Export configuration for Python agent integration
|
|
529
946
|
*/
|
|
@@ -538,12 +955,53 @@ export function exportConfigForPython(instruction, metadata = {}) {
|
|
|
538
955
|
taskType: decision.taskType,
|
|
539
956
|
timePressure: decision.timePressure,
|
|
540
957
|
historicalBenefit: decision.historicalBenefit,
|
|
958
|
+
secondaryCategories: decision.secondaryCategories,
|
|
959
|
+
relevantPatterns: decision.relevantPatterns,
|
|
541
960
|
context,
|
|
542
961
|
}, null, 2);
|
|
543
962
|
}
|
|
963
|
+
/**
|
|
964
|
+
* OPT 8: Get model fingerprint for routing integration
|
|
965
|
+
*/
|
|
966
|
+
export function getModelFingerprintForTask(taskType) {
|
|
967
|
+
// Check per-category success rates from model router
|
|
968
|
+
const models = ['claude-opus-4.5', 'gpt-5.2', 'glm-4.7', 'gpt-5.2-codex'];
|
|
969
|
+
let bestModel = 'claude-opus-4.5';
|
|
970
|
+
let bestScore = 0;
|
|
971
|
+
for (const modelId of models) {
|
|
972
|
+
const fp = getModelFingerprint(modelId);
|
|
973
|
+
if (fp && fp.categoryStats?.[taskType]) {
|
|
974
|
+
const stats = fp.categoryStats[taskType];
|
|
975
|
+
if (stats.attempts >= 3) {
|
|
976
|
+
const rate = stats.successes / stats.attempts;
|
|
977
|
+
if (rate > bestScore) {
|
|
978
|
+
bestScore = rate;
|
|
979
|
+
bestModel = modelId;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
if (bestScore > 0) {
|
|
985
|
+
return {
|
|
986
|
+
recommended: bestModel,
|
|
987
|
+
reason: `${bestModel} has ${(bestScore * 100).toFixed(0)}% success rate for ${taskType} tasks`,
|
|
988
|
+
};
|
|
989
|
+
}
|
|
990
|
+
return null;
|
|
991
|
+
}
|
|
992
|
+
/**
|
|
993
|
+
* Close database connection (for cleanup)
|
|
994
|
+
*/
|
|
995
|
+
export function closeHistoricalDb() {
|
|
996
|
+
if (historicalDb) {
|
|
997
|
+
historicalDb.close();
|
|
998
|
+
historicalDb = null;
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
544
1001
|
// Export main interface
|
|
545
1002
|
export const HybridAdaptiveContext = {
|
|
546
1003
|
classifyTask,
|
|
1004
|
+
classifyTaskMultiCategory,
|
|
547
1005
|
assessTimePressure,
|
|
548
1006
|
getHistoricalBenefit,
|
|
549
1007
|
recordOutcome,
|
|
@@ -552,7 +1010,13 @@ export const HybridAdaptiveContext = {
|
|
|
552
1010
|
selectRelevantSections,
|
|
553
1011
|
calculateOverhead,
|
|
554
1012
|
getProgressiveContextLevels,
|
|
1013
|
+
getSectionsForError,
|
|
1014
|
+
getRelevantPatterns,
|
|
555
1015
|
exportConfigForPython,
|
|
1016
|
+
lookupSemanticCache,
|
|
1017
|
+
storeSemanticCache,
|
|
1018
|
+
getModelFingerprintForTask,
|
|
1019
|
+
closeHistoricalDb,
|
|
556
1020
|
};
|
|
557
1021
|
export default HybridAdaptiveContext;
|
|
558
1022
|
//# sourceMappingURL=adaptive-context.js.map
|