@getmikk/ai-context 1.9.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@getmikk/ai-context",
3
- "version": "1.9.0",
3
+ "version": "2.0.0",
4
4
  "license": "Apache-2.0",
5
5
  "repository": {
6
6
  "type": "git",
@@ -21,8 +21,8 @@
21
21
  "dev": "tsc --watch"
22
22
  },
23
23
  "dependencies": {
24
- "@getmikk/core": "^1.9.0",
25
- "@getmikk/intent-engine": "^1.9.0"
24
+ "@getmikk/core": "^2.0.0",
25
+ "@getmikk/intent-engine": "^2.0.0"
26
26
  },
27
27
  "devDependencies": {
28
28
  "typescript": "^5.7.0",
@@ -373,16 +373,24 @@ export class ContextBuilder {
373
373
 
374
374
  // ── Step 5: Fill token budget ──────────────────────────────────────
375
375
  let selected: MikkLockFunction[] = []
376
+
377
+ // Pre-calculate baseline overhead (context files, routes, constraints)
376
378
  let usedTokens = 0
379
+ const routesStr = (!strictMode && this.lock.routes) ? JSON.stringify(this.lock.routes) : ''
380
+ const ctxStr = (!strictMode && this.lock.contextFiles)
381
+ ? this.lock.contextFiles.map(cf => readContextFile(cf.path, query.projectRoot).slice(0, 2000)).join('\n')
382
+ : ''
383
+ usedTokens += estimateTokens(routesStr + ctxStr + JSON.stringify(this.contract.declared.constraints))
377
384
 
378
385
  for (const { fn, score } of scored) {
379
386
  if (score <= 0 && seeds.length > 0) break // Nothing relevant left
380
387
  if (selected.length >= (query.maxFunctions ?? 80)) break
381
388
 
382
389
  const snippet = this.buildFunctionSnippet(fn, query)
383
- const tokens = estimateTokens(snippet)
390
+ // Multiply tokens by 2.2 to account for it being in both JSON and text prompt, plus JSON framing
391
+ const tokens = estimateTokens(snippet) * 2.2
384
392
 
385
- if (usedTokens + tokens > tokenBudget) continue // skip, try smaller ones later
393
+ if (usedTokens + tokens > tokenBudget && selected.length > 0) continue // skip, try smaller ones later
386
394
  selected.push(fn)
387
395
  usedTokens += tokens
388
396
  }
@@ -1,224 +1,157 @@
1
1
  /**
2
- * Improved Token Counter
3
- *
4
- * Provides more accurate token counting than the simple length/4 approximation.
5
- * Uses a GPT-4 compatible tokenizer approximation for better budget management.
2
+ * Token Counter — accurate, fast token estimation for context budget management.
3
+ *
4
+ * Design:
5
+ * - `countTokens(text)` accurate, linear-scan, O(n)
6
+ * - `countTokensFast(text)` — single-pass heuristic, O(n) for hot paths
7
+ * - `estimateFileTokens(content, path)` — file-type-aware wrapper
8
+ * - `TokenBudget` — budget manager with truncation
9
+ *
10
+ * The previous implementation used a character-position Set to track processed
11
+ * ranges across multiple regex scans — O(n²) per call on large files.
12
+ * Replaced with a single linear scan that categorises characters without
13
+ * per-character Set lookups.
6
14
  */
7
15
 
8
- // Character-based token approximation (more accurate than simple division)
9
- const CHARS_PER_TOKEN = 3.8 // Average for GPT-4 tokenizer
10
- const MIN_CHARS_PER_TOKEN = 2.0 // For dense code
11
- const MAX_CHARS_PER_TOKEN = 6.0 // For sparse text
12
-
13
- // Special token patterns that affect tokenization
14
- const TOKEN_PATTERNS = {
15
- // Common programming patterns that typically tokenize as single tokens
16
- SINGLE_TOKEN_PATTERNS: [
17
- /\b(if|else|for|while|function|return|const|let|var|class|import|export)\b/g,
18
- /\b(true|false|null|undefined)\b/g,
19
- /\b(async|await|try|catch|throw|new|this)\b/g,
20
- // Operators and punctuation
21
- /[+\-*\/=<>!&|]+/g,
22
- /[{}()\[\];,\.]/g,
23
- // Common function names
24
- /\b(console\.log|console\.error|console\.warn)\b/g,
25
- /\b(Math\.(floor|ceil|round|max|min))\b/g,
26
- ],
27
-
28
- // Patterns that typically increase token count
29
- HIGH_TOKEN_PATTERNS: [
30
- // String literals (each character ~0.25 tokens)
31
- /'[^']*'/g,
32
- /"[^"]*"/g,
33
- /`[^`]*`/g,
34
- // Numbers (digits ~0.5 tokens each)
35
- /\b\d+\.?\d*\b/g,
36
- // Long identifiers (split into multiple tokens)
37
- /\b[a-z][a-zA-Z0-9]{8,}\b/g,
38
- ]
39
- }
16
+ const CHARS_PER_TOKEN = 3.8 // GPT-4 average
17
+ const MIN_CHARS_PER_TOKEN = 2.0 // Dense code
18
+ const MAX_CHARS_PER_TOKEN = 6.0 // Sparse natural language
40
19
 
41
20
  /**
42
- * Count tokens with improved accuracy using position-based pattern matching
21
+ * Count tokens with reasonable accuracy O(n) single linear scan.
22
+ *
23
+ * Classifies runs of characters into:
24
+ * - whitespace: free (separators, not tokens)
25
+ * - string literals: ~4 chars/token
26
+ * - digit runs: ~2 chars/token (numbers tokenise finely)
27
+ * - identifiers/keywords: short → 1 token, long → ~3.5 chars/token
28
+ * - operators/punctuation: 1 char = 1 token
43
29
  */
44
30
  export function countTokens(text: string): number {
45
- if (!text || text.length === 0) return 0
46
-
47
- let tokenCount = 0
48
- const processedPositions = new Set<number>() // Track positions to avoid double-counting
49
-
50
- // Count single-token patterns with position tracking
51
- for (const pattern of TOKEN_PATTERNS.SINGLE_TOKEN_PATTERNS) {
52
- for (const match of text.matchAll(pattern)) {
53
- const start = match.index!
54
- const end = start + match[0].length
55
-
56
- // Check if this range overlaps with already processed ranges
57
- let overlaps = false
58
- for (let i = start; i < end; i++) {
59
- if (processedPositions.has(i)) {
60
- overlaps = true
61
- break
62
- }
63
- }
64
-
65
- if (!overlaps) {
66
- tokenCount += 1
67
- // Mark positions as processed
68
- for (let i = start; i < end; i++) {
69
- processedPositions.add(i)
70
- }
71
- }
31
+ if (!text) return 0
32
+
33
+ let tokens = 0
34
+ let i = 0
35
+ const n = text.length
36
+
37
+ while (i < n) {
38
+ const ch = text[i]
39
+
40
+ // Whitespace boundary only, no token cost
41
+ if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') {
42
+ i++
43
+ continue
72
44
  }
73
- }
74
-
75
- // Count high-token patterns (strings, numbers, long identifiers)
76
- for (const pattern of TOKEN_PATTERNS.HIGH_TOKEN_PATTERNS) {
77
- for (const match of text.matchAll(pattern)) {
78
- const start = match.index!
79
- const end = start + match[0].length
80
-
81
- // Check for overlaps
82
- let overlaps = false
83
- for (let i = start; i < end; i++) {
84
- if (processedPositions.has(i)) {
85
- overlaps = true
86
- break
87
- }
45
+
46
+ // String literals — scan to closing quote
47
+ if (ch === '"' || ch === "'" || ch === '`') {
48
+ const q = ch
49
+ let len = 1
50
+ i++
51
+ while (i < n) {
52
+ if (text[i] === '\\') { i += 2; len += 2; continue }
53
+ if (text[i] === q) { i++; len++; break }
54
+ i++; len++
88
55
  }
89
-
90
- if (!overlaps) {
91
- let tokensToAdd = 0
92
- if (match[0].startsWith('\'') || match[0].startsWith('"') || match[0].startsWith('`')) {
93
- // String literal: roughly 1 token per 4 characters
94
- tokensToAdd = Math.ceil(match[0].length / 4)
95
- } else if (/^\d/.test(match[0])) {
96
- // Number: roughly 1 token per 2 digits
97
- tokensToAdd = Math.ceil(match[0].length / 2)
98
- } else {
99
- // Long identifier: roughly 1 token per 6 characters
100
- tokensToAdd = Math.ceil(match[0].length / 6)
101
- }
102
-
103
- tokenCount += tokensToAdd
104
- // Mark positions as processed
105
- for (let i = start; i < end; i++) {
106
- processedPositions.add(i)
107
- }
56
+ tokens += Math.max(1, Math.ceil(len / 4))
57
+ continue
58
+ }
59
+
60
+ // Digit runs token-heavy
61
+ if (ch >= '0' && ch <= '9') {
62
+ let len = 0
63
+ while (i < n && ((text[i] >= '0' && text[i] <= '9') || text[i] === '.')) {
64
+ i++; len++
108
65
  }
66
+ tokens += Math.max(1, Math.ceil(len / 2))
67
+ continue
109
68
  }
110
- }
111
-
112
- // Count remaining characters (general text)
113
- const remainingText = Array.from(text.split(''))
114
- .map((char, index) => processedPositions.has(index) ? '' : char)
115
- .join('')
116
-
117
- if (remainingText.length > 0) {
118
- // Use variable rate based on character density
119
- const avgWordLength = remainingText.split(/\s+/).reduce((sum, word) => sum + word.length, 0) / Math.max(remainingText.split(/\s+/).length, 1)
120
-
121
- let charsPerToken = CHARS_PER_TOKEN
122
- if (avgWordLength < 4) {
123
- charsPerToken = MIN_CHARS_PER_TOKEN // Dense code
124
- } else if (avgWordLength > 8) {
125
- charsPerToken = MAX_CHARS_PER_TOKEN // Sparse text
69
+
70
+ // Identifier / keyword runs
71
+ if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch === '_' || ch === '$') {
72
+ let len = 0
73
+ while (
74
+ i < n &&
75
+ ((text[i] >= 'a' && text[i] <= 'z') || (text[i] >= 'A' && text[i] <= 'Z') ||
76
+ (text[i] >= '0' && text[i] <= '9') || text[i] === '_' || text[i] === '$')
77
+ ) { i++; len++ }
78
+ tokens += len <= 6 ? 1 : Math.ceil(len / 3.5)
79
+ continue
126
80
  }
127
-
128
- tokenCount += Math.ceil(remainingText.length / charsPerToken)
81
+
82
+ // Operators, punctuation, brackets — 1 char per token
83
+ tokens++
84
+ i++
129
85
  }
130
-
131
- // Apply bounds checking for sanity
86
+
132
87
  const minEstimate = Math.ceil(text.length / MAX_CHARS_PER_TOKEN)
133
88
  const maxEstimate = Math.ceil(text.length / MIN_CHARS_PER_TOKEN)
134
-
135
- return Math.max(minEstimate, Math.min(maxEstimate, tokenCount))
89
+ return Math.max(minEstimate, Math.min(maxEstimate, tokens))
136
90
  }
137
91
 
138
92
  /**
139
- * Fast token count for quick estimates (still more accurate than length/4)
93
+ * Fast O(n) single-pass heuristic for hot paths (context builder scoring loops).
140
94
  */
141
95
  export function countTokensFast(text: string): number {
142
- if (!text || text.length === 0) return 0
143
-
144
- // Quick heuristic based on character patterns
145
- const codeDensity = (text.match(/[a-zA-Z0-9]/g) || []).length / text.length
146
- const stringRatio = (text.match(/['"`]/g) || []).length / text.length
147
-
148
- // Adjust chars per token based on content type
149
- let charsPerToken = CHARS_PER_TOKEN
150
- if (codeDensity > 0.7) {
151
- charsPerToken = 3.2 // Dense code
152
- } else if (stringRatio > 0.2) {
153
- charsPerToken = 4.5 // String-heavy
154
- } else if (codeDensity < 0.3) {
155
- charsPerToken = 5.0 // Sparse text/comments
96
+ if (!text) return 0
97
+
98
+ let alphaNum = 0, punct = 0
99
+ for (let i = 0; i < text.length; i++) {
100
+ const c = text.charCodeAt(i)
101
+ if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57)) {
102
+ alphaNum++
103
+ } else if (c !== 32 && c !== 9 && c !== 10 && c !== 13) {
104
+ punct++
105
+ }
156
106
  }
157
-
158
- return Math.ceil(text.length / charsPerToken)
107
+
108
+ const nonWs = alphaNum + punct
109
+ if (nonWs === 0) return 0
110
+
111
+ const punctRatio = nonWs > 0 ? punct / nonWs : 0
112
+ const charsPerToken = punctRatio > 0.3 ? 2.8 : CHARS_PER_TOKEN
113
+ return Math.max(1, Math.ceil(text.length / charsPerToken))
159
114
  }
160
115
 
161
116
  /**
162
- * Estimate tokens for a file with content type awareness
117
+ * Estimate tokens for a file with content-type awareness.
163
118
  */
164
119
  export function estimateFileTokens(content: string, filePath: string): number {
165
- const extension = filePath.split('.').pop()?.toLowerCase()
166
-
167
- // Adjust counting based on file type
168
- switch (extension) {
169
- case 'json':
170
- // JSON is token-heavy due to strings and structure
171
- return countTokens(content) * 1.1
172
- case 'md':
173
- // Markdown has more natural language
174
- return countTokens(content) * 0.9
175
- case 'ts':
176
- case 'tsx':
177
- case 'js':
178
- case 'jsx':
179
- // Code files benefit from pattern recognition
180
- return countTokens(content)
181
- default:
182
- // Use standard counting for unknown types
183
- return countTokens(content)
184
- }
120
+ const ext = filePath.split('.').pop()?.toLowerCase()
121
+ if (ext === 'md') return Math.ceil(countTokens(content) * 0.9)
122
+ return countTokens(content)
185
123
  }
186
124
 
187
125
  /**
188
- * Token budget manager with overflow protection
126
+ * Token budget manager tracks usage and truncates content to fit.
189
127
  */
190
128
  export class TokenBudget {
191
- constructor(private maxTokens: number, private overflowAllowance: number = 0.1) {}
192
-
193
- /**
194
- * Check if content fits within budget
195
- */
129
+ private used = 0
130
+
131
+ constructor(
132
+ private readonly maxTokens: number,
133
+ private readonly overflowAllowance: number = 0.1,
134
+ ) {}
135
+
136
+ get remaining(): number {
137
+ return Math.max(0, this.maxTokens - this.used)
138
+ }
139
+
196
140
  fits(content: string): boolean {
197
- const tokens = countTokens(content)
198
- return tokens <= this.maxTokens * (1 + this.overflowAllowance)
141
+ return countTokensFast(content) <= this.remaining * (1 + this.overflowAllowance)
199
142
  }
200
-
201
- /**
202
- * Get remaining token count
203
- */
204
- remaining(usedTokens: number): number {
205
- return Math.max(0, this.maxTokens - usedTokens)
143
+
144
+ consume(tokens: number): boolean {
145
+ this.used += tokens
146
+ return this.used <= this.maxTokens * (1 + this.overflowAllowance)
206
147
  }
207
-
208
- /**
209
- * Truncate content to fit within budget
210
- */
211
- truncate(content: string, usedTokens: number = 0): string {
212
- const available = this.remaining(usedTokens)
213
- if (available <= 0) return ''
214
-
215
- const estimatedTokens = countTokens(content)
216
- if (estimatedTokens <= available) return content
217
-
218
- // Rough truncation based on character ratio
219
- const ratio = available / estimatedTokens
220
- const truncateAt = Math.floor(content.length * ratio * 0.9) // 10% buffer
221
-
222
- return content.substring(0, truncateAt) + '\n... [truncated due to token budget]'
148
+
149
+ truncate(content: string): string {
150
+ if (this.remaining <= 0) return ''
151
+ const estimated = countTokensFast(content)
152
+ if (estimated <= this.remaining) return content
153
+ const ratio = this.remaining / estimated
154
+ const cutAt = Math.floor(content.length * ratio * 0.9)
155
+ return content.slice(0, cutAt) + '\n… [truncated — token budget reached]'
223
156
  }
224
157
  }