repoburg 1.3.11 → 1.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/backend/dist/packages/tokenpatch/index.d.ts +5 -1
- package/backend/dist/packages/tokenpatch/index.js +42 -24
- package/backend/dist/packages/tokenpatch/index.js.map +1 -1
- package/backend/dist/packages/tokenpatch/patcher.js +62 -20
- package/backend/dist/packages/tokenpatch/patcher.js.map +1 -1
- package/backend/dist/packages/tokenpatch/strategies/tiktoken-tokenizer.d.ts +6 -0
- package/backend/dist/packages/tokenpatch/strategies/tiktoken-tokenizer.js +28 -0
- package/backend/dist/packages/tokenpatch/strategies/tiktoken-tokenizer.js.map +1 -0
- package/backend/dist/packages/tokenpatch/strategies/tree-sitter-tokenizer.d.ts +9 -0
- package/backend/dist/packages/tokenpatch/strategies/tree-sitter-tokenizer.js +36 -0
- package/backend/dist/packages/tokenpatch/strategies/tree-sitter-tokenizer.js.map +1 -0
- package/backend/dist/packages/tokenpatch/tokenizer.interface.d.ts +4 -0
- package/backend/dist/packages/tokenpatch/tokenizer.interface.js +3 -0
- package/backend/dist/packages/tokenpatch/tokenizer.interface.js.map +1 -0
- package/backend/dist/packages/tokenpatch/tokens.d.ts +0 -2
- package/backend/dist/packages/tokenpatch/tokens.js +4 -23
- package/backend/dist/packages/tokenpatch/tokens.js.map +1 -1
- package/backend/dist/packages/tokenpatch/types.d.ts +2 -2
- package/backend/dist/src/llm-orchestration/action-handlers/patch.handler.js +130 -51
- package/backend/dist/src/llm-orchestration/action-handlers/patch.handler.js.map +1 -1
- package/backend/dist/src/llm-orchestration/parser/parsing.constants.d.ts +2 -0
- package/backend/dist/src/llm-orchestration/parser/parsing.constants.js +3 -1
- package/backend/dist/src/llm-orchestration/parser/parsing.constants.js.map +1 -1
- package/backend/dist/src/seeding/data/system-prompts/experimental_eta_master-agent.d.ts +1 -1
- package/backend/dist/src/seeding/data/system-prompts/experimental_eta_master-agent.js +44 -55
- package/backend/dist/src/seeding/data/system-prompts/experimental_eta_master-agent.js.map +1 -1
- package/backend/dist/tsconfig.build.tsbuildinfo +1 -1
- package/backend/packages/tokenpatch/index.spec.ts +55 -37
- package/backend/packages/tokenpatch/index.ts +61 -31
- package/backend/packages/tokenpatch/patcher.ts +119 -30
- package/backend/packages/tokenpatch/strategies/tiktoken-tokenizer.ts +35 -0
- package/backend/packages/tokenpatch/strategies/tree-sitter-tokenizer.ts +37 -0
- package/backend/packages/tokenpatch/tokenizer.interface.ts +5 -0
- package/backend/packages/tokenpatch/tokens.ts +10 -28
- package/backend/packages/tokenpatch/types.ts +4 -4
- package/package.json +2 -1
|
@@ -1,15 +1,39 @@
|
|
|
1
1
|
import { applySnippetPatch } from './index';
|
|
2
2
|
import * as path from 'path';
|
|
3
|
+
import {
|
|
4
|
+
SPECIAL_PATCH_BEGIN_FILE_MARKER,
|
|
5
|
+
SPECIAL_PATCH_END_FILE_MARKER,
|
|
6
|
+
} from '../../src/llm-orchestration/parser/parsing.constants';
|
|
3
7
|
|
|
4
8
|
const TS_WASM_PATH = path.join(
|
|
5
9
|
__dirname,
|
|
6
10
|
'./grammar/tree-sitter-typescript.wasm',
|
|
7
11
|
);
|
|
8
12
|
|
|
9
|
-
const TSX_WASM_PATH = path.join(__dirname, './grammar/tree-sitter-tsx.wasm');
|
|
10
|
-
|
|
11
13
|
// Helper to normalize whitespace for robust comparison
|
|
12
|
-
const normalize = (str: string) =>
|
|
14
|
+
const normalize = (str: string) =>
|
|
15
|
+
str
|
|
16
|
+
.replace(/}/g, '} ')
|
|
17
|
+
.replace(/\s+/g, ' ')
|
|
18
|
+
.trim();
|
|
19
|
+
|
|
20
|
+
const runTest = async (
|
|
21
|
+
sourceCode: string,
|
|
22
|
+
patchCode: string,
|
|
23
|
+
expectedResult: string,
|
|
24
|
+
) => {
|
|
25
|
+
const resultTiktoken = await applySnippetPatch(sourceCode, patchCode, {
|
|
26
|
+
useTiktoken: true,
|
|
27
|
+
});
|
|
28
|
+
expect(normalize(resultTiktoken)).toEqual(normalize(expectedResult));
|
|
29
|
+
|
|
30
|
+
const resultTreeSitter = await applySnippetPatch(
|
|
31
|
+
sourceCode,
|
|
32
|
+
patchCode,
|
|
33
|
+
TS_WASM_PATH,
|
|
34
|
+
);
|
|
35
|
+
expect(normalize(resultTreeSitter)).toEqual(normalize(expectedResult));
|
|
36
|
+
};
|
|
13
37
|
|
|
14
38
|
describe('applySnippetPatch', () => {
|
|
15
39
|
it('should replace a method body in a class by automatically finding anchor size', async () => {
|
|
@@ -35,12 +59,11 @@ class Greeter {
|
|
|
35
59
|
// A new implementation
|
|
36
60
|
return "Hello, TypeScript!";
|
|
37
61
|
}
|
|
38
|
-
|
|
62
|
+
fc(){ return 42; }
|
|
39
63
|
|
|
40
64
|
}
|
|
41
65
|
`;
|
|
42
|
-
|
|
43
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
66
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
44
67
|
});
|
|
45
68
|
|
|
46
69
|
it('should replace a data structure definition', async () => {
|
|
@@ -63,8 +86,7 @@ interface MyData {
|
|
|
63
86
|
}
|
|
64
87
|
console.log("hello");
|
|
65
88
|
`;
|
|
66
|
-
|
|
67
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
89
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
68
90
|
});
|
|
69
91
|
|
|
70
92
|
it('should replace a full function definition', async () => {
|
|
@@ -95,11 +117,10 @@ function calculate() {
|
|
|
95
117
|
|
|
96
118
|
export { calculate };
|
|
97
119
|
`;
|
|
98
|
-
|
|
99
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
120
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
100
121
|
});
|
|
101
122
|
|
|
102
|
-
it(
|
|
123
|
+
it(`should replace a beginning of the file ${SPECIAL_PATCH_BEGIN_FILE_MARKER}`, async () => {
|
|
103
124
|
const sourceCode = `
|
|
104
125
|
import { ModuleA } from './moduleA';
|
|
105
126
|
import { ModuleB } from './moduleB';
|
|
@@ -107,7 +128,7 @@ import { ModuleB } from './moduleB';
|
|
|
107
128
|
console.log('starting up');
|
|
108
129
|
`;
|
|
109
130
|
const patchCode = `
|
|
110
|
-
//
|
|
131
|
+
// ${SPECIAL_PATCH_BEGIN_FILE_MARKER}
|
|
111
132
|
import groupBy from 'lodash';
|
|
112
133
|
import { ModuleA } from './moduleA';
|
|
113
134
|
`;
|
|
@@ -118,11 +139,10 @@ import { ModuleB } from './moduleB';
|
|
|
118
139
|
|
|
119
140
|
console.log('starting up');
|
|
120
141
|
`;
|
|
121
|
-
|
|
122
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
142
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
123
143
|
});
|
|
124
144
|
|
|
125
|
-
it(
|
|
145
|
+
it(`should handle ${SPECIAL_PATCH_BEGIN_FILE_MARKER} marker with variations`, async () => {
|
|
126
146
|
const sourceCode = `
|
|
127
147
|
import { ModuleA } from './moduleA';
|
|
128
148
|
import { ModuleB } from './moduleB';
|
|
@@ -130,7 +150,7 @@ import { ModuleB } from './moduleB';
|
|
|
130
150
|
console.log('starting up');
|
|
131
151
|
`;
|
|
132
152
|
const patchCode = `
|
|
133
|
-
|
|
153
|
+
//${SPECIAL_PATCH_BEGIN_FILE_MARKER} extra text
|
|
134
154
|
import groupBy from 'lodash';
|
|
135
155
|
import { ModuleA } from './moduleA';
|
|
136
156
|
`;
|
|
@@ -141,11 +161,10 @@ import { ModuleB } from './moduleB';
|
|
|
141
161
|
|
|
142
162
|
console.log('starting up');
|
|
143
163
|
`;
|
|
144
|
-
|
|
145
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
164
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
146
165
|
});
|
|
147
166
|
|
|
148
|
-
it(
|
|
167
|
+
it(`should replace a end of the file ${SPECIAL_PATCH_END_FILE_MARKER}`, async () => {
|
|
149
168
|
const sourceCode = `
|
|
150
169
|
import fs from 'fs';
|
|
151
170
|
|
|
@@ -158,7 +177,7 @@ export { calculate };
|
|
|
158
177
|
const patchCode = `
|
|
159
178
|
export { calculate };
|
|
160
179
|
export { sum };
|
|
161
|
-
//
|
|
180
|
+
// ${SPECIAL_PATCH_END_FILE_MARKER}
|
|
162
181
|
`;
|
|
163
182
|
const expectedResult = `
|
|
164
183
|
import fs from 'fs';
|
|
@@ -170,11 +189,10 @@ function calculate() {
|
|
|
170
189
|
export { calculate };
|
|
171
190
|
export { sum };
|
|
172
191
|
`;
|
|
173
|
-
|
|
174
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
192
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
175
193
|
});
|
|
176
194
|
|
|
177
|
-
it(
|
|
195
|
+
it(`should handle ${SPECIAL_PATCH_END_FILE_MARKER} marker with variations`, async () => {
|
|
178
196
|
const sourceCode = `
|
|
179
197
|
import fs from 'fs';
|
|
180
198
|
|
|
@@ -187,7 +205,7 @@ export { calculate };
|
|
|
187
205
|
const patchCode = `
|
|
188
206
|
export { calculate };
|
|
189
207
|
export { sum };
|
|
190
|
-
|
|
208
|
+
//${SPECIAL_PATCH_END_FILE_MARKER} some extra text
|
|
191
209
|
`;
|
|
192
210
|
const expectedResult = `
|
|
193
211
|
import fs from 'fs';
|
|
@@ -199,11 +217,11 @@ function calculate() {
|
|
|
199
217
|
export { calculate };
|
|
200
218
|
export { sum };
|
|
201
219
|
`;
|
|
202
|
-
|
|
203
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
220
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
204
221
|
});
|
|
205
222
|
it('should succeed with dynamic anchor sizing when initial anchors are ambiguous', async () => {
|
|
206
223
|
const sourceCode = `
|
|
224
|
+
import dotenv from 'dotenv';
|
|
207
225
|
const config = {
|
|
208
226
|
port: 8080,
|
|
209
227
|
host: 'localhost',
|
|
@@ -219,6 +237,7 @@ const config2 = {
|
|
|
219
237
|
};
|
|
220
238
|
`;
|
|
221
239
|
const patchCode = `
|
|
240
|
+
import dotenv from 'dotenv';
|
|
222
241
|
const config = {
|
|
223
242
|
port: 9000,
|
|
224
243
|
host: 'localhost',
|
|
@@ -228,6 +247,7 @@ const config = {
|
|
|
228
247
|
function connect() {
|
|
229
248
|
`;
|
|
230
249
|
const expectedResult = `
|
|
250
|
+
import dotenv from 'dotenv';
|
|
231
251
|
const config = {
|
|
232
252
|
port: 9000,
|
|
233
253
|
host: 'localhost',
|
|
@@ -243,8 +263,7 @@ const config2 = {
|
|
|
243
263
|
host: 'remote',
|
|
244
264
|
};
|
|
245
265
|
`;
|
|
246
|
-
|
|
247
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
266
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
248
267
|
});
|
|
249
268
|
|
|
250
269
|
it('should handle partial type definitions by trimming ambiguous tokens', async () => {
|
|
@@ -275,6 +294,8 @@ const config2 = {
|
|
|
275
294
|
*/
|
|
276
295
|
combinedStatusNew2?: everest_appserver_primitive_Text | null;
|
|
277
296
|
/**
|
|
297
|
+
* Contains Billable Expense line
|
|
298
|
+
*/
|
|
278
299
|
`;
|
|
279
300
|
const expectedResult = `
|
|
280
301
|
/**
|
|
@@ -296,8 +317,7 @@ const config2 = {
|
|
|
296
317
|
*/
|
|
297
318
|
containsPrepaidItem?: everest_appserver_primitive_TrueFalse | null;
|
|
298
319
|
`;
|
|
299
|
-
|
|
300
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
320
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
301
321
|
});
|
|
302
322
|
|
|
303
323
|
it('should patch TSX correctly by trimming tokens from invalid partial snippets', async () => {
|
|
@@ -408,7 +428,10 @@ export function TransactionsPage() {
|
|
|
408
428
|
<DataTable
|
|
409
429
|
columns={billColumns}
|
|
410
430
|
data={bills}
|
|
411
|
-
onRowClick={(row) =>
|
|
431
|
+
onRowClick={(row) => {
|
|
432
|
+
console.log(\`[UI] Selected matchable bill ID\`)
|
|
433
|
+
selectMatchable(row.id)
|
|
434
|
+
}}
|
|
412
435
|
selectedId={selectedMatchableId}
|
|
413
436
|
/>
|
|
414
437
|
</div>
|
|
@@ -569,11 +592,6 @@ export function TransactionsPage() {
|
|
|
569
592
|
)
|
|
570
593
|
}
|
|
571
594
|
`;
|
|
572
|
-
|
|
573
|
-
sourceCode,
|
|
574
|
-
patchCode,
|
|
575
|
-
TSX_WASM_PATH,
|
|
576
|
-
);
|
|
577
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
595
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
578
596
|
});
|
|
579
597
|
});
|
|
@@ -1,23 +1,48 @@
|
|
|
1
1
|
import { initializeParser } from './parser';
|
|
2
|
-
import { collectTokens } from './tokens';
|
|
3
2
|
import {
|
|
4
3
|
handleBeginOfFilePatch,
|
|
5
4
|
handleEndOfFilePatch,
|
|
6
5
|
handleStandardPatch,
|
|
7
6
|
} from './patcher';
|
|
7
|
+
import {
|
|
8
|
+
SPECIAL_PATCH_BEGIN_FILE_MARKER,
|
|
9
|
+
SPECIAL_PATCH_END_FILE_MARKER,
|
|
10
|
+
} from '../../src/llm-orchestration/parser/parsing.constants';
|
|
11
|
+
import { TokenizerStrategy } from './tokenizer.interface';
|
|
12
|
+
import { TreeSitterTokenizer } from './strategies/tree-sitter-tokenizer';
|
|
13
|
+
import { TiktokenTokenizer } from './strategies/tiktoken-tokenizer';
|
|
14
|
+
|
|
15
|
+
export interface ApplyPatchOptions {
|
|
16
|
+
grammarPath?: string;
|
|
17
|
+
useTiktoken?: boolean;
|
|
18
|
+
}
|
|
8
19
|
|
|
9
20
|
export async function applySnippetPatch(
|
|
10
21
|
sourceCode: string,
|
|
11
22
|
patchCode: string,
|
|
12
|
-
|
|
23
|
+
optionsOrGrammarPath: string | ApplyPatchOptions,
|
|
13
24
|
): Promise<string> {
|
|
14
|
-
|
|
25
|
+
let strategy: TokenizerStrategy;
|
|
26
|
+
let options: ApplyPatchOptions;
|
|
27
|
+
|
|
28
|
+
// Backward compatibility for when the 3rd argument was just grammarPath string
|
|
29
|
+
if (typeof optionsOrGrammarPath === 'string') {
|
|
30
|
+
options = { grammarPath: optionsOrGrammarPath };
|
|
31
|
+
} else {
|
|
32
|
+
options = optionsOrGrammarPath;
|
|
33
|
+
}
|
|
15
34
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
35
|
+
if (options.useTiktoken) {
|
|
36
|
+
strategy = new TiktokenTokenizer();
|
|
37
|
+
} else if (options.grammarPath) {
|
|
38
|
+
const parser = await initializeParser(options.grammarPath);
|
|
39
|
+
strategy = new TreeSitterTokenizer(parser);
|
|
40
|
+
} else {
|
|
41
|
+
// Default to Tiktoken if no grammar path is provided
|
|
42
|
+
strategy = new TiktokenTokenizer();
|
|
19
43
|
}
|
|
20
|
-
|
|
44
|
+
|
|
45
|
+
const sourceTokens = strategy.tokenize(sourceCode);
|
|
21
46
|
|
|
22
47
|
let patchResult: {
|
|
23
48
|
replaceStart: number;
|
|
@@ -27,42 +52,47 @@ export async function applySnippetPatch(
|
|
|
27
52
|
};
|
|
28
53
|
let processedPatchCode = patchCode;
|
|
29
54
|
|
|
30
|
-
const beginOfFileRegex =
|
|
31
|
-
|
|
55
|
+
const beginOfFileRegex = new RegExp(
|
|
56
|
+
`//\\s*${SPECIAL_PATCH_BEGIN_FILE_MARKER}.*`,
|
|
57
|
+
);
|
|
58
|
+
const endOfFileRegex = new RegExp(`//\\s*${SPECIAL_PATCH_END_FILE_MARKER}.*`);
|
|
32
59
|
|
|
33
60
|
const hasBeginOfFile = beginOfFileRegex.test(patchCode);
|
|
34
61
|
const hasEndOfFile = endOfFileRegex.test(patchCode);
|
|
35
62
|
|
|
36
63
|
if (hasBeginOfFile) {
|
|
37
64
|
processedPatchCode = patchCode.replace(beginOfFileRegex, '');
|
|
38
|
-
|
|
39
|
-
if
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
(t) => t.text !== '',
|
|
44
|
-
);
|
|
65
|
+
// We need to tokenize the processed patch code
|
|
66
|
+
// We filter out empty text tokens if any strategy produces them, though likely not needed for tiktoken
|
|
67
|
+
const patchTokens = strategy
|
|
68
|
+
.tokenize(processedPatchCode)
|
|
69
|
+
.filter((t) => t.text !== '' && t.text !== '\n');
|
|
45
70
|
patchResult = handleBeginOfFilePatch(sourceTokens, patchTokens);
|
|
46
71
|
} else if (hasEndOfFile) {
|
|
47
72
|
processedPatchCode = patchCode.replace(endOfFileRegex, '');
|
|
48
|
-
const
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
52
|
-
const patchTokens = collectTokens(patchTree, processedPatchCode).filter(
|
|
53
|
-
(t) => t.text !== '',
|
|
54
|
-
);
|
|
73
|
+
const patchTokens = strategy
|
|
74
|
+
.tokenize(processedPatchCode)
|
|
75
|
+
.filter((t) => t.text !== '' && t.text !== '\n');
|
|
55
76
|
patchResult = handleEndOfFilePatch(sourceTokens, patchTokens, sourceCode);
|
|
56
77
|
} else {
|
|
57
78
|
processedPatchCode = patchCode.trim();
|
|
58
|
-
const
|
|
59
|
-
|
|
60
|
-
|
|
79
|
+
const patchTokens = strategy
|
|
80
|
+
.tokenize(processedPatchCode)
|
|
81
|
+
.filter((t) => t.text !== '');
|
|
82
|
+
|
|
83
|
+
// Heuristic: If using Tiktoken, we discard the first and last 3 tokens to improve matching resilience.
|
|
84
|
+
// This prevents issues where the LLM hallucinating extra delimiters or context at the edges of the snippet.
|
|
85
|
+
const isTiktoken = options.useTiktoken || !options.grammarPath;
|
|
86
|
+
if (
|
|
87
|
+
isTiktoken &&
|
|
88
|
+
patchTokens.length >= 6 &&
|
|
89
|
+
patchTokens.length < sourceTokens.length
|
|
90
|
+
) {
|
|
91
|
+
const innerTokens = patchTokens.slice(2, -2);
|
|
92
|
+
patchResult = handleStandardPatch(sourceTokens, innerTokens);
|
|
93
|
+
} else {
|
|
94
|
+
patchResult = handleStandardPatch(sourceTokens, patchTokens);
|
|
61
95
|
}
|
|
62
|
-
const patchTokens = collectTokens(patchTree, processedPatchCode).filter(
|
|
63
|
-
(t) => t.text !== '',
|
|
64
|
-
);
|
|
65
|
-
patchResult = handleStandardPatch(sourceTokens, patchTokens);
|
|
66
96
|
}
|
|
67
97
|
|
|
68
98
|
// NOTE: replaceStart/End are byte offsets.
|
|
@@ -77,4 +107,4 @@ export async function applySnippetPatch(
|
|
|
77
107
|
);
|
|
78
108
|
|
|
79
109
|
return prefix + finalPatchContent + suffix;
|
|
80
|
-
}
|
|
110
|
+
}
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import type { Token } from './types';
|
|
2
2
|
import { findAllSequences, formatAnchor } from './tokens';
|
|
3
|
+
import {
|
|
4
|
+
SPECIAL_PATCH_BEGIN_FILE_MARKER,
|
|
5
|
+
SPECIAL_PATCH_END_FILE_MARKER,
|
|
6
|
+
} from '../../src/llm-orchestration/parser/parsing.constants';
|
|
3
7
|
|
|
4
8
|
interface PatchResult {
|
|
5
9
|
replaceStart: number;
|
|
@@ -13,6 +17,23 @@ interface SimplePatchResult {
|
|
|
13
17
|
replaceEnd: number;
|
|
14
18
|
}
|
|
15
19
|
|
|
20
|
+
// Helper to prioritize "Ambiguous" errors over "Not found" errors
|
|
21
|
+
const updateLastError = (
|
|
22
|
+
currentError: Error | null,
|
|
23
|
+
newError: Error,
|
|
24
|
+
): Error => {
|
|
25
|
+
if (!currentError) return newError;
|
|
26
|
+
// If we already have an Ambiguous error, keep it unless the new one is also Ambiguous
|
|
27
|
+
// (Assuming Ambiguous is more useful/specific than "Not found")
|
|
28
|
+
const currentIsAmbiguous = currentError.message.includes('Ambiguous');
|
|
29
|
+
const newIsAmbiguous = newError.message.includes('Ambiguous');
|
|
30
|
+
|
|
31
|
+
if (currentIsAmbiguous && !newIsAmbiguous) {
|
|
32
|
+
return currentError;
|
|
33
|
+
}
|
|
34
|
+
return newError;
|
|
35
|
+
};
|
|
36
|
+
|
|
16
37
|
// Internal helper for the original matching logic
|
|
17
38
|
function _findBeginOfFilePatchLocation(
|
|
18
39
|
sourceTokens: Token[],
|
|
@@ -36,7 +57,10 @@ function _findBeginOfFilePatchLocation(
|
|
|
36
57
|
if (indices.length > 1) {
|
|
37
58
|
const formattedAnchor = formatAnchor(suffixAnchor);
|
|
38
59
|
const locations = indices
|
|
39
|
-
.map(
|
|
60
|
+
.map(
|
|
61
|
+
(i) =>
|
|
62
|
+
`line ${sourceTokens[i].startPosition?.row ? sourceTokens[i].startPosition.row + 1 : '?'}`,
|
|
63
|
+
)
|
|
40
64
|
.join(', ');
|
|
41
65
|
lastError = `Ambiguous suffix anchor. The sequence "${formattedAnchor}" was found at ${indices.length} locations: ${locations}.`;
|
|
42
66
|
}
|
|
@@ -60,7 +84,9 @@ export function handleBeginOfFilePatch(
|
|
|
60
84
|
originalPatchTokens: Token[],
|
|
61
85
|
): PatchResult {
|
|
62
86
|
if (originalPatchTokens.length === 0) {
|
|
63
|
-
throw new Error(
|
|
87
|
+
throw new Error(
|
|
88
|
+
`Patch is empty after removing ${SPECIAL_PATCH_BEGIN_FILE_MARKER} marker.`,
|
|
89
|
+
);
|
|
64
90
|
}
|
|
65
91
|
|
|
66
92
|
let patchAttempt = [...originalPatchTokens];
|
|
@@ -77,13 +103,13 @@ export function handleBeginOfFilePatch(
|
|
|
77
103
|
const patchInsertEnd = patchAttempt[patchAttempt.length - 1].endIndex;
|
|
78
104
|
return { replaceStart, replaceEnd, patchInsertStart, patchInsertEnd };
|
|
79
105
|
} catch (e) {
|
|
80
|
-
lastError = e as Error;
|
|
106
|
+
lastError = updateLastError(lastError, e as Error);
|
|
81
107
|
patchAttempt = patchAttempt.slice(1); // Trim one token from the beginning
|
|
82
108
|
}
|
|
83
109
|
}
|
|
84
110
|
|
|
85
111
|
throw new Error(
|
|
86
|
-
`Failed to apply
|
|
112
|
+
`Failed to apply ${SPECIAL_PATCH_BEGIN_FILE_MARKER} patch. Could not find a unique anchor, even after trimming tokens. Last known error: ${lastError?.message}`,
|
|
87
113
|
);
|
|
88
114
|
}
|
|
89
115
|
|
|
@@ -110,7 +136,10 @@ function _findEndOfFilePatchLocation(
|
|
|
110
136
|
if (indices.length > 1) {
|
|
111
137
|
const formattedAnchor = formatAnchor(prefixAnchor);
|
|
112
138
|
const locations = indices
|
|
113
|
-
.map(
|
|
139
|
+
.map(
|
|
140
|
+
(i) =>
|
|
141
|
+
`line ${sourceTokens[i].startPosition?.row ? sourceTokens[i].startPosition.row + 1 : '?'}`,
|
|
142
|
+
)
|
|
114
143
|
.join(', ');
|
|
115
144
|
lastError = `Ambiguous prefix anchor. The sequence "${formattedAnchor}" was found at ${indices.length} locations: ${locations}.`;
|
|
116
145
|
}
|
|
@@ -134,7 +163,9 @@ export function handleEndOfFilePatch(
|
|
|
134
163
|
sourceCode: string,
|
|
135
164
|
): PatchResult {
|
|
136
165
|
if (originalPatchTokens.length === 0) {
|
|
137
|
-
throw new Error(
|
|
166
|
+
throw new Error(
|
|
167
|
+
`Patch is empty after removing ${SPECIAL_PATCH_END_FILE_MARKER} marker.`,
|
|
168
|
+
);
|
|
138
169
|
}
|
|
139
170
|
|
|
140
171
|
let patchAttempt = [...originalPatchTokens];
|
|
@@ -152,22 +183,26 @@ export function handleEndOfFilePatch(
|
|
|
152
183
|
const patchInsertEnd = patchAttempt[patchAttempt.length - 1].endIndex;
|
|
153
184
|
return { replaceStart, replaceEnd, patchInsertStart, patchInsertEnd };
|
|
154
185
|
} catch (e) {
|
|
155
|
-
lastError = e as Error;
|
|
186
|
+
lastError = updateLastError(lastError, e as Error);
|
|
156
187
|
patchAttempt = patchAttempt.slice(0, -1); // Trim one token from the end
|
|
157
188
|
}
|
|
158
189
|
}
|
|
159
190
|
|
|
160
191
|
throw new Error(
|
|
161
|
-
`Failed to apply
|
|
192
|
+
`Failed to apply ${SPECIAL_PATCH_END_FILE_MARKER} patch. Could not find a unique anchor, even after trimming tokens. Last known error: ${lastError?.message}`,
|
|
162
193
|
);
|
|
163
194
|
}
|
|
164
195
|
|
|
165
|
-
|
|
166
|
-
|
|
196
|
+
interface PrefixResult {
|
|
197
|
+
prefixAnchor: Token[];
|
|
198
|
+
prefixIndex: number;
|
|
199
|
+
replaceStart: number;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function _findPrefixLocation(
|
|
167
203
|
sourceTokens: Token[],
|
|
168
204
|
patchTokens: Token[],
|
|
169
|
-
):
|
|
170
|
-
// 1. Find smallest unique prefix
|
|
205
|
+
): PrefixResult {
|
|
171
206
|
let prefixAnchor: Token[] | null = null;
|
|
172
207
|
let prefixIndex: number | null = null;
|
|
173
208
|
let bestPrefixError: string | null = null;
|
|
@@ -185,7 +220,10 @@ function _findStandardPatchLocation(
|
|
|
185
220
|
if (prefixIndices.length > 1) {
|
|
186
221
|
const formatted = formatAnchor(currentPrefix);
|
|
187
222
|
const locations = prefixIndices
|
|
188
|
-
.map(
|
|
223
|
+
.map(
|
|
224
|
+
(i) =>
|
|
225
|
+
`line ${sourceTokens[i].startPosition?.row ? sourceTokens[i].startPosition.row + 1 : '?'}`,
|
|
226
|
+
)
|
|
189
227
|
.join(', ');
|
|
190
228
|
bestPrefixError = `Ambiguous prefix anchor. The sequence "${formatted}" was found at ${prefixIndices.length} locations: ${locations}.`;
|
|
191
229
|
}
|
|
@@ -199,12 +237,24 @@ function _findStandardPatchLocation(
|
|
|
199
237
|
}
|
|
200
238
|
|
|
201
239
|
if (!prefixAnchor || prefixIndex === null) {
|
|
202
|
-
throw new Error(
|
|
203
|
-
bestPrefixError || 'Could not find a unique prefix anchor.',
|
|
204
|
-
);
|
|
240
|
+
throw new Error(bestPrefixError || 'Could not find a unique prefix anchor.');
|
|
205
241
|
}
|
|
206
242
|
|
|
207
|
-
|
|
243
|
+
const replaceStart = sourceTokens[prefixIndex].startIndex;
|
|
244
|
+
|
|
245
|
+
return { prefixAnchor, prefixIndex, replaceStart };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
interface SuffixResult {
|
|
249
|
+
replaceEnd: number;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function _findSuffixLocation(
|
|
253
|
+
sourceTokens: Token[],
|
|
254
|
+
patchTokens: Token[],
|
|
255
|
+
prefixAnchor: Token[],
|
|
256
|
+
prefixIndex: number,
|
|
257
|
+
): SuffixResult {
|
|
208
258
|
let suffixAnchor: Token[] | null = null;
|
|
209
259
|
let suffixIndex: number | null = null;
|
|
210
260
|
let bestSuffixError: string | null = null;
|
|
@@ -231,7 +281,9 @@ function _findStandardPatchLocation(
|
|
|
231
281
|
.map(
|
|
232
282
|
(i) =>
|
|
233
283
|
`line ${
|
|
234
|
-
sourceTokens[searchStartIndex + i].startPosition
|
|
284
|
+
sourceTokens[searchStartIndex + i].startPosition?.row
|
|
285
|
+
? sourceTokens[searchStartIndex + i].startPosition.row + 1
|
|
286
|
+
: '?'
|
|
235
287
|
}`,
|
|
236
288
|
)
|
|
237
289
|
.join(', ');
|
|
@@ -243,7 +295,9 @@ function _findStandardPatchLocation(
|
|
|
243
295
|
throw new Error(bestSuffixError);
|
|
244
296
|
}
|
|
245
297
|
const prefixLocation = `line ${
|
|
246
|
-
sourceTokens[prefixIndex].startPosition
|
|
298
|
+
sourceTokens[prefixIndex].startPosition?.row
|
|
299
|
+
? sourceTokens[prefixIndex].startPosition.row + 1
|
|
300
|
+
: '?'
|
|
247
301
|
}`;
|
|
248
302
|
const formattedPrefix = formatAnchor(prefixAnchor);
|
|
249
303
|
const smallestSuffix = formatAnchor(
|
|
@@ -256,12 +310,10 @@ function _findStandardPatchLocation(
|
|
|
256
310
|
);
|
|
257
311
|
}
|
|
258
312
|
|
|
259
|
-
// 3. Apply patch
|
|
260
|
-
const replaceStart = sourceTokens[prefixIndex].startIndex;
|
|
261
313
|
const replaceEnd =
|
|
262
314
|
sourceTokens[suffixIndex + suffixAnchor.length - 1].endIndex;
|
|
263
315
|
|
|
264
|
-
return {
|
|
316
|
+
return { replaceEnd };
|
|
265
317
|
}
|
|
266
318
|
|
|
267
319
|
export function handleStandardPatch(
|
|
@@ -274,27 +326,64 @@ export function handleStandardPatch(
|
|
|
274
326
|
);
|
|
275
327
|
}
|
|
276
328
|
|
|
277
|
-
let
|
|
329
|
+
let startTrim = 0;
|
|
330
|
+
const endTrim = 0;
|
|
278
331
|
let lastError: Error | null = null;
|
|
279
332
|
|
|
280
|
-
|
|
333
|
+
let prefixInfo: PrefixResult | null = null;
|
|
334
|
+
|
|
335
|
+
// Loop 1: Find Prefix (trim from start)
|
|
336
|
+
let patchAttempt = [...originalPatchTokens];
|
|
281
337
|
while (patchAttempt.length >= 2) {
|
|
282
338
|
try {
|
|
283
|
-
|
|
339
|
+
prefixInfo = _findPrefixLocation(sourceTokens, patchAttempt);
|
|
340
|
+
// Success finding prefix
|
|
341
|
+
startTrim = originalPatchTokens.length - patchAttempt.length;
|
|
342
|
+
break;
|
|
343
|
+
} catch (e) {
|
|
344
|
+
lastError = updateLastError(lastError, e as Error);
|
|
345
|
+
// Trim one token from the start
|
|
346
|
+
patchAttempt = patchAttempt.slice(1);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if (!prefixInfo) {
|
|
351
|
+
throw new Error(
|
|
352
|
+
`Failed to apply patch. Could not find a unique prefix anchor, even after trimming tokens. Last known error: ${lastError?.message}`,
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Loop 2: Find Suffix (trim from end)
|
|
357
|
+
// Reset patchAttempt to start from the found prefix startTrim, but allow trimming end
|
|
358
|
+
patchAttempt = originalPatchTokens.slice(startTrim);
|
|
359
|
+
lastError = null; // Reset last error for suffix search phase
|
|
360
|
+
|
|
361
|
+
while (patchAttempt.length >= prefixInfo.prefixAnchor.length + 1) {
|
|
362
|
+
// Need at least prefix + 1 token? Or just prefix + suffix?
|
|
363
|
+
try {
|
|
364
|
+
const { replaceEnd } = _findSuffixLocation(
|
|
284
365
|
sourceTokens,
|
|
285
366
|
patchAttempt,
|
|
367
|
+
prefixInfo.prefixAnchor,
|
|
368
|
+
prefixInfo.prefixIndex,
|
|
286
369
|
);
|
|
370
|
+
// Success finding suffix
|
|
287
371
|
const patchInsertStart = patchAttempt[0].startIndex;
|
|
288
372
|
const patchInsertEnd = patchAttempt[patchAttempt.length - 1].endIndex;
|
|
289
|
-
return {
|
|
373
|
+
return {
|
|
374
|
+
replaceStart: prefixInfo.replaceStart,
|
|
375
|
+
replaceEnd,
|
|
376
|
+
patchInsertStart,
|
|
377
|
+
patchInsertEnd,
|
|
378
|
+
};
|
|
290
379
|
} catch (e) {
|
|
291
|
-
lastError = e as Error;
|
|
292
|
-
// Trim one token from the
|
|
293
|
-
patchAttempt = patchAttempt.slice(
|
|
380
|
+
lastError = updateLastError(lastError, e as Error);
|
|
381
|
+
// Trim one token from the end
|
|
382
|
+
patchAttempt = patchAttempt.slice(0, -1);
|
|
294
383
|
}
|
|
295
384
|
}
|
|
296
385
|
|
|
297
386
|
throw new Error(
|
|
298
|
-
`Failed to apply patch. Could not find a unique anchor
|
|
387
|
+
`Failed to apply patch. Could not find a unique suffix anchor, even after trimming tokens. Last known error: ${lastError?.message}`,
|
|
299
388
|
);
|
|
300
389
|
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { getEncoding } from 'js-tiktoken';
|
|
2
|
+
import { Token } from '../types';
|
|
3
|
+
import { TokenizerStrategy } from '../tokenizer.interface';
|
|
4
|
+
|
|
5
|
+
export class TiktokenTokenizer implements TokenizerStrategy {
|
|
6
|
+
// Use cl100k_base (GPT-4) as the standard encoding
|
|
7
|
+
private enc = getEncoding('cl100k_base');
|
|
8
|
+
|
|
9
|
+
tokenize(content: string): Token[] {
|
|
10
|
+
const tokens: Token[] = [];
|
|
11
|
+
const encoded = this.enc.encode(content);
|
|
12
|
+
|
|
13
|
+
let currentIndex = 0;
|
|
14
|
+
|
|
15
|
+
// Iterate through token IDs, decode them individually to get text and length.
|
|
16
|
+
// This allows us to reconstruct the offsets (startIndex/endIndex).
|
|
17
|
+
for (const tokenId of encoded) {
|
|
18
|
+
// decoding a single token is the only way to get its exact text representation
|
|
19
|
+
// to map back to the source string indices.
|
|
20
|
+
const text = this.enc.decode([tokenId]);
|
|
21
|
+
const length = text.length;
|
|
22
|
+
|
|
23
|
+
tokens.push({
|
|
24
|
+
text,
|
|
25
|
+
type: 'bpe',
|
|
26
|
+
startIndex: currentIndex,
|
|
27
|
+
endIndex: currentIndex + length,
|
|
28
|
+
// startPosition is not calculated for Tiktoken strategy as it's computationally expensive
|
|
29
|
+
// and not strictly required for the patching algorithm which relies on text matching.
|
|
30
|
+
});
|
|
31
|
+
currentIndex += length;
|
|
32
|
+
}
|
|
33
|
+
return tokens;
|
|
34
|
+
}
|
|
35
|
+
}
|