repoburg 1.3.12 → 1.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/backend/dist/packages/tokenpatch/index.d.ts +5 -1
- package/backend/dist/packages/tokenpatch/index.js +39 -22
- package/backend/dist/packages/tokenpatch/index.js.map +1 -1
- package/backend/dist/packages/tokenpatch/patcher.js +57 -16
- package/backend/dist/packages/tokenpatch/patcher.js.map +1 -1
- package/backend/dist/packages/tokenpatch/strategies/tiktoken-tokenizer.d.ts +6 -0
- package/backend/dist/packages/tokenpatch/strategies/tiktoken-tokenizer.js +28 -0
- package/backend/dist/packages/tokenpatch/strategies/tiktoken-tokenizer.js.map +1 -0
- package/backend/dist/packages/tokenpatch/strategies/tree-sitter-tokenizer.d.ts +9 -0
- package/backend/dist/packages/tokenpatch/strategies/tree-sitter-tokenizer.js +36 -0
- package/backend/dist/packages/tokenpatch/strategies/tree-sitter-tokenizer.js.map +1 -0
- package/backend/dist/packages/tokenpatch/tokenizer.interface.d.ts +4 -0
- package/backend/dist/packages/tokenpatch/tokenizer.interface.js +3 -0
- package/backend/dist/packages/tokenpatch/tokenizer.interface.js.map +1 -0
- package/backend/dist/packages/tokenpatch/tokens.d.ts +0 -2
- package/backend/dist/packages/tokenpatch/tokens.js +4 -23
- package/backend/dist/packages/tokenpatch/tokens.js.map +1 -1
- package/backend/dist/packages/tokenpatch/types.d.ts +2 -2
- package/backend/dist/src/llm-orchestration/action-handlers/patch.handler.js +125 -47
- package/backend/dist/src/llm-orchestration/action-handlers/patch.handler.js.map +1 -1
- package/backend/dist/src/seeding/data/system-prompts/experimental_eta_master-agent.d.ts +1 -1
- package/backend/dist/src/seeding/data/system-prompts/experimental_eta_master-agent.js +44 -55
- package/backend/dist/src/seeding/data/system-prompts/experimental_eta_master-agent.js.map +1 -1
- package/backend/dist/tsconfig.build.tsbuildinfo +1 -1
- package/backend/packages/tokenpatch/index.spec.ts +44 -30
- package/backend/packages/tokenpatch/index.ts +54 -32
- package/backend/packages/tokenpatch/patcher.ts +107 -26
- package/backend/packages/tokenpatch/strategies/tiktoken-tokenizer.ts +35 -0
- package/backend/packages/tokenpatch/strategies/tree-sitter-tokenizer.ts +37 -0
- package/backend/packages/tokenpatch/tokenizer.interface.ts +5 -0
- package/backend/packages/tokenpatch/tokens.ts +10 -28
- package/backend/packages/tokenpatch/types.ts +4 -4
- package/package.json +2 -1
|
@@ -10,10 +10,30 @@ const TS_WASM_PATH = path.join(
|
|
|
10
10
|
'./grammar/tree-sitter-typescript.wasm',
|
|
11
11
|
);
|
|
12
12
|
|
|
13
|
-
const TSX_WASM_PATH = path.join(__dirname, './grammar/tree-sitter-tsx.wasm');
|
|
14
|
-
|
|
15
13
|
// Helper to normalize whitespace for robust comparison
|
|
16
|
-
const normalize = (str: string) =>
|
|
14
|
+
const normalize = (str: string) =>
|
|
15
|
+
str
|
|
16
|
+
.replace(/}/g, '} ')
|
|
17
|
+
.replace(/\s+/g, ' ')
|
|
18
|
+
.trim();
|
|
19
|
+
|
|
20
|
+
const runTest = async (
|
|
21
|
+
sourceCode: string,
|
|
22
|
+
patchCode: string,
|
|
23
|
+
expectedResult: string,
|
|
24
|
+
) => {
|
|
25
|
+
const resultTiktoken = await applySnippetPatch(sourceCode, patchCode, {
|
|
26
|
+
useTiktoken: true,
|
|
27
|
+
});
|
|
28
|
+
expect(normalize(resultTiktoken)).toEqual(normalize(expectedResult));
|
|
29
|
+
|
|
30
|
+
const resultTreeSitter = await applySnippetPatch(
|
|
31
|
+
sourceCode,
|
|
32
|
+
patchCode,
|
|
33
|
+
TS_WASM_PATH,
|
|
34
|
+
);
|
|
35
|
+
expect(normalize(resultTreeSitter)).toEqual(normalize(expectedResult));
|
|
36
|
+
};
|
|
17
37
|
|
|
18
38
|
describe('applySnippetPatch', () => {
|
|
19
39
|
it('should replace a method body in a class by automatically finding anchor size', async () => {
|
|
@@ -39,12 +59,11 @@ class Greeter {
|
|
|
39
59
|
// A new implementation
|
|
40
60
|
return "Hello, TypeScript!";
|
|
41
61
|
}
|
|
42
|
-
|
|
62
|
+
fc(){ return 42; }
|
|
43
63
|
|
|
44
64
|
}
|
|
45
65
|
`;
|
|
46
|
-
|
|
47
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
66
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
48
67
|
});
|
|
49
68
|
|
|
50
69
|
it('should replace a data structure definition', async () => {
|
|
@@ -67,8 +86,7 @@ interface MyData {
|
|
|
67
86
|
}
|
|
68
87
|
console.log("hello");
|
|
69
88
|
`;
|
|
70
|
-
|
|
71
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
89
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
72
90
|
});
|
|
73
91
|
|
|
74
92
|
it('should replace a full function definition', async () => {
|
|
@@ -99,8 +117,7 @@ function calculate() {
|
|
|
99
117
|
|
|
100
118
|
export { calculate };
|
|
101
119
|
`;
|
|
102
|
-
|
|
103
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
120
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
104
121
|
});
|
|
105
122
|
|
|
106
123
|
it(`should replace a beginning of the file ${SPECIAL_PATCH_BEGIN_FILE_MARKER}`, async () => {
|
|
@@ -122,8 +139,7 @@ import { ModuleB } from './moduleB';
|
|
|
122
139
|
|
|
123
140
|
console.log('starting up');
|
|
124
141
|
`;
|
|
125
|
-
|
|
126
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
142
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
127
143
|
});
|
|
128
144
|
|
|
129
145
|
it(`should handle ${SPECIAL_PATCH_BEGIN_FILE_MARKER} marker with variations`, async () => {
|
|
@@ -145,8 +161,7 @@ import { ModuleB } from './moduleB';
|
|
|
145
161
|
|
|
146
162
|
console.log('starting up');
|
|
147
163
|
`;
|
|
148
|
-
|
|
149
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
164
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
150
165
|
});
|
|
151
166
|
|
|
152
167
|
it(`should replace a end of the file ${SPECIAL_PATCH_END_FILE_MARKER}`, async () => {
|
|
@@ -174,8 +189,7 @@ function calculate() {
|
|
|
174
189
|
export { calculate };
|
|
175
190
|
export { sum };
|
|
176
191
|
`;
|
|
177
|
-
|
|
178
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
192
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
179
193
|
});
|
|
180
194
|
|
|
181
195
|
it(`should handle ${SPECIAL_PATCH_END_FILE_MARKER} marker with variations`, async () => {
|
|
@@ -203,11 +217,11 @@ function calculate() {
|
|
|
203
217
|
export { calculate };
|
|
204
218
|
export { sum };
|
|
205
219
|
`;
|
|
206
|
-
|
|
207
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
220
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
208
221
|
});
|
|
209
222
|
it('should succeed with dynamic anchor sizing when initial anchors are ambiguous', async () => {
|
|
210
223
|
const sourceCode = `
|
|
224
|
+
import dotenv from 'dotenv';
|
|
211
225
|
const config = {
|
|
212
226
|
port: 8080,
|
|
213
227
|
host: 'localhost',
|
|
@@ -223,6 +237,7 @@ const config2 = {
|
|
|
223
237
|
};
|
|
224
238
|
`;
|
|
225
239
|
const patchCode = `
|
|
240
|
+
import dotenv from 'dotenv';
|
|
226
241
|
const config = {
|
|
227
242
|
port: 9000,
|
|
228
243
|
host: 'localhost',
|
|
@@ -232,6 +247,7 @@ const config = {
|
|
|
232
247
|
function connect() {
|
|
233
248
|
`;
|
|
234
249
|
const expectedResult = `
|
|
250
|
+
import dotenv from 'dotenv';
|
|
235
251
|
const config = {
|
|
236
252
|
port: 9000,
|
|
237
253
|
host: 'localhost',
|
|
@@ -247,8 +263,7 @@ const config2 = {
|
|
|
247
263
|
host: 'remote',
|
|
248
264
|
};
|
|
249
265
|
`;
|
|
250
|
-
|
|
251
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
266
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
252
267
|
});
|
|
253
268
|
|
|
254
269
|
it('should handle partial type definitions by trimming ambiguous tokens', async () => {
|
|
@@ -279,6 +294,8 @@ const config2 = {
|
|
|
279
294
|
*/
|
|
280
295
|
combinedStatusNew2?: everest_appserver_primitive_Text | null;
|
|
281
296
|
/**
|
|
297
|
+
* Contains Billable Expense line
|
|
298
|
+
*/
|
|
282
299
|
`;
|
|
283
300
|
const expectedResult = `
|
|
284
301
|
/**
|
|
@@ -300,8 +317,7 @@ const config2 = {
|
|
|
300
317
|
*/
|
|
301
318
|
containsPrepaidItem?: everest_appserver_primitive_TrueFalse | null;
|
|
302
319
|
`;
|
|
303
|
-
|
|
304
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
320
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
305
321
|
});
|
|
306
322
|
|
|
307
323
|
it('should patch TSX correctly by trimming tokens from invalid partial snippets', async () => {
|
|
@@ -412,7 +428,10 @@ export function TransactionsPage() {
|
|
|
412
428
|
<DataTable
|
|
413
429
|
columns={billColumns}
|
|
414
430
|
data={bills}
|
|
415
|
-
onRowClick={(row) =>
|
|
431
|
+
onRowClick={(row) => {
|
|
432
|
+
console.log(\`[UI] Selected matchable bill ID\`)
|
|
433
|
+
selectMatchable(row.id)
|
|
434
|
+
}}
|
|
416
435
|
selectedId={selectedMatchableId}
|
|
417
436
|
/>
|
|
418
437
|
</div>
|
|
@@ -573,11 +592,6 @@ export function TransactionsPage() {
|
|
|
573
592
|
)
|
|
574
593
|
}
|
|
575
594
|
`;
|
|
576
|
-
|
|
577
|
-
sourceCode,
|
|
578
|
-
patchCode,
|
|
579
|
-
TSX_WASM_PATH,
|
|
580
|
-
);
|
|
581
|
-
expect(normalize(result)).toEqual(normalize(expectedResult));
|
|
595
|
+
await runTest(sourceCode, patchCode, expectedResult);
|
|
582
596
|
});
|
|
583
|
-
});
|
|
597
|
+
});
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { initializeParser } from './parser';
|
|
2
|
-
import { collectTokens } from './tokens';
|
|
3
2
|
import {
|
|
4
3
|
handleBeginOfFilePatch,
|
|
5
4
|
handleEndOfFilePatch,
|
|
@@ -9,19 +8,41 @@ import {
|
|
|
9
8
|
SPECIAL_PATCH_BEGIN_FILE_MARKER,
|
|
10
9
|
SPECIAL_PATCH_END_FILE_MARKER,
|
|
11
10
|
} from '../../src/llm-orchestration/parser/parsing.constants';
|
|
11
|
+
import { TokenizerStrategy } from './tokenizer.interface';
|
|
12
|
+
import { TreeSitterTokenizer } from './strategies/tree-sitter-tokenizer';
|
|
13
|
+
import { TiktokenTokenizer } from './strategies/tiktoken-tokenizer';
|
|
14
|
+
|
|
15
|
+
export interface ApplyPatchOptions {
|
|
16
|
+
grammarPath?: string;
|
|
17
|
+
useTiktoken?: boolean;
|
|
18
|
+
}
|
|
12
19
|
|
|
13
20
|
export async function applySnippetPatch(
|
|
14
21
|
sourceCode: string,
|
|
15
22
|
patchCode: string,
|
|
16
|
-
|
|
23
|
+
optionsOrGrammarPath: string | ApplyPatchOptions,
|
|
17
24
|
): Promise<string> {
|
|
18
|
-
|
|
25
|
+
let strategy: TokenizerStrategy;
|
|
26
|
+
let options: ApplyPatchOptions;
|
|
19
27
|
|
|
20
|
-
|
|
21
|
-
if (
|
|
22
|
-
|
|
28
|
+
// Backward compatibility for when the 3rd argument was just grammarPath string
|
|
29
|
+
if (typeof optionsOrGrammarPath === 'string') {
|
|
30
|
+
options = { grammarPath: optionsOrGrammarPath };
|
|
31
|
+
} else {
|
|
32
|
+
options = optionsOrGrammarPath;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (options.useTiktoken) {
|
|
36
|
+
strategy = new TiktokenTokenizer();
|
|
37
|
+
} else if (options.grammarPath) {
|
|
38
|
+
const parser = await initializeParser(options.grammarPath);
|
|
39
|
+
strategy = new TreeSitterTokenizer(parser);
|
|
40
|
+
} else {
|
|
41
|
+
// Default to Tiktoken if no grammar path is provided
|
|
42
|
+
strategy = new TiktokenTokenizer();
|
|
23
43
|
}
|
|
24
|
-
|
|
44
|
+
|
|
45
|
+
const sourceTokens = strategy.tokenize(sourceCode);
|
|
25
46
|
|
|
26
47
|
let patchResult: {
|
|
27
48
|
replaceStart: number;
|
|
@@ -34,43 +55,44 @@ export async function applySnippetPatch(
|
|
|
34
55
|
const beginOfFileRegex = new RegExp(
|
|
35
56
|
`//\\s*${SPECIAL_PATCH_BEGIN_FILE_MARKER}.*`,
|
|
36
57
|
);
|
|
37
|
-
const endOfFileRegex = new RegExp(
|
|
38
|
-
`//\\s*${SPECIAL_PATCH_END_FILE_MARKER}.*`,
|
|
39
|
-
);
|
|
58
|
+
const endOfFileRegex = new RegExp(`//\\s*${SPECIAL_PATCH_END_FILE_MARKER}.*`);
|
|
40
59
|
|
|
41
60
|
const hasBeginOfFile = beginOfFileRegex.test(patchCode);
|
|
42
61
|
const hasEndOfFile = endOfFileRegex.test(patchCode);
|
|
43
62
|
|
|
44
63
|
if (hasBeginOfFile) {
|
|
45
64
|
processedPatchCode = patchCode.replace(beginOfFileRegex, '');
|
|
46
|
-
|
|
47
|
-
if
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
(t) => t.text !== '',
|
|
52
|
-
);
|
|
65
|
+
// We need to tokenize the processed patch code
|
|
66
|
+
// We filter out empty text tokens if any strategy produces them, though likely not needed for tiktoken
|
|
67
|
+
const patchTokens = strategy
|
|
68
|
+
.tokenize(processedPatchCode)
|
|
69
|
+
.filter((t) => t.text !== '' && t.text !== '\n');
|
|
53
70
|
patchResult = handleBeginOfFilePatch(sourceTokens, patchTokens);
|
|
54
71
|
} else if (hasEndOfFile) {
|
|
55
72
|
processedPatchCode = patchCode.replace(endOfFileRegex, '');
|
|
56
|
-
const
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
}
|
|
60
|
-
const patchTokens = collectTokens(patchTree, processedPatchCode).filter(
|
|
61
|
-
(t) => t.text !== '',
|
|
62
|
-
);
|
|
73
|
+
const patchTokens = strategy
|
|
74
|
+
.tokenize(processedPatchCode)
|
|
75
|
+
.filter((t) => t.text !== '' && t.text !== '\n');
|
|
63
76
|
patchResult = handleEndOfFilePatch(sourceTokens, patchTokens, sourceCode);
|
|
64
77
|
} else {
|
|
65
78
|
processedPatchCode = patchCode.trim();
|
|
66
|
-
const
|
|
67
|
-
|
|
68
|
-
|
|
79
|
+
const patchTokens = strategy
|
|
80
|
+
.tokenize(processedPatchCode)
|
|
81
|
+
.filter((t) => t.text !== '');
|
|
82
|
+
|
|
83
|
+
// Heuristic: If using Tiktoken, we discard the first and last 3 tokens to improve matching resilience.
|
|
84
|
+
// This prevents issues where the LLM hallucinating extra delimiters or context at the edges of the snippet.
|
|
85
|
+
const isTiktoken = options.useTiktoken || !options.grammarPath;
|
|
86
|
+
if (
|
|
87
|
+
isTiktoken &&
|
|
88
|
+
patchTokens.length >= 6 &&
|
|
89
|
+
patchTokens.length < sourceTokens.length
|
|
90
|
+
) {
|
|
91
|
+
const innerTokens = patchTokens.slice(2, -2);
|
|
92
|
+
patchResult = handleStandardPatch(sourceTokens, innerTokens);
|
|
93
|
+
} else {
|
|
94
|
+
patchResult = handleStandardPatch(sourceTokens, patchTokens);
|
|
69
95
|
}
|
|
70
|
-
const patchTokens = collectTokens(patchTree, processedPatchCode).filter(
|
|
71
|
-
(t) => t.text !== '',
|
|
72
|
-
);
|
|
73
|
-
patchResult = handleStandardPatch(sourceTokens, patchTokens);
|
|
74
96
|
}
|
|
75
97
|
|
|
76
98
|
// NOTE: replaceStart/End are byte offsets.
|
|
@@ -85,4 +107,4 @@ export async function applySnippetPatch(
|
|
|
85
107
|
);
|
|
86
108
|
|
|
87
109
|
return prefix + finalPatchContent + suffix;
|
|
88
|
-
}
|
|
110
|
+
}
|
|
@@ -17,6 +17,23 @@ interface SimplePatchResult {
|
|
|
17
17
|
replaceEnd: number;
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
+
// Helper to prioritize "Ambiguous" errors over "Not found" errors
|
|
21
|
+
const updateLastError = (
|
|
22
|
+
currentError: Error | null,
|
|
23
|
+
newError: Error,
|
|
24
|
+
): Error => {
|
|
25
|
+
if (!currentError) return newError;
|
|
26
|
+
// If we already have an Ambiguous error, keep it unless the new one is also Ambiguous
|
|
27
|
+
// (Assuming Ambiguous is more useful/specific than "Not found")
|
|
28
|
+
const currentIsAmbiguous = currentError.message.includes('Ambiguous');
|
|
29
|
+
const newIsAmbiguous = newError.message.includes('Ambiguous');
|
|
30
|
+
|
|
31
|
+
if (currentIsAmbiguous && !newIsAmbiguous) {
|
|
32
|
+
return currentError;
|
|
33
|
+
}
|
|
34
|
+
return newError;
|
|
35
|
+
};
|
|
36
|
+
|
|
20
37
|
// Internal helper for the original matching logic
|
|
21
38
|
function _findBeginOfFilePatchLocation(
|
|
22
39
|
sourceTokens: Token[],
|
|
@@ -40,7 +57,10 @@ function _findBeginOfFilePatchLocation(
|
|
|
40
57
|
if (indices.length > 1) {
|
|
41
58
|
const formattedAnchor = formatAnchor(suffixAnchor);
|
|
42
59
|
const locations = indices
|
|
43
|
-
.map(
|
|
60
|
+
.map(
|
|
61
|
+
(i) =>
|
|
62
|
+
`line ${sourceTokens[i].startPosition?.row ? sourceTokens[i].startPosition.row + 1 : '?'}`,
|
|
63
|
+
)
|
|
44
64
|
.join(', ');
|
|
45
65
|
lastError = `Ambiguous suffix anchor. The sequence "${formattedAnchor}" was found at ${indices.length} locations: ${locations}.`;
|
|
46
66
|
}
|
|
@@ -83,7 +103,7 @@ export function handleBeginOfFilePatch(
|
|
|
83
103
|
const patchInsertEnd = patchAttempt[patchAttempt.length - 1].endIndex;
|
|
84
104
|
return { replaceStart, replaceEnd, patchInsertStart, patchInsertEnd };
|
|
85
105
|
} catch (e) {
|
|
86
|
-
lastError = e as Error;
|
|
106
|
+
lastError = updateLastError(lastError, e as Error);
|
|
87
107
|
patchAttempt = patchAttempt.slice(1); // Trim one token from the beginning
|
|
88
108
|
}
|
|
89
109
|
}
|
|
@@ -116,7 +136,10 @@ function _findEndOfFilePatchLocation(
|
|
|
116
136
|
if (indices.length > 1) {
|
|
117
137
|
const formattedAnchor = formatAnchor(prefixAnchor);
|
|
118
138
|
const locations = indices
|
|
119
|
-
.map(
|
|
139
|
+
.map(
|
|
140
|
+
(i) =>
|
|
141
|
+
`line ${sourceTokens[i].startPosition?.row ? sourceTokens[i].startPosition.row + 1 : '?'}`,
|
|
142
|
+
)
|
|
120
143
|
.join(', ');
|
|
121
144
|
lastError = `Ambiguous prefix anchor. The sequence "${formattedAnchor}" was found at ${indices.length} locations: ${locations}.`;
|
|
122
145
|
}
|
|
@@ -160,7 +183,7 @@ export function handleEndOfFilePatch(
|
|
|
160
183
|
const patchInsertEnd = patchAttempt[patchAttempt.length - 1].endIndex;
|
|
161
184
|
return { replaceStart, replaceEnd, patchInsertStart, patchInsertEnd };
|
|
162
185
|
} catch (e) {
|
|
163
|
-
lastError = e as Error;
|
|
186
|
+
lastError = updateLastError(lastError, e as Error);
|
|
164
187
|
patchAttempt = patchAttempt.slice(0, -1); // Trim one token from the end
|
|
165
188
|
}
|
|
166
189
|
}
|
|
@@ -170,12 +193,16 @@ export function handleEndOfFilePatch(
|
|
|
170
193
|
);
|
|
171
194
|
}
|
|
172
195
|
|
|
173
|
-
|
|
174
|
-
|
|
196
|
+
interface PrefixResult {
|
|
197
|
+
prefixAnchor: Token[];
|
|
198
|
+
prefixIndex: number;
|
|
199
|
+
replaceStart: number;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function _findPrefixLocation(
|
|
175
203
|
sourceTokens: Token[],
|
|
176
204
|
patchTokens: Token[],
|
|
177
|
-
):
|
|
178
|
-
// 1. Find smallest unique prefix
|
|
205
|
+
): PrefixResult {
|
|
179
206
|
let prefixAnchor: Token[] | null = null;
|
|
180
207
|
let prefixIndex: number | null = null;
|
|
181
208
|
let bestPrefixError: string | null = null;
|
|
@@ -193,7 +220,10 @@ function _findStandardPatchLocation(
|
|
|
193
220
|
if (prefixIndices.length > 1) {
|
|
194
221
|
const formatted = formatAnchor(currentPrefix);
|
|
195
222
|
const locations = prefixIndices
|
|
196
|
-
.map(
|
|
223
|
+
.map(
|
|
224
|
+
(i) =>
|
|
225
|
+
`line ${sourceTokens[i].startPosition?.row ? sourceTokens[i].startPosition.row + 1 : '?'}`,
|
|
226
|
+
)
|
|
197
227
|
.join(', ');
|
|
198
228
|
bestPrefixError = `Ambiguous prefix anchor. The sequence "${formatted}" was found at ${prefixIndices.length} locations: ${locations}.`;
|
|
199
229
|
}
|
|
@@ -207,12 +237,24 @@ function _findStandardPatchLocation(
|
|
|
207
237
|
}
|
|
208
238
|
|
|
209
239
|
if (!prefixAnchor || prefixIndex === null) {
|
|
210
|
-
throw new Error(
|
|
211
|
-
bestPrefixError || 'Could not find a unique prefix anchor.',
|
|
212
|
-
);
|
|
240
|
+
throw new Error(bestPrefixError || 'Could not find a unique prefix anchor.');
|
|
213
241
|
}
|
|
214
242
|
|
|
215
|
-
|
|
243
|
+
const replaceStart = sourceTokens[prefixIndex].startIndex;
|
|
244
|
+
|
|
245
|
+
return { prefixAnchor, prefixIndex, replaceStart };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
interface SuffixResult {
|
|
249
|
+
replaceEnd: number;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function _findSuffixLocation(
|
|
253
|
+
sourceTokens: Token[],
|
|
254
|
+
patchTokens: Token[],
|
|
255
|
+
prefixAnchor: Token[],
|
|
256
|
+
prefixIndex: number,
|
|
257
|
+
): SuffixResult {
|
|
216
258
|
let suffixAnchor: Token[] | null = null;
|
|
217
259
|
let suffixIndex: number | null = null;
|
|
218
260
|
let bestSuffixError: string | null = null;
|
|
@@ -239,7 +281,9 @@ function _findStandardPatchLocation(
|
|
|
239
281
|
.map(
|
|
240
282
|
(i) =>
|
|
241
283
|
`line ${
|
|
242
|
-
sourceTokens[searchStartIndex + i].startPosition
|
|
284
|
+
sourceTokens[searchStartIndex + i].startPosition?.row
|
|
285
|
+
? sourceTokens[searchStartIndex + i].startPosition.row + 1
|
|
286
|
+
: '?'
|
|
243
287
|
}`,
|
|
244
288
|
)
|
|
245
289
|
.join(', ');
|
|
@@ -251,7 +295,9 @@ function _findStandardPatchLocation(
|
|
|
251
295
|
throw new Error(bestSuffixError);
|
|
252
296
|
}
|
|
253
297
|
const prefixLocation = `line ${
|
|
254
|
-
sourceTokens[prefixIndex].startPosition
|
|
298
|
+
sourceTokens[prefixIndex].startPosition?.row
|
|
299
|
+
? sourceTokens[prefixIndex].startPosition.row + 1
|
|
300
|
+
: '?'
|
|
255
301
|
}`;
|
|
256
302
|
const formattedPrefix = formatAnchor(prefixAnchor);
|
|
257
303
|
const smallestSuffix = formatAnchor(
|
|
@@ -264,12 +310,10 @@ function _findStandardPatchLocation(
|
|
|
264
310
|
);
|
|
265
311
|
}
|
|
266
312
|
|
|
267
|
-
// 3. Apply patch
|
|
268
|
-
const replaceStart = sourceTokens[prefixIndex].startIndex;
|
|
269
313
|
const replaceEnd =
|
|
270
314
|
sourceTokens[suffixIndex + suffixAnchor.length - 1].endIndex;
|
|
271
315
|
|
|
272
|
-
return {
|
|
316
|
+
return { replaceEnd };
|
|
273
317
|
}
|
|
274
318
|
|
|
275
319
|
export function handleStandardPatch(
|
|
@@ -282,27 +326,64 @@ export function handleStandardPatch(
|
|
|
282
326
|
);
|
|
283
327
|
}
|
|
284
328
|
|
|
285
|
-
let
|
|
329
|
+
let startTrim = 0;
|
|
330
|
+
const endTrim = 0;
|
|
286
331
|
let lastError: Error | null = null;
|
|
287
332
|
|
|
288
|
-
|
|
333
|
+
let prefixInfo: PrefixResult | null = null;
|
|
334
|
+
|
|
335
|
+
// Loop 1: Find Prefix (trim from start)
|
|
336
|
+
let patchAttempt = [...originalPatchTokens];
|
|
289
337
|
while (patchAttempt.length >= 2) {
|
|
290
338
|
try {
|
|
291
|
-
|
|
339
|
+
prefixInfo = _findPrefixLocation(sourceTokens, patchAttempt);
|
|
340
|
+
// Success finding prefix
|
|
341
|
+
startTrim = originalPatchTokens.length - patchAttempt.length;
|
|
342
|
+
break;
|
|
343
|
+
} catch (e) {
|
|
344
|
+
lastError = updateLastError(lastError, e as Error);
|
|
345
|
+
// Trim one token from the start
|
|
346
|
+
patchAttempt = patchAttempt.slice(1);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if (!prefixInfo) {
|
|
351
|
+
throw new Error(
|
|
352
|
+
`Failed to apply patch. Could not find a unique prefix anchor, even after trimming tokens. Last known error: ${lastError?.message}`,
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Loop 2: Find Suffix (trim from end)
|
|
357
|
+
// Reset patchAttempt to start from the found prefix startTrim, but allow trimming end
|
|
358
|
+
patchAttempt = originalPatchTokens.slice(startTrim);
|
|
359
|
+
lastError = null; // Reset last error for suffix search phase
|
|
360
|
+
|
|
361
|
+
while (patchAttempt.length >= prefixInfo.prefixAnchor.length + 1) {
|
|
362
|
+
// Need at least prefix + 1 token? Or just prefix + suffix?
|
|
363
|
+
try {
|
|
364
|
+
const { replaceEnd } = _findSuffixLocation(
|
|
292
365
|
sourceTokens,
|
|
293
366
|
patchAttempt,
|
|
367
|
+
prefixInfo.prefixAnchor,
|
|
368
|
+
prefixInfo.prefixIndex,
|
|
294
369
|
);
|
|
370
|
+
// Success finding suffix
|
|
295
371
|
const patchInsertStart = patchAttempt[0].startIndex;
|
|
296
372
|
const patchInsertEnd = patchAttempt[patchAttempt.length - 1].endIndex;
|
|
297
|
-
return {
|
|
373
|
+
return {
|
|
374
|
+
replaceStart: prefixInfo.replaceStart,
|
|
375
|
+
replaceEnd,
|
|
376
|
+
patchInsertStart,
|
|
377
|
+
patchInsertEnd,
|
|
378
|
+
};
|
|
298
379
|
} catch (e) {
|
|
299
|
-
lastError = e as Error;
|
|
300
|
-
// Trim one token from the
|
|
301
|
-
patchAttempt = patchAttempt.slice(
|
|
380
|
+
lastError = updateLastError(lastError, e as Error);
|
|
381
|
+
// Trim one token from the end
|
|
382
|
+
patchAttempt = patchAttempt.slice(0, -1);
|
|
302
383
|
}
|
|
303
384
|
}
|
|
304
385
|
|
|
305
386
|
throw new Error(
|
|
306
|
-
`Failed to apply patch. Could not find a unique anchor
|
|
387
|
+
`Failed to apply patch. Could not find a unique suffix anchor, even after trimming tokens. Last known error: ${lastError?.message}`,
|
|
307
388
|
);
|
|
308
389
|
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { getEncoding } from 'js-tiktoken';
|
|
2
|
+
import { Token } from '../types';
|
|
3
|
+
import { TokenizerStrategy } from '../tokenizer.interface';
|
|
4
|
+
|
|
5
|
+
export class TiktokenTokenizer implements TokenizerStrategy {
|
|
6
|
+
// Use cl100k_base (GPT-4) as the standard encoding
|
|
7
|
+
private enc = getEncoding('cl100k_base');
|
|
8
|
+
|
|
9
|
+
tokenize(content: string): Token[] {
|
|
10
|
+
const tokens: Token[] = [];
|
|
11
|
+
const encoded = this.enc.encode(content);
|
|
12
|
+
|
|
13
|
+
let currentIndex = 0;
|
|
14
|
+
|
|
15
|
+
// Iterate through token IDs, decode them individually to get text and length.
|
|
16
|
+
// This allows us to reconstruct the offsets (startIndex/endIndex).
|
|
17
|
+
for (const tokenId of encoded) {
|
|
18
|
+
// decoding a single token is the only way to get its exact text representation
|
|
19
|
+
// to map back to the source string indices.
|
|
20
|
+
const text = this.enc.decode([tokenId]);
|
|
21
|
+
const length = text.length;
|
|
22
|
+
|
|
23
|
+
tokens.push({
|
|
24
|
+
text,
|
|
25
|
+
type: 'bpe',
|
|
26
|
+
startIndex: currentIndex,
|
|
27
|
+
endIndex: currentIndex + length,
|
|
28
|
+
// startPosition is not calculated for Tiktoken strategy as it's computationally expensive
|
|
29
|
+
// and not strictly required for the patching algorithm which relies on text matching.
|
|
30
|
+
});
|
|
31
|
+
currentIndex += length;
|
|
32
|
+
}
|
|
33
|
+
return tokens;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { Token } from '../types';
|
|
2
|
+
import { TokenizerStrategy } from '../tokenizer.interface';
|
|
3
|
+
import type { Tree, Node, Parser } from 'web-tree-sitter';
|
|
4
|
+
|
|
5
|
+
export class TreeSitterTokenizer implements TokenizerStrategy {
|
|
6
|
+
constructor(private parser: Parser) {}
|
|
7
|
+
|
|
8
|
+
tokenize(content: string): Token[] {
|
|
9
|
+
const tree = this.parser.parse(content);
|
|
10
|
+
return this.collectTokens(tree, content);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
private collectTokens(tree: Tree, code: string): Token[] {
|
|
14
|
+
const tokens: Token[] = [];
|
|
15
|
+
|
|
16
|
+
function visit(node: Node) {
|
|
17
|
+
if (node.childCount === 0) {
|
|
18
|
+
tokens.push({
|
|
19
|
+
text: code.slice(node.startIndex, node.endIndex),
|
|
20
|
+
type: node.type,
|
|
21
|
+
startIndex: node.startIndex,
|
|
22
|
+
endIndex: node.endIndex,
|
|
23
|
+
startPosition: node.startPosition,
|
|
24
|
+
});
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
// By iterating over all children (not just named), we include punctuation
|
|
28
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
29
|
+
const child = node.child(i);
|
|
30
|
+
if (child) visit(child);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
visit(tree.rootNode);
|
|
35
|
+
return tokens;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
@@ -1,33 +1,15 @@
|
|
|
1
|
-
import type { Tree, Node } from 'web-tree-sitter';
|
|
2
1
|
import type { Token } from './types';
|
|
3
2
|
|
|
4
|
-
export function collectTokens(tree: Tree, code: string): Token[] {
|
|
5
|
-
const tokens: Token[] = [];
|
|
6
|
-
|
|
7
|
-
function visit(node: Node) {
|
|
8
|
-
if (node.childCount === 0) {
|
|
9
|
-
tokens.push({
|
|
10
|
-
text: code.slice(node.startIndex, node.endIndex),
|
|
11
|
-
type: node.type,
|
|
12
|
-
startIndex: node.startIndex,
|
|
13
|
-
endIndex: node.endIndex,
|
|
14
|
-
startPosition: node.startPosition,
|
|
15
|
-
});
|
|
16
|
-
return;
|
|
17
|
-
}
|
|
18
|
-
// By iterating over all children (not just named), we include punctuation
|
|
19
|
-
// which is critical for anchor-based matching.
|
|
20
|
-
for (let i = 0; i < node.childCount; i++) {
|
|
21
|
-
const child = node.child(i);
|
|
22
|
-
if (child) visit(child);
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
visit(tree.rootNode);
|
|
27
|
-
return tokens;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
3
|
export function tokensEqual(a: Token, b: Token): boolean {
|
|
4
|
+
// If using Tiktoken (BPE), we relax the comparison to ignore whitespace differences
|
|
5
|
+
// and newlines by trimming and stripping them. This helps when the patch might
|
|
6
|
+
// have slightly different indentation or spacing than the source.
|
|
7
|
+
if (a.type === 'bpe' || b.type === 'bpe') {
|
|
8
|
+
return (
|
|
9
|
+
a.text.replace(/\r?\n/g, '').trim() ===
|
|
10
|
+
b.text.replace(/\r?\n/g, '').trim()
|
|
11
|
+
);
|
|
12
|
+
}
|
|
31
13
|
return a.text === b.text;
|
|
32
14
|
}
|
|
33
15
|
|
|
@@ -47,4 +29,4 @@ export function findAllSequences(haystack: Token[], needle: Token[]): number[] {
|
|
|
47
29
|
indices.push(i);
|
|
48
30
|
}
|
|
49
31
|
return indices;
|
|
50
|
-
}
|
|
32
|
+
}
|
|
@@ -2,8 +2,8 @@ import type { Point } from 'web-tree-sitter';
|
|
|
2
2
|
|
|
3
3
|
export interface Token {
|
|
4
4
|
text: string;
|
|
5
|
-
type
|
|
6
|
-
startIndex: number; // byte offsets
|
|
5
|
+
type?: string;
|
|
6
|
+
startIndex: number; // byte offsets
|
|
7
7
|
endIndex: number;
|
|
8
|
-
startPosition
|
|
9
|
-
}
|
|
8
|
+
startPosition?: Point;
|
|
9
|
+
}
|