@afterxleep/doc-bot 1.18.0 → 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
import { PaginationService } from '../PaginationService.js';
|
|
2
|
+
|
|
3
|
+
describe('PaginationService', () => {
|
|
4
|
+
let paginationService;
|
|
5
|
+
|
|
6
|
+
beforeEach(() => {
|
|
7
|
+
paginationService = new PaginationService();
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
describe('estimateTokens', () => {
|
|
11
|
+
it('should estimate tokens using realistic tokenization', () => {
|
|
12
|
+
expect(paginationService.estimateTokens('test')).toBe(2); // Realistic: single word + overhead
|
|
13
|
+
expect(paginationService.estimateTokens('a'.repeat(100))).toBeGreaterThan(25); // More realistic than 4:1 ratio
|
|
14
|
+
expect(paginationService.estimateTokens('a'.repeat(1000))).toBeGreaterThan(250); // More realistic than 4:1 ratio
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
it('should handle empty or null input', () => {
|
|
18
|
+
expect(paginationService.estimateTokens('')).toBe(0);
|
|
19
|
+
expect(paginationService.estimateTokens(null)).toBe(0);
|
|
20
|
+
expect(paginationService.estimateTokens(undefined)).toBe(0);
|
|
21
|
+
});
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
describe('needsPagination', () => {
|
|
25
|
+
it('should return true for content over 24000 tokens', () => {
|
|
26
|
+
const largeContent = 'a'.repeat(100000); // 25000 tokens
|
|
27
|
+
expect(paginationService.needsPagination(largeContent)).toBe(true);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should return false for content under 24000 tokens', () => {
|
|
31
|
+
const smallContent = 'a'.repeat(50000); // 12500 tokens
|
|
32
|
+
expect(paginationService.needsPagination(smallContent)).toBe(false);
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
describe('paginateArray', () => {
|
|
37
|
+
const testItems = Array.from({ length: 25 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
|
|
38
|
+
|
|
39
|
+
it('should paginate array with default page size', () => {
|
|
40
|
+
const result = paginationService.paginateArray(testItems, 1);
|
|
41
|
+
|
|
42
|
+
expect(result.items).toHaveLength(10);
|
|
43
|
+
expect(result.page).toBe(1);
|
|
44
|
+
expect(result.pageSize).toBe(10);
|
|
45
|
+
expect(result.totalPages).toBe(3);
|
|
46
|
+
expect(result.totalItems).toBe(25);
|
|
47
|
+
expect(result.hasMore).toBe(true);
|
|
48
|
+
expect(result.nextPage).toBe(2);
|
|
49
|
+
expect(result.prevPage).toBe(null);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it('should paginate array with custom page size', () => {
|
|
53
|
+
const result = paginationService.paginateArray(testItems, 1, 5);
|
|
54
|
+
|
|
55
|
+
expect(result.items).toHaveLength(5);
|
|
56
|
+
expect(result.pageSize).toBe(5);
|
|
57
|
+
expect(result.totalPages).toBe(5);
|
|
58
|
+
expect(result.items[0].id).toBe(1);
|
|
59
|
+
expect(result.items[4].id).toBe(5);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it('should handle page 2 correctly', () => {
|
|
63
|
+
const result = paginationService.paginateArray(testItems, 2, 10);
|
|
64
|
+
|
|
65
|
+
expect(result.items).toHaveLength(10);
|
|
66
|
+
expect(result.page).toBe(2);
|
|
67
|
+
expect(result.items[0].id).toBe(11);
|
|
68
|
+
expect(result.items[9].id).toBe(20);
|
|
69
|
+
expect(result.prevPage).toBe(1);
|
|
70
|
+
expect(result.nextPage).toBe(3);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('should handle last page correctly', () => {
|
|
74
|
+
const result = paginationService.paginateArray(testItems, 3, 10);
|
|
75
|
+
|
|
76
|
+
expect(result.items).toHaveLength(5);
|
|
77
|
+
expect(result.page).toBe(3);
|
|
78
|
+
expect(result.hasMore).toBe(false);
|
|
79
|
+
expect(result.nextPage).toBe(null);
|
|
80
|
+
expect(result.prevPage).toBe(2);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it('should handle empty array', () => {
|
|
84
|
+
const result = paginationService.paginateArray([], 1, 10);
|
|
85
|
+
|
|
86
|
+
expect(result.items).toHaveLength(0);
|
|
87
|
+
expect(result.totalPages).toBe(0);
|
|
88
|
+
expect(result.totalItems).toBe(0);
|
|
89
|
+
expect(result.hasMore).toBe(false);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('should handle out of bounds page number', () => {
|
|
93
|
+
const result = paginationService.paginateArray(testItems, 10, 10);
|
|
94
|
+
|
|
95
|
+
expect(result.page).toBe(3); // Should cap at max page
|
|
96
|
+
expect(result.items).toHaveLength(5);
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
it('should handle negative page number', () => {
|
|
100
|
+
const result = paginationService.paginateArray(testItems, -1, 10);
|
|
101
|
+
|
|
102
|
+
expect(result.page).toBe(1); // Should default to page 1
|
|
103
|
+
expect(result.items[0].id).toBe(1);
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
describe('smartPaginate', () => {
|
|
108
|
+
const createLargeItems = (count) => {
|
|
109
|
+
return Array.from({ length: count }, (_, i) => ({
|
|
110
|
+
id: i + 1,
|
|
111
|
+
content: 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. '.repeat(100) // ~1400 chars each
|
|
112
|
+
}));
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
const formatter = (items) => {
|
|
116
|
+
return items.map(item => `Item ${item.id}: ${item.content}`).join('\n\n');
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
it('should auto-fit items within token limit when no page size specified', () => {
|
|
120
|
+
const items = createLargeItems(50);
|
|
121
|
+
const result = paginationService.smartPaginate(items, formatter, 1);
|
|
122
|
+
|
|
123
|
+
expect(result.pagination.totalItems).toBe(50);
|
|
124
|
+
expect(result.pagination.page).toBe(1);
|
|
125
|
+
expect(result.pagination.hasMore).toBe(true);
|
|
126
|
+
|
|
127
|
+
// Content should be under 20000 tokens
|
|
128
|
+
const estimatedTokens = paginationService.estimateTokens(result.content);
|
|
129
|
+
expect(estimatedTokens).toBeLessThanOrEqual(20000);
|
|
130
|
+
expect(estimatedTokens).toBeGreaterThan(0);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
it('should use specified page size when provided', () => {
|
|
134
|
+
const items = createLargeItems(20);
|
|
135
|
+
const result = paginationService.smartPaginate(items, formatter, 1, 5);
|
|
136
|
+
|
|
137
|
+
expect(result.pagination.page).toBe(1);
|
|
138
|
+
expect(result.pagination.pageSize).toBe(5);
|
|
139
|
+
expect(result.pagination.totalPages).toBe(4);
|
|
140
|
+
expect(result.pagination.totalItems).toBe(20);
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
it('should handle page navigation in smart mode', () => {
|
|
144
|
+
const items = createLargeItems(100);
|
|
145
|
+
const page1 = paginationService.smartPaginate(items, formatter, 1);
|
|
146
|
+
const page2 = paginationService.smartPaginate(items, formatter, 2);
|
|
147
|
+
|
|
148
|
+
expect(page1.pagination.page).toBe(1);
|
|
149
|
+
expect(page1.pagination.nextPage).toBe(2);
|
|
150
|
+
expect(page1.pagination.prevPage).toBe(null);
|
|
151
|
+
|
|
152
|
+
expect(page2.pagination.page).toBe(2);
|
|
153
|
+
expect(page2.pagination.prevPage).toBe(1);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it('should handle empty items array', () => {
|
|
157
|
+
const result = paginationService.smartPaginate([], formatter, 1);
|
|
158
|
+
|
|
159
|
+
expect(result.content).toBe('No items found.');
|
|
160
|
+
expect(result.pagination.totalItems).toBe(0);
|
|
161
|
+
expect(result.pagination.hasMore).toBe(false);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('should handle single small item', () => {
|
|
165
|
+
const items = [{ id: 1, content: 'Small content' }];
|
|
166
|
+
const result = paginationService.smartPaginate(items, formatter, 1);
|
|
167
|
+
|
|
168
|
+
expect(result.pagination.itemsInPage).toBe(1);
|
|
169
|
+
expect(result.pagination.totalItems).toBe(1);
|
|
170
|
+
expect(result.pagination.hasMore).toBe(false);
|
|
171
|
+
});
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
describe('formatPaginationInfo', () => {
|
|
175
|
+
it('should format pagination info with total pages', () => {
|
|
176
|
+
const pagination = {
|
|
177
|
+
page: 2,
|
|
178
|
+
totalPages: 5,
|
|
179
|
+
itemsInPage: 10,
|
|
180
|
+
totalItems: 50,
|
|
181
|
+
hasMore: true,
|
|
182
|
+
nextPage: 3,
|
|
183
|
+
prevPage: 1
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
const formatted = paginationService.formatPaginationInfo(pagination);
|
|
187
|
+
|
|
188
|
+
expect(formatted).toContain('Page 2 of 5');
|
|
189
|
+
expect(formatted).toContain('Showing 10 of 50 items');
|
|
190
|
+
expect(formatted).toContain('Previous: Add `page: 1`');
|
|
191
|
+
expect(formatted).toContain('Next: Add `page: 3`');
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
it('should format pagination info with estimated pages', () => {
|
|
195
|
+
const pagination = {
|
|
196
|
+
page: 1,
|
|
197
|
+
estimatedTotalPages: 3,
|
|
198
|
+
itemsInPage: 15,
|
|
199
|
+
totalItems: 45,
|
|
200
|
+
hasMore: true,
|
|
201
|
+
nextPage: 2,
|
|
202
|
+
prevPage: null
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
const formatted = paginationService.formatPaginationInfo(pagination);
|
|
206
|
+
|
|
207
|
+
expect(formatted).toContain('Page 1 of ~3');
|
|
208
|
+
expect(formatted).not.toContain('Previous:');
|
|
209
|
+
expect(formatted).toContain('Next: Add `page: 2`');
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
it('should format pagination info without navigation on last page', () => {
|
|
213
|
+
const pagination = {
|
|
214
|
+
page: 3,
|
|
215
|
+
totalPages: 3,
|
|
216
|
+
pageSize: 10,
|
|
217
|
+
totalItems: 25,
|
|
218
|
+
hasMore: false,
|
|
219
|
+
nextPage: null,
|
|
220
|
+
prevPage: 2
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
const formatted = paginationService.formatPaginationInfo(pagination);
|
|
224
|
+
|
|
225
|
+
expect(formatted).toContain('Page 3 of 3');
|
|
226
|
+
expect(formatted).toContain('Previous: Add `page: 2`');
|
|
227
|
+
expect(formatted).not.toContain('Next:');
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
it('should handle pageSize display correctly', () => {
|
|
231
|
+
const pagination = {
|
|
232
|
+
page: 2,
|
|
233
|
+
pageSize: 20,
|
|
234
|
+
totalPages: 3,
|
|
235
|
+
totalItems: 50,
|
|
236
|
+
hasMore: true,
|
|
237
|
+
nextPage: 3,
|
|
238
|
+
prevPage: 1
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
const formatted = paginationService.formatPaginationInfo(pagination);
|
|
242
|
+
|
|
243
|
+
expect(formatted).toContain('Showing items 21-40 of 50');
|
|
244
|
+
});
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
describe('chunkText', () => {
|
|
248
|
+
it('should not chunk text under token limit', () => {
|
|
249
|
+
const text = 'a'.repeat(50000); // 12500 tokens
|
|
250
|
+
const chunks = paginationService.chunkText(text);
|
|
251
|
+
|
|
252
|
+
expect(chunks).toHaveLength(1);
|
|
253
|
+
expect(chunks[0]).toBe(text);
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
it('should chunk large text into multiple parts', () => {
|
|
257
|
+
const text = 'a'.repeat(100000); // 25000 tokens
|
|
258
|
+
const chunks = paginationService.chunkText(text);
|
|
259
|
+
|
|
260
|
+
expect(chunks).toHaveLength(2);
|
|
261
|
+
chunks.forEach(chunk => {
|
|
262
|
+
const tokens = paginationService.estimateTokens(chunk);
|
|
263
|
+
expect(tokens).toBeLessThanOrEqual(20500); // Allow small buffer for realistic tokenization
|
|
264
|
+
});
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
it('should preserve line breaks when chunking', () => {
|
|
268
|
+
const lines = Array.from({ length: 1000 }, (_, i) => 'a'.repeat(100)).join('\n');
|
|
269
|
+
const chunks = paginationService.chunkText(lines);
|
|
270
|
+
|
|
271
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
272
|
+
chunks.forEach(chunk => {
|
|
273
|
+
// Each chunk should be properly formatted (allow trimming)
|
|
274
|
+
expect(chunk.length).toBeGreaterThan(0);
|
|
275
|
+
expect(typeof chunk).toBe('string');
|
|
276
|
+
});
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
it('should handle very long single lines', () => {
|
|
280
|
+
const longLine = 'word '.repeat(20000); // Single line with ~100000 chars
|
|
281
|
+
const chunks = paginationService.chunkText(longLine);
|
|
282
|
+
|
|
283
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
284
|
+
chunks.forEach(chunk => {
|
|
285
|
+
const tokens = paginationService.estimateTokens(chunk);
|
|
286
|
+
expect(tokens).toBeLessThanOrEqual(20500); // Allow small buffer for realistic tokenization
|
|
287
|
+
});
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
it('should handle empty text', () => {
|
|
291
|
+
const chunks = paginationService.chunkText('');
|
|
292
|
+
expect(chunks).toEqual(['']);
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
it('should handle null text', () => {
|
|
296
|
+
const chunks = paginationService.chunkText(null);
|
|
297
|
+
expect(chunks).toEqual([null]);
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
it('should respect target token parameter', () => {
|
|
301
|
+
const text = 'word '.repeat(50000); // More complex text that tokenizes differently
|
|
302
|
+
const chunks = paginationService.chunkText(text, 10000); // 10000 token target
|
|
303
|
+
|
|
304
|
+
expect(chunks.length).toBeGreaterThan(1); // Should need chunking
|
|
305
|
+
chunks.forEach(chunk => {
|
|
306
|
+
const tokens = paginationService.estimateTokens(chunk);
|
|
307
|
+
expect(tokens).toBeLessThanOrEqual(10500); // Allow small buffer
|
|
308
|
+
});
|
|
309
|
+
});
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
describe('Edge Cases', () => {
|
|
313
|
+
it('should handle mixed content sizes in smart pagination', () => {
|
|
314
|
+
const items = [
|
|
315
|
+
{ id: 1, content: 'a'.repeat(50000) }, // Large item
|
|
316
|
+
{ id: 2, content: 'b'.repeat(10) }, // Small item
|
|
317
|
+
{ id: 3, content: 'c'.repeat(30000) }, // Medium item
|
|
318
|
+
];
|
|
319
|
+
|
|
320
|
+
const formatter = (items) => items.map(i => i.content).join('');
|
|
321
|
+
const result = paginationService.smartPaginate(items, formatter, 1);
|
|
322
|
+
|
|
323
|
+
// Should include at least the first item, maybe more
|
|
324
|
+
expect(result.pagination.itemsInPage).toBeGreaterThanOrEqual(1);
|
|
325
|
+
const tokens = paginationService.estimateTokens(result.content);
|
|
326
|
+
expect(tokens).toBeLessThanOrEqual(20000);
|
|
327
|
+
});
|
|
328
|
+
|
|
329
|
+
it('should handle single item exceeding token limit', () => {
|
|
330
|
+
const items = [
|
|
331
|
+
{ id: 1, content: 'x'.repeat(100000) } // 25000 tokens - exceeds limit
|
|
332
|
+
];
|
|
333
|
+
|
|
334
|
+
const formatter = (items) => items.map(i => i.content).join('');
|
|
335
|
+
const result = paginationService.smartPaginate(items, formatter, 1);
|
|
336
|
+
|
|
337
|
+
// Should still include the item even if it exceeds limit
|
|
338
|
+
expect(result.pagination.itemsInPage).toBe(1);
|
|
339
|
+
expect(result.pagination.hasMore).toBe(false);
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
it('should handle unicode characters correctly', () => {
|
|
343
|
+
const text = '你好世界🌍'.repeat(1000);
|
|
344
|
+
const tokens = paginationService.estimateTokens(text);
|
|
345
|
+
|
|
346
|
+
// Unicode chars should still be counted
|
|
347
|
+
expect(tokens).toBeGreaterThan(0);
|
|
348
|
+
|
|
349
|
+
const chunks = paginationService.chunkText(text.repeat(10));
|
|
350
|
+
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
it('should handle formatter that returns empty string', () => {
|
|
354
|
+
const items = [{ id: 1 }, { id: 2 }];
|
|
355
|
+
const formatter = () => '';
|
|
356
|
+
|
|
357
|
+
const result = paginationService.smartPaginate(items, formatter, 1);
|
|
358
|
+
expect(result.content).toBe('');
|
|
359
|
+
expect(result.pagination.itemsInPage).toBe(1); // New pagination logic: 1 item per page
|
|
360
|
+
});
|
|
361
|
+
|
|
362
|
+
it('should handle formatter that throws error gracefully', () => {
|
|
363
|
+
const items = [{ id: 1 }];
|
|
364
|
+
const formatter = () => {
|
|
365
|
+
throw new Error('Formatter error');
|
|
366
|
+
};
|
|
367
|
+
|
|
368
|
+
expect(() => {
|
|
369
|
+
paginationService.smartPaginate(items, formatter, 1);
|
|
370
|
+
}).toThrow('Formatter error');
|
|
371
|
+
});
|
|
372
|
+
});
|
|
373
|
+
|
|
374
|
+
describe('Performance', () => {
|
|
375
|
+
it('should handle very large arrays efficiently', () => {
|
|
376
|
+
const largeArray = Array.from({ length: 10000 }, (_, i) => ({ id: i }));
|
|
377
|
+
|
|
378
|
+
const startTime = Date.now();
|
|
379
|
+
const result = paginationService.paginateArray(largeArray, 1, 100);
|
|
380
|
+
const endTime = Date.now();
|
|
381
|
+
|
|
382
|
+
expect(result.items).toHaveLength(100);
|
|
383
|
+
expect(result.totalItems).toBe(10000);
|
|
384
|
+
expect(endTime - startTime).toBeLessThan(100); // Should be fast
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
it('should handle very large text efficiently', () => {
|
|
388
|
+
const largeText = 'a'.repeat(1000000); // 1M chars
|
|
389
|
+
|
|
390
|
+
const startTime = Date.now();
|
|
391
|
+
const chunks = paginationService.chunkText(largeText);
|
|
392
|
+
const endTime = Date.now();
|
|
393
|
+
|
|
394
|
+
expect(chunks.length).toBeGreaterThanOrEqual(12); // 1M chars / 80K = ~12.5 chunks
|
|
395
|
+
expect(endTime - startTime).toBeLessThan(500); // Should complete quickly
|
|
396
|
+
});
|
|
397
|
+
});
|
|
398
|
+
});
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TokenEstimator - Realistic token counting for LLM content
|
|
3
|
+
*
|
|
4
|
+
* Implements proper tokenization estimation based on modern LLM patterns.
|
|
5
|
+
* Much more accurate than naive character-count approaches.
|
|
6
|
+
*/
|
|
7
|
+
export class TokenEstimator {
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Estimate token count using realistic tokenization patterns
|
|
11
|
+
* Based on GPT-style tokenization rules and observed patterns
|
|
12
|
+
*
|
|
13
|
+
* @param {string} text - Text to analyze
|
|
14
|
+
* @returns {number} Estimated token count
|
|
15
|
+
*/
|
|
16
|
+
static estimateTokens(text) {
|
|
17
|
+
if (!text) return 0;
|
|
18
|
+
|
|
19
|
+
let tokens = 0;
|
|
20
|
+
|
|
21
|
+
// Split by whitespace and punctuation patterns
|
|
22
|
+
const words = text.split(/(\s+|[^\w\s])/);
|
|
23
|
+
|
|
24
|
+
for (const word of words) {
|
|
25
|
+
if (!word) continue;
|
|
26
|
+
|
|
27
|
+
// Whitespace is often merged with adjacent tokens
|
|
28
|
+
if (/^\s+$/.test(word)) {
|
|
29
|
+
continue; // Don't count pure whitespace as tokens
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Single punctuation marks are usually 1 token
|
|
33
|
+
if (/^[^\w\s]$/.test(word)) {
|
|
34
|
+
tokens += 1;
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Handle different word types
|
|
39
|
+
if (/^\w+$/.test(word)) {
|
|
40
|
+
// Regular words: estimate based on length and common patterns
|
|
41
|
+
if (word.length <= 3) {
|
|
42
|
+
tokens += 1; // Short words: 1 token
|
|
43
|
+
} else if (word.length <= 6) {
|
|
44
|
+
tokens += 1; // Medium words: usually 1 token
|
|
45
|
+
} else if (word.length <= 10) {
|
|
46
|
+
tokens += Math.ceil(word.length / 5); // Longer words: ~5 chars per token
|
|
47
|
+
} else {
|
|
48
|
+
// Very long words (often technical terms): ~4 chars per token
|
|
49
|
+
tokens += Math.ceil(word.length / 4);
|
|
50
|
+
}
|
|
51
|
+
} else {
|
|
52
|
+
// Mixed content (URLs, emails, code, etc.)
|
|
53
|
+
// These are often tokenized more aggressively
|
|
54
|
+
if (word.includes('://') || word.includes('@')) {
|
|
55
|
+
// URLs and emails: roughly 3-4 chars per token
|
|
56
|
+
tokens += Math.ceil(word.length / 3.5);
|
|
57
|
+
} else if (/[A-Z]{2,}/.test(word) || /\d+/.test(word)) {
|
|
58
|
+
// Acronyms and numbers: often 2-3 chars per token
|
|
59
|
+
tokens += Math.ceil(word.length / 2.5);
|
|
60
|
+
} else {
|
|
61
|
+
// Other mixed content: 4 chars per token
|
|
62
|
+
tokens += Math.ceil(word.length / 4);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Account for special sequences that are tokenized differently
|
|
68
|
+
// Code blocks, markdown, JSON, etc. tend to have more tokens
|
|
69
|
+
const specialPatterns = [
|
|
70
|
+
/```[\s\S]*?```/g, // Code blocks
|
|
71
|
+
/`[^`]+`/g, // Inline code
|
|
72
|
+
/\[[^\]]*\]\([^)]*\)/g, // Markdown links
|
|
73
|
+
/\*\*[^*]+\*\*/g, // Bold text
|
|
74
|
+
/\*[^*]+\*/g, // Italic text
|
|
75
|
+
/{[^}]*}/g, // JSON-like structures
|
|
76
|
+
/\([^)]*\)/g, // Parenthetical content
|
|
77
|
+
];
|
|
78
|
+
|
|
79
|
+
let specialTokens = 0;
|
|
80
|
+
for (const pattern of specialPatterns) {
|
|
81
|
+
const matches = text.match(pattern);
|
|
82
|
+
if (matches) {
|
|
83
|
+
for (const match of matches) {
|
|
84
|
+
// Special content has higher token density
|
|
85
|
+
specialTokens += Math.ceil(match.length / 3);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Use the higher of the two estimates (word-based vs special-pattern-based)
|
|
91
|
+
// This accounts for content that's heavily formatted vs plain text
|
|
92
|
+
const wordBasedEstimate = tokens;
|
|
93
|
+
const specialContentRatio = specialTokens / Math.max(1, text.length);
|
|
94
|
+
|
|
95
|
+
if (specialContentRatio > 0.1) {
|
|
96
|
+
// High special content - use pattern-based estimate with adjustment
|
|
97
|
+
tokens = Math.max(wordBasedEstimate, Math.ceil(text.length / 3.2));
|
|
98
|
+
} else {
|
|
99
|
+
// Regular content - use word-based estimate
|
|
100
|
+
tokens = wordBasedEstimate;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Add buffer for control tokens, formatting, etc. (5-10% overhead)
|
|
104
|
+
tokens = Math.ceil(tokens * 1.08);
|
|
105
|
+
|
|
106
|
+
return tokens;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Get the average characters per token for specific text
|
|
111
|
+
* Useful for chunking operations
|
|
112
|
+
*/
|
|
113
|
+
static getAvgCharsPerToken(text) {
|
|
114
|
+
if (!text) return 4; // Fallback
|
|
115
|
+
const tokens = this.estimateTokens(text);
|
|
116
|
+
return tokens > 0 ? text.length / tokens : 4;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Check if text exceeds a token limit
|
|
121
|
+
*/
|
|
122
|
+
static exceedsLimit(text, maxTokens) {
|
|
123
|
+
return this.estimateTokens(text) > maxTokens;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Estimate how many characters would fit within a token budget
|
|
128
|
+
* for a given text style
|
|
129
|
+
*/
|
|
130
|
+
static estimateCharsForTokens(sampleText, targetTokens) {
|
|
131
|
+
const avgCharsPerToken = this.getAvgCharsPerToken(sampleText);
|
|
132
|
+
return Math.floor(targetTokens * avgCharsPerToken);
|
|
133
|
+
}
|
|
134
|
+
}
|