smart-review 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1671 @@
1
+ /**
2
+ * 智能文件分批处理模块
3
+ * 基于Token限制进行最优分组,支持大文件分段处理
4
+ */
5
+
6
+ import { BATCH_CONSTANTS } from './utils/constants.js';
7
+
8
+ export class SmartBatching {
9
+ constructor(config = {}) {
10
+ this.config = {
11
+ // 每批次最大Token数(默认4000)
12
+ maxRequestTokens: config.maxRequestTokens || BATCH_CONSTANTS.DEFAULT_MAX_REQUEST_TOKENS,
13
+ // Token估算比例(字符数/Token数,默认4)
14
+ tokenRatio: config.tokenRatio || BATCH_CONSTANTS.TOKEN_RATIO,
15
+ // 分段重叠行数(保持上下文连贯性)
16
+ chunkOverlapLines: config.chunkOverlapLines || BATCH_CONSTANTS.CHUNK_OVERLAP_LINES,
17
+ // 动态文件数量计算的边界值
18
+ minFilesPerBatch: config.minFilesPerBatch || BATCH_CONSTANTS.MIN_FILES_PER_BATCH,
19
+ maxFilesPerBatch: config.maxFilesPerBatch || BATCH_CONSTANTS.MAX_FILES_PER_BATCH,
20
+ ...config
21
+ };
22
+
23
+ // 大文件分段阈值设为批次最大Token的90%,确保每段接近最大利用率
24
+ this.maxFileTokens = Math.floor(this.config.maxRequestTokens * BATCH_CONSTANTS.UTILIZATION_THRESHOLD);
25
+
26
+ // Token估算缓存,避免重复计算
27
+ this.tokenCache = new Map();
28
+ this.cacheStats = { hits: 0, misses: 0 };
29
+
30
+ // 添加对象池和缓存机制以减少对象创建
31
+ this.batchObjectPool = []; // 批次对象池
32
+ this.itemObjectPool = []; // 项目对象池
33
+ this.chunkObjectPool = []; // 分块对象池
34
+ this.statsObjectPool = []; // 统计对象池
35
+
36
+ // 预分配一些对象到池中
37
+ this.initializeObjectPools();
38
+ }
39
+
40
+ /**
41
+ * 初始化对象池,预分配一些对象以减少运行时创建开销
42
+ */
43
+ initializeObjectPools() {
44
+ // 预分配批次对象
45
+ for (let i = 0; i < BATCH_CONSTANTS.INITIAL_BATCH_POOL_SIZE; i++) {
46
+ this.batchObjectPool.push(this.createBatchObject());
47
+ }
48
+
49
+ // 预分配项目对象
50
+ for (let i = 0; i < BATCH_CONSTANTS.INITIAL_ITEM_POOL_SIZE; i++) {
51
+ this.itemObjectPool.push(this.createItemObject());
52
+ }
53
+
54
+ // 预分配分块对象
55
+ for (let i = 0; i < BATCH_CONSTANTS.INITIAL_CHUNK_POOL_SIZE; i++) {
56
+ this.chunkObjectPool.push(this.createChunkObject());
57
+ }
58
+
59
+ // 预分配统计对象
60
+ for (let i = 0; i < BATCH_CONSTANTS.INITIAL_STATS_POOL_SIZE; i++) {
61
+ this.statsObjectPool.push(this.createStatsObject());
62
+ }
63
+ }
64
+
65
+ /**
66
+ * 创建批次对象
67
+ * @returns {Object} 新的批次对象
68
+ */
69
+ createBatchObject() {
70
+ return {
71
+ items: [],
72
+ totalTokens: 0,
73
+ totalFiles: 0,
74
+ utilization: 0,
75
+ isLargeFileSegment: false,
76
+ segmentedFile: null
77
+ };
78
+ }
79
+
80
+ /**
81
+ * 创建项目对象
82
+ * @returns {Object} 新的项目对象
83
+ */
84
+ createItemObject() {
85
+ return {
86
+ filePath: '',
87
+ content: '',
88
+ tokens: 0,
89
+ isSegment: false,
90
+ segmentInfo: null
91
+ };
92
+ }
93
+
94
+ /**
95
+ * 创建分块对象
96
+ * @returns {Object} 新的分块对象
97
+ */
98
+ createChunkObject() {
99
+ return {
100
+ content: '',
101
+ startLine: 0,
102
+ endLine: 0,
103
+ tokens: 0
104
+ };
105
+ }
106
+
107
+ /**
108
+ * 创建统计对象
109
+ * @returns {Object} 新的统计对象
110
+ */
111
+ createStatsObject() {
112
+ return {
113
+ totalFiles: 0,
114
+ totalTokens: 0,
115
+ avgTokensPerFile: 0,
116
+ maxTokensInBatch: 0,
117
+ minTokensInBatch: 0,
118
+ avgUtilization: 0
119
+ };
120
+ }
121
+
122
+ /**
123
+ * 从池中获取批次对象
124
+ * @returns {Object} 批次对象
125
+ */
126
+ getBatchFromPool() {
127
+ if (this.batchObjectPool.length > 0) {
128
+ const obj = this.batchObjectPool.pop();
129
+ // 重置对象状态
130
+ obj.items = [];
131
+ obj.totalTokens = 0;
132
+ obj.totalFiles = 0;
133
+ obj.utilization = 0;
134
+ obj.isLargeFileSegment = false;
135
+ obj.segmentedFile = null;
136
+ return obj;
137
+ }
138
+ return this.createBatchObject();
139
+ }
140
+
141
+ /**
142
+ * 从池中获取项目对象
143
+ * @returns {Object} 项目对象
144
+ */
145
+ getItemFromPool() {
146
+ if (this.itemObjectPool.length > 0) {
147
+ const obj = this.itemObjectPool.pop();
148
+ // 重置对象状态
149
+ obj.filePath = '';
150
+ obj.content = '';
151
+ obj.tokens = 0;
152
+ obj.isSegment = false;
153
+ obj.segmentInfo = null;
154
+ return obj;
155
+ }
156
+ return this.createItemObject();
157
+ }
158
+
159
+ /**
160
+ * 从池中获取分块对象
161
+ * @returns {Object} 分块对象
162
+ */
163
+ getChunkFromPool() {
164
+ if (this.chunkObjectPool.length > 0) {
165
+ const obj = this.chunkObjectPool.pop();
166
+ // 重置对象状态
167
+ obj.content = '';
168
+ obj.startLine = 0;
169
+ obj.endLine = 0;
170
+ obj.tokens = 0;
171
+ return obj;
172
+ }
173
+ return this.createChunkObject();
174
+ }
175
+
176
+ /**
177
+ * 从池中获取统计对象
178
+ * @returns {Object} 统计对象
179
+ */
180
+ getStatsFromPool() {
181
+ if (this.statsObjectPool.length > 0) {
182
+ const obj = this.statsObjectPool.pop();
183
+ // 重置对象状态
184
+ obj.totalFiles = 0;
185
+ obj.totalTokens = 0;
186
+ obj.avgTokensPerFile = 0;
187
+ obj.maxTokensInBatch = 0;
188
+ obj.minTokensInBatch = 0;
189
+ obj.avgUtilization = 0;
190
+ return obj;
191
+ }
192
+ return this.createStatsObject();
193
+ }
194
+
195
+ /**
196
+ * 将批次对象回收到池中
197
+ * @param {Object} obj - 要回收的批次对象
198
+ */
199
+ recycleBatchToPool(obj) {
200
+ if (this.batchObjectPool.length < BATCH_CONSTANTS.MAX_BATCH_POOL_SIZE) { // 限制池大小
201
+ this.batchObjectPool.push(obj);
202
+ }
203
+ }
204
+
205
+ /**
206
+ * 将项目对象回收到池中
207
+ * @param {Object} obj - 要回收的项目对象
208
+ */
209
+ recycleItemToPool(obj) {
210
+ if (this.itemObjectPool.length < BATCH_CONSTANTS.MAX_ITEM_POOL_SIZE) { // 限制池大小
211
+ this.itemObjectPool.push(obj);
212
+ }
213
+ }
214
+
215
+ /**
216
+ * 将分块对象回收到池中
217
+ * @param {Object} obj - 要回收的分块对象
218
+ */
219
+ recycleChunkToPool(obj) {
220
+ if (this.chunkObjectPool.length < BATCH_CONSTANTS.MAX_CHUNK_POOL_SIZE) { // 限制池大小
221
+ this.chunkObjectPool.push(obj);
222
+ }
223
+ }
224
+
225
+ /**
226
+ * 将统计对象回收到池中
227
+ * @param {Object} obj - 要回收的统计对象
228
+ */
229
+ recycleStatsToPool(obj) {
230
+ if (this.statsObjectPool.length < BATCH_CONSTANTS.MAX_STATS_POOL_SIZE) { // 限制池大小
231
+ this.statsObjectPool.push(obj);
232
+ }
233
+ }
234
+
235
+ /**
236
+ * 获取对象池统计信息
237
+ * @returns {Object} 对象池统计信息
238
+ */
239
+ getObjectPoolStats() {
240
+ return {
241
+ batchPool: {
242
+ size: this.batchObjectPool.length,
243
+ type: 'batch'
244
+ },
245
+ itemPool: {
246
+ size: this.itemObjectPool.length,
247
+ type: 'item'
248
+ },
249
+ chunkPool: {
250
+ size: this.chunkObjectPool.length,
251
+ type: 'chunk'
252
+ },
253
+ statsPool: {
254
+ size: this.statsObjectPool.length,
255
+ type: 'stats'
256
+ }
257
+ };
258
+ }
259
+
260
+ /**
261
+ * 根据文件平均大小动态计算每批次最大文件数
262
+ * @param {Array} files - 文件列表
263
+ * @returns {number} 动态计算的最大文件数
264
+ */
265
+ calculateDynamicMaxFiles(files) {
266
+ if (files.length === 0) return this.config.minFilesPerBatch;
267
+
268
+ // 计算平均文件token数
269
+ const totalTokens = files.reduce((sum, file) => sum + this.estimateTokens(file.content), 0);
270
+ const avgTokensPerFile = totalTokens / files.length;
271
+
272
+ // 基于平均文件大小计算合理的文件数量
273
+ const estimatedMaxFiles = Math.floor(this.config.maxRequestTokens / avgTokensPerFile);
274
+
275
+ // 调试信息
276
+ // 应用边界限制
277
+ const result = Math.max(
278
+ this.config.minFilesPerBatch,
279
+ Math.min(this.config.maxFilesPerBatch, estimatedMaxFiles)
280
+ );
281
+ return result;
282
+ }
283
+
284
+ /**
285
+ * 基于处理后的项目计算动态最大文件数
286
+ * @param {Array} items - 处理后的文件项目数组
287
+ * @returns {number} 动态最大文件数
288
+ */
289
+ calculateDynamicMaxFilesFromItems(items) {
290
+ if (items.length === 0) return this.config.minFilesPerBatch;
291
+
292
+ // 按文件分组,计算每个文件的平均token数
293
+ const fileGroups = new Map();
294
+ items.forEach(item => {
295
+ const filePath = item.originalFilePath || item.filePath;
296
+ if (!fileGroups.has(filePath)) {
297
+ fileGroups.set(filePath, []);
298
+ }
299
+ fileGroups.get(filePath).push(item);
300
+ });
301
+
302
+ // 计算每个文件的token数组
303
+ const fileTokens = [];
304
+ fileGroups.forEach((fileItems, filePath) => {
305
+ const totalTokens = fileItems.reduce((sum, item) => sum + item.tokens, 0);
306
+ const avgTokensPerSegment = totalTokens / fileItems.length;
307
+ fileTokens.push(avgTokensPerSegment);
308
+ });
309
+
310
+ // 排序文件token数,用于更精确的估算
311
+ fileTokens.sort((a, b) => a - b);
312
+
313
+ // 使用更智能的策略:尝试找到能组合的最大文件数
314
+ let maxFiles = this.config.minFilesPerBatch;
315
+ const maxTokens = this.config.maxRequestTokens;
316
+
317
+ // 从小文件开始累加,找到理论最大文件数
318
+ let currentTokens = 0;
319
+ let currentFiles = 0;
320
+
321
+ for (const tokens of fileTokens) {
322
+ if (currentTokens + tokens <= maxTokens) {
323
+ currentTokens += tokens;
324
+ currentFiles++;
325
+ } else {
326
+ break;
327
+ }
328
+ }
329
+
330
+ // 为了保险起见,允许比理论最大值稍大一些的组合
331
+ // 因为实际组合可能有更好的搭配
332
+ const theoreticalMax = currentFiles;
333
+ const allowedMax = Math.min(
334
+ Math.max(theoreticalMax, Math.ceil(theoreticalMax * 1.5)), // 允许50%的余量
335
+ this.config.maxFilesPerBatch
336
+ );
337
+
338
+
339
+
340
+ return Math.max(this.config.minFilesPerBatch, allowedMax);
341
+ }
342
+
343
+ /**
344
+ * 估算文本的Token数量(带缓存)
345
+ * @param {string} text - 文本内容
346
+ * @returns {number} 估算的Token数量
347
+ */
348
+ estimateTokens(text) {
349
+ if (!text || typeof text !== 'string') return 0;
350
+
351
+ // 生成缓存键(使用文本长度和前100字符的哈希)
352
+ const cacheKey = this.generateTokenCacheKey(text);
353
+
354
+ // 检查缓存
355
+ if (this.tokenCache.has(cacheKey)) {
356
+ this.cacheStats.hits++;
357
+ return this.tokenCache.get(cacheKey);
358
+ }
359
+
360
+ this.cacheStats.misses++;
361
+
362
+ // 计算token数量
363
+ const tokenCount = this.calculateTokenCount(text);
364
+
365
+ // 缓存结果(限制缓存大小)
366
+ if (this.tokenCache.size >= BATCH_CONSTANTS.MAX_TOKEN_CACHE_SIZE) {
367
+ // 清理最旧的缓存项
368
+ const firstKey = this.tokenCache.keys().next().value;
369
+ this.tokenCache.delete(firstKey);
370
+ }
371
+
372
+ this.tokenCache.set(cacheKey, tokenCount);
373
+ return tokenCount;
374
+ }
375
+
376
+ /**
377
+ * 生成token缓存键
378
+ * @param {string} text - 文本内容
379
+ * @returns {string} 缓存键
380
+ */
381
+ generateTokenCacheKey(text) {
382
+ const length = text.length;
383
+ const prefix = text.substring(0, Math.min(BATCH_CONSTANTS.CACHE_KEY_PREFIX_LENGTH, length));
384
+ // 简单哈希函数
385
+ let hash = 0;
386
+ for (let i = 0; i < prefix.length; i++) {
387
+ const char = prefix.charCodeAt(i);
388
+ hash = ((hash << 5) - hash) + char;
389
+ hash = hash & hash; // 转换为32位整数
390
+ }
391
+ return `${length}_${hash}`;
392
+ }
393
+
394
+ /**
395
+ * 计算文本的Token数量
396
+ * @param {string} text - 文本内容
397
+ * @returns {number} Token数量
398
+ */
399
+ calculateTokenCount(text) {
400
+ // 基础字符数估算
401
+ const charCount = text.length;
402
+
403
+ // 考虑中文字符(通常占用更多Token)
404
+ const chineseCharCount = (text.match(/[\u4e00-\u9fff]/g) || []).length;
405
+ const englishCharCount = charCount - chineseCharCount;
406
+
407
+ // 中文字符按1.5倍计算,英文字符按标准比例
408
+ const adjustedCharCount = englishCharCount + (chineseCharCount * BATCH_CONSTANTS.CHINESE_CHAR_MULTIPLIER);
409
+
410
+ // 考虑代码结构(括号、关键字等会增加Token数)
411
+ const codeStructureBonus = (text.match(/[{}()\[\];,]/g) || []).length * BATCH_CONSTANTS.CODE_STRUCTURE_BONUS;
412
+
413
+ return Math.ceil((adjustedCharCount + codeStructureBonus) / this.config.tokenRatio);
414
+ }
415
+
416
+ /**
417
+ * 获取缓存统计信息
418
+ * @returns {Object} 缓存统计
419
+ */
420
+ getCacheStats() {
421
+ const total = this.cacheStats.hits + this.cacheStats.misses;
422
+ const hitRate = total > 0 ? (this.cacheStats.hits / total * 100).toFixed(2) : 0;
423
+
424
+ return {
425
+ ...this.cacheStats,
426
+ total,
427
+ hitRate: `${hitRate}%`,
428
+ cacheSize: this.tokenCache.size
429
+ };
430
+ }
431
+
432
+ /**
433
+ * 将大文件分段
434
+ * @param {string} content - 文件内容
435
+ * @param {string} filePath - 文件路径
436
+ * @returns {Array} 分段信息数组
437
+ */
438
+ chunkLargeFile(content, filePath) {
439
+ const lines = content.split('\n');
440
+ // 每段目标大小接近maxRequestTokens,确保最大利用率
441
+ const maxTokensPerChunk = Math.floor(this.config.maxRequestTokens * 0.95);
442
+ const overlapLines = this.config.chunkOverlapLines;
443
+ const chunks = [];
444
+ let currentChunk = [];
445
+ let currentTokens = 0;
446
+ let startLine = 1;
447
+
448
+ for (let i = 0; i < lines.length; i++) {
449
+ const line = lines[i];
450
+ const lineTokens = this.estimateTokens(line);
451
+
452
+ // 如果添加这一行会超过限制,且当前块不为空,则结束当前块
453
+ if (currentTokens + lineTokens > maxTokensPerChunk && currentChunk.length > 0) {
454
+ // 保存当前块,使用对象池
455
+ const chunkInfo = this.getChunkFromPool();
456
+ chunkInfo.content = currentChunk.join('\n');
457
+ chunkInfo.startLine = startLine;
458
+ chunkInfo.endLine = startLine + currentChunk.length - 1;
459
+ chunkInfo.tokens = currentTokens;
460
+ chunkInfo.chunkIndex = chunks.length;
461
+ chunkInfo.isChunk = true;
462
+ chunks.push(chunkInfo);
463
+
464
+ // 开始新块,保留重叠行
465
+ const overlapStart = Math.max(0, currentChunk.length - overlapLines);
466
+ const overlapContent = currentChunk.slice(overlapStart);
467
+
468
+ startLine = startLine + currentChunk.length - overlapContent.length;
469
+ currentChunk = overlapContent;
470
+ currentTokens = this.estimateTokens(currentChunk.join('\n'));
471
+ }
472
+
473
+ currentChunk.push(line);
474
+ currentTokens += lineTokens;
475
+ }
476
+
477
+ // 添加最后一个块,使用对象池
478
+ if (currentChunk.length > 0) {
479
+ const chunkInfo = this.getChunkFromPool();
480
+ chunkInfo.content = currentChunk.join('\n');
481
+ chunkInfo.startLine = startLine;
482
+ chunkInfo.endLine = startLine + currentChunk.length - 1;
483
+ chunkInfo.tokens = currentTokens;
484
+ chunkInfo.chunkIndex = chunks.length;
485
+ chunkInfo.isChunk = true;
486
+ chunks.push(chunkInfo);
487
+ }
488
+
489
+ // 如果只有一个块且不超过限制,标记为非分块
490
+ if (chunks.length === 1 && chunks[0].tokens <= maxTokensPerChunk) {
491
+ chunks[0].isChunk = false;
492
+ }
493
+
494
+ return chunks;
495
+ }
496
+
497
+ /**
498
+ * 智能分批处理文件
499
+ * @param {Array} files - 文件信息数组 [{filePath, content, ...}]
500
+ * @returns {Object} 分批结果
501
+ */
502
+ createSmartBatches(files) {
503
+
504
+
505
+ const fileStats = new Map();
506
+ const smallFiles = [];
507
+ const largeBatches = [];
508
+ for (const file of files) {
509
+ const { filePath, content } = file;
510
+ const originalTokens = this.estimateTokens(content);
511
+
512
+ if (originalTokens > this.maxFileTokens) {
513
+ // 大文件:分段处理,但同一文件的所有分段合并为一个批次
514
+ const chunks = this.chunkLargeFile(content, filePath);
515
+
516
+ const segmentItems = [];
517
+ let sumTokens = 0;
518
+ for (const chunk of chunks) {
519
+ const item = this.getItemFromPool();
520
+ Object.assign(item, file);
521
+ item.content = chunk.content;
522
+ item.tokens = chunk.tokens;
523
+ item.isChunk = true;
524
+ item.chunkIndex = chunk.chunkIndex;
525
+ item.totalChunks = chunks.length;
526
+ item.startLine = chunk.startLine;
527
+ item.endLine = chunk.endLine;
528
+ item.originalFilePath = filePath;
529
+ item.segmentGroup = filePath;
530
+ // 合并为单批次,不需要分段等待标记
531
+ item.needsWaiting = false;
532
+
533
+ segmentItems.push(item);
534
+ sumTokens += (chunk.tokens || 0);
535
+ }
536
+
537
+ const batch = this.getBatchFromPool();
538
+ batch.batchIndex = -1; // 稍后重新编号
539
+ batch.items = segmentItems; // 同一文件的所有分段放入一个批次
540
+ batch.totalTokens = sumTokens;
541
+ batch.fileCount = 1; // 单文件的分段
542
+ batch.utilization = (sumTokens / this.config.maxRequestTokens) * 100;
543
+ batch.isLargeFileSegment = true;
544
+ batch.segmentedFile = filePath;
545
+ batch.totalSegments = chunks.length;
546
+ // 不再使用currentSegment(批次包含全部分段)
547
+
548
+ largeBatches.push(batch);
549
+
550
+ const stats = this.getStatsFromPool();
551
+ stats.originalTokens = originalTokens;
552
+ stats.needsChunking = true;
553
+ stats.chunkCount = chunks.length;
554
+ stats.processedTokens = chunks.reduce((sum, chunk) => sum + chunk.tokens, 0);
555
+ fileStats.set(filePath, stats);
556
+ } else {
557
+ // 小文件:可以与其他小文件组合,使用对象池
558
+ const item = this.getItemFromPool();
559
+ Object.assign(item, file);
560
+ item.tokens = originalTokens;
561
+ item.isChunk = false;
562
+ item.chunkIndex = 0;
563
+ item.totalChunks = 1;
564
+ smallFiles.push(item);
565
+
566
+ const stats = this.getStatsFromPool();
567
+ stats.originalTokens = originalTokens;
568
+ stats.needsChunking = false;
569
+ stats.chunkCount = 1;
570
+ stats.processedTokens = originalTokens;
571
+ fileStats.set(filePath, stats);
572
+ }
573
+ }
574
+
575
+ // 对小文件进行最优组合
576
+ const smallFileBatches = [];
577
+ if (smallFiles.length > 0) {
578
+ // 动态计算每批次最大文件数(基于小文件)
579
+ const dynamicMaxFiles = this.calculateDynamicMaxFilesFromItems(smallFiles);
580
+
581
+ // 使用优化算法对小文件进行分批
582
+ const combinedBatches = this.optimizedBinPacking(smallFiles, dynamicMaxFiles);
583
+ smallFileBatches.push(...combinedBatches);
584
+ }
585
+
586
+ // 合并所有批次并重新编号
587
+ const allBatches = [...smallFileBatches, ...largeBatches];
588
+ allBatches.forEach((batch, index) => {
589
+ batch.batchIndex = index;
590
+ });
591
+
592
+ // 后处理分段文件,确保同一文件的分段能够正确组织
593
+ const processedBatches = this.postProcessSegmentedFiles(allBatches);
594
+
595
+ // 生成统计信息
596
+ const stats = this.generateBatchStats(processedBatches, fileStats);
597
+
598
+
599
+
600
+ return {
601
+ batches: processedBatches,
602
+ stats,
603
+ fileStats: Object.fromEntries(fileStats)
604
+ };
605
+ }
606
+
607
+ /**
608
+ * 最优组合算法 - 实现最接近token限制的最优分批
609
+ * @param {Array} items - 处理后的文件项目
610
+ * @param {number} maxFiles - 每批次最大文件数
611
+ * @returns {Array} 批次数组
612
+ */
613
+ optimizedBinPacking(items, maxFiles) {
614
+ const maxTokens = this.config.maxRequestTokens;
615
+
616
+ // 第一步:分离超大文件和普通文件
617
+ const { largeFiles, normalFiles } = this.separateFilesBySize(items, maxTokens);
618
+
619
+ // 第二步:处理超大文件(需要分段)
620
+ const largeBatches = this.handleLargeFiles(largeFiles, maxFiles);
621
+
622
+ // 第三步:对普通文件进行最优组合
623
+ const optimalBatches = this.findOptimalCombinations(normalFiles, maxFiles, maxTokens);
624
+
625
+ // 第四步:合并所有批次并重新编号
626
+ const allBatches = [...largeBatches, ...optimalBatches];
627
+ allBatches.forEach((batch, index) => {
628
+ batch.batchIndex = index;
629
+ });
630
+
631
+ return allBatches;
632
+ }
633
+
634
+ /**
635
+ * 分离超大文件和普通文件
636
+ */
637
+ separateFilesBySize(items, maxTokens) {
638
+ const largeFiles = [];
639
+ const normalFiles = [];
640
+
641
+ items.forEach(item => {
642
+ if (item.tokens > maxTokens) {
643
+ largeFiles.push(item);
644
+ } else {
645
+ normalFiles.push(item);
646
+ }
647
+ });
648
+
649
+ return { largeFiles, normalFiles };
650
+ }
651
+
652
+ /**
653
+ * 处理超大文件 - 需要分段的文件
654
+ */
655
+ handleLargeFiles(largeFiles, maxFiles) {
656
+ const batches = [];
657
+
658
+ largeFiles.forEach(file => {
659
+ // 超大文件需要分段处理:同一文件的所有分段合并为一个批次
660
+ const segments = this.chunkLargeFile(file.content, file.filePath);
661
+
662
+ const items = [];
663
+ let sumTokens = 0;
664
+ for (const seg of segments) {
665
+ const item = this.getItemFromPool();
666
+ Object.assign(item, file);
667
+ item.content = seg.content;
668
+ item.tokens = seg.tokens;
669
+ item.isChunk = true;
670
+ item.chunkIndex = seg.chunkIndex;
671
+ item.totalChunks = segments.length;
672
+ item.startLine = seg.startLine;
673
+ item.endLine = seg.endLine;
674
+ item.originalFilePath = file.filePath;
675
+ item.segmentGroup = file.filePath;
676
+ item.needsWaiting = false;
677
+ items.push(item);
678
+ sumTokens += (seg.tokens || 0);
679
+ }
680
+
681
+ const batch = this.getBatchFromPool();
682
+ batch.batchIndex = 0; // 稍后重新编号
683
+ batch.items = items;
684
+ batch.totalTokens = sumTokens;
685
+ batch.fileCount = 1;
686
+ batch.utilization = (sumTokens / this.config.maxRequestTokens) * 100;
687
+ batch.isLargeFileSegment = true;
688
+ batch.segmentedFile = file.filePath;
689
+ batch.totalSegments = segments.length;
690
+ batches.push(batch);
691
+ });
692
+
693
+ return batches;
694
+ }
695
+
696
+ /**
697
+ * 找到最优组合 - 多重背包问题的最优解
698
+ */
699
+ findOptimalCombinations(normalFiles, maxFiles, maxTokens) {
700
+ if (normalFiles.length === 0) return [];
701
+
702
+ // 使用动态规划找到最优组合
703
+ const combinations = this.findAllOptimalCombinations(normalFiles, maxFiles, maxTokens);
704
+
705
+ // 将组合转换为批次格式
706
+ const batches = combinations.map(combination => {
707
+ const totalTokens = combination.reduce((sum, item) => sum + item.tokens, 0);
708
+ return {
709
+ batchIndex: 0, // 稍后重新编号
710
+ items: combination,
711
+ totalTokens,
712
+ fileCount: combination.length,
713
+ utilization: (totalTokens / maxTokens) * 100,
714
+ isSegmented: false
715
+ };
716
+ });
717
+
718
+ return batches;
719
+ }
720
+
721
+ /**
722
+ * 找到所有最优组合 - 真正的最优算法
723
+ */
724
+ findAllOptimalCombinations(files, maxFiles, maxTokens) {
725
+ if (files.length === 0) return [];
726
+
727
+ // 第一步:分离超大文件(单个文件就超过限制的)
728
+ const largeFiles = files.filter(f => f.tokens > maxTokens);
729
+ const normalFiles = files.filter(f => f.tokens <= maxTokens);
730
+
731
+ // 第二步:对普通文件进行最优组合
732
+ const combinations = this.findOptimalNormalFileCombinations(normalFiles, maxFiles, maxTokens);
733
+
734
+ // 第三步:处理超大文件(需要分段)
735
+ largeFiles.forEach(file => {
736
+ combinations.push([file]); // 超大文件单独成批,后续会被分段处理
737
+ });
738
+
739
+ return combinations;
740
+ }
741
+
742
+ /**
743
+ * 对普通文件进行最优组合
744
+ */
745
+ findOptimalNormalFileCombinations(files, maxFiles, maxTokens) {
746
+ if (files.length === 0) return [];
747
+
748
+ // 使用真正的最优算法:尝试所有可能的组合,找到最接近maxTokens的
749
+ const allCombinations = this.generateAllValidCombinations(files, maxFiles, maxTokens);
750
+
751
+ // 选择最优的组合集合
752
+ return this.selectOptimalCombinationSet(allCombinations, files, maxFiles, maxTokens);
753
+ }
754
+
755
+ /**
756
+ * 生成所有有效的组合
757
+ */
758
+ generateAllValidCombinations(files, maxFiles, maxTokens) {
759
+ const validCombinations = [];
760
+
761
+ // 生成所有可能的组合(从1个文件到maxFiles个文件)
762
+ for (let size = 1; size <= Math.min(maxFiles, files.length); size++) {
763
+ const combinations = this.getCombinations(files, size);
764
+
765
+ for (const combo of combinations) {
766
+ const totalTokens = combo.reduce((sum, file) => sum + file.tokens, 0);
767
+ if (totalTokens <= maxTokens) {
768
+ validCombinations.push({
769
+ files: combo,
770
+ totalTokens,
771
+ utilization: (totalTokens / maxTokens) * 100,
772
+ efficiency: totalTokens // 效率 = 总token数
773
+ });
774
+ }
775
+ }
776
+ }
777
+
778
+ // 按效率降序排列
779
+ return validCombinations.sort((a, b) => b.efficiency - a.efficiency);
780
+ }
781
+
782
+ /**
783
+ * 获取指定大小的所有组合
784
+ */
785
+ getCombinations(arr, size) {
786
+ if (size === 1) return arr.map(item => [item]);
787
+ if (size > arr.length) return [];
788
+
789
+ const result = [];
790
+
791
+ for (let i = 0; i <= arr.length - size; i++) {
792
+ const head = arr[i];
793
+ const tailCombinations = this.getCombinations(arr.slice(i + 1), size - 1);
794
+
795
+ for (const tail of tailCombinations) {
796
+ result.push([head, ...tail]);
797
+ }
798
+ }
799
+
800
+ return result;
801
+ }
802
+
803
+ /**
804
+ * 选择最优的组合集合 - 使用高效的贪心算法
805
+ */
806
+ selectOptimalCombinationSet(allCombinations, files, maxFiles, maxTokens) {
807
+ // 对于大量文件,使用高效的贪心算法而不是指数级的动态规划
808
+ if (files.length > 20) {
809
+ return this.findOptimalCombinationSetGreedy(allCombinations, files, maxFiles, maxTokens);
810
+ }
811
+ // 对于小量文件,仍可使用动态规划获得最优解
812
+ return this.findOptimalCombinationSetDP(allCombinations, files, maxFiles, maxTokens);
813
+ }
814
+
815
+ /**
816
+ * 使用高效的贪心算法找到近似最优的组合集合
817
+ * 时间复杂度: O(n * m * log m),其中n是文件数,m是组合数
818
+ */
819
+ findOptimalCombinationSetGreedy(allCombinations, files, maxFiles, maxTokens) {
820
+ // 计算每个组合的效率分数
821
+ const scoredCombinations = allCombinations.map(combo => {
822
+ const utilization = combo.utilization / 100;
823
+ const fileCount = combo.files.length;
824
+ // 效率分数 = 利用率 * 文件数量 / 批次数量
825
+ const efficiency = utilization * fileCount;
826
+
827
+ return {
828
+ ...combo,
829
+ efficiency,
830
+ fileSet: new Set(combo.files.map(f => f.path || f.filePath))
831
+ };
832
+ });
833
+
834
+ // 按效率分数降序排序
835
+ scoredCombinations.sort((a, b) => b.efficiency - a.efficiency);
836
+
837
+ const selectedCombinations = [];
838
+ const usedFiles = new Set();
839
+
840
+ // 贪心选择:优先选择效率最高且不冲突的组合
841
+ for (const combo of scoredCombinations) {
842
+ // 检查是否与已选择的文件冲突
843
+ const hasConflict = Array.from(combo.fileSet).some(filePath => usedFiles.has(filePath));
844
+
845
+ if (!hasConflict) {
846
+ selectedCombinations.push(combo.files);
847
+ combo.fileSet.forEach(filePath => usedFiles.add(filePath));
848
+ }
849
+ }
850
+
851
+ // 处理剩余文件
852
+ const remainingFiles = files.filter(file =>
853
+ !usedFiles.has(file.path || file.filePath)
854
+ );
855
+
856
+ if (remainingFiles.length > 0) {
857
+ const finalCombinations = this.combineRemainingFiles(remainingFiles, maxFiles, maxTokens);
858
+ selectedCombinations.push(...finalCombinations);
859
+ }
860
+
861
+ return selectedCombinations;
862
+ }
863
+
864
+ /**
865
+ * 使用动态规划找到最优的组合集合(仅用于小规模问题)
866
+ * 时间复杂度: O(2^n * m),仅在文件数量较少时使用
867
+ */
868
+ findOptimalCombinationSetDP(allCombinations, files, maxFiles, maxTokens) {
869
+ const fileCount = files.length;
870
+
871
+ // 文件数量过多时,回退到贪心算法
872
+ if (fileCount > 20) {
873
+ return this.findOptimalCombinationSetGreedy(allCombinations, files, maxFiles, maxTokens);
874
+ }
875
+
876
+ // 为每个文件创建唯一标识
877
+ const fileIds = new Map();
878
+ files.forEach((file, index) => {
879
+ fileIds.set(file.path || file.filePath, index);
880
+ });
881
+
882
+ // 将组合转换为位掩码表示,并计算价值函数
883
+ const combosWithMask = allCombinations.map(combo => {
884
+ let mask = 0;
885
+ combo.files.forEach(file => {
886
+ const fileId = fileIds.get(file.path || file.filePath);
887
+ if (fileId !== undefined) {
888
+ mask |= (1 << fileId);
889
+ }
890
+ });
891
+
892
+ const utilization = combo.utilization / 100;
893
+ const fileCount = combo.files.length;
894
+ const value = Math.floor(utilization * utilization * fileCount * 1000);
895
+
896
+ return {
897
+ ...combo,
898
+ mask,
899
+ value
900
+ };
901
+ });
902
+
903
+ // 动态规划状态数组
904
+ const stateCount = 1 << fileCount;
905
+ const dp = new Array(stateCount).fill(null).map(() => ({value: 0, count: 0}));
906
+ const parent = new Array(stateCount).fill(-1);
907
+
908
+ // 遍历所有组合
909
+ for (let i = 0; i < combosWithMask.length; i++) {
910
+ const combo = combosWithMask[i];
911
+
912
+ // 从高位到低位遍历状态,避免重复计算
913
+ for (let mask = stateCount - 1; mask >= 0; mask--) {
914
+ if ((mask & combo.mask) === 0) {
915
+ const newMask = mask | combo.mask;
916
+ const newValue = dp[mask].value + combo.value;
917
+ const newCount = dp[mask].count + 1;
918
+
919
+ if (newValue > dp[newMask].value ||
920
+ (newValue === dp[newMask].value && newCount < dp[newMask].count)) {
921
+ dp[newMask] = {value: newValue, count: newCount};
922
+ parent[newMask] = i;
923
+ }
924
+ }
925
+ }
926
+ }
927
+
928
+ // 找到最优解
929
+ const fullMask = stateCount - 1;
930
+ let bestMask = fullMask;
931
+ let bestScore = dp[fullMask];
932
+
933
+ if (bestScore.value === 0) {
934
+ for (let mask = 0; mask < stateCount; mask++) {
935
+ const popCount = this.popCount(mask);
936
+ const score = dp[mask];
937
+ if (score.value > 0 && popCount > this.popCount(bestMask)) {
938
+ bestMask = mask;
939
+ bestScore = score;
940
+ }
941
+ }
942
+ }
943
+
944
+ // 回溯构建解
945
+ const selectedCombinations = [];
946
+ let currentMask = bestMask;
947
+
948
+ while (currentMask > 0 && parent[currentMask] !== -1) {
949
+ const comboIndex = parent[currentMask];
950
+ const combo = combosWithMask[comboIndex];
951
+ selectedCombinations.push(combo.files);
952
+ currentMask ^= combo.mask;
953
+ }
954
+
955
+ // 处理剩余文件
956
+ const usedFiles = new Set();
957
+ selectedCombinations.forEach(combo => {
958
+ combo.forEach(file => {
959
+ usedFiles.add(file.path || file.filePath);
960
+ });
961
+ });
962
+
963
+ const remainingFiles = files.filter(file =>
964
+ !usedFiles.has(file.path || file.filePath)
965
+ );
966
+
967
+ if (remainingFiles.length > 0) {
968
+ const finalCombinations = this.combineRemainingFiles(remainingFiles, maxFiles, maxTokens);
969
+ selectedCombinations.push(...finalCombinations);
970
+ }
971
+
972
+ return selectedCombinations;
973
+ }
974
+
975
+ /**
976
+ * 计算位掩码中1的个数
977
+ */
978
+ popCount(mask) {
979
+ let count = 0;
980
+ while (mask) {
981
+ count += mask & 1;
982
+ mask >>= 1;
983
+ }
984
+ return count;
985
+ }
986
+
987
+ /**
988
+ * 组合剩余文件
989
+ */
990
+ combineRemainingFiles(files, maxFiles, maxTokens) {
991
+ const combinations = [];
992
+ let remaining = [...files];
993
+
994
+ while (remaining.length > 0) {
995
+ // 尝试找到最佳组合
996
+ let bestCombo = [remaining[0]];
997
+ let bestSum = remaining[0].tokens;
998
+
999
+ // 尝试添加更多文件到当前组合
1000
+ for (let i = 1; i < remaining.length && bestCombo.length < maxFiles; i++) {
1001
+ if (bestSum + remaining[i].tokens <= maxTokens) {
1002
+ bestCombo.push(remaining[i]);
1003
+ bestSum += remaining[i].tokens;
1004
+ }
1005
+ }
1006
+
1007
+ combinations.push(bestCombo);
1008
+
1009
+ // 移除已使用的文件
1010
+ remaining = remaining.filter(file =>
1011
+ !bestCombo.some(used => (used.path || used.filePath) === (file.path || file.filePath))
1012
+ );
1013
+ }
1014
+
1015
+ return combinations;
1016
+ }
1017
+
1018
+ /**
1019
+ * 使用贪心算法找到最佳组合
1020
+ */
1021
+ findBestGreedyCombination(files, maxFiles, maxTokens) {
1022
+ if (files.length === 0) return [];
1023
+
1024
+ let bestCombination = [];
1025
+ let bestSum = 0;
1026
+
1027
+ // 尝试以每个文件为起点的组合
1028
+ for (let i = 0; i < files.length; i++) {
1029
+ const currentCombination = [files[i]];
1030
+ let currentSum = files[i].tokens;
1031
+
1032
+ if (currentSum > maxTokens) continue; // 跳过超大文件
1033
+
1034
+ // 贪心添加其他文件
1035
+ for (let j = 0; j < files.length; j++) {
1036
+ if (i === j || currentCombination.length >= maxFiles) continue;
1037
+
1038
+ if (currentSum + files[j].tokens <= maxTokens) {
1039
+ currentCombination.push(files[j]);
1040
+ currentSum += files[j].tokens;
1041
+ }
1042
+ }
1043
+
1044
+ // 如果这个组合更好,更新最优解
1045
+ if (currentSum > bestSum) {
1046
+ bestCombination = [...currentCombination];
1047
+ bestSum = currentSum;
1048
+ }
1049
+ }
1050
+
1051
+ return bestCombination;
1052
+ }
1053
+
1054
+ /**
1055
+ * 组合剩余的小文件
1056
+ */
1057
+ combineSmallFiles(files, maxFiles, maxTokens) {
1058
+ const combinations = [];
1059
+ const used = new Array(files.length).fill(false);
1060
+
1061
+ for (let i = 0; i < files.length; i++) {
1062
+ if (used[i]) continue;
1063
+
1064
+ const currentCombination = [files[i]];
1065
+ let currentSum = files[i].tokens;
1066
+ used[i] = true;
1067
+
1068
+ // 尝试添加更多文件到当前组合
1069
+ for (let j = i + 1; j < files.length; j++) {
1070
+ if (used[j]) continue;
1071
+
1072
+ if (currentSum + files[j].tokens <= maxTokens && currentCombination.length < maxFiles) {
1073
+ currentCombination.push(files[j]);
1074
+ currentSum += files[j].tokens;
1075
+ used[j] = true;
1076
+ }
1077
+ }
1078
+
1079
+ combinations.push(currentCombination);
1080
+ }
1081
+
1082
+ return combinations;
1083
+ }
1084
+
1085
+ /**
1086
+ * 找到单个最优组合 - 最接近maxTokens的组合
1087
+ */
1088
+ findSingleOptimalCombination(files, used, maxFiles, maxTokens) {
1089
+ const availableFiles = files.filter((file, index) => !used[index]);
1090
+
1091
+ if (availableFiles.length === 0) return [];
1092
+
1093
+ let bestCombination = [];
1094
+ let bestSum = 0;
1095
+
1096
+ // 使用回溯算法找到最优组合
1097
+ const backtrack = (index, currentCombination, currentSum) => {
1098
+ // 如果当前组合更接近目标值,更新最优解
1099
+ if (currentSum > bestSum && currentSum <= maxTokens && currentCombination.length <= maxFiles) {
1100
+ bestCombination = [...currentCombination];
1101
+ bestSum = currentSum;
1102
+ }
1103
+
1104
+ // 如果已经达到最大文件数或遍历完所有文件,返回
1105
+ if (currentCombination.length >= maxFiles || index >= availableFiles.length) {
1106
+ return;
1107
+ }
1108
+
1109
+ // 尝试添加当前文件
1110
+ const file = availableFiles[index];
1111
+ if (currentSum + file.tokens <= maxTokens) {
1112
+ currentCombination.push(file);
1113
+ backtrack(index + 1, currentCombination, currentSum + file.tokens);
1114
+ currentCombination.pop();
1115
+ }
1116
+
1117
+ // 不添加当前文件,继续下一个
1118
+ backtrack(index + 1, currentCombination, currentSum);
1119
+ };
1120
+
1121
+ backtrack(0, [], 0);
1122
+
1123
+ // 如果没有找到多文件组合,至少返回一个文件(避免无限循环)
1124
+ if (bestCombination.length === 0 && availableFiles.length > 0) {
1125
+ bestCombination = [availableFiles[0]];
1126
+ }
1127
+
1128
+ return bestCombination;
1129
+ }
1130
+
1131
+ /**
1132
+ * First Fit Decreasing算法 - 按大小降序排列后依次放入第一个能容纳的批次
1133
+ */
1134
+ firstFitDecreasingPacking(items, maxFiles) {
1135
+ // 按token数降序排列
1136
+ const sortedItems = [...items].sort((a, b) => b.tokens - a.tokens);
1137
+ const batches = [];
1138
+
1139
+ for (const item of sortedItems) {
1140
+ let placed = false;
1141
+
1142
+ // 尝试放入现有批次
1143
+ for (const batch of batches) {
1144
+ if (this.canAddToBatch(batch, item, maxFiles)) {
1145
+ batch.items.push(item);
1146
+ batch.totalTokens += item.tokens;
1147
+ batch.fileCount = new Set(batch.items.map(i => i.originalFilePath || i.filePath)).size;
1148
+ batch.utilization = (batch.totalTokens / this.config.maxRequestTokens) * 100;
1149
+ placed = true;
1150
+ break;
1151
+ }
1152
+ }
1153
+
1154
+ // 如果无法放入现有批次,创建新批次,使用对象池
1155
+ if (!placed) {
1156
+ const batch = this.getBatchFromPool();
1157
+ batch.batchIndex = batches.length;
1158
+ batch.items = [item];
1159
+ batch.totalTokens = item.tokens;
1160
+ batch.fileCount = 1;
1161
+ batch.utilization = (item.tokens / this.config.maxRequestTokens) * 100;
1162
+ batches.push(batch);
1163
+ }
1164
+ }
1165
+
1166
+ return batches;
1167
+ }
1168
+
1169
+ /**
1170
+ * 全局优化 - 重新分配文件以提高整体利用率
1171
+ */
1172
+ globalOptimization(batches, maxFiles) {
1173
+ const maxTokens = this.config.maxRequestTokens;
1174
+ let improved = true;
1175
+ let iterations = 0;
1176
+ const maxIterations = 10; // 防止无限循环
1177
+
1178
+ // 迭代优化直到无法改进或达到最大迭代次数
1179
+ while (improved && iterations < maxIterations) {
1180
+ improved = false;
1181
+ iterations++;
1182
+
1183
+ // 找到利用率最低的批次
1184
+ const sortedBatches = [...batches].sort((a, b) => a.utilization - b.utilization);
1185
+ const lowUtilBatch = sortedBatches[0];
1186
+
1187
+ if (!lowUtilBatch || lowUtilBatch.utilization >= 80) break; // 如果最低利用率已经很高,停止优化
1188
+
1189
+ // 尝试将其他批次的小文件移动到低利用率批次
1190
+ for (let i = 1; i < sortedBatches.length; i++) {
1191
+ const sourceBatch = sortedBatches[i];
1192
+
1193
+ if (sourceBatch.items.length === 0) continue; // 跳过空批次
1194
+
1195
+ // 找到源批次中最小的文件
1196
+ const smallestItem = sourceBatch.items.reduce((min, item) =>
1197
+ item.tokens < min.tokens ? item : min
1198
+ );
1199
+
1200
+ // 检查是否可以移动到低利用率批次
1201
+ if (this.canAddToBatch(lowUtilBatch, smallestItem, maxFiles)) {
1202
+ // 执行移动
1203
+ const itemIndex = sourceBatch.items.indexOf(smallestItem);
1204
+ if (itemIndex !== -1) {
1205
+ sourceBatch.items.splice(itemIndex, 1);
1206
+ sourceBatch.totalTokens -= smallestItem.tokens;
1207
+ sourceBatch.fileCount = new Set(sourceBatch.items.map(i => i.originalFilePath || i.filePath)).size;
1208
+ sourceBatch.utilization = sourceBatch.totalTokens > 0 ? (sourceBatch.totalTokens / maxTokens) * 100 : 0;
1209
+
1210
+ lowUtilBatch.items.push(smallestItem);
1211
+ lowUtilBatch.totalTokens += smallestItem.tokens;
1212
+ lowUtilBatch.fileCount = new Set(lowUtilBatch.items.map(i => i.originalFilePath || i.filePath)).size;
1213
+ lowUtilBatch.utilization = (lowUtilBatch.totalTokens / maxTokens) * 100;
1214
+
1215
+ improved = true;
1216
+ break;
1217
+ }
1218
+ }
1219
+ }
1220
+ }
1221
+
1222
+ // 移除空批次
1223
+ return batches.filter(batch => batch.items.length > 0);
1224
+ }
1225
+
1226
+ /**
1227
+ * 回填优化 - 尝试合并利用率低的批次
1228
+ */
1229
+ backfillOptimization(batches, maxFiles) {
1230
+ const maxTokens = this.config.maxRequestTokens;
1231
+ let optimized = true;
1232
+ let iterations = 0;
1233
+ const maxIterations = 5; // 防止无限循环
1234
+
1235
+ while (optimized && iterations < maxIterations) {
1236
+ optimized = false;
1237
+ iterations++;
1238
+
1239
+ // 按利用率升序排列
1240
+ const sortedBatches = [...batches].sort((a, b) => a.utilization - b.utilization);
1241
+
1242
+ for (let i = 0; i < sortedBatches.length - 1; i++) {
1243
+ const batch1 = sortedBatches[i];
1244
+
1245
+ if (!batch1 || batch1.items.length === 0) continue;
1246
+
1247
+ for (let j = i + 1; j < sortedBatches.length; j++) {
1248
+ const batch2 = sortedBatches[j];
1249
+
1250
+ if (!batch2 || batch2.items.length === 0) continue;
1251
+
1252
+ // 检查是否可以合并两个批次
1253
+ const combinedTokens = batch1.totalTokens + batch2.totalTokens;
1254
+ const combinedFileCount = new Set([
1255
+ ...batch1.items.map(i => i.originalFilePath || i.filePath),
1256
+ ...batch2.items.map(i => i.originalFilePath || i.filePath)
1257
+ ]).size;
1258
+
1259
+ if (combinedTokens <= maxTokens && combinedFileCount <= maxFiles) {
1260
+ // 合并批次
1261
+ batch1.items.push(...batch2.items);
1262
+ batch1.totalTokens = combinedTokens;
1263
+ batch1.fileCount = combinedFileCount;
1264
+ batch1.utilization = (combinedTokens / maxTokens) * 100;
1265
+
1266
+ // 移除第二个批次
1267
+ const batch2Index = batches.indexOf(batch2);
1268
+ if (batch2Index !== -1) {
1269
+ batches.splice(batch2Index, 1);
1270
+ }
1271
+
1272
+ optimized = true;
1273
+ break;
1274
+ }
1275
+ }
1276
+
1277
+ if (optimized) break;
1278
+ }
1279
+ }
1280
+
1281
+ // 重新编号批次
1282
+ batches.forEach((batch, index) => {
1283
+ batch.batchIndex = index;
1284
+ });
1285
+
1286
+ return batches;
1287
+ }
1288
+
1289
+ /**
1290
+ * 后处理分段文件,确保同一文件的分段能够正确组织
1291
+ * @param {Array} batches - 原始批次数组
1292
+ * @returns {Array} 处理后的批次数组
1293
+ */
1294
+ postProcessSegmentedFiles(batches) {
1295
+ const processedBatches = [];
1296
+ const segmentGroups = new Map(); // 存储分段组信息
1297
+
1298
+ // 第一步:识别所有分段组
1299
+ for (const batch of batches) {
1300
+ for (const item of batch.items) {
1301
+ if (item.isChunk && item.segmentGroup) {
1302
+ if (!segmentGroups.has(item.segmentGroup)) {
1303
+ segmentGroups.set(item.segmentGroup, {
1304
+ segments: [],
1305
+ totalChunks: item.totalChunks,
1306
+ batchIndices: new Set()
1307
+ });
1308
+ }
1309
+ segmentGroups.get(item.segmentGroup).segments.push({
1310
+ item,
1311
+ batchIndex: batch.batchIndex
1312
+ });
1313
+ segmentGroups.get(item.segmentGroup).batchIndices.add(batch.batchIndex);
1314
+ }
1315
+ }
1316
+ }
1317
+
1318
+ // 第二步:为每个批次添加分段等待信息
1319
+ for (const batch of batches) {
1320
+ const processedBatch = {
1321
+ ...batch,
1322
+ hasSegmentedFiles: false,
1323
+ segmentInfo: {},
1324
+ needsWaiting: false
1325
+ };
1326
+
1327
+ // 检查批次中是否有分段文件
1328
+ for (const item of batch.items) {
1329
+ if (item.isChunk && item.segmentGroup) {
1330
+ processedBatch.hasSegmentedFiles = true;
1331
+ processedBatch.needsWaiting = true;
1332
+
1333
+ const groupInfo = segmentGroups.get(item.segmentGroup);
1334
+ if (!processedBatch.segmentInfo[item.segmentGroup]) {
1335
+ processedBatch.segmentInfo[item.segmentGroup] = {
1336
+ currentChunk: item.chunkIndex,
1337
+ totalChunks: item.totalChunks,
1338
+ allBatchIndices: Array.from(groupInfo.batchIndices).sort(),
1339
+ isComplete: groupInfo.segments.length === item.totalChunks
1340
+ };
1341
+ }
1342
+ }
1343
+ }
1344
+
1345
+ processedBatches.push(processedBatch);
1346
+ }
1347
+
1348
+ return processedBatches;
1349
+ }
1350
+
1351
+ /**
1352
+ * 找到最接近maxRequestTokens的最优文件组合
1353
+ * @param {Array} items - 可选择的文件项目
1354
+ * @param {number} maxFiles - 最大文件数限制
1355
+ * @returns {Object} 最优组合 {items: [], totalTokens: number}
1356
+ */
1357
+ findOptimalCombination(items, maxFiles) {
1358
+ const maxTokens = this.config.maxRequestTokens;
1359
+ let bestCombination = { items: [], totalTokens: 0 };
1360
+
1361
+ // 使用贪心算法 + 回溯优化
1362
+ // 首先按token密度排序(token/文件数比例)
1363
+ const sortedItems = [...items].sort((a, b) => {
1364
+ return b.tokens - a.tokens; // 降序
1365
+ });
1366
+
1367
+ // 尝试不同的组合策略
1368
+ this.tryGreedyCombination(sortedItems, maxFiles, maxTokens, bestCombination);
1369
+ this.tryBalancedCombination(sortedItems, maxFiles, maxTokens, bestCombination);
1370
+
1371
+ return bestCombination;
1372
+ }
1373
+
1374
+ /**
1375
+ * 贪心策略:优先选择大文件,然后填充小文件
1376
+ */
1377
+ tryGreedyCombination(items, maxFiles, maxTokens, bestCombination) {
1378
+ const combination = { items: [], totalTokens: 0 };
1379
+ const uniqueFiles = new Set();
1380
+
1381
+ for (const item of items) {
1382
+ const itemFilePath = item.originalFilePath || item.filePath;
1383
+ const wouldExceedTokens = combination.totalTokens + item.tokens > maxTokens;
1384
+ const wouldExceedFiles = !uniqueFiles.has(itemFilePath) && uniqueFiles.size >= maxFiles;
1385
+
1386
+ if (!wouldExceedTokens && !wouldExceedFiles) {
1387
+ combination.items.push(item);
1388
+ combination.totalTokens += item.tokens;
1389
+ uniqueFiles.add(itemFilePath);
1390
+ }
1391
+ }
1392
+
1393
+ // 如果这个组合更好,更新最佳组合
1394
+ if (combination.totalTokens > bestCombination.totalTokens) {
1395
+ bestCombination.items = [...combination.items];
1396
+ bestCombination.totalTokens = combination.totalTokens;
1397
+ }
1398
+ }
1399
+
1400
+ /**
1401
+ * 平衡策略:尝试找到更均衡的文件大小组合
1402
+ */
1403
+ tryBalancedCombination(items, maxFiles, maxTokens, bestCombination) {
1404
+ // 按token数升序排列,尝试组合多个小文件
1405
+ const ascendingItems = [...items].sort((a, b) => a.tokens - b.tokens);
1406
+ const combination = { items: [], totalTokens: 0 };
1407
+ const uniqueFiles = new Set();
1408
+
1409
+ for (const item of ascendingItems) {
1410
+ const itemFilePath = item.originalFilePath || item.filePath;
1411
+ const wouldExceedTokens = combination.totalTokens + item.tokens > maxTokens;
1412
+ const wouldExceedFiles = !uniqueFiles.has(itemFilePath) && uniqueFiles.size >= maxFiles;
1413
+
1414
+ if (!wouldExceedTokens && !wouldExceedFiles) {
1415
+ combination.items.push(item);
1416
+ combination.totalTokens += item.tokens;
1417
+ uniqueFiles.add(itemFilePath);
1418
+ }
1419
+ }
1420
+
1421
+ // 如果这个组合的利用率更高,更新最佳组合
1422
+ const currentUtilization = bestCombination.totalTokens / maxTokens;
1423
+ const newUtilization = combination.totalTokens / maxTokens;
1424
+
1425
+ if (newUtilization > currentUtilization ||
1426
+ (Math.abs(newUtilization - currentUtilization) < 0.1 && combination.items.length > bestCombination.items.length)) {
1427
+ bestCombination.items = [...combination.items];
1428
+ bestCombination.totalTokens = combination.totalTokens;
1429
+ }
1430
+ }
1431
+
1432
+ /**
1433
+ * 检查是否可以将项目添加到批次中
1434
+ * @param {Object} batch - 批次对象
1435
+ * @param {Object} item - 文件项目
1436
+ * @param {number} maxFiles - 每批次最大文件数
1437
+ * @returns {boolean} 是否可以添加
1438
+ */
1439
+ canAddToBatch(batch, item, maxFiles) {
1440
+ // 检查Token限制
1441
+ if (batch.totalTokens + item.tokens > this.config.maxRequestTokens) {
1442
+ return false;
1443
+ }
1444
+
1445
+ // 检查文件数限制
1446
+ const uniqueFiles = new Set(batch.items.map(i => i.originalFilePath || i.filePath));
1447
+ const itemFilePath = item.originalFilePath || item.filePath;
1448
+ if (!uniqueFiles.has(itemFilePath) && uniqueFiles.size >= maxFiles) {
1449
+ return false;
1450
+ }
1451
+
1452
+ return true;
1453
+ }
1454
+
1455
+ /**
1456
+ * 生成批次统计信息
1457
+ * @param {Array} batches - 批次数组
1458
+ * @param {Map} fileStats - 文件统计信息
1459
+ * @returns {Object} 统计信息
1460
+ */
1461
+ generateBatchStats(batches, fileStats) {
1462
+ const totalFiles = fileStats.size;
1463
+ const totalBatches = batches.length;
1464
+ const totalTokens = Array.from(fileStats.values()).reduce((sum, stat) => sum + stat.originalTokens, 0);
1465
+ const chunkedFiles = Array.from(fileStats.values()).filter(stat => stat.needsChunking).length;
1466
+
1467
+ const avgUtilization = batches.length > 0
1468
+ ? batches.reduce((sum, batch) => sum + batch.utilization, 0) / batches.length
1469
+ : 0;
1470
+
1471
+ return {
1472
+ totalFiles,
1473
+ totalBatches,
1474
+ totalTokens,
1475
+ chunkedFiles,
1476
+ avgUtilization: Math.round(avgUtilization * 100) / 100,
1477
+ maxBatchTokens: this.config.maxBatchTokens,
1478
+ maxFileTokens: this.config.maxFileTokens
1479
+ };
1480
+ }
1481
+
1482
+ /**
1483
+ * 格式化批次信息用于AI分析
1484
+ * @param {Object} batch - 批次对象
1485
+ * @returns {Object} 格式化的批次信息
1486
+ */
1487
+ formatBatchForAI(batch) {
1488
+ const fileGroups = new Map();
1489
+
1490
+ // 按文件分组
1491
+ for (const item of batch.items) {
1492
+ const filePath = item.originalFilePath || item.filePath;
1493
+ if (!fileGroups.has(filePath)) {
1494
+ fileGroups.set(filePath, []);
1495
+ }
1496
+ fileGroups.get(filePath).push(item);
1497
+ }
1498
+
1499
+ const formattedFiles = [];
1500
+
1501
+ for (const [filePath, items] of fileGroups) {
1502
+ if (items.length === 1 && !items[0].isChunk) {
1503
+ // 单个完整文件
1504
+ formattedFiles.push({
1505
+ filePath,
1506
+ content: items[0].content,
1507
+ isChunked: false,
1508
+ staticIssues: items[0].staticIssues || []
1509
+ });
1510
+ } else {
1511
+ // 分段文件
1512
+ const sortedItems = items.sort((a, b) => a.chunkIndex - b.chunkIndex);
1513
+ formattedFiles.push({
1514
+ filePath,
1515
+ content: sortedItems.map(item => item.content).join('\n'),
1516
+ isChunked: true,
1517
+ totalChunks: sortedItems[0].totalChunks,
1518
+ chunks: sortedItems.map(item => ({
1519
+ index: item.chunkIndex,
1520
+ content: item.content,
1521
+ startLine: item.startLine,
1522
+ endLine: item.endLine,
1523
+ tokens: item.tokens
1524
+ })),
1525
+ staticIssues: sortedItems[0].staticIssues || []
1526
+ });
1527
+ }
1528
+ }
1529
+
1530
+ return {
1531
+ batchIndex: batch.batchIndex,
1532
+ totalTokens: batch.totalTokens,
1533
+ utilization: batch.utilization,
1534
+ files: formattedFiles
1535
+ };
1536
+ }
1537
+
1538
+
1539
+
1540
+ /**
1541
+ * 全局优化组合 - 重新分配以提高整体利用率
1542
+ */
1543
+ globalOptimizeCombinations(combinations, maxFiles, maxTokens) {
1544
+ if (combinations.length <= 1) return combinations;
1545
+
1546
+ // 转换为批次格式
1547
+ const batches = combinations.map((combo, index) => ({
1548
+ batchIndex: index,
1549
+ items: combo,
1550
+ totalTokens: combo.reduce((sum, file) => sum + file.tokens, 0),
1551
+ fileCount: combo.length,
1552
+ utilization: (combo.reduce((sum, file) => sum + file.tokens, 0) / maxTokens) * 100
1553
+ }));
1554
+
1555
+ // 尝试合并低利用率的批次
1556
+ const optimizedBatches = this.mergeLowUtilizationBatches(batches, maxFiles, maxTokens);
1557
+
1558
+ // 转换回组合格式
1559
+ return optimizedBatches.map(batch => batch.items);
1560
+ }
1561
+
1562
+ /**
1563
+ * 合并低利用率批次
1564
+ */
1565
+ mergeLowUtilizationBatches(batches, maxFiles, maxTokens) {
1566
+ const result = [];
1567
+ const used = new Array(batches.length).fill(false);
1568
+
1569
+ // 按利用率排序
1570
+ const sortedBatches = batches
1571
+ .map((batch, index) => ({ ...batch, originalIndex: index }))
1572
+ .sort((a, b) => b.utilization - a.utilization);
1573
+
1574
+ for (const batch of sortedBatches) {
1575
+ if (used[batch.originalIndex]) continue;
1576
+
1577
+ let currentBatch = { ...batch, items: [...batch.items] };
1578
+ used[batch.originalIndex] = true;
1579
+
1580
+ // 尝试合并其他低利用率批次
1581
+ for (const otherBatch of sortedBatches) {
1582
+ if (used[otherBatch.originalIndex]) continue;
1583
+
1584
+ const combinedTokens = currentBatch.totalTokens + otherBatch.totalTokens;
1585
+ const combinedFileCount = currentBatch.fileCount + otherBatch.fileCount;
1586
+
1587
+ if (combinedTokens <= maxTokens && combinedFileCount <= maxFiles) {
1588
+ currentBatch.items.push(...otherBatch.items);
1589
+ currentBatch.totalTokens = combinedTokens;
1590
+ currentBatch.fileCount = combinedFileCount;
1591
+ currentBatch.utilization = (combinedTokens / maxTokens) * 100;
1592
+ used[otherBatch.originalIndex] = true;
1593
+ }
1594
+ }
1595
+
1596
+ result.push(currentBatch);
1597
+ }
1598
+
1599
+ return result;
1600
+ }
1601
+
1602
+ /**
1603
+ * 清理资源并回收对象到池中
1604
+ * @param {Array} batches - 要清理的批次数组
1605
+ */
1606
+ cleanupBatches(batches) {
1607
+ if (!batches || !Array.isArray(batches)) return;
1608
+
1609
+ for (const batch of batches) {
1610
+ if (batch.items && Array.isArray(batch.items)) {
1611
+ for (const item of batch.items) {
1612
+ this.recycleItemToPool(item);
1613
+ }
1614
+ }
1615
+ this.recycleBatchToPool(batch);
1616
+ }
1617
+ }
1618
+
1619
+ /**
1620
+ * 清理文件统计对象
1621
+ * @param {Map} fileStats - 文件统计Map
1622
+ */
1623
+ cleanupFileStats(fileStats) {
1624
+ if (!fileStats || !(fileStats instanceof Map)) return;
1625
+
1626
+ for (const stats of fileStats.values()) {
1627
+ this.recycleStatsToPool(stats);
1628
+ }
1629
+ }
1630
+
1631
+ /**
1632
+ * 清理所有缓存和对象池
1633
+ */
1634
+ cleanup() {
1635
+ // 清理token缓存
1636
+ this.tokenCache.clear();
1637
+ this.cacheStats = { hits: 0, misses: 0 };
1638
+
1639
+ // 清理对象池
1640
+ this.batchObjectPool.length = 0;
1641
+ this.itemObjectPool.length = 0;
1642
+ this.chunkObjectPool.length = 0;
1643
+ this.statsObjectPool.length = 0;
1644
+ }
1645
+
1646
+ /**
1647
+ * 获取对象池统计信息
1648
+ * @returns {Object} 对象池统计
1649
+ */
1650
+ getPoolStats() {
1651
+ return {
1652
+ batchPool: {
1653
+ size: this.batchObjectPool.length,
1654
+ type: 'batch'
1655
+ },
1656
+ itemPool: {
1657
+ size: this.itemObjectPool.length,
1658
+ type: 'item'
1659
+ },
1660
+ chunkPool: {
1661
+ size: this.chunkObjectPool.length,
1662
+ type: 'chunk'
1663
+ },
1664
+ statsPool: {
1665
+ size: this.statsObjectPool.length,
1666
+ type: 'stats'
1667
+ },
1668
+ tokenCache: this.getCacheStats()
1669
+ };
1670
+ }
1671
+ }