git-coco 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -38,6 +38,21 @@ type BaseLLMService = {
38
38
  * @default 6
39
39
  */
40
40
  maxConcurrent?: number;
41
+ /**
42
+ * Minimum token count for a directory/file group to be eligible for summarization.
43
+ * Groups below this threshold preserve raw diffs to maintain detail.
44
+ *
45
+ * @default 400
46
+ */
47
+ minTokensForSummary?: number;
48
+ /**
49
+ * Maximum tokens allowed for a single file diff before it gets pre-summarized.
50
+ * Prevents large files from biasing the overall summary.
51
+ * If not set, defaults to 25% of tokenLimit.
52
+ *
53
+ * @default undefined (uses 0.25 * tokenLimit)
54
+ */
55
+ maxFileTokens?: number;
41
56
  authentication: Authentication;
42
57
  requestOptions?: {
43
58
  timeout?: number;
@@ -339,6 +354,21 @@ interface BaseParserOptions {
339
354
  git: SimpleGit;
340
355
  logger: Logger;
341
356
  maxTokens?: number;
357
+ /**
358
+ * Minimum token count for a directory/file group to be eligible for summarization.
359
+ * @default 400
360
+ */
361
+ minTokensForSummary?: number;
362
+ /**
363
+ * Maximum tokens allowed for a single file diff before it gets pre-summarized.
364
+ * Defaults to 25% of maxTokens if not specified.
365
+ */
366
+ maxFileTokens?: number;
367
+ /**
368
+ * Maximum number of concurrent summarization requests.
369
+ * @default 6
370
+ */
371
+ maxConcurrent?: number;
342
372
  }
343
373
  interface BaseParserInput {
344
374
  options: BaseParserOptions;
@@ -27,7 +27,6 @@ import { RUN_KEY } from '@langchain/core/outputs';
27
27
  import { CallbackManager, parseCallbackConfigArg } from '@langchain/core/callbacks/manager';
28
28
  import '@langchain/core/utils/json_patch';
29
29
  import { simpleGit } from 'simple-git';
30
- import pQueue from 'p-queue';
31
30
  import { Document, BaseDocumentTransformer } from '@langchain/core/documents';
32
31
  import { createTwoFilesPatch } from 'diff';
33
32
  import '@langchain/core/messages';
@@ -47,7 +46,7 @@ import { pathToFileURL } from 'url';
47
46
  /**
48
47
  * Current build version from package.json
49
48
  */
50
- const BUILD_VERSION = "0.24.0";
49
+ const BUILD_VERSION = "0.25.0";
51
50
 
52
51
  const isInteractive = (config) => {
53
52
  return config?.mode === 'interactive' || !!config?.interactive;
@@ -162,6 +161,17 @@ async function updateFileSection({ filePath, startComment, endComment, getNewCon
162
161
  fs__default.writeFileSync(filePath, newLines.join('\n'));
163
162
  }
164
163
 
164
+ /**
165
+ * Prompt template for summarizing code diffs.
166
+ *
167
+ * TODO: Future improvements to consider:
168
+ * - Separate prompts for file-level vs directory-level summarization
169
+ * - Include file type context (e.g., "This is a React component", "This is a test file")
170
+ * - Add guidance for preserving semantic meaning of changes
171
+ * - Consider change type (added/modified/deleted) in prompt for better context
172
+ * - Include hints about the programming language for more idiomatic summaries
173
+ * - Add support for custom user-provided summarization prompts via config
174
+ */
165
175
  const template$5 = `GOAL: Use functional abstractions to summarize the following text
166
176
 
167
177
  RULES: Avoid phrases like "this change", "this code", or "this function" etc. Instead refer to the function, variable, or class by name.
@@ -1039,6 +1049,16 @@ const schema$1 = {
1039
1049
  "description": "The maximum number of requests to make concurrently.",
1040
1050
  "default": 6
1041
1051
  },
1052
+ "minTokensForSummary": {
1053
+ "type": "number",
1054
+ "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
1055
+ "default": 400
1056
+ },
1057
+ "maxFileTokens": {
1058
+ "type": "number",
1059
+ "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
1060
+ "default": "undefined (uses 0.25 * tokenLimit)"
1061
+ },
1042
1062
  "authentication": {
1043
1063
  "anyOf": [
1044
1064
  {
@@ -1799,6 +1819,16 @@ const schema$1 = {
1799
1819
  "description": "The maximum number of requests to make concurrently.",
1800
1820
  "default": 6
1801
1821
  },
1822
+ "minTokensForSummary": {
1823
+ "type": "number",
1824
+ "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
1825
+ "default": 400
1826
+ },
1827
+ "maxFileTokens": {
1828
+ "type": "number",
1829
+ "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
1830
+ "default": "undefined (uses 0.25 * tokenLimit)"
1831
+ },
1802
1832
  "authentication": {
1803
1833
  "anyOf": [
1804
1834
  {
@@ -1950,6 +1980,16 @@ const schema$1 = {
1950
1980
  "description": "The maximum number of requests to make concurrently.",
1951
1981
  "default": 6
1952
1982
  },
1983
+ "minTokensForSummary": {
1984
+ "type": "number",
1985
+ "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
1986
+ "default": 400
1987
+ },
1988
+ "maxFileTokens": {
1989
+ "type": "number",
1990
+ "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
1991
+ "default": "undefined (uses 0.25 * tokenLimit)"
1992
+ },
1953
1993
  "authentication": {
1954
1994
  "anyOf": [
1955
1995
  {
@@ -8217,6 +8257,114 @@ async function summarize(documents, { chain, textSplitter, options }) {
8217
8257
  return res.text && res.text.trim();
8218
8258
  }
8219
8259
 
8260
+ /**
8261
+ * Summarize a single file diff that exceeds the token threshold.
8262
+ */
8263
+ async function summarizeFileDiff(fileDiff, { chain, textSplitter, tokenizer }) {
8264
+ try {
8265
+ const fileSummary = await summarize([
8266
+ {
8267
+ pageContent: fileDiff.diff,
8268
+ metadata: {
8269
+ file: fileDiff.file,
8270
+ summary: fileDiff.summary,
8271
+ },
8272
+ },
8273
+ ], {
8274
+ chain,
8275
+ textSplitter,
8276
+ options: {
8277
+ returnIntermediateSteps: false,
8278
+ },
8279
+ });
8280
+ const newTokenCount = tokenizer(fileSummary);
8281
+ return {
8282
+ ...fileDiff,
8283
+ diff: fileSummary,
8284
+ tokenCount: newTokenCount,
8285
+ };
8286
+ }
8287
+ catch (error) {
8288
+ // On error, return original diff unchanged
8289
+ console.error(`Failed to summarize file ${fileDiff.file}:`, error);
8290
+ return fileDiff;
8291
+ }
8292
+ }
8293
+ /**
8294
+ * Process files in waves to respect concurrency limits.
8295
+ */
8296
+ async function processInWaves(items, processor, maxConcurrent) {
8297
+ const results = [];
8298
+ for (let i = 0; i < items.length; i += maxConcurrent) {
8299
+ const wave = items.slice(i, i + maxConcurrent);
8300
+ const waveResults = await Promise.all(wave.map(processor));
8301
+ results.push(...waveResults);
8302
+ }
8303
+ return results;
8304
+ }
8305
+ /**
8306
+ * Pre-summarize individual files that exceed the maxFileTokens threshold.
8307
+ * This prevents large files from dominating the token budget and biasing
8308
+ * the final commit message toward a single file's changes.
8309
+ *
8310
+ * @param diffs - Array of file diffs to process
8311
+ * @param options - Configuration options for summarization
8312
+ * @returns Array of file diffs with large files summarized
8313
+ */
8314
+ async function summarizeLargeFiles(diffs, options) {
8315
+ const { maxFileTokens, minTokensForSummary, maxConcurrent, tokenizer, logger, chain, textSplitter } = options;
8316
+ // Identify files that need summarization
8317
+ const filesToSummarize = [];
8318
+ const results = [...diffs];
8319
+ diffs.forEach((diff, index) => {
8320
+ if (diff.tokenCount > maxFileTokens && diff.tokenCount >= minTokensForSummary) {
8321
+ filesToSummarize.push({ index, diff });
8322
+ }
8323
+ });
8324
+ if (filesToSummarize.length === 0) {
8325
+ return results;
8326
+ }
8327
+ logger.verbose(`Pre-summarizing ${filesToSummarize.length} large file(s)...`, { color: 'blue' });
8328
+ // Process large files in waves
8329
+ const summarizedFiles = await processInWaves(filesToSummarize, async ({ diff }) => summarizeFileDiff(diff, { chain, textSplitter, tokenizer }), maxConcurrent);
8330
+ // Update results with summarized files
8331
+ summarizedFiles.forEach((summarizedDiff, i) => {
8332
+ const originalIndex = filesToSummarize[i].index;
8333
+ const originalTokens = results[originalIndex].tokenCount;
8334
+ const newTokens = summarizedDiff.tokenCount;
8335
+ logger.verbose(` - ${summarizedDiff.file}: ${originalTokens} -> ${newTokens} tokens`, { color: 'magenta' });
8336
+ results[originalIndex] = summarizedDiff;
8337
+ });
8338
+ return results;
8339
+ }
8340
+ /**
8341
+ * Pre-process a DiffNode tree, summarizing large files at the leaf level.
8342
+ * Returns a new DiffNode with updated token counts.
8343
+ */
8344
+ async function preprocessLargeFiles(rootNode, options) {
8345
+ // Collect all diffs from the tree
8346
+ const allDiffs = [];
8347
+ function collectDiffs(node) {
8348
+ allDiffs.push(...node.diffs);
8349
+ node.children.forEach(collectDiffs);
8350
+ }
8351
+ collectDiffs(rootNode);
8352
+ // Summarize large files
8353
+ const processedDiffs = await summarizeLargeFiles(allDiffs, options);
8354
+ // Create a map for quick lookup
8355
+ const diffMap = new Map();
8356
+ processedDiffs.forEach((diff) => diffMap.set(diff.file, diff));
8357
+ // Rebuild tree with processed diffs
8358
+ function rebuildNode(node) {
8359
+ return {
8360
+ path: node.path,
8361
+ diffs: node.diffs.map((diff) => diffMap.get(diff.file) || diff),
8362
+ children: node.children.map(rebuildNode),
8363
+ };
8364
+ }
8365
+ return rebuildNode(rootNode);
8366
+ }
8367
+
8220
8368
  /**
8221
8369
  * Create groups from a given node info.
8222
8370
  * @param {DiffNode} node - The node info to start grouping.
@@ -8269,6 +8417,16 @@ async function summarizeDirectoryDiff(directory, { chain, textSplitter, tokenize
8269
8417
  return directory;
8270
8418
  }
8271
8419
  }
8420
+ /**
8421
+ * Default output formatter for directory diffs.
8422
+ *
8423
+ * TODO: Future improvements to consider:
8424
+ * - Hierarchical output showing file -> directory -> overall summary
8425
+ * - Configurable verbosity levels (compact, standard, detailed)
8426
+ * - Machine-readable format option (JSON) for programmatic use
8427
+ * - Semantic grouping by change type (added/modified/deleted) or feature area
8428
+ * - Visual diff indicators showing magnitude of changes
8429
+ */
8272
8430
  const defaultOutputCallback = (group) => {
8273
8431
  let output = `
8274
8432
  -------\n* changes in "/${group.path}"\n\n`;
@@ -8280,41 +8438,124 @@ const defaultOutputCallback = (group) => {
8280
8438
  }
8281
8439
  return output;
8282
8440
  };
8283
- async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
8284
- const queue = new pQueue({ concurrency: 8 });
8441
+ /**
8442
+ * Process directory summarization in waves to respect concurrency limits
8443
+ * while maintaining predictable behavior.
8444
+ */
8445
+ async function summarizeInWaves(directories, options) {
8446
+ const { totalTokenCount: initialTotal, maxTokens, minTokensForSummary, maxConcurrent, logger, chain, textSplitter, tokenizer, } = options;
8447
+ let totalTokenCount = initialTotal;
8448
+ const results = [...directories];
8449
+ // Create sorted indices by token count (descending) for prioritized processing
8450
+ const sortedIndices = directories
8451
+ .map((d, i) => ({ index: i, tokens: d.tokenCount }))
8452
+ .sort((a, b) => b.tokens - a.tokens);
8453
+ let cursor = 0;
8454
+ while (totalTokenCount > maxTokens && cursor < sortedIndices.length) {
8455
+ // Select wave candidates: directories that exceed minTokensForSummary
8456
+ const wave = [];
8457
+ for (let i = cursor; i < sortedIndices.length && wave.length < maxConcurrent; i++) {
8458
+ const { index, tokens } = sortedIndices[i];
8459
+ // Skip directories below the minimum threshold
8460
+ if (tokens < minTokensForSummary) {
8461
+ cursor = i + 1;
8462
+ continue;
8463
+ }
8464
+ // Skip directories that have already been summarized
8465
+ if (results[index].summary) {
8466
+ cursor = i + 1;
8467
+ continue;
8468
+ }
8469
+ wave.push(index);
8470
+ cursor = i + 1;
8471
+ }
8472
+ // No more eligible candidates
8473
+ if (wave.length === 0) {
8474
+ break;
8475
+ }
8476
+ logger.verbose(`\nProcessing wave of ${wave.length} directories...`, { color: 'blue' });
8477
+ // Process wave in parallel
8478
+ const waveResults = await Promise.all(wave.map((idx) => summarizeDirectoryDiff(results[idx], { chain, textSplitter, tokenizer })));
8479
+ // Update results and recalculate total
8480
+ waveResults.forEach((result, i) => {
8481
+ const idx = wave[i];
8482
+ const originalTokens = results[idx].tokenCount;
8483
+ const newTokens = result.tokenCount;
8484
+ const reduction = originalTokens - newTokens;
8485
+ totalTokenCount -= reduction;
8486
+ results[idx] = result;
8487
+ logger.verbose(` • Summarized "/${result.path}": ${originalTokens} -> ${newTokens} tokens`, {
8488
+ color: 'magenta',
8489
+ });
8490
+ });
8491
+ logger.verbose(`Total token count: ${totalTokenCount}`, {
8492
+ color: totalTokenCount > maxTokens ? 'yellow' : 'green',
8493
+ });
8494
+ // Check if we're now under budget
8495
+ if (totalTokenCount <= maxTokens) {
8496
+ logger.verbose(`Under token budget, stopping summarization.`, { color: 'green' });
8497
+ break;
8498
+ }
8499
+ }
8500
+ return { directories: results, totalTokenCount };
8501
+ }
8502
+ /**
8503
+ * Summarize diffs using a three-phase approach:
8504
+ *
8505
+ * Phase 1: Pre-process large files to prevent any single file from dominating
8506
+ * Phase 2: Group diffs by directory and assess total token count
8507
+ * Phase 3: Wave-based parallel summarization until under budget
8508
+ *
8509
+ * This approach ensures:
8510
+ * - Large files don't bias the summary
8511
+ * - Small changes preserve their detail (minTokensForSummary threshold)
8512
+ * - Efficient parallel processing with predictable behavior
8513
+ * - Early exit when under token budget
8514
+ */
8515
+ async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, minTokensForSummary = 400, maxFileTokens, maxConcurrent = 6, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
8516
+ // Calculate maxFileTokens as 25% of maxTokens if not specified
8517
+ const effectiveMaxFileTokens = maxFileTokens ?? Math.floor(maxTokens * 0.25);
8518
+ // PHASE 1: Pre-process large files
8519
+ logger.startTimer().startSpinner(`Pre-processing large files...`, { color: 'blue' });
8520
+ const preprocessedNode = await preprocessLargeFiles(rootDiffNode, {
8521
+ maxFileTokens: effectiveMaxFileTokens,
8522
+ minTokensForSummary,
8523
+ maxConcurrent,
8524
+ tokenizer,
8525
+ logger,
8526
+ chain,
8527
+ textSplitter,
8528
+ });
8529
+ logger.stopSpinner('Files pre-processed').stopTimer();
8530
+ // PHASE 2: Directory grouping & assessment
8285
8531
  logger.startTimer().startSpinner(`Organizing Diffs...`, { color: 'blue' });
8286
- const directoryDiffs = createDirectoryDiffs(rootDiffNode);
8287
- // Sort by token count descending
8532
+ const directoryDiffs = createDirectoryDiffs(preprocessedNode);
8533
+ // Sort by token count descending for consistent output ordering
8288
8534
  directoryDiffs.sort((a, b) => b.tokenCount - a.tokenCount);
8289
- let totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
8535
+ const totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
8290
8536
  logger.stopSpinner('Diffs Organized').stopTimer();
8291
- logger.startSpinner(`Consolidating Diffs`, { color: 'blue' });
8292
- const processingTasks = directoryDiffs.map((group, i) => {
8293
- return queue.add(async () => {
8294
- // If the diff token count is already less than the average req, we can skip summarizing.
8295
- const isLessThanAvgTokenReq = group.tokenCount <= maxTokens / directoryDiffs.length;
8296
- if (totalTokenCount <= maxTokens || isLessThanAvgTokenReq) {
8297
- return group;
8298
- }
8299
- group = await summarizeDirectoryDiff(group, {
8300
- chain,
8301
- textSplitter,
8302
- tokenizer,
8303
- });
8304
- // We need to subtract the old token count and add the new one
8305
- totalTokenCount = totalTokenCount - directoryDiffs[i].tokenCount + group.tokenCount;
8306
- directoryDiffs[i] = group;
8307
- logger
8308
- .verbose(`\n • Summarized diffs in "/${group.path}" `, { color: 'blue' })
8309
- .verbose(`\nTotal token count: ${totalTokenCount}`, {
8310
- color: totalTokenCount > maxTokens ? 'yellow' : 'green',
8311
- });
8312
- return group;
8313
- }, { priority: group.tokenCount });
8537
+ logger.verbose(`Total token count: ${totalTokenCount}, max allowed: ${maxTokens}`, {
8538
+ color: totalTokenCount > maxTokens ? 'yellow' : 'green',
8539
+ });
8540
+ // Early exit if already under budget
8541
+ if (totalTokenCount <= maxTokens) {
8542
+ logger.verbose(`Already under token budget, skipping summarization.`, { color: 'green' });
8543
+ return directoryDiffs.map(handleOutput).join('');
8544
+ }
8545
+ // PHASE 3: Wave-based summarization
8546
+ logger.startTimer().startSpinner(`Consolidating Diffs...`, { color: 'blue' });
8547
+ const { directories: summarizedDiffs } = await summarizeInWaves(directoryDiffs, {
8548
+ totalTokenCount,
8549
+ maxTokens,
8550
+ minTokensForSummary,
8551
+ maxConcurrent,
8552
+ logger,
8553
+ chain,
8554
+ textSplitter,
8555
+ tokenizer,
8314
8556
  });
8315
- await Promise.all(processingTasks);
8316
- logger.stopSpinner(`Summarized Diffs`);
8317
- return directoryDiffs.map(handleOutput).join('');
8557
+ logger.stopSpinner(`Diffs Consolidated`).stopTimer();
8558
+ return summarizedDiffs.map(handleOutput).join('');
8318
8559
  }
8319
8560
 
8320
8561
  /**
@@ -11314,7 +11555,7 @@ for (var i = 0; i < 256; i++) {
11314
11555
  simpleEscapeMap[i] = simpleEscapeSequence(i);
11315
11556
  }
11316
11557
 
11317
- async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens }, }) {
11558
+ async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens, minTokensForSummary, maxFileTokens, maxConcurrent, }, }) {
11318
11559
  const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 10000, chunkOverlap: 250 });
11319
11560
  const summarizationChain = loadSummarizationChain(model, {
11320
11561
  type: 'map_reduce',
@@ -11328,11 +11569,17 @@ async function fileChangeParser({ changes, commit, options: { tokenizer, git, ll
11328
11569
  logger.startTimer().startSpinner(`Collecting Diffs...\n`, { color: 'blue' });
11329
11570
  const diffs = await collectDiffs(rootTreeNode, (path) => getDiff(path, commit, { git, logger }), tokenizer, logger);
11330
11571
  logger.stopSpinner('Diffs Collected').stopTimer();
11331
- // Summarize diffs
11572
+ // Summarize diffs using three-phase approach:
11573
+ // 1. Pre-process large files to prevent bias
11574
+ // 2. Group by directory and assess token count
11575
+ // 3. Wave-based parallel summarization until under budget
11332
11576
  logger.startTimer();
11333
11577
  const summary = await summarizeDiffs(diffs, {
11334
11578
  tokenizer,
11335
- maxTokens: maxTokens || 4096,
11579
+ maxTokens: maxTokens || 2048,
11580
+ minTokensForSummary,
11581
+ maxFileTokens,
11582
+ maxConcurrent,
11336
11583
  textSplitter,
11337
11584
  chain: summarizationChain,
11338
11585
  logger,
@@ -11635,7 +11882,16 @@ const handler$3 = async (argv, logger) => {
11635
11882
  return await fileChangeParser({
11636
11883
  changes,
11637
11884
  commit: '--staged',
11638
- options: { tokenizer, git, llm, logger, maxTokens: config.service.tokenLimit },
11885
+ options: {
11886
+ tokenizer,
11887
+ git,
11888
+ llm,
11889
+ logger,
11890
+ maxTokens: config.service.tokenLimit,
11891
+ minTokensForSummary: config.service.minTokensForSummary,
11892
+ maxFileTokens: config.service.maxFileTokens,
11893
+ maxConcurrent: config.service.maxConcurrent,
11894
+ },
11639
11895
  });
11640
11896
  }
11641
11897
  const commitMsg = await generateAndReviewLoop({
package/dist/index.js CHANGED
@@ -26,7 +26,6 @@ var outputs = require('@langchain/core/outputs');
26
26
  var manager = require('@langchain/core/callbacks/manager');
27
27
  require('@langchain/core/utils/json_patch');
28
28
  var simpleGit = require('simple-git');
29
- var pQueue = require('p-queue');
30
29
  var documents = require('@langchain/core/documents');
31
30
  var diff = require('diff');
32
31
  require('@langchain/core/messages');
@@ -69,7 +68,7 @@ var readline__namespace = /*#__PURE__*/_interopNamespaceDefault(readline);
69
68
  /**
70
69
  * Current build version from package.json
71
70
  */
72
- const BUILD_VERSION = "0.24.0";
71
+ const BUILD_VERSION = "0.25.0";
73
72
 
74
73
  const isInteractive = (config) => {
75
74
  return config?.mode === 'interactive' || !!config?.interactive;
@@ -184,6 +183,17 @@ async function updateFileSection({ filePath, startComment, endComment, getNewCon
184
183
  fs.writeFileSync(filePath, newLines.join('\n'));
185
184
  }
186
185
 
186
+ /**
187
+ * Prompt template for summarizing code diffs.
188
+ *
189
+ * TODO: Future improvements to consider:
190
+ * - Separate prompts for file-level vs directory-level summarization
191
+ * - Include file type context (e.g., "This is a React component", "This is a test file")
192
+ * - Add guidance for preserving semantic meaning of changes
193
+ * - Consider change type (added/modified/deleted) in prompt for better context
194
+ * - Include hints about the programming language for more idiomatic summaries
195
+ * - Add support for custom user-provided summarization prompts via config
196
+ */
187
197
  const template$5 = `GOAL: Use functional abstractions to summarize the following text
188
198
 
189
199
  RULES: Avoid phrases like "this change", "this code", or "this function" etc. Instead refer to the function, variable, or class by name.
@@ -1061,6 +1071,16 @@ const schema$1 = {
1061
1071
  "description": "The maximum number of requests to make concurrently.",
1062
1072
  "default": 6
1063
1073
  },
1074
+ "minTokensForSummary": {
1075
+ "type": "number",
1076
+ "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
1077
+ "default": 400
1078
+ },
1079
+ "maxFileTokens": {
1080
+ "type": "number",
1081
+ "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
1082
+ "default": "undefined (uses 0.25 * tokenLimit)"
1083
+ },
1064
1084
  "authentication": {
1065
1085
  "anyOf": [
1066
1086
  {
@@ -1821,6 +1841,16 @@ const schema$1 = {
1821
1841
  "description": "The maximum number of requests to make concurrently.",
1822
1842
  "default": 6
1823
1843
  },
1844
+ "minTokensForSummary": {
1845
+ "type": "number",
1846
+ "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
1847
+ "default": 400
1848
+ },
1849
+ "maxFileTokens": {
1850
+ "type": "number",
1851
+ "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
1852
+ "default": "undefined (uses 0.25 * tokenLimit)"
1853
+ },
1824
1854
  "authentication": {
1825
1855
  "anyOf": [
1826
1856
  {
@@ -1972,6 +2002,16 @@ const schema$1 = {
1972
2002
  "description": "The maximum number of requests to make concurrently.",
1973
2003
  "default": 6
1974
2004
  },
2005
+ "minTokensForSummary": {
2006
+ "type": "number",
2007
+ "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
2008
+ "default": 400
2009
+ },
2010
+ "maxFileTokens": {
2011
+ "type": "number",
2012
+ "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
2013
+ "default": "undefined (uses 0.25 * tokenLimit)"
2014
+ },
1975
2015
  "authentication": {
1976
2016
  "anyOf": [
1977
2017
  {
@@ -8239,6 +8279,114 @@ async function summarize(documents$1, { chain, textSplitter, options }) {
8239
8279
  return res.text && res.text.trim();
8240
8280
  }
8241
8281
 
8282
+ /**
8283
+ * Summarize a single file diff that exceeds the token threshold.
8284
+ */
8285
+ async function summarizeFileDiff(fileDiff, { chain, textSplitter, tokenizer }) {
8286
+ try {
8287
+ const fileSummary = await summarize([
8288
+ {
8289
+ pageContent: fileDiff.diff,
8290
+ metadata: {
8291
+ file: fileDiff.file,
8292
+ summary: fileDiff.summary,
8293
+ },
8294
+ },
8295
+ ], {
8296
+ chain,
8297
+ textSplitter,
8298
+ options: {
8299
+ returnIntermediateSteps: false,
8300
+ },
8301
+ });
8302
+ const newTokenCount = tokenizer(fileSummary);
8303
+ return {
8304
+ ...fileDiff,
8305
+ diff: fileSummary,
8306
+ tokenCount: newTokenCount,
8307
+ };
8308
+ }
8309
+ catch (error) {
8310
+ // On error, return original diff unchanged
8311
+ console.error(`Failed to summarize file ${fileDiff.file}:`, error);
8312
+ return fileDiff;
8313
+ }
8314
+ }
8315
+ /**
8316
+ * Process files in waves to respect concurrency limits.
8317
+ */
8318
+ async function processInWaves(items, processor, maxConcurrent) {
8319
+ const results = [];
8320
+ for (let i = 0; i < items.length; i += maxConcurrent) {
8321
+ const wave = items.slice(i, i + maxConcurrent);
8322
+ const waveResults = await Promise.all(wave.map(processor));
8323
+ results.push(...waveResults);
8324
+ }
8325
+ return results;
8326
+ }
8327
+ /**
8328
+ * Pre-summarize individual files that exceed the maxFileTokens threshold.
8329
+ * This prevents large files from dominating the token budget and biasing
8330
+ * the final commit message toward a single file's changes.
8331
+ *
8332
+ * @param diffs - Array of file diffs to process
8333
+ * @param options - Configuration options for summarization
8334
+ * @returns Array of file diffs with large files summarized
8335
+ */
8336
+ async function summarizeLargeFiles(diffs, options) {
8337
+ const { maxFileTokens, minTokensForSummary, maxConcurrent, tokenizer, logger, chain, textSplitter } = options;
8338
+ // Identify files that need summarization
8339
+ const filesToSummarize = [];
8340
+ const results = [...diffs];
8341
+ diffs.forEach((diff, index) => {
8342
+ if (diff.tokenCount > maxFileTokens && diff.tokenCount >= minTokensForSummary) {
8343
+ filesToSummarize.push({ index, diff });
8344
+ }
8345
+ });
8346
+ if (filesToSummarize.length === 0) {
8347
+ return results;
8348
+ }
8349
+ logger.verbose(`Pre-summarizing ${filesToSummarize.length} large file(s)...`, { color: 'blue' });
8350
+ // Process large files in waves
8351
+ const summarizedFiles = await processInWaves(filesToSummarize, async ({ diff }) => summarizeFileDiff(diff, { chain, textSplitter, tokenizer }), maxConcurrent);
8352
+ // Update results with summarized files
8353
+ summarizedFiles.forEach((summarizedDiff, i) => {
8354
+ const originalIndex = filesToSummarize[i].index;
8355
+ const originalTokens = results[originalIndex].tokenCount;
8356
+ const newTokens = summarizedDiff.tokenCount;
8357
+ logger.verbose(` - ${summarizedDiff.file}: ${originalTokens} -> ${newTokens} tokens`, { color: 'magenta' });
8358
+ results[originalIndex] = summarizedDiff;
8359
+ });
8360
+ return results;
8361
+ }
8362
+ /**
8363
+ * Pre-process a DiffNode tree, summarizing large files at the leaf level.
8364
+ * Returns a new DiffNode with updated token counts.
8365
+ */
8366
+ async function preprocessLargeFiles(rootNode, options) {
8367
+ // Collect all diffs from the tree
8368
+ const allDiffs = [];
8369
+ function collectDiffs(node) {
8370
+ allDiffs.push(...node.diffs);
8371
+ node.children.forEach(collectDiffs);
8372
+ }
8373
+ collectDiffs(rootNode);
8374
+ // Summarize large files
8375
+ const processedDiffs = await summarizeLargeFiles(allDiffs, options);
8376
+ // Create a map for quick lookup
8377
+ const diffMap = new Map();
8378
+ processedDiffs.forEach((diff) => diffMap.set(diff.file, diff));
8379
+ // Rebuild tree with processed diffs
8380
+ function rebuildNode(node) {
8381
+ return {
8382
+ path: node.path,
8383
+ diffs: node.diffs.map((diff) => diffMap.get(diff.file) || diff),
8384
+ children: node.children.map(rebuildNode),
8385
+ };
8386
+ }
8387
+ return rebuildNode(rootNode);
8388
+ }
8389
+
8242
8390
  /**
8243
8391
  * Create groups from a given node info.
8244
8392
  * @param {DiffNode} node - The node info to start grouping.
@@ -8291,6 +8439,16 @@ async function summarizeDirectoryDiff(directory, { chain, textSplitter, tokenize
8291
8439
  return directory;
8292
8440
  }
8293
8441
  }
8442
+ /**
8443
+ * Default output formatter for directory diffs.
8444
+ *
8445
+ * TODO: Future improvements to consider:
8446
+ * - Hierarchical output showing file -> directory -> overall summary
8447
+ * - Configurable verbosity levels (compact, standard, detailed)
8448
+ * - Machine-readable format option (JSON) for programmatic use
8449
+ * - Semantic grouping by change type (added/modified/deleted) or feature area
8450
+ * - Visual diff indicators showing magnitude of changes
8451
+ */
8294
8452
  const defaultOutputCallback = (group) => {
8295
8453
  let output = `
8296
8454
  -------\n* changes in "/${group.path}"\n\n`;
@@ -8302,41 +8460,124 @@ const defaultOutputCallback = (group) => {
8302
8460
  }
8303
8461
  return output;
8304
8462
  };
8305
- async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
8306
- const queue = new pQueue({ concurrency: 8 });
8463
+ /**
8464
+ * Process directory summarization in waves to respect concurrency limits
8465
+ * while maintaining predictable behavior.
8466
+ */
8467
+ async function summarizeInWaves(directories, options) {
8468
+ const { totalTokenCount: initialTotal, maxTokens, minTokensForSummary, maxConcurrent, logger, chain, textSplitter, tokenizer, } = options;
8469
+ let totalTokenCount = initialTotal;
8470
+ const results = [...directories];
8471
+ // Create sorted indices by token count (descending) for prioritized processing
8472
+ const sortedIndices = directories
8473
+ .map((d, i) => ({ index: i, tokens: d.tokenCount }))
8474
+ .sort((a, b) => b.tokens - a.tokens);
8475
+ let cursor = 0;
8476
+ while (totalTokenCount > maxTokens && cursor < sortedIndices.length) {
8477
+ // Select wave candidates: directories that exceed minTokensForSummary
8478
+ const wave = [];
8479
+ for (let i = cursor; i < sortedIndices.length && wave.length < maxConcurrent; i++) {
8480
+ const { index, tokens } = sortedIndices[i];
8481
+ // Skip directories below the minimum threshold
8482
+ if (tokens < minTokensForSummary) {
8483
+ cursor = i + 1;
8484
+ continue;
8485
+ }
8486
+ // Skip directories that have already been summarized
8487
+ if (results[index].summary) {
8488
+ cursor = i + 1;
8489
+ continue;
8490
+ }
8491
+ wave.push(index);
8492
+ cursor = i + 1;
8493
+ }
8494
+ // No more eligible candidates
8495
+ if (wave.length === 0) {
8496
+ break;
8497
+ }
8498
+ logger.verbose(`\nProcessing wave of ${wave.length} directories...`, { color: 'blue' });
8499
+ // Process wave in parallel
8500
+ const waveResults = await Promise.all(wave.map((idx) => summarizeDirectoryDiff(results[idx], { chain, textSplitter, tokenizer })));
8501
+ // Update results and recalculate total
8502
+ waveResults.forEach((result, i) => {
8503
+ const idx = wave[i];
8504
+ const originalTokens = results[idx].tokenCount;
8505
+ const newTokens = result.tokenCount;
8506
+ const reduction = originalTokens - newTokens;
8507
+ totalTokenCount -= reduction;
8508
+ results[idx] = result;
8509
+ logger.verbose(` • Summarized "/${result.path}": ${originalTokens} -> ${newTokens} tokens`, {
8510
+ color: 'magenta',
8511
+ });
8512
+ });
8513
+ logger.verbose(`Total token count: ${totalTokenCount}`, {
8514
+ color: totalTokenCount > maxTokens ? 'yellow' : 'green',
8515
+ });
8516
+ // Check if we're now under budget
8517
+ if (totalTokenCount <= maxTokens) {
8518
+ logger.verbose(`Under token budget, stopping summarization.`, { color: 'green' });
8519
+ break;
8520
+ }
8521
+ }
8522
+ return { directories: results, totalTokenCount };
8523
+ }
8524
+ /**
8525
+ * Summarize diffs using a three-phase approach:
8526
+ *
8527
+ * Phase 1: Pre-process large files to prevent any single file from dominating
8528
+ * Phase 2: Group diffs by directory and assess total token count
8529
+ * Phase 3: Wave-based parallel summarization until under budget
8530
+ *
8531
+ * This approach ensures:
8532
+ * - Large files don't bias the summary
8533
+ * - Small changes preserve their detail (minTokensForSummary threshold)
8534
+ * - Efficient parallel processing with predictable behavior
8535
+ * - Early exit when under token budget
8536
+ */
8537
+ async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, minTokensForSummary = 400, maxFileTokens, maxConcurrent = 6, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
8538
+ // Calculate maxFileTokens as 25% of maxTokens if not specified
8539
+ const effectiveMaxFileTokens = maxFileTokens ?? Math.floor(maxTokens * 0.25);
8540
+ // PHASE 1: Pre-process large files
8541
+ logger.startTimer().startSpinner(`Pre-processing large files...`, { color: 'blue' });
8542
+ const preprocessedNode = await preprocessLargeFiles(rootDiffNode, {
8543
+ maxFileTokens: effectiveMaxFileTokens,
8544
+ minTokensForSummary,
8545
+ maxConcurrent,
8546
+ tokenizer,
8547
+ logger,
8548
+ chain,
8549
+ textSplitter,
8550
+ });
8551
+ logger.stopSpinner('Files pre-processed').stopTimer();
8552
+ // PHASE 2: Directory grouping & assessment
8307
8553
  logger.startTimer().startSpinner(`Organizing Diffs...`, { color: 'blue' });
8308
- const directoryDiffs = createDirectoryDiffs(rootDiffNode);
8309
- // Sort by token count descending
8554
+ const directoryDiffs = createDirectoryDiffs(preprocessedNode);
8555
+ // Sort by token count descending for consistent output ordering
8310
8556
  directoryDiffs.sort((a, b) => b.tokenCount - a.tokenCount);
8311
- let totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
8557
+ const totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
8312
8558
  logger.stopSpinner('Diffs Organized').stopTimer();
8313
- logger.startSpinner(`Consolidating Diffs`, { color: 'blue' });
8314
- const processingTasks = directoryDiffs.map((group, i) => {
8315
- return queue.add(async () => {
8316
- // If the diff token count is already less than the average req, we can skip summarizing.
8317
- const isLessThanAvgTokenReq = group.tokenCount <= maxTokens / directoryDiffs.length;
8318
- if (totalTokenCount <= maxTokens || isLessThanAvgTokenReq) {
8319
- return group;
8320
- }
8321
- group = await summarizeDirectoryDiff(group, {
8322
- chain,
8323
- textSplitter,
8324
- tokenizer,
8325
- });
8326
- // We need to subtract the old token count and add the new one
8327
- totalTokenCount = totalTokenCount - directoryDiffs[i].tokenCount + group.tokenCount;
8328
- directoryDiffs[i] = group;
8329
- logger
8330
- .verbose(`\n • Summarized diffs in "/${group.path}" `, { color: 'blue' })
8331
- .verbose(`\nTotal token count: ${totalTokenCount}`, {
8332
- color: totalTokenCount > maxTokens ? 'yellow' : 'green',
8333
- });
8334
- return group;
8335
- }, { priority: group.tokenCount });
8559
+ logger.verbose(`Total token count: ${totalTokenCount}, max allowed: ${maxTokens}`, {
8560
+ color: totalTokenCount > maxTokens ? 'yellow' : 'green',
8561
+ });
8562
+ // Early exit if already under budget
8563
+ if (totalTokenCount <= maxTokens) {
8564
+ logger.verbose(`Already under token budget, skipping summarization.`, { color: 'green' });
8565
+ return directoryDiffs.map(handleOutput).join('');
8566
+ }
8567
+ // PHASE 3: Wave-based summarization
8568
+ logger.startTimer().startSpinner(`Consolidating Diffs...`, { color: 'blue' });
8569
+ const { directories: summarizedDiffs } = await summarizeInWaves(directoryDiffs, {
8570
+ totalTokenCount,
8571
+ maxTokens,
8572
+ minTokensForSummary,
8573
+ maxConcurrent,
8574
+ logger,
8575
+ chain,
8576
+ textSplitter,
8577
+ tokenizer,
8336
8578
  });
8337
- await Promise.all(processingTasks);
8338
- logger.stopSpinner(`Summarized Diffs`);
8339
- return directoryDiffs.map(handleOutput).join('');
8579
+ logger.stopSpinner(`Diffs Consolidated`).stopTimer();
8580
+ return summarizedDiffs.map(handleOutput).join('');
8340
8581
  }
8341
8582
 
8342
8583
  /**
@@ -11336,7 +11577,7 @@ for (var i = 0; i < 256; i++) {
11336
11577
  simpleEscapeMap[i] = simpleEscapeSequence(i);
11337
11578
  }
11338
11579
 
11339
- async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens }, }) {
11580
+ async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens, minTokensForSummary, maxFileTokens, maxConcurrent, }, }) {
11340
11581
  const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 10000, chunkOverlap: 250 });
11341
11582
  const summarizationChain = loadSummarizationChain(model, {
11342
11583
  type: 'map_reduce',
@@ -11350,11 +11591,17 @@ async function fileChangeParser({ changes, commit, options: { tokenizer, git, ll
11350
11591
  logger.startTimer().startSpinner(`Collecting Diffs...\n`, { color: 'blue' });
11351
11592
  const diffs = await collectDiffs(rootTreeNode, (path) => getDiff(path, commit, { git, logger }), tokenizer, logger);
11352
11593
  logger.stopSpinner('Diffs Collected').stopTimer();
11353
- // Summarize diffs
11594
+ // Summarize diffs using three-phase approach:
11595
+ // 1. Pre-process large files to prevent bias
11596
+ // 2. Group by directory and assess token count
11597
+ // 3. Wave-based parallel summarization until under budget
11354
11598
  logger.startTimer();
11355
11599
  const summary = await summarizeDiffs(diffs, {
11356
11600
  tokenizer,
11357
- maxTokens: maxTokens || 4096,
11601
+ maxTokens: maxTokens || 2048,
11602
+ minTokensForSummary,
11603
+ maxFileTokens,
11604
+ maxConcurrent,
11358
11605
  textSplitter,
11359
11606
  chain: summarizationChain,
11360
11607
  logger,
@@ -11657,7 +11904,16 @@ const handler$3 = async (argv, logger) => {
11657
11904
  return await fileChangeParser({
11658
11905
  changes,
11659
11906
  commit: '--staged',
11660
- options: { tokenizer, git, llm, logger, maxTokens: config.service.tokenLimit },
11907
+ options: {
11908
+ tokenizer,
11909
+ git,
11910
+ llm,
11911
+ logger,
11912
+ maxTokens: config.service.tokenLimit,
11913
+ minTokensForSummary: config.service.minTokensForSummary,
11914
+ maxFileTokens: config.service.maxFileTokens,
11915
+ maxConcurrent: config.service.maxConcurrent,
11916
+ },
11661
11917
  });
11662
11918
  }
11663
11919
  const commitMsg = await generateAndReviewLoop({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "git-coco",
3
- "version": "0.24.0",
3
+ "version": "0.25.0",
4
4
  "description": "zero-effort git commits with coco.",
5
5
  "author": "gfargo <ghfargo@gmail.com>",
6
6
  "license": "MIT",
@@ -77,20 +77,20 @@
77
77
  "@commitlint/core": "^19.8.0",
78
78
  "@inquirer/prompts": "3.3.0",
79
79
  "@langchain/anthropic": "^0.3.14",
80
- "@langchain/community": "^0.3.32",
81
- "@langchain/core": "^0.3.40",
80
+ "@langchain/community": "^0.3.58",
81
+ "@langchain/core": "^0.3.80",
82
82
  "@langchain/ollama": "^0.2.0",
83
83
  "@langchain/openai": "^0.6.7",
84
84
  "ajv": "^8.16.0",
85
85
  "chalk": "4.1.2",
86
- "diff": "8.0.2",
86
+ "diff": "8.0.3",
87
87
  "ini": "5.0.0",
88
88
  "minimatch": "^9.0.5",
89
89
  "ora": "5.4.1",
90
90
  "p-queue": "5.0.0",
91
91
  "performance-now": "2.1.0",
92
92
  "pretty-ms": "7.0.1",
93
- "simple-git": "3.28.0",
93
+ "simple-git": "3.30.0",
94
94
  "tiktoken": "^1.0.21",
95
95
  "yargs": "17.7.2"
96
96
  },