npm - git-coco - Versions diffs - 0.24.0 → 0.25.0 - Mend

git-coco 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -38,6 +38,21 @@ type BaseLLMService = {
      * @default 6
      */
     maxConcurrent?: number;
+    /**
+     * Minimum token count for a directory/file group to be eligible for summarization.
+     * Groups below this threshold preserve raw diffs to maintain detail.
+     *
+     * @default 400
+     */
+    minTokensForSummary?: number;
+    /**
+     * Maximum tokens allowed for a single file diff before it gets pre-summarized.
+     * Prevents large files from biasing the overall summary.
+     * If not set, defaults to 25% of tokenLimit.
+     *
+     * @default undefined (uses 0.25 * tokenLimit)
+     */
+    maxFileTokens?: number;
     authentication: Authentication;
     requestOptions?: {
         timeout?: number;
@@ -339,6 +354,21 @@ interface BaseParserOptions {
     git: SimpleGit;
     logger: Logger;
     maxTokens?: number;
+    /**
+     * Minimum token count for a directory/file group to be eligible for summarization.
+     * @default 400
+     */
+    minTokensForSummary?: number;
+    /**
+     * Maximum tokens allowed for a single file diff before it gets pre-summarized.
+     * Defaults to 25% of maxTokens if not specified.
+     */
+    maxFileTokens?: number;
+    /**
+     * Maximum number of concurrent summarization requests.
+     * @default 6
+     */
+    maxConcurrent?: number;
 }
 interface BaseParserInput {
     options: BaseParserOptions;

package/dist/index.esm.mjs CHANGED Viewed

@@ -27,7 +27,6 @@ import { RUN_KEY } from '@langchain/core/outputs';
 import { CallbackManager, parseCallbackConfigArg } from '@langchain/core/callbacks/manager';
 import '@langchain/core/utils/json_patch';
 import { simpleGit } from 'simple-git';
-import pQueue from 'p-queue';
 import { Document, BaseDocumentTransformer } from '@langchain/core/documents';
 import { createTwoFilesPatch } from 'diff';
 import '@langchain/core/messages';
@@ -47,7 +46,7 @@ import { pathToFileURL } from 'url';
 /**
  * Current build version from package.json
  */
-const BUILD_VERSION = "0.24.0";
+const BUILD_VERSION = "0.25.0";
 const isInteractive = (config) => {
     return config?.mode === 'interactive' || !!config?.interactive;
@@ -162,6 +161,17 @@ async function updateFileSection({ filePath, startComment, endComment, getNewCon
     fs__default.writeFileSync(filePath, newLines.join('\n'));
 }
+/**
+ * Prompt template for summarizing code diffs.
+ *
+ * TODO: Future improvements to consider:
+ * - Separate prompts for file-level vs directory-level summarization
+ * - Include file type context (e.g., "This is a React component", "This is a test file")
+ * - Add guidance for preserving semantic meaning of changes
+ * - Consider change type (added/modified/deleted) in prompt for better context
+ * - Include hints about the programming language for more idiomatic summaries
+ * - Add support for custom user-provided summarization prompts via config
+ */
 const template$5 = `GOAL: Use functional abstractions to summarize the following text
 RULES: Avoid phrases like  "this change", "this code", or "this function" etc. Instead refer to the function, variable, or class by name.
@@ -1039,6 +1049,16 @@ const schema$1 = {
                     "description": "The maximum number of requests to make concurrently.",
                     "default": 6
                 },
+                "minTokensForSummary": {
+                    "type": "number",
+                    "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
+                    "default": 400
+                },
+                "maxFileTokens": {
+                    "type": "number",
+                    "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
+                    "default": "undefined (uses 0.25 * tokenLimit)"
+                },
                 "authentication": {
                     "anyOf": [
                         {
@@ -1799,6 +1819,16 @@ const schema$1 = {
                     "description": "The maximum number of requests to make concurrently.",
                     "default": 6
                 },
+                "minTokensForSummary": {
+                    "type": "number",
+                    "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
+                    "default": 400
+                },
+                "maxFileTokens": {
+                    "type": "number",
+                    "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
+                    "default": "undefined (uses 0.25 * tokenLimit)"
+                },
                 "authentication": {
                     "anyOf": [
                         {
@@ -1950,6 +1980,16 @@ const schema$1 = {
                     "description": "The maximum number of requests to make concurrently.",
                     "default": 6
                 },
+                "minTokensForSummary": {
+                    "type": "number",
+                    "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
+                    "default": 400
+                },
+                "maxFileTokens": {
+                    "type": "number",
+                    "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
+                    "default": "undefined (uses 0.25 * tokenLimit)"
+                },
                 "authentication": {
                     "anyOf": [
                         {
@@ -8217,6 +8257,114 @@ async function summarize(documents, { chain, textSplitter, options }) {
     return res.text && res.text.trim();
 }
+/**
+ * Summarize a single file diff that exceeds the token threshold.
+ */
+async function summarizeFileDiff(fileDiff, { chain, textSplitter, tokenizer }) {
+    try {
+        const fileSummary = await summarize([
+            {
+                pageContent: fileDiff.diff,
+                metadata: {
+                    file: fileDiff.file,
+                    summary: fileDiff.summary,
+                },
+            },
+        ], {
+            chain,
+            textSplitter,
+            options: {
+                returnIntermediateSteps: false,
+            },
+        });
+        const newTokenCount = tokenizer(fileSummary);
+        return {
+            ...fileDiff,
+            diff: fileSummary,
+            tokenCount: newTokenCount,
+        };
+    }
+    catch (error) {
+        // On error, return original diff unchanged
+        console.error(`Failed to summarize file ${fileDiff.file}:`, error);
+        return fileDiff;
+    }
+}
+/**
+ * Process files in waves to respect concurrency limits.
+ */
+async function processInWaves(items, processor, maxConcurrent) {
+    const results = [];
+    for (let i = 0; i < items.length; i += maxConcurrent) {
+        const wave = items.slice(i, i + maxConcurrent);
+        const waveResults = await Promise.all(wave.map(processor));
+        results.push(...waveResults);
+    }
+    return results;
+}
+/**
+ * Pre-summarize individual files that exceed the maxFileTokens threshold.
+ * This prevents large files from dominating the token budget and biasing
+ * the final commit message toward a single file's changes.
+ *
+ * @param diffs - Array of file diffs to process
+ * @param options - Configuration options for summarization
+ * @returns Array of file diffs with large files summarized
+ */
+async function summarizeLargeFiles(diffs, options) {
+    const { maxFileTokens, minTokensForSummary, maxConcurrent, tokenizer, logger, chain, textSplitter } = options;
+    // Identify files that need summarization
+    const filesToSummarize = [];
+    const results = [...diffs];
+    diffs.forEach((diff, index) => {
+        if (diff.tokenCount > maxFileTokens && diff.tokenCount >= minTokensForSummary) {
+            filesToSummarize.push({ index, diff });
+        }
+    });
+    if (filesToSummarize.length === 0) {
+        return results;
+    }
+    logger.verbose(`Pre-summarizing ${filesToSummarize.length} large file(s)...`, { color: 'blue' });
+    // Process large files in waves
+    const summarizedFiles = await processInWaves(filesToSummarize, async ({ diff }) => summarizeFileDiff(diff, { chain, textSplitter, tokenizer }), maxConcurrent);
+    // Update results with summarized files
+    summarizedFiles.forEach((summarizedDiff, i) => {
+        const originalIndex = filesToSummarize[i].index;
+        const originalTokens = results[originalIndex].tokenCount;
+        const newTokens = summarizedDiff.tokenCount;
+        logger.verbose(` - ${summarizedDiff.file}: ${originalTokens} -> ${newTokens} tokens`, { color: 'magenta' });
+        results[originalIndex] = summarizedDiff;
+    });
+    return results;
+}
+/**
+ * Pre-process a DiffNode tree, summarizing large files at the leaf level.
+ * Returns a new DiffNode with updated token counts.
+ */
+async function preprocessLargeFiles(rootNode, options) {
+    // Collect all diffs from the tree
+    const allDiffs = [];
+    function collectDiffs(node) {
+        allDiffs.push(...node.diffs);
+        node.children.forEach(collectDiffs);
+    }
+    collectDiffs(rootNode);
+    // Summarize large files
+    const processedDiffs = await summarizeLargeFiles(allDiffs, options);
+    // Create a map for quick lookup
+    const diffMap = new Map();
+    processedDiffs.forEach((diff) => diffMap.set(diff.file, diff));
+    // Rebuild tree with processed diffs
+    function rebuildNode(node) {
+        return {
+            path: node.path,
+            diffs: node.diffs.map((diff) => diffMap.get(diff.file) || diff),
+            children: node.children.map(rebuildNode),
+        };
+    }
+    return rebuildNode(rootNode);
+}
 /**
  * Create groups from a given node info.
  * @param {DiffNode} node - The node info to start grouping.
@@ -8269,6 +8417,16 @@ async function summarizeDirectoryDiff(directory, { chain, textSplitter, tokenize
         return directory;
     }
 }
+/**
+ * Default output formatter for directory diffs.
+ *
+ * TODO: Future improvements to consider:
+ * - Hierarchical output showing file -> directory -> overall summary
+ * - Configurable verbosity levels (compact, standard, detailed)
+ * - Machine-readable format option (JSON) for programmatic use
+ * - Semantic grouping by change type (added/modified/deleted) or feature area
+ * - Visual diff indicators showing magnitude of changes
+ */
 const defaultOutputCallback = (group) => {
     let output = `
 -------\n* changes in "/${group.path}"\n\n`;
@@ -8280,41 +8438,124 @@ const defaultOutputCallback = (group) => {
     }
     return output;
 };
-async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
-    const queue = new pQueue({ concurrency: 8 });
+/**
+ * Process directory summarization in waves to respect concurrency limits
+ * while maintaining predictable behavior.
+ */
+async function summarizeInWaves(directories, options) {
+    const { totalTokenCount: initialTotal, maxTokens, minTokensForSummary, maxConcurrent, logger, chain, textSplitter, tokenizer, } = options;
+    let totalTokenCount = initialTotal;
+    const results = [...directories];
+    // Create sorted indices by token count (descending) for prioritized processing
+    const sortedIndices = directories
+        .map((d, i) => ({ index: i, tokens: d.tokenCount }))
+        .sort((a, b) => b.tokens - a.tokens);
+    let cursor = 0;
+    while (totalTokenCount > maxTokens && cursor < sortedIndices.length) {
+        // Select wave candidates: directories that exceed minTokensForSummary
+        const wave = [];
+        for (let i = cursor; i < sortedIndices.length && wave.length < maxConcurrent; i++) {
+            const { index, tokens } = sortedIndices[i];
+            // Skip directories below the minimum threshold
+            if (tokens < minTokensForSummary) {
+                cursor = i + 1;
+                continue;
+            }
+            // Skip directories that have already been summarized
+            if (results[index].summary) {
+                cursor = i + 1;
+                continue;
+            }
+            wave.push(index);
+            cursor = i + 1;
+        }
+        // No more eligible candidates
+        if (wave.length === 0) {
+            break;
+        }
+        logger.verbose(`\nProcessing wave of ${wave.length} directories...`, { color: 'blue' });
+        // Process wave in parallel
+        const waveResults = await Promise.all(wave.map((idx) => summarizeDirectoryDiff(results[idx], { chain, textSplitter, tokenizer })));
+        // Update results and recalculate total
+        waveResults.forEach((result, i) => {
+            const idx = wave[i];
+            const originalTokens = results[idx].tokenCount;
+            const newTokens = result.tokenCount;
+            const reduction = originalTokens - newTokens;
+            totalTokenCount -= reduction;
+            results[idx] = result;
+            logger.verbose(` • Summarized "/${result.path}": ${originalTokens} -> ${newTokens} tokens`, {
+                color: 'magenta',
+            });
+        });
+        logger.verbose(`Total token count: ${totalTokenCount}`, {
+            color: totalTokenCount > maxTokens ? 'yellow' : 'green',
+        });
+        // Check if we're now under budget
+        if (totalTokenCount <= maxTokens) {
+            logger.verbose(`Under token budget, stopping summarization.`, { color: 'green' });
+            break;
+        }
+    }
+    return { directories: results, totalTokenCount };
+}
+/**
+ * Summarize diffs using a three-phase approach:
+ *
+ * Phase 1: Pre-process large files to prevent any single file from dominating
+ * Phase 2: Group diffs by directory and assess total token count
+ * Phase 3: Wave-based parallel summarization until under budget
+ *
+ * This approach ensures:
+ * - Large files don't bias the summary
+ * - Small changes preserve their detail (minTokensForSummary threshold)
+ * - Efficient parallel processing with predictable behavior
+ * - Early exit when under token budget
+ */
+async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, minTokensForSummary = 400, maxFileTokens, maxConcurrent = 6, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
+    // Calculate maxFileTokens as 25% of maxTokens if not specified
+    const effectiveMaxFileTokens = maxFileTokens ?? Math.floor(maxTokens * 0.25);
+    // PHASE 1: Pre-process large files
+    logger.startTimer().startSpinner(`Pre-processing large files...`, { color: 'blue' });
+    const preprocessedNode = await preprocessLargeFiles(rootDiffNode, {
+        maxFileTokens: effectiveMaxFileTokens,
+        minTokensForSummary,
+        maxConcurrent,
+        tokenizer,
+        logger,
+        chain,
+        textSplitter,
+    });
+    logger.stopSpinner('Files pre-processed').stopTimer();
+    // PHASE 2: Directory grouping & assessment
     logger.startTimer().startSpinner(`Organizing Diffs...`, { color: 'blue' });
-    const directoryDiffs = createDirectoryDiffs(rootDiffNode);
-    // Sort by token count descending
+    const directoryDiffs = createDirectoryDiffs(preprocessedNode);
+    // Sort by token count descending for consistent output ordering
     directoryDiffs.sort((a, b) => b.tokenCount - a.tokenCount);
-    let totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
+    const totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
     logger.stopSpinner('Diffs Organized').stopTimer();
-    logger.startSpinner(`Consolidating Diffs`, { color: 'blue' });
-    const processingTasks = directoryDiffs.map((group, i) => {
-        return queue.add(async () => {
-            // If the diff token count is already less than the average req, we can skip summarizing.
-            const isLessThanAvgTokenReq = group.tokenCount <= maxTokens / directoryDiffs.length;
-            if (totalTokenCount <= maxTokens || isLessThanAvgTokenReq) {
-                return group;
-            }
-            group = await summarizeDirectoryDiff(group, {
-                chain,
-                textSplitter,
-                tokenizer,
-            });
-            // We need to subtract the old token count and add the new one
-            totalTokenCount = totalTokenCount - directoryDiffs[i].tokenCount + group.tokenCount;
-            directoryDiffs[i] = group;
-            logger
-                .verbose(`\n • Summarized diffs in "/${group.path}" `, { color: 'blue' })
-                .verbose(`\nTotal token count: ${totalTokenCount}`, {
-                color: totalTokenCount > maxTokens ? 'yellow' : 'green',
-            });
-            return group;
-        }, { priority: group.tokenCount });
+    logger.verbose(`Total token count: ${totalTokenCount}, max allowed: ${maxTokens}`, {
+        color: totalTokenCount > maxTokens ? 'yellow' : 'green',
+    });
+    // Early exit if already under budget
+    if (totalTokenCount <= maxTokens) {
+        logger.verbose(`Already under token budget, skipping summarization.`, { color: 'green' });
+        return directoryDiffs.map(handleOutput).join('');
+    }
+    // PHASE 3: Wave-based summarization
+    logger.startTimer().startSpinner(`Consolidating Diffs...`, { color: 'blue' });
+    const { directories: summarizedDiffs } = await summarizeInWaves(directoryDiffs, {
+        totalTokenCount,
+        maxTokens,
+        minTokensForSummary,
+        maxConcurrent,
+        logger,
+        chain,
+        textSplitter,
+        tokenizer,
     });
-    await Promise.all(processingTasks);
-    logger.stopSpinner(`Summarized Diffs`);
-    return directoryDiffs.map(handleOutput).join('');
+    logger.stopSpinner(`Diffs Consolidated`).stopTimer();
+    return summarizedDiffs.map(handleOutput).join('');
 }
 /**
@@ -11314,7 +11555,7 @@ for (var i = 0; i < 256; i++) {
   simpleEscapeMap[i] = simpleEscapeSequence(i);
 }
-async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens }, }) {
+async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens, minTokensForSummary, maxFileTokens, maxConcurrent, }, }) {
     const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 10000, chunkOverlap: 250 });
     const summarizationChain = loadSummarizationChain(model, {
         type: 'map_reduce',
@@ -11328,11 +11569,17 @@ async function fileChangeParser({ changes, commit, options: { tokenizer, git, ll
     logger.startTimer().startSpinner(`Collecting Diffs...\n`, { color: 'blue' });
     const diffs = await collectDiffs(rootTreeNode, (path) => getDiff(path, commit, { git, logger }), tokenizer, logger);
     logger.stopSpinner('Diffs Collected').stopTimer();
-    // Summarize diffs
+    // Summarize diffs using three-phase approach:
+    // 1. Pre-process large files to prevent bias
+    // 2. Group by directory and assess token count
+    // 3. Wave-based parallel summarization until under budget
     logger.startTimer();
     const summary = await summarizeDiffs(diffs, {
         tokenizer,
-        maxTokens: maxTokens || 4096,
+        maxTokens: maxTokens || 2048,
+        minTokensForSummary,
+        maxFileTokens,
+        maxConcurrent,
         textSplitter,
         chain: summarizationChain,
         logger,
@@ -11635,7 +11882,16 @@ const handler$3 = async (argv, logger) => {
         return await fileChangeParser({
             changes,
             commit: '--staged',
-            options: { tokenizer, git, llm, logger, maxTokens: config.service.tokenLimit },
+            options: {
+                tokenizer,
+                git,
+                llm,
+                logger,
+                maxTokens: config.service.tokenLimit,
+                minTokensForSummary: config.service.minTokensForSummary,
+                maxFileTokens: config.service.maxFileTokens,
+                maxConcurrent: config.service.maxConcurrent,
+            },
         });
     }
     const commitMsg = await generateAndReviewLoop({

package/dist/index.js CHANGED Viewed

@@ -26,7 +26,6 @@ var outputs = require('@langchain/core/outputs');
 var manager = require('@langchain/core/callbacks/manager');
 require('@langchain/core/utils/json_patch');
 var simpleGit = require('simple-git');
-var pQueue = require('p-queue');
 var documents = require('@langchain/core/documents');
 var diff = require('diff');
 require('@langchain/core/messages');
@@ -69,7 +68,7 @@ var readline__namespace = /*#__PURE__*/_interopNamespaceDefault(readline);
 /**
  * Current build version from package.json
  */
-const BUILD_VERSION = "0.24.0";
+const BUILD_VERSION = "0.25.0";
 const isInteractive = (config) => {
     return config?.mode === 'interactive' || !!config?.interactive;
@@ -184,6 +183,17 @@ async function updateFileSection({ filePath, startComment, endComment, getNewCon
     fs.writeFileSync(filePath, newLines.join('\n'));
 }
+/**
+ * Prompt template for summarizing code diffs.
+ *
+ * TODO: Future improvements to consider:
+ * - Separate prompts for file-level vs directory-level summarization
+ * - Include file type context (e.g., "This is a React component", "This is a test file")
+ * - Add guidance for preserving semantic meaning of changes
+ * - Consider change type (added/modified/deleted) in prompt for better context
+ * - Include hints about the programming language for more idiomatic summaries
+ * - Add support for custom user-provided summarization prompts via config
+ */
 const template$5 = `GOAL: Use functional abstractions to summarize the following text
 RULES: Avoid phrases like  "this change", "this code", or "this function" etc. Instead refer to the function, variable, or class by name.
@@ -1061,6 +1071,16 @@ const schema$1 = {
                     "description": "The maximum number of requests to make concurrently.",
                     "default": 6
                 },
+                "minTokensForSummary": {
+                    "type": "number",
+                    "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
+                    "default": 400
+                },
+                "maxFileTokens": {
+                    "type": "number",
+                    "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
+                    "default": "undefined (uses 0.25 * tokenLimit)"
+                },
                 "authentication": {
                     "anyOf": [
                         {
@@ -1821,6 +1841,16 @@ const schema$1 = {
                     "description": "The maximum number of requests to make concurrently.",
                     "default": 6
                 },
+                "minTokensForSummary": {
+                    "type": "number",
+                    "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
+                    "default": 400
+                },
+                "maxFileTokens": {
+                    "type": "number",
+                    "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
+                    "default": "undefined (uses 0.25 * tokenLimit)"
+                },
                 "authentication": {
                     "anyOf": [
                         {
@@ -1972,6 +2002,16 @@ const schema$1 = {
                     "description": "The maximum number of requests to make concurrently.",
                     "default": 6
                 },
+                "minTokensForSummary": {
+                    "type": "number",
+                    "description": "Minimum token count for a directory/file group to be eligible for summarization. Groups below this threshold preserve raw diffs to maintain detail.",
+                    "default": 400
+                },
+                "maxFileTokens": {
+                    "type": "number",
+                    "description": "Maximum tokens allowed for a single file diff before it gets pre-summarized. Prevents large files from biasing the overall summary. If not set, defaults to 25% of tokenLimit.",
+                    "default": "undefined (uses 0.25 * tokenLimit)"
+                },
                 "authentication": {
                     "anyOf": [
                         {
@@ -8239,6 +8279,114 @@ async function summarize(documents$1, { chain, textSplitter, options }) {
     return res.text && res.text.trim();
 }
+/**
+ * Summarize a single file diff that exceeds the token threshold.
+ */
+async function summarizeFileDiff(fileDiff, { chain, textSplitter, tokenizer }) {
+    try {
+        const fileSummary = await summarize([
+            {
+                pageContent: fileDiff.diff,
+                metadata: {
+                    file: fileDiff.file,
+                    summary: fileDiff.summary,
+                },
+            },
+        ], {
+            chain,
+            textSplitter,
+            options: {
+                returnIntermediateSteps: false,
+            },
+        });
+        const newTokenCount = tokenizer(fileSummary);
+        return {
+            ...fileDiff,
+            diff: fileSummary,
+            tokenCount: newTokenCount,
+        };
+    }
+    catch (error) {
+        // On error, return original diff unchanged
+        console.error(`Failed to summarize file ${fileDiff.file}:`, error);
+        return fileDiff;
+    }
+}
+/**
+ * Process files in waves to respect concurrency limits.
+ */
+async function processInWaves(items, processor, maxConcurrent) {
+    const results = [];
+    for (let i = 0; i < items.length; i += maxConcurrent) {
+        const wave = items.slice(i, i + maxConcurrent);
+        const waveResults = await Promise.all(wave.map(processor));
+        results.push(...waveResults);
+    }
+    return results;
+}
+/**
+ * Pre-summarize individual files that exceed the maxFileTokens threshold.
+ * This prevents large files from dominating the token budget and biasing
+ * the final commit message toward a single file's changes.
+ *
+ * @param diffs - Array of file diffs to process
+ * @param options - Configuration options for summarization
+ * @returns Array of file diffs with large files summarized
+ */
+async function summarizeLargeFiles(diffs, options) {
+    const { maxFileTokens, minTokensForSummary, maxConcurrent, tokenizer, logger, chain, textSplitter } = options;
+    // Identify files that need summarization
+    const filesToSummarize = [];
+    const results = [...diffs];
+    diffs.forEach((diff, index) => {
+        if (diff.tokenCount > maxFileTokens && diff.tokenCount >= minTokensForSummary) {
+            filesToSummarize.push({ index, diff });
+        }
+    });
+    if (filesToSummarize.length === 0) {
+        return results;
+    }
+    logger.verbose(`Pre-summarizing ${filesToSummarize.length} large file(s)...`, { color: 'blue' });
+    // Process large files in waves
+    const summarizedFiles = await processInWaves(filesToSummarize, async ({ diff }) => summarizeFileDiff(diff, { chain, textSplitter, tokenizer }), maxConcurrent);
+    // Update results with summarized files
+    summarizedFiles.forEach((summarizedDiff, i) => {
+        const originalIndex = filesToSummarize[i].index;
+        const originalTokens = results[originalIndex].tokenCount;
+        const newTokens = summarizedDiff.tokenCount;
+        logger.verbose(` - ${summarizedDiff.file}: ${originalTokens} -> ${newTokens} tokens`, { color: 'magenta' });
+        results[originalIndex] = summarizedDiff;
+    });
+    return results;
+}
+/**
+ * Pre-process a DiffNode tree, summarizing large files at the leaf level.
+ * Returns a new DiffNode with updated token counts.
+ */
+async function preprocessLargeFiles(rootNode, options) {
+    // Collect all diffs from the tree
+    const allDiffs = [];
+    function collectDiffs(node) {
+        allDiffs.push(...node.diffs);
+        node.children.forEach(collectDiffs);
+    }
+    collectDiffs(rootNode);
+    // Summarize large files
+    const processedDiffs = await summarizeLargeFiles(allDiffs, options);
+    // Create a map for quick lookup
+    const diffMap = new Map();
+    processedDiffs.forEach((diff) => diffMap.set(diff.file, diff));
+    // Rebuild tree with processed diffs
+    function rebuildNode(node) {
+        return {
+            path: node.path,
+            diffs: node.diffs.map((diff) => diffMap.get(diff.file) || diff),
+            children: node.children.map(rebuildNode),
+        };
+    }
+    return rebuildNode(rootNode);
+}
 /**
  * Create groups from a given node info.
  * @param {DiffNode} node - The node info to start grouping.
@@ -8291,6 +8439,16 @@ async function summarizeDirectoryDiff(directory, { chain, textSplitter, tokenize
         return directory;
     }
 }
+/**
+ * Default output formatter for directory diffs.
+ *
+ * TODO: Future improvements to consider:
+ * - Hierarchical output showing file -> directory -> overall summary
+ * - Configurable verbosity levels (compact, standard, detailed)
+ * - Machine-readable format option (JSON) for programmatic use
+ * - Semantic grouping by change type (added/modified/deleted) or feature area
+ * - Visual diff indicators showing magnitude of changes
+ */
 const defaultOutputCallback = (group) => {
     let output = `
 -------\n* changes in "/${group.path}"\n\n`;
@@ -8302,41 +8460,124 @@ const defaultOutputCallback = (group) => {
     }
     return output;
 };
-async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
-    const queue = new pQueue({ concurrency: 8 });
+/**
+ * Process directory summarization in waves to respect concurrency limits
+ * while maintaining predictable behavior.
+ */
+async function summarizeInWaves(directories, options) {
+    const { totalTokenCount: initialTotal, maxTokens, minTokensForSummary, maxConcurrent, logger, chain, textSplitter, tokenizer, } = options;
+    let totalTokenCount = initialTotal;
+    const results = [...directories];
+    // Create sorted indices by token count (descending) for prioritized processing
+    const sortedIndices = directories
+        .map((d, i) => ({ index: i, tokens: d.tokenCount }))
+        .sort((a, b) => b.tokens - a.tokens);
+    let cursor = 0;
+    while (totalTokenCount > maxTokens && cursor < sortedIndices.length) {
+        // Select wave candidates: directories that exceed minTokensForSummary
+        const wave = [];
+        for (let i = cursor; i < sortedIndices.length && wave.length < maxConcurrent; i++) {
+            const { index, tokens } = sortedIndices[i];
+            // Skip directories below the minimum threshold
+            if (tokens < minTokensForSummary) {
+                cursor = i + 1;
+                continue;
+            }
+            // Skip directories that have already been summarized
+            if (results[index].summary) {
+                cursor = i + 1;
+                continue;
+            }
+            wave.push(index);
+            cursor = i + 1;
+        }
+        // No more eligible candidates
+        if (wave.length === 0) {
+            break;
+        }
+        logger.verbose(`\nProcessing wave of ${wave.length} directories...`, { color: 'blue' });
+        // Process wave in parallel
+        const waveResults = await Promise.all(wave.map((idx) => summarizeDirectoryDiff(results[idx], { chain, textSplitter, tokenizer })));
+        // Update results and recalculate total
+        waveResults.forEach((result, i) => {
+            const idx = wave[i];
+            const originalTokens = results[idx].tokenCount;
+            const newTokens = result.tokenCount;
+            const reduction = originalTokens - newTokens;
+            totalTokenCount -= reduction;
+            results[idx] = result;
+            logger.verbose(` • Summarized "/${result.path}": ${originalTokens} -> ${newTokens} tokens`, {
+                color: 'magenta',
+            });
+        });
+        logger.verbose(`Total token count: ${totalTokenCount}`, {
+            color: totalTokenCount > maxTokens ? 'yellow' : 'green',
+        });
+        // Check if we're now under budget
+        if (totalTokenCount <= maxTokens) {
+            logger.verbose(`Under token budget, stopping summarization.`, { color: 'green' });
+            break;
+        }
+    }
+    return { directories: results, totalTokenCount };
+}
+/**
+ * Summarize diffs using a three-phase approach:
+ *
+ * Phase 1: Pre-process large files to prevent any single file from dominating
+ * Phase 2: Group diffs by directory and assess total token count
+ * Phase 3: Wave-based parallel summarization until under budget
+ *
+ * This approach ensures:
+ * - Large files don't bias the summary
+ * - Small changes preserve their detail (minTokensForSummary threshold)
+ * - Efficient parallel processing with predictable behavior
+ * - Early exit when under token budget
+ */
+async function summarizeDiffs(rootDiffNode, { tokenizer, logger, maxTokens = 2048, minTokensForSummary = 400, maxFileTokens, maxConcurrent = 6, textSplitter, chain, handleOutput = defaultOutputCallback, }) {
+    // Calculate maxFileTokens as 25% of maxTokens if not specified
+    const effectiveMaxFileTokens = maxFileTokens ?? Math.floor(maxTokens * 0.25);
+    // PHASE 1: Pre-process large files
+    logger.startTimer().startSpinner(`Pre-processing large files...`, { color: 'blue' });
+    const preprocessedNode = await preprocessLargeFiles(rootDiffNode, {
+        maxFileTokens: effectiveMaxFileTokens,
+        minTokensForSummary,
+        maxConcurrent,
+        tokenizer,
+        logger,
+        chain,
+        textSplitter,
+    });
+    logger.stopSpinner('Files pre-processed').stopTimer();
+    // PHASE 2: Directory grouping & assessment
     logger.startTimer().startSpinner(`Organizing Diffs...`, { color: 'blue' });
-    const directoryDiffs = createDirectoryDiffs(rootDiffNode);
-    // Sort by token count descending
+    const directoryDiffs = createDirectoryDiffs(preprocessedNode);
+    // Sort by token count descending for consistent output ordering
     directoryDiffs.sort((a, b) => b.tokenCount - a.tokenCount);
-    let totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
+    const totalTokenCount = directoryDiffs.reduce((sum, group) => sum + group.tokenCount, 0);
     logger.stopSpinner('Diffs Organized').stopTimer();
-    logger.startSpinner(`Consolidating Diffs`, { color: 'blue' });
-    const processingTasks = directoryDiffs.map((group, i) => {
-        return queue.add(async () => {
-            // If the diff token count is already less than the average req, we can skip summarizing.
-            const isLessThanAvgTokenReq = group.tokenCount <= maxTokens / directoryDiffs.length;
-            if (totalTokenCount <= maxTokens || isLessThanAvgTokenReq) {
-                return group;
-            }
-            group = await summarizeDirectoryDiff(group, {
-                chain,
-                textSplitter,
-                tokenizer,
-            });
-            // We need to subtract the old token count and add the new one
-            totalTokenCount = totalTokenCount - directoryDiffs[i].tokenCount + group.tokenCount;
-            directoryDiffs[i] = group;
-            logger
-                .verbose(`\n • Summarized diffs in "/${group.path}" `, { color: 'blue' })
-                .verbose(`\nTotal token count: ${totalTokenCount}`, {
-                color: totalTokenCount > maxTokens ? 'yellow' : 'green',
-            });
-            return group;
-        }, { priority: group.tokenCount });
+    logger.verbose(`Total token count: ${totalTokenCount}, max allowed: ${maxTokens}`, {
+        color: totalTokenCount > maxTokens ? 'yellow' : 'green',
+    });
+    // Early exit if already under budget
+    if (totalTokenCount <= maxTokens) {
+        logger.verbose(`Already under token budget, skipping summarization.`, { color: 'green' });
+        return directoryDiffs.map(handleOutput).join('');
+    }
+    // PHASE 3: Wave-based summarization
+    logger.startTimer().startSpinner(`Consolidating Diffs...`, { color: 'blue' });
+    const { directories: summarizedDiffs } = await summarizeInWaves(directoryDiffs, {
+        totalTokenCount,
+        maxTokens,
+        minTokensForSummary,
+        maxConcurrent,
+        logger,
+        chain,
+        textSplitter,
+        tokenizer,
     });
-    await Promise.all(processingTasks);
-    logger.stopSpinner(`Summarized Diffs`);
-    return directoryDiffs.map(handleOutput).join('');
+    logger.stopSpinner(`Diffs Consolidated`).stopTimer();
+    return summarizedDiffs.map(handleOutput).join('');
 }
 /**
@@ -11336,7 +11577,7 @@ for (var i = 0; i < 256; i++) {
   simpleEscapeMap[i] = simpleEscapeSequence(i);
 }
-async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens }, }) {
+async function fileChangeParser({ changes, commit, options: { tokenizer, git, llm: model, logger, maxTokens, minTokensForSummary, maxFileTokens, maxConcurrent, }, }) {
     const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 10000, chunkOverlap: 250 });
     const summarizationChain = loadSummarizationChain(model, {
         type: 'map_reduce',
@@ -11350,11 +11591,17 @@ async function fileChangeParser({ changes, commit, options: { tokenizer, git, ll
     logger.startTimer().startSpinner(`Collecting Diffs...\n`, { color: 'blue' });
     const diffs = await collectDiffs(rootTreeNode, (path) => getDiff(path, commit, { git, logger }), tokenizer, logger);
     logger.stopSpinner('Diffs Collected').stopTimer();
-    // Summarize diffs
+    // Summarize diffs using three-phase approach:
+    // 1. Pre-process large files to prevent bias
+    // 2. Group by directory and assess token count
+    // 3. Wave-based parallel summarization until under budget
     logger.startTimer();
     const summary = await summarizeDiffs(diffs, {
         tokenizer,
-        maxTokens: maxTokens || 4096,
+        maxTokens: maxTokens || 2048,
+        minTokensForSummary,
+        maxFileTokens,
+        maxConcurrent,
         textSplitter,
         chain: summarizationChain,
         logger,
@@ -11657,7 +11904,16 @@ const handler$3 = async (argv, logger) => {
         return await fileChangeParser({
             changes,
             commit: '--staged',
-            options: { tokenizer, git, llm, logger, maxTokens: config.service.tokenLimit },
+            options: {
+                tokenizer,
+                git,
+                llm,
+                logger,
+                maxTokens: config.service.tokenLimit,
+                minTokensForSummary: config.service.minTokensForSummary,
+                maxFileTokens: config.service.maxFileTokens,
+                maxConcurrent: config.service.maxConcurrent,
+            },
         });
     }
     const commitMsg = await generateAndReviewLoop({

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "git-coco",
-  "version": "0.24.0",
+  "version": "0.25.0",
   "description": "zero-effort git commits with coco.",
   "author": "gfargo <ghfargo@gmail.com>",
   "license": "MIT",
@@ -77,20 +77,20 @@
     "@commitlint/core": "^19.8.0",
     "@inquirer/prompts": "3.3.0",
     "@langchain/anthropic": "^0.3.14",
-    "@langchain/community": "^0.3.32",
-    "@langchain/core": "^0.3.40",
+    "@langchain/community": "^0.3.58",
+    "@langchain/core": "^0.3.80",
     "@langchain/ollama": "^0.2.0",
     "@langchain/openai": "^0.6.7",
     "ajv": "^8.16.0",
     "chalk": "4.1.2",
-    "diff": "8.0.2",
+    "diff": "8.0.3",
     "ini": "5.0.0",
     "minimatch": "^9.0.5",
     "ora": "5.4.1",
     "p-queue": "5.0.0",
     "performance-now": "2.1.0",
     "pretty-ms": "7.0.1",
-    "simple-git": "3.28.0",
+    "simple-git": "3.30.0",
     "tiktoken": "^1.0.21",
     "yargs": "17.7.2"
   },