npm - task-summary-extractor - Versions diffs - 9.7.0 → 9.8.0 - Mend

task-summary-extractor 9.7.0 → 9.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +2 -1
package/package.json +1 -1
package/src/modes/deep-summary.js +37 -0
package/src/modes/focused-reanalysis.js +16 -1
package/src/phases/process-media.js +36 -11
package/src/utils/cli.js +5 -2
package/src/utils/schema-validator.js +33 -2

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # Task Summary Extractor
-> **v9.7.0** — AI-powered content analysis CLI — meetings, recordings, documents, or any mix. Install globally, run anywhere.
+> **v9.8.0** — AI-powered content analysis CLI — meetings, recordings, documents, or any mix. Install globally, run anywhere.
 <p align="center">
   <img src="https://img.shields.io/badge/node-%3E%3D18.0.0-green" alt="Node.js" />
@@ -598,6 +598,7 @@ task-summary-extractor/
 | Version | Highlights |
 |---------|-----------|
+| **v9.8.0** | **Schema hardening & transcript handling** — VTT/SRT auto-excluded from deep-summary (transcripts routed to workflow, not summarizer), `normalizeAnalysis()` fills missing `summary`/`confidence`/`discussed_state` defaults before validation, batch Storage URL→File API auto-retry on `INVALID_ARGUMENT`, focused re-analysis skips sparse segments (≤2 items + low density), 367 tests |
 | **v9.7.0** | **Multi-segment batching** — groups consecutive video segments into single Gemini API calls when context window has headroom, greedy bin-packing by token budget (`planSegmentBatches`), `processSegmentBatch()` multi-video API calls, automatic fallback to single-segment on failure, `--no-batch` to disable, codebase audit fixes (unused imports, variable shadowing) |
 | **v9.6.0** | **Interactive CLI UX** — arrow-key navigation for all selectors (folder, model, run mode, formats, confidence, doc exclusion), zero-dependency prompt engine (`interactive.js`), `selectOne()` with ↑↓+Enter, `selectMany()` with Space toggle + A all/none, non-TTY fallback to number input |
 | **v9.5.0** | **Video processing flags** — `--no-compress`, `--speed`, `--segment-time` CLI flags, hardcoded 1200s for raw mode, deprecated `--skip-compression` |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "task-summary-extractor",
-  "version": "9.7.0",
+  "version": "9.8.0",
   "description": "AI-powered meeting analysis & document generation CLI — video + document processing, deep dive docs, dynamic mode, interactive CLI with model selection, confidence scoring, learning loop, git progress tracking",
   "main": "process_and_upload.js",
   "bin": {

package/src/modes/deep-summary.js CHANGED Viewed

@@ -27,6 +27,20 @@ const config = require('../config');
 // ======================== CONSTANTS ========================
+/**
+ * Transcript file extensions that should NEVER be summarized.
+ * VTT/SRT files are time-sliced per segment during analysis — summarising
+ * them would destroy the timestamp-indexed structure that `sliceVttForSegment`
+ * relies on. They are automatically kept at full fidelity.
+ */
+const TRANSCRIPT_EXTENSIONS = ['.vtt', '.srt'];
+/** Check whether a filename is a transcript file (VTT/SRT). */
+function isTranscriptFile(fileName) {
+  const lower = (fileName || '').toLowerCase();
+  return TRANSCRIPT_EXTENSIONS.some(ext => lower.endsWith(ext));
+}
 /** Max tokens for a single summarization call output */
 const SUMMARY_MAX_OUTPUT = 16384;
@@ -262,6 +276,13 @@ async function deepSummarize(ai, contextDocs, opts = {}) {
       continue;
     }
+    // Auto-exclude transcript files (VTT/SRT) — they are time-sliced per
+    // segment during analysis and must retain their timestamp structure.
+    if (isTranscriptFile(doc.fileName)) {
+      keepFull.push(doc);
+      continue;
+    }
     // Keep excluded docs at full fidelity
     if (excludeSet.has(doc.fileName.toLowerCase())) {
       keepFull.push(doc);
@@ -294,14 +315,22 @@ async function deepSummarize(ai, contextDocs, opts = {}) {
   }
   // Build focus topics from excluded docs (tell summarizer what to prioritize)
+  // NOTE: transcript files (VTT/SRT) are auto-excluded but NOT used as focus
+  // topics — they are time-sliced per segment and don't represent "topics".
   const focusTopics = keepFull
     .filter(d => d.type === 'inlineText' && excludeSet.has(d.fileName.toLowerCase()))
     .map(d => d.fileName);
+  // Count auto-excluded transcript files for logging
+  const autoExcludedTranscripts = keepFull.filter(d => isTranscriptFile(d.fileName));
   // Batch documents
   const batches = buildBatches(toSummarize);
   console.log(`    Batched ${c.highlight(toSummarize.length)} doc(s) into ${c.highlight(batches.length)} summarization batch(es)`);
+  if (autoExcludedTranscripts.length > 0) {
+    console.log(`    Auto-excluded ${c.highlight(autoExcludedTranscripts.length)} transcript file(s) (VTT/SRT — time-sliced per segment)`);
+  }
   if (focusTopics.length > 0) {
     console.log(`    Focus topics from ${c.highlight(focusTopics.length)} excluded doc(s):`);
     focusTopics.forEach(t => console.log(`      ${c.dim('•')} ${c.cyan(t)}`));
@@ -350,6 +379,12 @@ async function deepSummarize(ai, contextDocs, opts = {}) {
       continue;
     }
+    // Auto-exclude transcript files (VTT/SRT)
+    if (isTranscriptFile(doc.fileName)) {
+      resultDocs.push(doc);
+      continue;
+    }
     // Check if we have a summary for this doc
     const summaryKey = doc.fileName.toLowerCase();
     const summary = allSummaries.get(summaryKey);
@@ -399,6 +434,8 @@ module.exports = {
   deepSummarize,
   summarizeBatch,
   buildBatches,
+  isTranscriptFile,
+  TRANSCRIPT_EXTENSIONS,
   SUMMARY_MAX_OUTPUT,
   BATCH_MAX_CHARS,
   MIN_SUMMARIZE_LENGTH,

package/src/modes/focused-reanalysis.js CHANGED Viewed

@@ -136,9 +136,24 @@ function identifyWeaknesses(qualityReport, analysis) {
     );
   }
+  // ── Skip focused pass for simple / sparse segments ──────────────────────
+  // When the analysis has very few extracted items AND the density dimension
+  // is low, the segment is likely simple (chit-chat, small-talk, intro) or
+  // the AI legitimately had nothing to extract. A focused pass won't help.
+  const totalItems = [
+    ...(analysis.tickets || []),
+    ...(analysis.action_items || []),
+    ...(analysis.change_requests || []),
+    ...(analysis.blockers || []),
+    ...(analysis.scope_changes || []),
+  ].length;
+  const isSparseSegment = totalItems <= 2 && dims.density && dims.density.score < 30;
   const shouldReanalyze = focusInstructions.length > 0 &&
     qualityReport.score < 60 &&       // Only re-analyze if quality is truly lacking
-    weakAreas.length >= 2;            // At least 2 weak areas to justify the cost
+    weakAreas.length >= 2 &&          // At least 2 weak areas to justify the cost
+    !isSparseSegment;                 // Don't waste tokens on sparse / simple segments
   const focusPrompt = focusInstructions.length > 0
     ? focusInstructions.join('\n\n')

package/src/phases/process-media.js CHANGED Viewed

@@ -352,18 +352,43 @@ async function phaseProcessVideo(ctx, videoPath, videoIndex) {
         }
         try {
-          const batchRun = await processSegmentBatch(
-            ai, batchSegs,
-            `${callName}_${baseName}_batch${bIdx}`,
-            contextDocs, previousAnalyses, userName, PKG_ROOT,
-            {
-              segmentIndices: batchIndices,
-              totalSegments: segments.length,
-              segmentTimes: batchTimes,
-              thinkingBudget: opts.thinkingBudget || 24576,
-              noStorageUrl: !!opts.noStorageUrl,
+          let batchRun;
+          try {
+            batchRun = await processSegmentBatch(
+              ai, batchSegs,
+              `${callName}_${baseName}_batch${bIdx}`,
+              contextDocs, previousAnalyses, userName, PKG_ROOT,
+              {
+                segmentIndices: batchIndices,
+                totalSegments: segments.length,
+                segmentTimes: batchTimes,
+                thinkingBudget: opts.thinkingBudget || 24576,
+                noStorageUrl: !!opts.noStorageUrl,
+              }
+            );
+          } catch (batchErr) {
+            const msg = batchErr.message || '';
+            // If Storage URL was rejected, retry batch with forced File API uploads
+            if (!opts.noStorageUrl && msg.includes('INVALID_ARGUMENT') && batchSegs.some(s => s.storageUrl)) {
+              console.log(`    ${c.warn('Storage URL rejected — retrying batch with File API uploads...')}`);
+              log.warn(`Batch ${bIdx} Storage URL rejected — retrying with noStorageUrl=true`);
+              batchRun = await processSegmentBatch(
+                ai, batchSegs,
+                `${callName}_${baseName}_batch${bIdx}`,
+                contextDocs, previousAnalyses, userName, PKG_ROOT,
+                {
+                  segmentIndices: batchIndices,
+                  totalSegments: segments.length,
+                  segmentTimes: batchTimes,
+                  thinkingBudget: opts.thinkingBudget || 24576,
+                  noStorageUrl: true,
+                }
+              );
+              console.log(`    ${c.success('File API batch retry succeeded')}`);
+            } else {
+              throw batchErr;
             }
-          );
+          }
           // Save batch run file
           const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);

package/src/utils/cli.js CHANGED Viewed

@@ -587,9 +587,12 @@ async function selectConfidence() {
  * @returns {Promise<string[]>} Array of excluded fileName strings
  */
 async function selectDocsToExclude(contextDocs) {
-  // Only show inlineText docs with actual content
+  const { isTranscriptFile } = require('../modes/deep-summary');
+  // Only show inlineText docs with actual content, excluding transcript files
+  // (VTT/SRT are auto-excluded from summarization — no need to show them)
   const eligible = contextDocs
-    .filter(d => d.type === 'inlineText' && d.content && d.content.length > 0)
+    .filter(d => d.type === 'inlineText' && d.content && d.content.length > 0 && !isTranscriptFile(d.fileName))
     .map(d => ({
       fileName: d.fileName,
       chars: d.content.length,

package/src/utils/schema-validator.js CHANGED Viewed

@@ -316,8 +316,12 @@ const ARRAY_DEFAULTS = [
 /**
  * Normalize a parsed analysis object by filling in missing array fields
- * with empty arrays. This prevents downstream code from crashing when
- * a segment legitimately has no tickets/action_items/etc.
+ * with empty arrays, ensuring required string fields exist, and patching
+ * item-level required fields (e.g. confidence) with sensible defaults.
+ *
+ * This prevents downstream code from crashing when a segment legitimately
+ * has no tickets/action_items/etc., and avoids schema-validation failures
+ * on truncated AI outputs.
  *
  * Mutates `data` in-place and returns it for convenience.
  *
@@ -326,11 +330,38 @@ const ARRAY_DEFAULTS = [
  */
 function normalizeAnalysis(data) {
   if (!data || typeof data !== 'object') return data;
+  // Fill missing array fields
   for (const field of ARRAY_DEFAULTS) {
     if (data[field] === undefined || data[field] === null) {
       data[field] = [];
     }
   }
+  // Ensure top-level required string fields
+  if (!data.summary && data.summary !== '') {
+    data.summary = data.segment_summary || data.overview || '';
+  }
+  // Patch ticket items — fill missing required fields with defaults
+  if (Array.isArray(data.tickets)) {
+    for (const ticket of data.tickets) {
+      if (!ticket || typeof ticket !== 'object') continue;
+      if (!ticket.confidence) ticket.confidence = 'MEDIUM';
+      if (!ticket.discussed_state && ticket.discussed_state !== null) {
+        ticket.discussed_state = { summary: '' };
+      }
+    }
+  }
+  // Patch action_items — fill missing confidence
+  if (Array.isArray(data.action_items)) {
+    for (const item of data.action_items) {
+      if (!item || typeof item !== 'object') continue;
+      if (!item.confidence) item.confidence = 'MEDIUM';
+    }
+  }
   return data;
 }