task-summary-extractor 9.2.1 → 9.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +6 -2
- package/package.json +2 -3
- package/src/config.js +1 -1
- package/src/logger.js +6 -3
- package/src/modes/deep-summary.js +375 -0
- package/src/phases/discover.js +1 -0
- package/src/phases/init.js +9 -30
- package/src/phases/services.js +61 -1
- package/src/pipeline.js +16 -3
- package/src/services/gemini.js +3 -3
- package/src/utils/cli.js +89 -1
- package/src/utils/json-parser.js +3 -0
- package/EXPLORATION.md +0 -514
package/.env.example
CHANGED
|
@@ -12,8 +12,8 @@ GEMINI_API_KEY=your_gemini_api_key
|
|
|
12
12
|
GEMINI_MODEL=gemini-2.5-flash
|
|
13
13
|
|
|
14
14
|
# ======================== VIDEO PROCESSING ========================
|
|
15
|
-
# Speed multiplier (default: 1.
|
|
16
|
-
VIDEO_SPEED=1.
|
|
15
|
+
# Speed multiplier (default: 1.6)
|
|
16
|
+
VIDEO_SPEED=1.6
|
|
17
17
|
# Segment duration in seconds (default: 280)
|
|
18
18
|
VIDEO_SEGMENT_TIME=280
|
|
19
19
|
# ffmpeg preset: ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow
|
|
@@ -36,3 +36,7 @@ THINKING_BUDGET=24576
|
|
|
36
36
|
COMPILATION_THINKING_BUDGET=10240
|
|
37
37
|
# Max polling time for Gemini File API processing in ms (default: 300000 = 5 min)
|
|
38
38
|
GEMINI_POLL_TIMEOUT_MS=300000
|
|
39
|
+
|
|
40
|
+
# ======================== NPM PUBLISHING ========================
|
|
41
|
+
# Automation token for npm publish (optional — if not set, browser sign-in is used)
|
|
42
|
+
# NPM_TOKEN=npm_your_token_here
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "task-summary-extractor",
|
|
3
|
-
"version": "9.
|
|
3
|
+
"version": "9.3.1",
|
|
4
4
|
"description": "AI-powered meeting analysis & document generation CLI — video + document processing, deep dive docs, dynamic mode, interactive CLI with model selection, confidence scoring, learning loop, git progress tracking",
|
|
5
5
|
"main": "process_and_upload.js",
|
|
6
6
|
"bin": {
|
|
@@ -15,8 +15,7 @@
|
|
|
15
15
|
".env.example",
|
|
16
16
|
"README.md",
|
|
17
17
|
"QUICK_START.md",
|
|
18
|
-
"ARCHITECTURE.md"
|
|
19
|
-
"EXPLORATION.md"
|
|
18
|
+
"ARCHITECTURE.md"
|
|
20
19
|
],
|
|
21
20
|
"scripts": {
|
|
22
21
|
"setup": "node setup.js",
|
package/src/config.js
CHANGED
|
@@ -220,7 +220,7 @@ function getMaxThinkingBudget() {
|
|
|
220
220
|
|
|
221
221
|
// ======================== VIDEO PROCESSING ========================
|
|
222
222
|
|
|
223
|
-
const SPEED = envFloat('VIDEO_SPEED', 1.
|
|
223
|
+
const SPEED = envFloat('VIDEO_SPEED', 1.6);
|
|
224
224
|
const SEG_TIME = envInt('VIDEO_SEGMENT_TIME', 280); // seconds — produces segments < 5 min
|
|
225
225
|
const PRESET = env('VIDEO_PRESET', 'slow');
|
|
226
226
|
const VIDEO_EXTS = ['.mp4', '.mkv', '.avi', '.mov', '.webm'];
|
package/src/logger.js
CHANGED
|
@@ -321,16 +321,17 @@ class Logger {
|
|
|
321
321
|
/** Flush buffers and close the logger. Safe to call multiple times. */
|
|
322
322
|
close() {
|
|
323
323
|
if (this.closed) return;
|
|
324
|
-
this.closed = true;
|
|
325
324
|
clearInterval(this._flushInterval);
|
|
326
325
|
this.unpatchConsole();
|
|
327
326
|
|
|
328
|
-
// End active phase if any
|
|
327
|
+
// End active phase if any (must happen BEFORE setting closed flag
|
|
328
|
+
// so _writeStructured inside phaseEnd is not blocked)
|
|
329
329
|
if (this._activePhase) {
|
|
330
330
|
this.phaseEnd();
|
|
331
331
|
}
|
|
332
332
|
|
|
333
|
-
// Write footer
|
|
333
|
+
// Write footer and session_end BEFORE setting closed flag
|
|
334
|
+
// so _writeStructured is not blocked by the guard
|
|
334
335
|
const elapsed = ((Date.now() - this.startTime) / 1000).toFixed(1);
|
|
335
336
|
const footer = `\n=== CLOSED | elapsed: ${elapsed}s | ${new Date().toISOString()} ===\n`;
|
|
336
337
|
this._detailedBuffer.push(footer);
|
|
@@ -342,6 +343,8 @@ class Logger {
|
|
|
342
343
|
timestamp: new Date().toISOString(),
|
|
343
344
|
level: 'info',
|
|
344
345
|
});
|
|
346
|
+
|
|
347
|
+
this.closed = true;
|
|
345
348
|
this._flush(true); // sync flush on close to ensure data is written before process exits
|
|
346
349
|
}
|
|
347
350
|
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deep Summary — pre-summarizes context documents before segment analysis
|
|
3
|
+
* to dramatically reduce input tokens per segment.
|
|
4
|
+
*
|
|
5
|
+
* Instead of sending full document content (potentially 500K+ tokens) to
|
|
6
|
+
* every segment, this module:
|
|
7
|
+
* 1. Groups documents by priority tier
|
|
8
|
+
* 2. Sends each group to Gemini for intelligent condensation
|
|
9
|
+
* 3. Replaces full content with condensed summaries
|
|
10
|
+
* 4. Preserves "excluded" docs at full fidelity (user-chosen focus docs)
|
|
11
|
+
* 5. Ensures summaries capture all ticket IDs, action items, statuses
|
|
12
|
+
*
|
|
13
|
+
* The user can pick specific docs to EXCLUDE from summarization — these stay
|
|
14
|
+
* full. The summary pass receives extra instructions to focus on extracting
|
|
15
|
+
* information related to these excluded docs' topics.
|
|
16
|
+
*
|
|
17
|
+
* Token savings: typically 60-80% reduction in per-segment context tokens.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
'use strict';
|
|
21
|
+
|
|
22
|
+
const { extractJson } = require('../utils/json-parser');
|
|
23
|
+
const { withRetry } = require('../utils/retry');
|
|
24
|
+
const { estimateTokens } = require('../utils/context-manager');
|
|
25
|
+
const { c } = require('../utils/colors');
|
|
26
|
+
const config = require('../config');
|
|
27
|
+
|
|
28
|
+
// ======================== CONSTANTS ========================
|
|
29
|
+
|
|
30
|
+
/** Max tokens for a single summarization call output */
|
|
31
|
+
const SUMMARY_MAX_OUTPUT = 16384;
|
|
32
|
+
|
|
33
|
+
/** Max input chars to send in one summarization batch (~200K tokens @ 0.3 tok/char) */
|
|
34
|
+
const BATCH_MAX_CHARS = 600000;
|
|
35
|
+
|
|
36
|
+
/** Minimum content length (chars) to bother summarizing — below this, keep full */
|
|
37
|
+
const MIN_SUMMARIZE_LENGTH = 500;
|
|
38
|
+
|
|
39
|
+
// ======================== BATCH BUILDER ========================
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Group documents into batches that fit within the batch char limit.
|
|
43
|
+
* Each batch will be summarized in a single Gemini call.
|
|
44
|
+
*
|
|
45
|
+
* @param {Array} docs - Context docs to batch [{type, fileName, content}]
|
|
46
|
+
* @param {number} [maxChars=BATCH_MAX_CHARS] - Max chars per batch
|
|
47
|
+
* @returns {Array<Array>} Batches of docs
|
|
48
|
+
*/
|
|
49
|
+
function buildBatches(docs, maxChars = BATCH_MAX_CHARS) {
|
|
50
|
+
const batches = [];
|
|
51
|
+
let currentBatch = [];
|
|
52
|
+
let currentChars = 0;
|
|
53
|
+
|
|
54
|
+
for (const doc of docs) {
|
|
55
|
+
const docChars = doc.content ? doc.content.length : 0;
|
|
56
|
+
|
|
57
|
+
// If this single doc exceeds the batch limit, it gets its own batch
|
|
58
|
+
if (docChars > maxChars) {
|
|
59
|
+
if (currentBatch.length > 0) {
|
|
60
|
+
batches.push(currentBatch);
|
|
61
|
+
currentBatch = [];
|
|
62
|
+
currentChars = 0;
|
|
63
|
+
}
|
|
64
|
+
batches.push([doc]);
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (currentChars + docChars > maxChars && currentBatch.length > 0) {
|
|
69
|
+
batches.push(currentBatch);
|
|
70
|
+
currentBatch = [];
|
|
71
|
+
currentChars = 0;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
currentBatch.push(doc);
|
|
75
|
+
currentChars += docChars;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (currentBatch.length > 0) {
|
|
79
|
+
batches.push(currentBatch);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return batches;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// ======================== SUMMARIZE ONE BATCH ========================
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Summarize a batch of documents into a condensed representation.
|
|
89
|
+
*
|
|
90
|
+
* @param {object} ai - Gemini AI instance
|
|
91
|
+
* @param {Array} docs - Documents in this batch
|
|
92
|
+
* @param {object} [opts]
|
|
93
|
+
* @param {string[]} [opts.focusTopics=[]] - Topics to focus on (from excluded docs)
|
|
94
|
+
* @param {number} [opts.thinkingBudget=8192] - Thinking token budget
|
|
95
|
+
* @param {number} [opts.batchIndex=0] - Batch number for logging
|
|
96
|
+
* @param {number} [opts.totalBatches=1] - Total batches for logging
|
|
97
|
+
* @returns {Promise<{summaries: Map<string, string>, tokenUsage: object}|null>}
|
|
98
|
+
*/
|
|
99
|
+
async function summarizeBatch(ai, docs, opts = {}) {
|
|
100
|
+
const {
|
|
101
|
+
focusTopics = [],
|
|
102
|
+
thinkingBudget = 8192,
|
|
103
|
+
batchIndex = 0,
|
|
104
|
+
totalBatches = 1,
|
|
105
|
+
} = opts;
|
|
106
|
+
|
|
107
|
+
const docEntries = docs
|
|
108
|
+
.filter(d => d.type === 'inlineText' && d.content)
|
|
109
|
+
.map(d => `=== DOCUMENT: ${d.fileName} ===\n${d.content}`);
|
|
110
|
+
|
|
111
|
+
if (docEntries.length === 0) return null;
|
|
112
|
+
|
|
113
|
+
const focusSection = focusTopics.length > 0
|
|
114
|
+
? `\n\nFOCUS AREAS — The user has selected certain documents to keep at full fidelity. ` +
|
|
115
|
+
`Your summaries must be especially thorough about information related to these topics:\n` +
|
|
116
|
+
focusTopics.map((t, i) => ` ${i + 1}. ${t}`).join('\n') +
|
|
117
|
+
`\n\nFor every ticket ID, action item, blocker, or status mentioned in relation to these ` +
|
|
118
|
+
`focus areas, include them verbatim in the summary. Do NOT omit any IDs or assignments.`
|
|
119
|
+
: '';
|
|
120
|
+
|
|
121
|
+
const promptText = `You are a precision document summarizer for a meeting analysis pipeline.
|
|
122
|
+
|
|
123
|
+
Your job: read ALL documents below and produce a CONDENSED version of each that preserves:
|
|
124
|
+
- Every ticket ID, task ID, CR number, or reference number (verbatim)
|
|
125
|
+
- All assignees, reviewers, and responsible parties
|
|
126
|
+
- All statuses (open, closed, in_progress, blocked, etc.)
|
|
127
|
+
- All action items and their owners
|
|
128
|
+
- All blockers, dependencies, and deadlines
|
|
129
|
+
- Key decisions and their rationale
|
|
130
|
+
- File paths and code references
|
|
131
|
+
- Numerical data (percentages, counts, dates, versions)
|
|
132
|
+
|
|
133
|
+
What to remove:
|
|
134
|
+
- Verbose explanations of well-known concepts
|
|
135
|
+
- Redundant phrasing and filler text
|
|
136
|
+
- Formatting-only content (decorative headers, dividers)
|
|
137
|
+
- Boilerplate/template text that adds no information
|
|
138
|
+
${focusSection}
|
|
139
|
+
|
|
140
|
+
OUTPUT FORMAT:
|
|
141
|
+
Return valid JSON with this structure:
|
|
142
|
+
{
|
|
143
|
+
"summaries": {
|
|
144
|
+
"<fileName>": "<condensed text — plain text, preserving all key info>",
|
|
145
|
+
...
|
|
146
|
+
},
|
|
147
|
+
"metadata": {
|
|
148
|
+
"originalTokensEstimate": <number>,
|
|
149
|
+
"summaryTokensEstimate": <number>,
|
|
150
|
+
"compressionRatio": <number between 0 and 1>
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
Aim for 70-80% size reduction while preserving ALL actionable information.
|
|
155
|
+
Every ID, every name, every status must survive the summarization.
|
|
156
|
+
|
|
157
|
+
DOCUMENTS TO SUMMARIZE (${docEntries.length} documents):
|
|
158
|
+
|
|
159
|
+
${docEntries.join('\n\n')}`;
|
|
160
|
+
|
|
161
|
+
const requestPayload = {
|
|
162
|
+
model: config.GEMINI_MODEL,
|
|
163
|
+
contents: [{ role: 'user', parts: [{ text: promptText }] }],
|
|
164
|
+
config: {
|
|
165
|
+
systemInstruction: 'You are a lossless information compressor. Preserve every ID, name, status, assignment, and actionable detail. Output valid JSON only.',
|
|
166
|
+
maxOutputTokens: SUMMARY_MAX_OUTPUT,
|
|
167
|
+
temperature: 0,
|
|
168
|
+
thinkingConfig: { thinkingBudget },
|
|
169
|
+
},
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
try {
|
|
173
|
+
const label = totalBatches > 1
|
|
174
|
+
? `Deep summary batch ${batchIndex + 1}/${totalBatches}`
|
|
175
|
+
: 'Deep summary';
|
|
176
|
+
|
|
177
|
+
const response = await withRetry(
|
|
178
|
+
() => ai.models.generateContent(requestPayload),
|
|
179
|
+
{ label, maxRetries: 2, baseDelay: 3000 }
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
const rawText = response.text;
|
|
183
|
+
const parsed = extractJson(rawText);
|
|
184
|
+
|
|
185
|
+
if (!parsed || !parsed.summaries) return null;
|
|
186
|
+
|
|
187
|
+
const usage = response.usageMetadata || {};
|
|
188
|
+
const tokenUsage = {
|
|
189
|
+
inputTokens: usage.promptTokenCount || 0,
|
|
190
|
+
outputTokens: usage.candidatesTokenCount || 0,
|
|
191
|
+
totalTokens: usage.totalTokenCount || 0,
|
|
192
|
+
thoughtTokens: usage.thoughtsTokenCount || 0,
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
return { summaries: parsed.summaries, metadata: parsed.metadata || {}, tokenUsage };
|
|
196
|
+
} catch (err) {
|
|
197
|
+
console.warn(` ${c.warn(`Deep summary batch ${batchIndex + 1} failed: ${err.message}`)}`);
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// ======================== MAIN ENTRY POINT ========================
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Run deep summarization on context documents.
|
|
206
|
+
*
|
|
207
|
+
* @param {object} ai - Gemini AI instance
|
|
208
|
+
* @param {Array} contextDocs - All prepared context docs
|
|
209
|
+
* @param {object} [opts]
|
|
210
|
+
* @param {string[]} [opts.excludeFileNames=[]] - Doc fileNames to keep at full fidelity
|
|
211
|
+
* @param {number} [opts.thinkingBudget=8192] - Thinking budget per batch
|
|
212
|
+
* @param {Function} [opts.onProgress] - Callback(done, total) for progress
|
|
213
|
+
* @returns {Promise<{docs: Array, stats: object}>}
|
|
214
|
+
*/
|
|
215
|
+
async function deepSummarize(ai, contextDocs, opts = {}) {
|
|
216
|
+
const {
|
|
217
|
+
excludeFileNames = [],
|
|
218
|
+
thinkingBudget = 8192,
|
|
219
|
+
onProgress = null,
|
|
220
|
+
} = opts;
|
|
221
|
+
|
|
222
|
+
const excludeSet = new Set(excludeFileNames.map(n => n.toLowerCase()));
|
|
223
|
+
|
|
224
|
+
// Partition: docs to summarize vs docs to keep full
|
|
225
|
+
const toSummarize = [];
|
|
226
|
+
const keepFull = [];
|
|
227
|
+
|
|
228
|
+
for (const doc of contextDocs) {
|
|
229
|
+
// Keep non-text docs (fileData = PDF etc.) as-is
|
|
230
|
+
if (doc.type !== 'inlineText') {
|
|
231
|
+
keepFull.push(doc);
|
|
232
|
+
continue;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Keep excluded docs at full fidelity
|
|
236
|
+
if (excludeSet.has(doc.fileName.toLowerCase())) {
|
|
237
|
+
keepFull.push(doc);
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Skip tiny docs — not worth summarizing
|
|
242
|
+
if (!doc.content || doc.content.length < MIN_SUMMARIZE_LENGTH) {
|
|
243
|
+
keepFull.push(doc);
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
toSummarize.push(doc);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (toSummarize.length === 0) {
|
|
251
|
+
return {
|
|
252
|
+
docs: contextDocs,
|
|
253
|
+
stats: {
|
|
254
|
+
summarized: 0,
|
|
255
|
+
keptFull: keepFull.length,
|
|
256
|
+
originalTokens: 0,
|
|
257
|
+
summaryTokens: 0,
|
|
258
|
+
savedTokens: 0,
|
|
259
|
+
savingsPercent: 0,
|
|
260
|
+
totalInputTokens: 0,
|
|
261
|
+
totalOutputTokens: 0,
|
|
262
|
+
},
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Build focus topics from excluded docs (tell summarizer what to prioritize)
|
|
267
|
+
const focusTopics = keepFull
|
|
268
|
+
.filter(d => d.type === 'inlineText' && excludeSet.has(d.fileName.toLowerCase()))
|
|
269
|
+
.map(d => d.fileName);
|
|
270
|
+
|
|
271
|
+
// Batch documents
|
|
272
|
+
const batches = buildBatches(toSummarize);
|
|
273
|
+
|
|
274
|
+
console.log(` Batched ${c.highlight(toSummarize.length)} doc(s) into ${c.highlight(batches.length)} summarization batch(es)`);
|
|
275
|
+
if (focusTopics.length > 0) {
|
|
276
|
+
console.log(` Focus topics from ${c.highlight(focusTopics.length)} excluded doc(s):`);
|
|
277
|
+
focusTopics.forEach(t => console.log(` ${c.dim('•')} ${c.cyan(t)}`));
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Process batches (sequential for now; can add parallelization later)
|
|
281
|
+
const allSummaries = new Map();
|
|
282
|
+
let totalInput = 0;
|
|
283
|
+
let totalOutput = 0;
|
|
284
|
+
let batchesDone = 0;
|
|
285
|
+
|
|
286
|
+
for (let i = 0; i < batches.length; i++) {
|
|
287
|
+
const result = await summarizeBatch(ai, batches[i], {
|
|
288
|
+
focusTopics,
|
|
289
|
+
thinkingBudget,
|
|
290
|
+
batchIndex: i,
|
|
291
|
+
totalBatches: batches.length,
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
batchesDone++;
|
|
295
|
+
if (onProgress) onProgress(batchesDone, batches.length);
|
|
296
|
+
|
|
297
|
+
if (result && result.summaries) {
|
|
298
|
+
for (const [fileName, summary] of Object.entries(result.summaries)) {
|
|
299
|
+
allSummaries.set(fileName.toLowerCase(), summary);
|
|
300
|
+
}
|
|
301
|
+
totalInput += result.tokenUsage.inputTokens;
|
|
302
|
+
totalOutput += result.tokenUsage.outputTokens;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Replace doc content with summaries
|
|
307
|
+
let originalTokens = 0;
|
|
308
|
+
let summaryTokens = 0;
|
|
309
|
+
const resultDocs = [];
|
|
310
|
+
|
|
311
|
+
for (const doc of contextDocs) {
|
|
312
|
+
if (doc.type !== 'inlineText') {
|
|
313
|
+
resultDocs.push(doc);
|
|
314
|
+
continue;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Check if this doc was excluded (kept full)
|
|
318
|
+
if (excludeSet.has(doc.fileName.toLowerCase())) {
|
|
319
|
+
resultDocs.push(doc);
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Check if we have a summary for this doc
|
|
324
|
+
const summaryKey = doc.fileName.toLowerCase();
|
|
325
|
+
const summary = allSummaries.get(summaryKey);
|
|
326
|
+
|
|
327
|
+
if (summary && summary.length > 0) {
|
|
328
|
+
const origTokens = estimateTokens(doc.content);
|
|
329
|
+
const sumTokens = estimateTokens(summary);
|
|
330
|
+
originalTokens += origTokens;
|
|
331
|
+
summaryTokens += sumTokens;
|
|
332
|
+
|
|
333
|
+
resultDocs.push({
|
|
334
|
+
...doc,
|
|
335
|
+
content: `[Deep Summary — original: ~${origTokens.toLocaleString()} tokens → condensed: ~${sumTokens.toLocaleString()} tokens]\n\n${summary}`,
|
|
336
|
+
_originalLength: doc.content.length,
|
|
337
|
+
_summaryLength: summary.length,
|
|
338
|
+
_deepSummarized: true,
|
|
339
|
+
});
|
|
340
|
+
} else {
|
|
341
|
+
// No summary returned — keep original
|
|
342
|
+
resultDocs.push(doc);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const savedTokens = originalTokens - summaryTokens;
|
|
347
|
+
const savingsPercent = originalTokens > 0
|
|
348
|
+
? parseFloat(((savedTokens / originalTokens) * 100).toFixed(1))
|
|
349
|
+
: 0;
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
docs: resultDocs,
|
|
353
|
+
stats: {
|
|
354
|
+
summarized: allSummaries.size,
|
|
355
|
+
keptFull: keepFull.length,
|
|
356
|
+
originalTokens,
|
|
357
|
+
summaryTokens,
|
|
358
|
+
savedTokens,
|
|
359
|
+
savingsPercent,
|
|
360
|
+
totalInputTokens: totalInput,
|
|
361
|
+
totalOutputTokens: totalOutput,
|
|
362
|
+
},
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// ======================== EXPORTS ========================
|
|
367
|
+
|
|
368
|
+
module.exports = {
|
|
369
|
+
deepSummarize,
|
|
370
|
+
summarizeBatch,
|
|
371
|
+
buildBatches,
|
|
372
|
+
SUMMARY_MAX_OUTPUT,
|
|
373
|
+
BATCH_MAX_CHARS,
|
|
374
|
+
MIN_SUMMARIZE_LENGTH,
|
|
375
|
+
};
|
package/src/phases/discover.js
CHANGED
|
@@ -85,6 +85,7 @@ async function phaseDiscover(ctx) {
|
|
|
85
85
|
if (opts.resume) activeFlags.push('resume');
|
|
86
86
|
if (opts.reanalyze) activeFlags.push('reanalyze');
|
|
87
87
|
if (opts.dryRun) activeFlags.push('dry-run');
|
|
88
|
+
if (opts.deepSummary) activeFlags.push('deep-summary');
|
|
88
89
|
if (activeFlags.length > 0) {
|
|
89
90
|
console.log(` Flags: ${c.yellow(activeFlags.join(', '))}`);
|
|
90
91
|
}
|
package/src/phases/init.js
CHANGED
|
@@ -67,6 +67,10 @@ async function phaseInit() {
|
|
|
67
67
|
disableDiff: !!flags['no-diff'],
|
|
68
68
|
noHtml: !!flags['no-html'],
|
|
69
69
|
deepDive: !!flags['deep-dive'],
|
|
70
|
+
deepSummary: !!flags['deep-summary'],
|
|
71
|
+
deepSummaryExclude: typeof flags['exclude-docs'] === 'string'
|
|
72
|
+
? flags['exclude-docs'].split(',').map(s => s.trim()).filter(Boolean)
|
|
73
|
+
: [], // populated by CLI flag, interactive picker, or kept empty
|
|
70
74
|
dynamic: !!flags.dynamic,
|
|
71
75
|
request: typeof flags.request === 'string' ? flags.request : null,
|
|
72
76
|
updateProgress: !!flags['update-progress'],
|
|
@@ -94,36 +98,10 @@ async function phaseInit() {
|
|
|
94
98
|
opts.runMode = mode;
|
|
95
99
|
|
|
96
100
|
if (mode !== 'custom') {
|
|
97
|
-
// Apply preset overrides
|
|
98
|
-
const {
|
|
99
|
-
|
|
100
|
-
const
|
|
101
|
-
fast: {
|
|
102
|
-
disableFocusedPass: true,
|
|
103
|
-
disableLearning: true,
|
|
104
|
-
disableDiff: true,
|
|
105
|
-
format: 'md,json',
|
|
106
|
-
formats: new Set(['md', 'json']),
|
|
107
|
-
modelTier: 'economy',
|
|
108
|
-
},
|
|
109
|
-
balanced: {
|
|
110
|
-
disableFocusedPass: false,
|
|
111
|
-
disableLearning: false,
|
|
112
|
-
disableDiff: false,
|
|
113
|
-
format: 'all',
|
|
114
|
-
formats: new Set(['md', 'html', 'json', 'pdf', 'docx']),
|
|
115
|
-
modelTier: 'balanced',
|
|
116
|
-
},
|
|
117
|
-
detailed: {
|
|
118
|
-
disableFocusedPass: false,
|
|
119
|
-
disableLearning: false,
|
|
120
|
-
disableDiff: false,
|
|
121
|
-
format: 'all',
|
|
122
|
-
formats: new Set(['md', 'html', 'json', 'pdf', 'docx']),
|
|
123
|
-
modelTier: 'premium',
|
|
124
|
-
},
|
|
125
|
-
};
|
|
126
|
-
const preset = presetOverrides[mode];
|
|
101
|
+
// Apply preset overrides from the shared RUN_PRESETS definition
|
|
102
|
+
const { RUN_PRESETS } = require('../utils/cli');
|
|
103
|
+
const presetDef = RUN_PRESETS[mode];
|
|
104
|
+
const preset = presetDef ? presetDef.overrides : null;
|
|
127
105
|
if (preset) {
|
|
128
106
|
opts.disableFocusedPass = preset.disableFocusedPass;
|
|
129
107
|
opts.disableLearning = preset.disableLearning;
|
|
@@ -322,6 +300,7 @@ function _printRunSummary(opts, modelId, models, targetDir) {
|
|
|
322
300
|
if (!opts.disableLearning) features.push(c.green('learning'));
|
|
323
301
|
if (!opts.disableDiff) features.push(c.green('diff'));
|
|
324
302
|
if (opts.deepDive) features.push(c.cyan('deep-dive'));
|
|
303
|
+
if (opts.deepSummary) features.push(c.cyan('deep-summary'));
|
|
325
304
|
if (opts.dynamic) features.push(c.cyan('dynamic'));
|
|
326
305
|
if (opts.resume) features.push(c.yellow('resume'));
|
|
327
306
|
if (opts.dryRun) features.push(c.yellow('dry-run'));
|
package/src/phases/services.js
CHANGED
|
@@ -7,6 +7,9 @@ const path = require('path');
|
|
|
7
7
|
const { initFirebase, uploadToStorage, storageExists } = require('../services/firebase');
|
|
8
8
|
const { initGemini, prepareDocsForGemini } = require('../services/gemini');
|
|
9
9
|
|
|
10
|
+
// --- Modes ---
|
|
11
|
+
const { deepSummarize } = require('../modes/deep-summary');
|
|
12
|
+
|
|
10
13
|
// --- Utils ---
|
|
11
14
|
const { parallelMap } = require('../utils/retry');
|
|
12
15
|
|
|
@@ -101,4 +104,61 @@ async function phaseServices(ctx) {
|
|
|
101
104
|
return { ...ctx, storage, firebaseReady, ai, contextDocs, docStorageUrls, callName };
|
|
102
105
|
}
|
|
103
106
|
|
|
104
|
-
|
|
107
|
+
// ======================== PHASE: DEEP SUMMARY ========================
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Pre-summarize context documents to save input tokens per segment.
|
|
111
|
+
* Runs only when --deep-summary flag is active.
|
|
112
|
+
*
|
|
113
|
+
* @param {object} ctx - Pipeline context with ai, contextDocs, opts
|
|
114
|
+
* @returns {object} Updated ctx with summarized contextDocs and deepSummaryStats
|
|
115
|
+
*/
|
|
116
|
+
async function phaseDeepSummary(ctx) {
|
|
117
|
+
const log = getLog();
|
|
118
|
+
const { opts, ai, contextDocs } = ctx;
|
|
119
|
+
|
|
120
|
+
if (!opts.deepSummary || !ai || contextDocs.length === 0) {
|
|
121
|
+
return { ...ctx, deepSummaryStats: null };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
console.log('');
|
|
125
|
+
console.log(c.cyan(' ── Deep Summary — Pre-summarizing context documents ──'));
|
|
126
|
+
log.step('Deep summary: starting context document pre-summarization');
|
|
127
|
+
if (log && log.phaseStart) log.phaseStart('deep_summary');
|
|
128
|
+
|
|
129
|
+
const excludeNames = opts.deepSummaryExclude || [];
|
|
130
|
+
let updatedDocs = contextDocs;
|
|
131
|
+
let deepSummaryStats = null;
|
|
132
|
+
|
|
133
|
+
try {
|
|
134
|
+
const result = await deepSummarize(ai, contextDocs, {
|
|
135
|
+
excludeFileNames: excludeNames,
|
|
136
|
+
thinkingBudget: Math.min(8192, opts.thinkingBudget),
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
updatedDocs = result.docs;
|
|
140
|
+
deepSummaryStats = result.stats;
|
|
141
|
+
|
|
142
|
+
if (deepSummaryStats.summarized > 0) {
|
|
143
|
+
console.log(` ${c.success(`Summarized ${c.highlight(deepSummaryStats.summarized)} doc(s) — saved ~${c.highlight(deepSummaryStats.savedTokens.toLocaleString())} tokens (${c.yellow(deepSummaryStats.savingsPercent + '%')} reduction)`)}`);
|
|
144
|
+
console.log(` ${c.dim('Original:')} ~${deepSummaryStats.originalTokens.toLocaleString()} tokens → ${c.dim('Condensed:')} ~${deepSummaryStats.summaryTokens.toLocaleString()} tokens`);
|
|
145
|
+
if (deepSummaryStats.keptFull > 0) {
|
|
146
|
+
console.log(` ${c.dim('Kept full:')} ${deepSummaryStats.keptFull} doc(s) (excluded from summary)`);
|
|
147
|
+
}
|
|
148
|
+
log.step(`Deep summary: ${deepSummaryStats.summarized} docs summarized, ${deepSummaryStats.savedTokens} tokens saved (${deepSummaryStats.savingsPercent}%)`);
|
|
149
|
+
log.metric('deep_summary', deepSummaryStats);
|
|
150
|
+
} else {
|
|
151
|
+
console.log(` ${c.dim('No documents needed summarization')}`);
|
|
152
|
+
}
|
|
153
|
+
} catch (err) {
|
|
154
|
+
console.warn(` ${c.warn(`Deep summary failed (continuing with full docs): ${err.message}`)}`);
|
|
155
|
+
log.warn(`Deep summary failed: ${err.message}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (log && log.phaseEnd) log.phaseEnd({ stats: deepSummaryStats });
|
|
159
|
+
console.log('');
|
|
160
|
+
|
|
161
|
+
return { ...ctx, contextDocs: updatedDocs, deepSummaryStats };
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
module.exports = { phaseServices, phaseDeepSummary };
|
package/src/pipeline.js
CHANGED
|
@@ -32,7 +32,7 @@ const { getLog, isShuttingDown, PKG_ROOT, PROJECT_ROOT } = require('./phases/_sh
|
|
|
32
32
|
// --- Pipeline phases ---
|
|
33
33
|
const phaseInit = require('./phases/init');
|
|
34
34
|
const phaseDiscover = require('./phases/discover');
|
|
35
|
-
const phaseServices
|
|
35
|
+
const { phaseServices, phaseDeepSummary } = require('./phases/services');
|
|
36
36
|
const phaseProcessVideo = require('./phases/process-media');
|
|
37
37
|
const phaseCompile = require('./phases/compile');
|
|
38
38
|
const phaseOutput = require('./phases/output');
|
|
@@ -46,7 +46,7 @@ const phaseDeepDive = require('./phases/deep-dive');
|
|
|
46
46
|
// --- Utils (for run orchestration + alt modes) ---
|
|
47
47
|
const { c } = require('./utils/colors');
|
|
48
48
|
const { findDocsRecursive } = require('./utils/fs');
|
|
49
|
-
const { promptUserText } = require('./utils/cli');
|
|
49
|
+
const { promptUserText, selectDocsToExclude } = require('./utils/cli');
|
|
50
50
|
const { createProgressBar } = require('./utils/progress-bar');
|
|
51
51
|
const { buildHealthReport, printHealthDashboard } = require('./utils/health-dashboard');
|
|
52
52
|
const { saveHistory, buildHistoryEntry } = require('./utils/learning-loop');
|
|
@@ -92,9 +92,21 @@ async function run() {
|
|
|
92
92
|
|
|
93
93
|
// Phase 3: Services
|
|
94
94
|
bar.setPhase('services');
|
|
95
|
-
|
|
95
|
+
let fullCtx = await phaseServices(ctx);
|
|
96
96
|
bar.tick('Services ready');
|
|
97
97
|
|
|
98
|
+
// Phase 3.5 (optional): Deep Summary — pre-summarize context docs
|
|
99
|
+
if (fullCtx.opts.deepSummary && fullCtx.ai && fullCtx.contextDocs.length > 0) {
|
|
100
|
+
// Interactive picker: let user choose docs to keep at full fidelity
|
|
101
|
+
if (process.stdin.isTTY && fullCtx.opts.deepSummaryExclude.length === 0) {
|
|
102
|
+
const excluded = await selectDocsToExclude(fullCtx.contextDocs);
|
|
103
|
+
fullCtx.opts.deepSummaryExclude = excluded;
|
|
104
|
+
}
|
|
105
|
+
bar.setPhase('deep-summary', 1);
|
|
106
|
+
fullCtx = await phaseDeepSummary(fullCtx);
|
|
107
|
+
bar.tick('Docs summarized');
|
|
108
|
+
}
|
|
109
|
+
|
|
98
110
|
// Phase 4: Process each media file (video or audio)
|
|
99
111
|
const allSegmentAnalyses = [];
|
|
100
112
|
const allSegmentReports = [];
|
|
@@ -117,6 +129,7 @@ async function run() {
|
|
|
117
129
|
contextDocuments: fullCtx.contextDocs.map(d => d.fileName),
|
|
118
130
|
documentStorageUrls: fullCtx.docStorageUrls,
|
|
119
131
|
firebaseAuthenticated: fullCtx.firebaseReady,
|
|
132
|
+
deepSummary: fullCtx.deepSummaryStats || null,
|
|
120
133
|
files: [],
|
|
121
134
|
};
|
|
122
135
|
|
package/src/services/gemini.js
CHANGED
|
@@ -90,7 +90,7 @@ async function prepareDocsForGemini(ai, docFileList) {
|
|
|
90
90
|
const pollStart = Date.now();
|
|
91
91
|
while (file.state === 'PROCESSING') {
|
|
92
92
|
if (Date.now() - pollStart > GEMINI_POLL_TIMEOUT_MS) {
|
|
93
|
-
console.warn(` ${c.warn(`${name} —
|
|
93
|
+
console.warn(` ${c.warn(`${name} — file is still processing after ${(GEMINI_POLL_TIMEOUT_MS / 1000).toFixed(0)}s, skipping (you can increase the wait time with GEMINI_POLL_TIMEOUT_MS in .env)`)}`);
|
|
94
94
|
file = null;
|
|
95
95
|
break;
|
|
96
96
|
}
|
|
@@ -287,7 +287,7 @@ async function processWithGemini(ai, filePath, displayName, contextDocs = [], pr
|
|
|
287
287
|
const pollStart = Date.now();
|
|
288
288
|
while (uploaded.state === 'PROCESSING') {
|
|
289
289
|
if (Date.now() - pollStart > GEMINI_POLL_TIMEOUT_MS) {
|
|
290
|
-
throw new Error(`
|
|
290
|
+
throw new Error(`File "${displayName}" is still processing after ${(GEMINI_POLL_TIMEOUT_MS / 1000).toFixed(0)}s. Try again or increase the wait time by setting GEMINI_POLL_TIMEOUT_MS in your .env file.`);
|
|
291
291
|
}
|
|
292
292
|
process.stdout.write(` Processing${'.'.repeat((waited % 3) + 1)} \r`);
|
|
293
293
|
await new Promise(r => setTimeout(r, 5000));
|
|
@@ -343,7 +343,7 @@ async function processWithGemini(ai, filePath, displayName, contextDocs = [], pr
|
|
|
343
343
|
buildProgressiveContext(previousAnalyses, userName) || ''
|
|
344
344
|
);
|
|
345
345
|
const docBudget = Math.max(100000, config.GEMINI_CONTEXT_WINDOW - 350000 - prevContextEstimate);
|
|
346
|
-
console.log(`
|
|
346
|
+
console.log(` Reference docs budget: ${(docBudget / 1000).toFixed(0)}K (${contextDocs.length} doc${contextDocs.length !== 1 ? 's' : ''} available)`);
|
|
347
347
|
|
|
348
348
|
const { selected: selectedDocs, excluded, stats } = selectDocsByBudget(
|
|
349
349
|
contextDocs, docBudget, { segmentIndex }
|