task-summary-extractor 8.3.0 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/.env.example +38 -0
  2. package/ARCHITECTURE.md +99 -3
  3. package/EXPLORATION.md +148 -89
  4. package/QUICK_START.md +5 -2
  5. package/README.md +51 -7
  6. package/bin/taskex.js +11 -4
  7. package/package.json +38 -5
  8. package/src/config.js +52 -3
  9. package/src/modes/focused-reanalysis.js +2 -1
  10. package/src/modes/progress-updater.js +1 -1
  11. package/src/phases/_shared.js +43 -0
  12. package/src/phases/compile.js +101 -0
  13. package/src/phases/deep-dive.js +118 -0
  14. package/src/phases/discover.js +178 -0
  15. package/src/phases/init.js +192 -0
  16. package/src/phases/output.js +238 -0
  17. package/src/phases/process-media.js +633 -0
  18. package/src/phases/services.js +104 -0
  19. package/src/phases/summary.js +86 -0
  20. package/src/pipeline.js +431 -1463
  21. package/src/renderers/docx.js +531 -0
  22. package/src/renderers/html.js +672 -0
  23. package/src/renderers/markdown.js +15 -183
  24. package/src/renderers/pdf.js +90 -0
  25. package/src/renderers/shared.js +211 -0
  26. package/src/schemas/analysis-compiled.schema.json +381 -0
  27. package/src/schemas/analysis-segment.schema.json +380 -0
  28. package/src/services/doc-parser.js +346 -0
  29. package/src/services/gemini.js +101 -44
  30. package/src/services/video.js +123 -8
  31. package/src/utils/adaptive-budget.js +6 -4
  32. package/src/utils/checkpoint.js +2 -1
  33. package/src/utils/cli.js +131 -110
  34. package/src/utils/colors.js +83 -0
  35. package/src/utils/confidence-filter.js +138 -0
  36. package/src/utils/diff-engine.js +2 -1
  37. package/src/utils/global-config.js +6 -5
  38. package/src/utils/health-dashboard.js +11 -9
  39. package/src/utils/json-parser.js +4 -2
  40. package/src/utils/learning-loop.js +3 -2
  41. package/src/utils/progress-bar.js +286 -0
  42. package/src/utils/quality-gate.js +4 -2
  43. package/src/utils/retry.js +3 -1
  44. package/src/utils/schema-validator.js +314 -0
@@ -13,189 +13,12 @@
13
13
 
14
14
  'use strict';
15
15
 
16
- // ════════════════════════════════════════════════════════════
17
- // Name Clustering Utilities
18
- // ════════════════════════════════════════════════════════════
19
-
20
- /**
21
- * Strip parenthetical suffixes and normalize whitespace.
22
- * "Mohamed Elhadi (Service Desk)" → "Mohamed Elhadi"
23
- */
24
- function stripParens(name) {
25
- return (name || '').replace(/\s*\([^)]*\)\s*/g, '').trim();
26
- }
27
-
28
- /**
29
- * Normalize a name to lowercase stripped form for comparison.
30
- */
31
- function normalizeKey(name) {
32
- return stripParens(name).toLowerCase().replace(/\s+/g, ' ').trim();
33
- }
34
-
35
- /**
36
- * Build a Map<canonicalName, Set<rawVariants>> from a list of raw name strings.
37
- * Clustering rules applied in order:
38
- * 1. Exact normalized match (case-insensitive, parens stripped)
39
- * 2. Substring containment after stripping
40
- *
41
- * The canonical name chosen is the longest proper-cased variant (no parens).
42
- */
43
- function clusterNames(rawNames) {
44
- const clusters = new Map(); // normKey → { canonical, variants: Set }
45
- const normToCluster = new Map(); // normKey → cluster ref
46
-
47
- for (const raw of rawNames) {
48
- const stripped = stripParens(raw).trim();
49
- if (!stripped) continue;
50
- const nk = normalizeKey(raw);
51
-
52
- // Check exact match first
53
- if (normToCluster.has(nk)) {
54
- const c = normToCluster.get(nk);
55
- c.variants.add(raw);
56
- // Upgrade canonical: prefer longest proper-cased form without parens
57
- if (stripped.length >= c.canonical.length && stripped[0] === stripped[0].toUpperCase()) {
58
- c.canonical = stripped;
59
- }
60
- continue;
61
- }
62
-
63
- // Check substring containment against existing clusters
64
- let merged = false;
65
- for (const [existNk, c] of normToCluster) {
66
- if (existNk.includes(nk) || nk.includes(existNk)) {
67
- c.variants.add(raw);
68
- normToCluster.set(nk, c);
69
- if (stripped.length >= c.canonical.length && stripped[0] === stripped[0].toUpperCase()) {
70
- c.canonical = stripped;
71
- }
72
- merged = true;
73
- break;
74
- }
75
- }
76
- if (merged) continue;
77
-
78
- // New cluster
79
- const cluster = { canonical: stripped[0] === stripped[0].toUpperCase() ? stripped : raw, variants: new Set([raw]) };
80
- clusters.set(nk, cluster);
81
- normToCluster.set(nk, cluster);
82
- }
83
-
84
- // Build final map: canonical → Set of raw variants
85
- const result = new Map();
86
- for (const c of clusters.values()) {
87
- if (!result.has(c.canonical)) result.set(c.canonical, new Set());
88
- for (const v of c.variants) {
89
- result.get(c.canonical).add(v);
90
- }
91
- }
92
- return result;
93
- }
94
-
95
- /**
96
- * Given a raw name and a cluster map, return the canonical form.
97
- */
98
- function resolve(name, clusterMap) {
99
- if (!name) return name;
100
- const nk = normalizeKey(name);
101
- for (const [canonical, variants] of clusterMap) {
102
- for (const v of variants) {
103
- if (normalizeKey(v) === nk) return canonical;
104
- }
105
- // substring fallback
106
- const cnk = normalizeKey(canonical);
107
- if (cnk.includes(nk) || nk.includes(cnk)) return canonical;
108
- }
109
- return stripParens(name).trim() || name;
110
- }
111
-
112
- // ════════════════════════════════════════════════════════════
113
- // Dedup Utilities
114
- // ════════════════════════════════════════════════════════════
115
-
116
- /** Deduplicate an array by a key function. First occurrence wins (keeps richest data by default). */
117
- function dedupBy(arr, keyFn) {
118
- const seen = new Map();
119
- const result = [];
120
- for (const item of arr) {
121
- const k = keyFn(item);
122
- if (!k) { result.push(item); continue; }
123
- if (seen.has(k)) {
124
- // Merge: overwrite sparse fields from later duplicates
125
- const existing = seen.get(k);
126
- for (const [field, val] of Object.entries(item)) {
127
- if (val && !existing[field]) existing[field] = val;
128
- }
129
- continue;
130
- }
131
- seen.set(k, item);
132
- result.push(item);
133
- }
134
- return result;
135
- }
136
-
137
- /** Normalize a description for fuzzy matching — strips file paths, parenthetical details, punctuation. */
138
- function normalizeDesc(s) {
139
- return (s || '')
140
- .toLowerCase()
141
- // Strip full file paths (keep only the last segment, e.g. "Notifications.cs")
142
- .replace(/[\w\-./\\]+\/[\w\-./\\]+\.(cs|ts|js|json|html|resx|png|md)/g, m => {
143
- const parts = m.replace(/\\/g, '/').split('/');
144
- return parts[parts.length - 1];
145
- })
146
- // Strip parenthetical additions like "(likely code/config ...)"
147
- .replace(/\([^)]*\)/g, '')
148
- // Strip trailing punctuation and whitespace before punctuation
149
- .replace(/\s+([.,;:!?])/g, '$1')
150
- // Collapse whitespace
151
- .replace(/\s+/g, ' ')
152
- .trim();
153
- }
154
-
155
- /** Deduplicate by description text similarity (fallback when IDs are missing). */
156
- function dedupByDesc(arr, descField = 'description') {
157
- const seen = new Set();
158
- return arr.filter(item => {
159
- const key = normalizeDesc(item[descField]);
160
- if (!key || seen.has(key)) return false;
161
- seen.add(key);
162
- return true;
163
- });
164
- }
165
-
166
- // ════════════════════════════════════════════════════════════
167
- // Main Renderer
168
- // ════════════════════════════════════════════════════════════
169
-
170
- /** Format a timestamp string for display, optionally with segment number. */
171
- function fmtTs(ts, seg) {
172
- if (!ts) return '';
173
- if (seg) return `\`${ts}\` _(Seg ${seg})_`;
174
- return `\`${ts}\``;
175
- }
176
-
177
- /** Make a compact priority badge */
178
- function priBadge(p) {
179
- if (!p) return '';
180
- const icons = { high: '🔴', medium: '🟡', low: '🟢', critical: '🔴' };
181
- return ` ${icons[p] || '⚪'} \`${p}\``;
182
- }
183
-
184
- /** Make a compact confidence badge */
185
- function confBadge(c) {
186
- if (!c) return '';
187
- const icons = { HIGH: '🟢', MEDIUM: '🟡', LOW: '🔴' };
188
- return ` ${icons[c] || '⚪'}\`${c}\``;
189
- }
190
-
191
- /** Make a confidence badge with reason tooltip */
192
- function confBadgeFull(c, reason) {
193
- if (!c) return '';
194
- const icons = { HIGH: '🟢', MEDIUM: '🟡', LOW: '🔴' };
195
- const badge = `${icons[c] || '⚪'}\`${c}\``;
196
- if (reason) return ` ${badge} _(${reason})_`;
197
- return ` ${badge}`;
198
- }
16
+ // Shared renderer utilities (name clustering, dedup, badges)
17
+ const {
18
+ stripParens, normalizeKey, clusterNames, resolve,
19
+ dedupBy, normalizeDesc, dedupByDesc,
20
+ fmtTs, priBadge, confBadge, confBadgeFull,
21
+ } = require('./shared');
199
22
 
200
23
  /**
201
24
  * Render the final compiled analysis into a comprehensive Markdown report.
@@ -290,6 +113,15 @@ function renderResultsMarkdown({ compiled, meta }) {
290
113
  }
291
114
  ln('');
292
115
 
116
+ // Confidence filter notice
117
+ if (compiled._filterMeta && compiled._filterMeta.minConfidence !== 'LOW') {
118
+ const fm = compiled._filterMeta;
119
+ const levelLabel = fm.minConfidence === 'HIGH' ? 'HIGH' : 'MEDIUM and HIGH';
120
+ ln(`> ⚠️ **Confidence filter active:** showing only ${levelLabel} confidence items. `);
121
+ ln(`> Kept ${fm.filteredCounts.total}/${fm.originalCounts.total} items (${fm.removed} removed). Full unfiltered data in \`results.json\`. `);
122
+ ln('');
123
+ }
124
+
293
125
  // Segment breakdown
294
126
  const segs = meta.segments || [];
295
127
  if (segs.length > 0) {
@@ -0,0 +1,90 @@
1
+ /**
2
+ * PDF renderer — converts the HTML report to a self-contained PDF.
3
+ *
4
+ * Uses Puppeteer (headless Chrome) to render the full HTML report,
5
+ * preserving all styling, collapsible sections, and visual layout.
6
+ *
7
+ * Usage:
8
+ * const { renderResultsPdf } = require('./pdf');
9
+ * await renderResultsPdf(htmlContent, outputPath);
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ const { c } = require('../utils/colors');
15
+
16
+ /**
17
+ * Render an HTML string to a PDF file.
18
+ *
19
+ * @param {string} htmlContent - The full self-contained HTML string
20
+ * @param {string} outputPath - Absolute path for the output .pdf file
21
+ * @param {object} [options] - PDF options
22
+ * @param {string} [options.format='A4'] - Page size (A4, Letter, etc.)
23
+ * @param {boolean} [options.landscape=false] - Landscape orientation
24
+ * @param {string} [options.margin] - CSS margin (e.g. '1cm')
25
+ * @returns {Promise<{ path: string, pages: number, bytes: number }>}
26
+ */
27
+ async function renderResultsPdf(htmlContent, outputPath, options = {}) {
28
+ let browser;
29
+ try {
30
+ const puppeteer = require('puppeteer');
31
+ browser = await puppeteer.launch({
32
+ headless: true,
33
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
34
+ });
35
+
36
+ const page = await browser.newPage();
37
+
38
+ // Set HTML content with generous timeout for complex reports
39
+ await page.setContent(htmlContent, {
40
+ waitUntil: 'networkidle0',
41
+ timeout: 30000,
42
+ });
43
+
44
+ // Expand all collapsible sections so they appear in PDF
45
+ await page.evaluate(() => {
46
+ document.querySelectorAll('details').forEach(d => d.setAttribute('open', ''));
47
+ });
48
+
49
+ // Generate PDF
50
+ const margin = options.margin || '1.5cm';
51
+ const pdfBuffer = await page.pdf({
52
+ path: outputPath,
53
+ format: options.format || 'A4',
54
+ landscape: !!options.landscape,
55
+ printBackground: true,
56
+ margin: { top: margin, right: margin, bottom: margin, left: margin },
57
+ displayHeaderFooter: true,
58
+ headerTemplate: '<div></div>',
59
+ footerTemplate: `
60
+ <div style="font-size: 9px; color: #888; width: 100%; text-align: center; padding: 0 1.5cm;">
61
+ <span>Generated by taskex</span>
62
+ <span style="float: right;">Page <span class="pageNumber"></span> of <span class="totalPages"></span></span>
63
+ </div>
64
+ `,
65
+ });
66
+
67
+ // Get page count from the buffer metadata
68
+ const pages = pdfBuffer.toString('ascii').match(/\/Type\s*\/Page\b/g)?.length || 0;
69
+
70
+ return {
71
+ path: outputPath,
72
+ pages,
73
+ bytes: pdfBuffer.length,
74
+ };
75
+ } catch (err) {
76
+ if (err.code === 'MODULE_NOT_FOUND') {
77
+ throw new Error(
78
+ 'PDF generation requires puppeteer. Install it with: npm install puppeteer\n' +
79
+ ' This downloads a bundled Chromium (~300MB) for rendering.'
80
+ );
81
+ }
82
+ throw new Error(`PDF generation failed: ${err.message}`);
83
+ } finally {
84
+ if (browser) {
85
+ await browser.close().catch(() => {});
86
+ }
87
+ }
88
+ }
89
+
90
+ module.exports = { renderResultsPdf };
@@ -0,0 +1,211 @@
1
+ /**
2
+ * Shared renderer utilities — name clustering, deduplication, badge formatting.
3
+ *
4
+ * Extracted from markdown.js so both the Markdown and HTML renderers
5
+ * share the same data normalisation logic.
6
+ */
7
+
8
+ 'use strict';
9
+
10
+ // ════════════════════════════════════════════════════════════
11
+ // Name Clustering Utilities
12
+ // ════════════════════════════════════════════════════════════
13
+
14
+ /**
15
+ * Strip parenthetical suffixes and normalize whitespace.
16
+ * "Mohamed Elhadi (Service Desk)" → "Mohamed Elhadi"
17
+ */
18
+ function stripParens(name) {
19
+ return (name || '').replace(/\s*\([^)]*\)\s*/g, '').trim();
20
+ }
21
+
22
+ /**
23
+ * Normalize a name to lowercase stripped form for comparison.
24
+ */
25
+ function normalizeKey(name) {
26
+ return stripParens(name).toLowerCase().replace(/\s+/g, ' ').trim();
27
+ }
28
+
29
+ /**
30
+ * Build a Map<canonicalName, Set<rawVariants>> from a list of raw name strings.
31
+ * Clustering rules applied in order:
32
+ * 1. Exact normalized match (case-insensitive, parens stripped)
33
+ * 2. Substring containment after stripping
34
+ *
35
+ * The canonical name chosen is the longest proper-cased variant (no parens).
36
+ */
37
+ function clusterNames(rawNames) {
38
+ const clusters = new Map();
39
+ const normToCluster = new Map();
40
+
41
+ for (const raw of rawNames) {
42
+ const stripped = stripParens(raw).trim();
43
+ if (!stripped) continue;
44
+ const nk = normalizeKey(raw);
45
+
46
+ if (normToCluster.has(nk)) {
47
+ const c = normToCluster.get(nk);
48
+ c.variants.add(raw);
49
+ if (stripped.length >= c.canonical.length && stripped[0] === stripped[0].toUpperCase()) {
50
+ c.canonical = stripped;
51
+ }
52
+ continue;
53
+ }
54
+
55
+ let merged = false;
56
+ for (const [existNk, c] of normToCluster) {
57
+ if (existNk.includes(nk) || nk.includes(existNk)) {
58
+ c.variants.add(raw);
59
+ normToCluster.set(nk, c);
60
+ if (stripped.length >= c.canonical.length && stripped[0] === stripped[0].toUpperCase()) {
61
+ c.canonical = stripped;
62
+ }
63
+ merged = true;
64
+ break;
65
+ }
66
+ }
67
+ if (merged) continue;
68
+
69
+ const cluster = { canonical: stripped[0] === stripped[0].toUpperCase() ? stripped : raw, variants: new Set([raw]) };
70
+ clusters.set(nk, cluster);
71
+ normToCluster.set(nk, cluster);
72
+ }
73
+
74
+ const result = new Map();
75
+ for (const c of clusters.values()) {
76
+ if (!result.has(c.canonical)) result.set(c.canonical, new Set());
77
+ for (const v of c.variants) {
78
+ result.get(c.canonical).add(v);
79
+ }
80
+ }
81
+ return result;
82
+ }
83
+
84
+ /**
85
+ * Given a raw name and a cluster map, return the canonical form.
86
+ */
87
+ function resolve(name, clusterMap) {
88
+ if (!name) return name;
89
+ const nk = normalizeKey(name);
90
+ for (const [canonical, variants] of clusterMap) {
91
+ for (const v of variants) {
92
+ if (normalizeKey(v) === nk) return canonical;
93
+ }
94
+ const cnk = normalizeKey(canonical);
95
+ if (cnk.includes(nk) || nk.includes(cnk)) return canonical;
96
+ }
97
+ return stripParens(name).trim() || name;
98
+ }
99
+
100
+ // ════════════════════════════════════════════════════════════
101
+ // Dedup Utilities
102
+ // ════════════════════════════════════════════════════════════
103
+
104
+ /** Deduplicate an array by a key function. First occurrence wins. */
105
+ function dedupBy(arr, keyFn) {
106
+ const seen = new Map();
107
+ const result = [];
108
+ for (const item of arr) {
109
+ const k = keyFn(item);
110
+ if (!k) { result.push(item); continue; }
111
+ if (seen.has(k)) {
112
+ const existing = seen.get(k);
113
+ for (const [field, val] of Object.entries(item)) {
114
+ if (val && !existing[field]) existing[field] = val;
115
+ }
116
+ continue;
117
+ }
118
+ seen.set(k, item);
119
+ result.push(item);
120
+ }
121
+ return result;
122
+ }
123
+
124
+ /** Normalize a description for fuzzy matching. */
125
+ function normalizeDesc(s) {
126
+ return (s || '')
127
+ .toLowerCase()
128
+ .replace(/[\w\-./\\]+\/[\w\-./\\]+\.(cs|ts|js|json|html|resx|png|md)/g, m => {
129
+ const parts = m.replace(/\\/g, '/').split('/');
130
+ return parts[parts.length - 1];
131
+ })
132
+ .replace(/\([^)]*\)/g, '')
133
+ .replace(/\s+([.,;:!?])/g, '$1')
134
+ .replace(/\s+/g, ' ')
135
+ .trim();
136
+ }
137
+
138
+ /** Deduplicate by description text similarity. */
139
+ function dedupByDesc(arr, descField = 'description') {
140
+ const seen = new Set();
141
+ return arr.filter(item => {
142
+ const key = normalizeDesc(item[descField]);
143
+ if (!key || seen.has(key)) return false;
144
+ seen.add(key);
145
+ return true;
146
+ });
147
+ }
148
+
149
+ // ════════════════════════════════════════════════════════════
150
+ // Badge / Formatting Utilities
151
+ // ════════════════════════════════════════════════════════════
152
+
153
+ /** Format a timestamp string for display. */
154
+ function fmtTs(ts, seg) {
155
+ if (!ts) return '';
156
+ if (seg) return `\`${ts}\` _(Seg ${seg})_`;
157
+ return `\`${ts}\``;
158
+ }
159
+
160
+ /** Make a compact priority badge */
161
+ function priBadge(p) {
162
+ if (!p) return '';
163
+ const icons = { high: '🔴', medium: '🟡', low: '🟢', critical: '🔴' };
164
+ return ` ${icons[p] || '⚪'} \`${p}\``;
165
+ }
166
+
167
+ /** Make a compact confidence badge */
168
+ function confBadge(c) {
169
+ if (!c) return '';
170
+ const icons = { HIGH: '🟢', MEDIUM: '🟡', LOW: '🔴' };
171
+ return ` ${icons[c] || '⚪'}\`${c}\``;
172
+ }
173
+
174
+ /** Make a confidence badge with reason tooltip */
175
+ function confBadgeFull(c, reason) {
176
+ if (!c) return '';
177
+ const icons = { HIGH: '🟢', MEDIUM: '🟡', LOW: '🔴' };
178
+ const badge = `${icons[c] || '⚪'}\`${c}\``;
179
+ if (reason) return ` ${badge} _(${reason})_`;
180
+ return ` ${badge}`;
181
+ }
182
+
183
+ // ════════════════════════════════════════════════════════════
184
+ // HTML-safe escaping
185
+ // ════════════════════════════════════════════════════════════
186
+
187
+ /** Escape a string for safe HTML insertion. */
188
+ function escHtml(s) {
189
+ if (!s) return '';
190
+ return String(s)
191
+ .replace(/&/g, '&amp;')
192
+ .replace(/</g, '&lt;')
193
+ .replace(/>/g, '&gt;')
194
+ .replace(/"/g, '&quot;')
195
+ .replace(/'/g, '&#39;');
196
+ }
197
+
198
+ module.exports = {
199
+ stripParens,
200
+ normalizeKey,
201
+ clusterNames,
202
+ resolve,
203
+ dedupBy,
204
+ normalizeDesc,
205
+ dedupByDesc,
206
+ fmtTs,
207
+ priBadge,
208
+ confBadge,
209
+ confBadgeFull,
210
+ escHtml,
211
+ };