spec-agent 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +256 -0
  2. package/bin/spec-agent.js +14 -0
  3. package/dist/commands/analyze.d.ts +16 -0
  4. package/dist/commands/analyze.d.ts.map +1 -0
  5. package/dist/commands/analyze.js +283 -0
  6. package/dist/commands/analyze.js.map +1 -0
  7. package/dist/commands/clean.d.ts +9 -0
  8. package/dist/commands/clean.d.ts.map +1 -0
  9. package/dist/commands/clean.js +109 -0
  10. package/dist/commands/clean.js.map +1 -0
  11. package/dist/commands/dispatch.d.ts +12 -0
  12. package/dist/commands/dispatch.d.ts.map +1 -0
  13. package/dist/commands/dispatch.js +232 -0
  14. package/dist/commands/dispatch.js.map +1 -0
  15. package/dist/commands/doctor.d.ts +9 -0
  16. package/dist/commands/doctor.d.ts.map +1 -0
  17. package/dist/commands/doctor.js +153 -0
  18. package/dist/commands/doctor.js.map +1 -0
  19. package/dist/commands/learn.d.ts +13 -0
  20. package/dist/commands/learn.d.ts.map +1 -0
  21. package/dist/commands/learn.js +234 -0
  22. package/dist/commands/learn.js.map +1 -0
  23. package/dist/commands/merge.d.ts +11 -0
  24. package/dist/commands/merge.d.ts.map +1 -0
  25. package/dist/commands/merge.js +335 -0
  26. package/dist/commands/merge.js.map +1 -0
  27. package/dist/commands/pipeline.d.ts +19 -0
  28. package/dist/commands/pipeline.d.ts.map +1 -0
  29. package/dist/commands/pipeline.js +266 -0
  30. package/dist/commands/pipeline.js.map +1 -0
  31. package/dist/commands/plan.d.ts +13 -0
  32. package/dist/commands/plan.d.ts.map +1 -0
  33. package/dist/commands/plan.js +314 -0
  34. package/dist/commands/plan.js.map +1 -0
  35. package/dist/commands/scan.d.ts +28 -0
  36. package/dist/commands/scan.d.ts.map +1 -0
  37. package/dist/commands/scan.js +488 -0
  38. package/dist/commands/scan.js.map +1 -0
  39. package/dist/commands/status.d.ts +8 -0
  40. package/dist/commands/status.d.ts.map +1 -0
  41. package/dist/commands/status.js +146 -0
  42. package/dist/commands/status.js.map +1 -0
  43. package/dist/index.d.ts +2 -0
  44. package/dist/index.d.ts.map +1 -0
  45. package/dist/index.js +126 -0
  46. package/dist/index.js.map +1 -0
  47. package/dist/services/document-parser.d.ts +49 -0
  48. package/dist/services/document-parser.d.ts.map +1 -0
  49. package/dist/services/document-parser.js +499 -0
  50. package/dist/services/document-parser.js.map +1 -0
  51. package/dist/services/llm.d.ts +61 -0
  52. package/dist/services/llm.d.ts.map +1 -0
  53. package/dist/services/llm.js +716 -0
  54. package/dist/services/llm.js.map +1 -0
  55. package/dist/types.d.ts +159 -0
  56. package/dist/types.d.ts.map +1 -0
  57. package/dist/types.js +4 -0
  58. package/dist/types.js.map +1 -0
  59. package/dist/utils/file.d.ts +10 -0
  60. package/dist/utils/file.d.ts.map +1 -0
  61. package/dist/utils/file.js +96 -0
  62. package/dist/utils/file.js.map +1 -0
  63. package/dist/utils/logger.d.ts +13 -0
  64. package/dist/utils/logger.d.ts.map +1 -0
  65. package/dist/utils/logger.js +55 -0
  66. package/dist/utils/logger.js.map +1 -0
  67. package/package.json +48 -0
  68. package/scripts/publish-npm.js +174 -0
  69. package/spec-agent-implementation.md +750 -0
  70. package/src/commands/analyze.ts +322 -0
  71. package/src/commands/clean.ts +88 -0
  72. package/src/commands/dispatch.ts +250 -0
  73. package/src/commands/doctor.ts +136 -0
  74. package/src/commands/learn.ts +261 -0
  75. package/src/commands/merge.ts +377 -0
  76. package/src/commands/pipeline.ts +306 -0
  77. package/src/commands/plan.ts +331 -0
  78. package/src/commands/scan.ts +568 -0
  79. package/src/commands/status.ts +129 -0
  80. package/src/index.ts +137 -0
  81. package/src/services/document-parser.ts +548 -0
  82. package/src/services/llm.ts +857 -0
  83. package/src/types.ts +161 -0
  84. package/src/utils/file.ts +60 -0
  85. package/src/utils/logger.ts +58 -0
  86. package/tsconfig.json +19 -0
package/src/index.ts ADDED
@@ -0,0 +1,137 @@
1
+ import { Command } from 'commander';
2
+ import chalk from 'chalk';
3
+ import { scanCommand } from './commands/scan';
4
+ import { analyzeCommand } from './commands/analyze';
5
+ import { mergeCommand } from './commands/merge';
6
+ import { planCommand } from './commands/plan';
7
+ import { dispatchCommand } from './commands/dispatch';
8
+ import { learnCommand } from './commands/learn';
9
+ import { pipelineCommand } from './commands/pipeline';
10
+ import { statusCommand } from './commands/status';
11
+ import { cleanCommand } from './commands/clean';
12
+ import { doctorCommand } from './commands/doctor';
13
+
14
+ const program = new Command();
15
+
16
+ program
17
+ .name('spec-agent')
18
+ .description('Multi-agent CLI tool for breaking down large requirement documents')
19
+ .version('1.0.0');
20
+
21
+ program
22
+ .command('scan')
23
+ .description('Scan documents and create chunk manifest')
24
+ .option('-i, --input <path>', 'Input file or directory')
25
+ .option('--stdin', 'Read file paths from stdin')
26
+ .option('-o, --output <path>', 'Output path for manifest.json', 'manifest.json')
27
+ .option('-c, --chunk-size <size>', 'Maximum chunk size for LLM analysis (default: 200kb)', '200kb')
28
+ .option('--min-chunk-size <size>', 'Minimum chunk size - smaller chunks will be merged (default: 10kb)', '10kb')
29
+ .option('--no-llm-chunking', 'Disable LLM-driven document structure analysis (use rule-based only)')
30
+ .option('--strict-llm', 'Fail if LLM chunking fails (no fallback)')
31
+ .option('-f, --format <format>', 'Input format: md, pdf, docx, html, auto', 'auto')
32
+ .option('--dry-run', 'Preview scan plan without creating manifest')
33
+ .option('-y, --yes', 'Skip confirmation prompts')
34
+ .action(scanCommand);
35
+
36
+ program
37
+ .command('analyze')
38
+ .description('Analyze document chunks in parallel using multiple agents')
39
+ .option('-m, --manifest <path>', 'Path to manifest.json', 'manifest.json')
40
+ .option('-o, --output <dir>', 'Output directory for summaries', 'summaries')
41
+ .option('-a, --agents <count>', 'Number of parallel agents', 'auto')
42
+ .option('--chunks <indices>', 'Comma-separated chunk indices to analyze')
43
+ .option('--focus <type>', 'Analysis focus: full, features, data-model, api, pages', 'full')
44
+ .option('--apply-learned', 'Apply learned patterns from previous runs')
45
+ .option('--retries <count>', 'Retry attempts per failed chunk (default: 1)', '1')
46
+ .option('--budget-tokens <count>', 'Max total tokens for analyze run (0 = unlimited)', '0')
47
+ .option('--dry-run', 'Preview analysis plan without running')
48
+ .option('-y, --yes', 'Skip confirmation prompts')
49
+ .action(analyzeCommand);
50
+
51
+ program
52
+ .command('merge')
53
+ .description('Merge chunk summaries into unified spec')
54
+ .option('-s, --summaries <dir>', 'Directory containing summaries', 'summaries')
55
+ .option('-o, --output <path>', 'Output path for spec_summary.json', 'spec_summary.json')
56
+ .option('--strategy <strategy>', 'Merge strategy: conservative, aggressive', 'conservative')
57
+ .option('--dry-run', 'Preview merge plan without running')
58
+ .option('-y, --yes', 'Skip confirmation prompts')
59
+ .action(mergeCommand);
60
+
61
+ program
62
+ .command('plan')
63
+ .description('Generate executable task plan from spec')
64
+ .option('-s, --spec <path>', 'Path to spec_summary.json', 'spec_summary.json')
65
+ .option('-o, --output <path>', 'Output path for task_plan.json', 'task_plan.json')
66
+ .option('-t, --type <type>', 'Output type: prototype, code, docs', 'prototype')
67
+ .option('--framework <fw>', 'Target framework: vue3, react, html', 'vue3')
68
+ .option('-p, --parallel <count>', 'Max parallel tasks', '3')
69
+ .option('--dry-run', 'Preview task plan without generating')
70
+ .option('-y, --yes', 'Skip confirmation prompts')
71
+ .action(planCommand);
72
+
73
+ program
74
+ .command('dispatch')
75
+ .description('Dispatch tasks to specialized agents')
76
+ .option('-p, --plan <path>', 'Path to task_plan.json', 'task_plan.json')
77
+ .option('-o, --output <path>', 'Output path for dispatch_plan.json', 'dispatch_plan.json')
78
+ .option('-a, --agents <mapping>', 'Agent type mapping, e.g. frontend:2,backend:2')
79
+ .option('--strategy <strategy>', 'Dispatch strategy: balanced, skill-first, load-first', 'balanced')
80
+ .option('--dry-run', 'Preview dispatch plan without executing')
81
+ .option('-y, --yes', 'Skip confirmation prompts')
82
+ .action(dispatchCommand);
83
+
84
+ program
85
+ .command('learn')
86
+ .description('Learn and accumulate project-specific patterns')
87
+ .option('-w, --workspace <dir>', 'Workspace directory', '.')
88
+ .option('--from <phase>', 'Learn from: summaries, plan, dispatch', 'summaries')
89
+ .option('--pattern <name>', 'Pattern name to learn')
90
+ .option('--rule <value>', 'Pattern rule value')
91
+ .option('--list', 'List all learned patterns')
92
+ .option('--export <path>', 'Export patterns to JSON file')
93
+ .option('--apply', 'Apply learned patterns to current workspace')
94
+ .action(learnCommand);
95
+
96
+ program
97
+ .command('pipeline')
98
+ .description('Run full pipeline: scan → analyze → merge → plan → dispatch')
99
+ .option('-i, --input <path>', 'Input file or directory')
100
+ .option('-o, --output <dir>', 'Output directory', 'workspace')
101
+ .option('-a, --agents <count>', 'Max parallel agents', 'auto')
102
+ .option('-c, --chunk-size <size>', 'Max chunk size for LLM analysis (default: 200kb)', '200kb')
103
+ .option('--min-chunk-size <size>', 'Minimum chunk size - smaller chunks will be merged (default: 10kb)', '10kb')
104
+ .option('--analyze-retries <count>', 'Retry attempts per failed analyze chunk (default: 1)', '1')
105
+ .option('--analyze-budget-tokens <count>', 'Max total tokens for analyze in pipeline (0 = unlimited)', '0')
106
+ .option('--strict-llm', 'Fail if LLM chunking fails (no fallback)')
107
+ .option('--framework <fw>', 'Target framework', 'vue3')
108
+ .option('--stop-at <phase>', 'Stop after phase: scan, analyze, merge, plan, dispatch')
109
+ .option('--from <phase>', 'Resume from phase: scan, analyze, merge, plan, dispatch')
110
+ .option('--dry-run', 'Preview full pipeline without executing')
111
+ .option('-y, --yes', 'Skip all confirmation prompts')
112
+ .action(pipelineCommand);
113
+
114
+ program
115
+ .command('status')
116
+ .description('Check workspace status')
117
+ .option('-w, --workspace <dir>', 'Workspace directory', '.')
118
+ .option('--format <format>', 'Output format: text, json', 'text')
119
+ .action(statusCommand);
120
+
121
+ program
122
+ .command('clean')
123
+ .description('Clean workspace intermediate files')
124
+ .option('-w, --workspace <dir>', 'Workspace directory', '.')
125
+ .option('--dry-run', 'Preview what would be cleaned')
126
+ .option('-y, --yes', 'Skip confirmation')
127
+ .action(cleanCommand);
128
+
129
+ program
130
+ .command('doctor')
131
+ .description('Run environment and configuration health checks')
132
+ .option('-w, --workspace <dir>', 'Workspace directory', '.')
133
+ .option('--check-llm', 'Run a lightweight real LLM connectivity test')
134
+ .option('--format <format>', 'Output format: text, json', 'text')
135
+ .action(doctorCommand);
136
+
137
+ program.parse();
@@ -0,0 +1,548 @@
1
+ import * as fs from 'fs-extra';
2
+ import * as path from 'path';
3
+ import { execFile } from 'child_process';
4
+ import { promisify } from 'util';
5
+
6
+ const execFileAsync = promisify(execFile);
7
+
8
+ export interface ParsedDocument {
9
+ content: string;
10
+ format: 'markdown' | 'text' | 'html';
11
+ images?: EmbeddedImage[];
12
+ metadata: {
13
+ title?: string;
14
+ author?: string;
15
+ pages?: number;
16
+ wordCount?: number;
17
+ };
18
+ }
19
+
20
+ export interface EmbeddedImage {
21
+ id: string;
22
+ alt: string;
23
+ mimeType: string;
24
+ estimatedSize: number;
25
+ dataUri: string;
26
+ }
27
+
28
+ /**
29
+ * Parse a document file and extract text content as Markdown
30
+ * Supports: .md, .txt, .html, .pdf, .docx
31
+ *
32
+ * All formats are normalized to Markdown for consistent chunking:
33
+ * - Headings become # ## ###
34
+ * - Lists become - or 1.
35
+ * - Tables become Markdown tables
36
+ */
37
+ export async function parseDocument(filePath: string): Promise<ParsedDocument> {
38
+ const ext = path.extname(filePath).toLowerCase();
39
+ const buffer = await fs.readFile(filePath);
40
+
41
+ switch (ext) {
42
+ case '.md':
43
+ return parseMarkdownFile(buffer);
44
+ case '.txt':
45
+ return parseTextFile(buffer);
46
+ case '.html':
47
+ case '.htm':
48
+ return parseHtmlToMarkdown(buffer);
49
+ case '.pdf':
50
+ return parsePdfToMarkdown(buffer, filePath);
51
+ case '.docx':
52
+ return parseDocxToMarkdown(buffer);
53
+ default:
54
+ // Try to read as text
55
+ return parseTextFile(buffer);
56
+ }
57
+ }
58
+
59
+ function parseMarkdownFile(buffer: Buffer): ParsedDocument {
60
+ const content = buffer.toString('utf-8');
61
+ const normalized = normalizeMarkdown(content);
62
+ const extracted = extractEmbeddedImages(normalized);
63
+ return {
64
+ content: extracted.content,
65
+ format: 'markdown',
66
+ images: extracted.images,
67
+ metadata: {
68
+ wordCount: content.split(/\s+/).length,
69
+ },
70
+ };
71
+ }
72
+
73
+ function parseTextFile(buffer: Buffer): ParsedDocument {
74
+ const content = buffer.toString('utf-8');
75
+ const normalized = normalizeMarkdown(content);
76
+ const extracted = extractEmbeddedImages(normalized);
77
+ return {
78
+ content: extracted.content,
79
+ format: 'markdown',
80
+ images: extracted.images,
81
+ metadata: {
82
+ wordCount: content.split(/\s+/).length,
83
+ },
84
+ };
85
+ }
86
+
87
+ /**
88
+ * Normalize content to proper Markdown format
89
+ * - Ensures consistent heading syntax
90
+ * - Normalizes list markers
91
+ * - Fixes spacing around headers
92
+ * - Removes base64 images (replaces with placeholder)
93
+ */
94
+ function normalizeMarkdown(content: string): string {
95
+ return content
96
+ // Ensure space after # for headers
97
+ .replace(/^(#{1,6})([^\s#])/gm, '$1 $2')
98
+ // Normalize list markers (convert * to -)
99
+ .replace(/^(\s*)\*[ \t]/gm, '$1- ')
100
+ // Ensure blank line before headers
101
+ .replace(/([^\n])\n(#{1,6}\s)/g, '$1\n\n$2')
102
+ // Remove excessive blank lines (max 2)
103
+ .replace(/\n{4,}/g, '\n\n\n')
104
+ .trim();
105
+ }
106
+
107
+ export function extractEmbeddedImages(content: string): { content: string; images: EmbeddedImage[] } {
108
+ const images: EmbeddedImage[] = [];
109
+ let imageIndex = 1;
110
+
111
+ const withMarkdownImages = content.replace(
112
+ /!\[([^\]]*)\]\((data:image\/([^;]+);base64,([A-Za-z0-9+/=]+))\)/g,
113
+ (_match, altText: string, dataUri: string, mimeSubType: string, base64Data: string) => {
114
+ const id = `IMG${String(imageIndex++).padStart(4, '0')}`;
115
+ const estimatedSize = Math.round(base64Data.length * 0.75);
116
+ const mimeType = `image/${mimeSubType}`;
117
+ const alt = (altText || '').trim();
118
+ images.push({ id, alt, mimeType, estimatedSize, dataUri });
119
+ return `\n[图片引用 ${id} | alt="${alt || '无'}" | ${mimeType} | ${estimatedSize} bytes]\n`;
120
+ }
121
+ );
122
+
123
+ const withHtmlImages = withMarkdownImages.replace(
124
+ /<img[^>]*src="(data:image\/([^;]+);base64,([A-Za-z0-9+/=]+))"[^>]*>/gi,
125
+ (match, dataUri: string, mimeSubType: string, base64Data: string) => {
126
+ const altMatch = match.match(/\salt="([^"]*)"/i);
127
+ const alt = altMatch?.[1]?.trim() || '';
128
+ const id = `IMG${String(imageIndex++).padStart(4, '0')}`;
129
+ const estimatedSize = Math.round(base64Data.length * 0.75);
130
+ const mimeType = `image/${mimeSubType}`;
131
+ images.push({ id, alt, mimeType, estimatedSize, dataUri });
132
+ return `\n[图片引用 ${id} | alt="${alt || '无'}" | ${mimeType} | ${estimatedSize} bytes]\n`;
133
+ }
134
+ );
135
+
136
+ return {
137
+ content: withHtmlImages,
138
+ images,
139
+ };
140
+ }
141
+
142
+ /**
143
+ * Check if content contains base64 images and estimate their size
144
+ */
145
+ export function analyzeBase64Images(content: string): { count: number; estimatedSize: number } {
146
+ const base64Pattern = /data:image\/[^;]+;base64,([A-Za-z0-9+/=]+)/g;
147
+ let match;
148
+ let count = 0;
149
+ let totalLength = 0;
150
+
151
+ while ((match = base64Pattern.exec(content)) !== null) {
152
+ count++;
153
+ totalLength += match[1].length;
154
+ }
155
+
156
+ // Base64 is ~4/3 of binary size, so multiply by 0.75 to get approximate binary size
157
+ const estimatedSize = Math.round(totalLength * 0.75);
158
+
159
+ return { count, estimatedSize };
160
+ }
161
+
162
+ async function parseHtmlToMarkdown(buffer: Buffer): Promise<ParsedDocument> {
163
+ const html = buffer.toString('utf-8');
164
+
165
+ // Convert HTML to Markdown with structure preservation
166
+ const markdown = convertHtmlToMarkdown(html);
167
+
168
+ const extracted = extractEmbeddedImages(markdown);
169
+ return {
170
+ content: extracted.content,
171
+ format: 'markdown',
172
+ images: extracted.images,
173
+ metadata: {
174
+ wordCount: markdown.split(/\s+/).length,
175
+ },
176
+ };
177
+ }
178
+
179
+ /**
180
+ * Convert HTML to Markdown while preserving document structure
181
+ */
182
+ function convertHtmlToMarkdown(html: string): string {
183
+ let md = html;
184
+
185
+ // Remove script and style tags with content
186
+ md = md.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
187
+ md = md.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
188
+
189
+ // Convert headings
190
+ md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n\n# $1\n\n');
191
+ md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n\n## $1\n\n');
192
+ md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n\n### $1\n\n');
193
+ md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n\n#### $1\n\n');
194
+ md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n\n##### $1\n\n');
195
+ md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n\n###### $1\n\n');
196
+
197
+ // Convert paragraphs
198
+ md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n\n$1\n\n');
199
+
200
+ // Convert line breaks
201
+ md = md.replace(/<br\s*\/?>/gi, '\n');
202
+
203
+ // Convert strong/b and em/i
204
+ md = md.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/(strong|b)>/gi, '**$2**');
205
+ md = md.replace(/<(em|i)[^>]*>([\s\S]*?)<\/(em|i)>/gi, '*$2*');
206
+
207
+ // Convert code
208
+ md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
209
+ md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n\n```\n$1\n```\n\n');
210
+
211
+ // Convert unordered lists
212
+ md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (match, content) => {
213
+ return '\n\n' + content.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n') + '\n';
214
+ });
215
+
216
+ // Convert ordered lists
217
+ md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (match, content) => {
218
+ let index = 1;
219
+ return '\n\n' + content.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_liMatch: string, itemContent: string) => `${index++}. ${itemContent.trim()}\n`) + '\n';
220
+ });
221
+
222
+ // Convert tables
223
+ md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (match, content) => {
224
+ let tableMd = '\n\n';
225
+ const rows = content.match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
226
+
227
+ rows.forEach((row: string, rowIndex: number) => {
228
+ const cells = row.match(/<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi) || [];
229
+ const cellContents = cells.map((cell: string) =>
230
+ cell.replace(/<[^>]+>/g, '').trim()
231
+ );
232
+
233
+ if (cellContents.length > 0) {
234
+ tableMd += '| ' + cellContents.join(' | ') + ' |\n';
235
+ // Add separator after header row
236
+ if (rowIndex === 0) {
237
+ tableMd += '|' + cellContents.map(() => ' --- |').join('') + '\n';
238
+ }
239
+ }
240
+ });
241
+
242
+ return tableMd + '\n';
243
+ });
244
+
245
+ // Convert links
246
+ md = md.replace(/<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
247
+
248
+ // Convert images
249
+ md = md.replace(/<img[^>]+src="([^"]+)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)');
250
+ md = md.replace(/<img[^>]+alt="([^"]*)"[^>]*src="([^"]+)"[^>]*\/?>/gi, '![$1]($2)');
251
+ md = md.replace(/<img[^>]+src="([^"]+)"[^>]*\/?>/gi, '![]($1)');
252
+
253
+ // Remove remaining HTML tags but keep content
254
+ md = md.replace(/<[^>]+>/g, '');
255
+
256
+ // Decode HTML entities
257
+ md = md.replace(/&amp;/g, '&');
258
+ md = md.replace(/&lt;/g, '<');
259
+ md = md.replace(/&gt;/g, '>');
260
+ md = md.replace(/&quot;/g, '"');
261
+ md = md.replace(/&#39;/g, "'");
262
+ md = md.replace(/&nbsp;/g, ' ');
263
+ md = md.replace(/&mdash;/g, '—');
264
+ md = md.replace(/&ndash;/g, '–');
265
+ md = md.replace(/&hellip;/g, '...');
266
+
267
+ // Clean up excessive whitespace
268
+ md = md.replace(/\n{4,}/g, '\n\n\n');
269
+
270
+ return md.trim();
271
+ }
272
+
273
+ function parseEnvInt(name: string, fallback: number): number {
274
+ const raw = process.env[name];
275
+ if (!raw) return fallback;
276
+ const parsed = parseInt(raw, 10);
277
+ return Number.isFinite(parsed) ? parsed : fallback;
278
+ }
279
+
280
+ async function extractPdfPageImages(pdfPath: string, maxPages: number): Promise<EmbeddedImage[]> {
281
+ if (maxPages <= 0) {
282
+ return [];
283
+ }
284
+
285
+ const tempRoot = path.join(
286
+ process.cwd(),
287
+ '.spec-agent-tmp',
288
+ 'pdf-pages',
289
+ `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
290
+ );
291
+ const outPrefix = path.join(tempRoot, 'page');
292
+ await fs.ensureDir(tempRoot);
293
+
294
+ try {
295
+ await execFileAsync('pdftoppm', [
296
+ '-png',
297
+ '-f', '1',
298
+ '-l', String(maxPages),
299
+ pdfPath,
300
+ outPrefix,
301
+ ]);
302
+
303
+ const files = await fs.readdir(tempRoot);
304
+ const pngFiles = files
305
+ .filter(f => /^page-\d+\.png$/i.test(f))
306
+ .sort((a, b) => {
307
+ const ai = parseInt(a.match(/\d+/)?.[0] || '0', 10);
308
+ const bi = parseInt(b.match(/\d+/)?.[0] || '0', 10);
309
+ return ai - bi;
310
+ });
311
+
312
+ const images: EmbeddedImage[] = [];
313
+ let imageIdx = 1;
314
+ for (const file of pngFiles) {
315
+ const imagePath = path.join(tempRoot, file);
316
+ const buffer = await fs.readFile(imagePath);
317
+ const base64 = buffer.toString('base64');
318
+ const pageNo = parseInt(file.match(/\d+/)?.[0] || String(imageIdx), 10);
319
+ images.push({
320
+ id: `PDFIMG${String(imageIdx++).padStart(4, '0')}`,
321
+ alt: `PDF第${pageNo}页`,
322
+ mimeType: 'image/png',
323
+ estimatedSize: buffer.length,
324
+ dataUri: `data:image/png;base64,${base64}`,
325
+ });
326
+ }
327
+ return images;
328
+ } catch {
329
+ // pdftoppm unavailable or conversion failed.
330
+ return [];
331
+ } finally {
332
+ await fs.remove(tempRoot).catch(() => undefined);
333
+ }
334
+ }
335
+
336
+ async function parsePdfToMarkdown(buffer: Buffer, filePath?: string): Promise<ParsedDocument> {
337
+ try {
338
+ // Dynamic import to avoid loading if not needed
339
+ const pdfParse = await import('pdf-parse');
340
+ const result = await pdfParse.default(buffer);
341
+
342
+ // Convert PDF text to structured Markdown
343
+ // PDF text often has page breaks and layout artifacts that need cleaning
344
+ const structuredContent = structurePdfContent(result.text);
345
+
346
+ const maxPdfImagePages = Math.max(0, parseEnvInt('PDF_IMAGE_PAGE_LIMIT', 8));
347
+ const pageImages = filePath
348
+ ? await extractPdfPageImages(filePath, Math.min(result.numpages, maxPdfImagePages))
349
+ : [];
350
+
351
+ const imageHeaders = pageImages.map(
352
+ image => `[图片引用 ${image.id} | alt="${image.alt || '无'}" | ${image.mimeType} | ${image.estimatedSize} bytes]`
353
+ );
354
+ const contentWithPageImages = imageHeaders.length > 0
355
+ ? `${imageHeaders.join('\n')}\n\n${structuredContent}`
356
+ : structuredContent;
357
+
358
+ return {
359
+ content: extractEmbeddedImages(contentWithPageImages).content,
360
+ format: 'markdown',
361
+ images: pageImages,
362
+ metadata: {
363
+ pages: result.numpages,
364
+ wordCount: result.text.split(/\s+/).length,
365
+ },
366
+ };
367
+ } catch (error) {
368
+ // Fallback: try to extract text as-is
369
+ const text = buffer.toString('utf-8');
370
+ if (text.length > 100) {
371
+ return {
372
+ content: extractEmbeddedImages(normalizeMarkdown(text)).content,
373
+ format: 'markdown',
374
+ images: [],
375
+ metadata: {},
376
+ };
377
+ }
378
+ throw new Error(`Failed to parse PDF: ${error instanceof Error ? error.message : String(error)}`);
379
+ }
380
+ }
381
+
382
+ /**
383
+ * Structure PDF content into Markdown format
384
+ * PDFs often have layout artifacts that need intelligent processing
385
+ */
386
+ function structurePdfContent(text: string): string {
387
+ let md = text;
388
+
389
+ // Remove page number lines (standalone numbers)
390
+ md = md.replace(/\n\s*\d+\s*\n/g, '\n\n');
391
+
392
+ // Detect and convert potential headers
393
+ // Short lines at the start of paragraphs that are all caps or title case
394
+ const lines = md.split('\n');
395
+ const processedLines: string[] = [];
396
+ let prevLineEmpty = true;
397
+
398
+ for (let i = 0; i < lines.length; i++) {
399
+ const line = lines[i].trim();
400
+ const nextLine = lines[i + 1]?.trim() || '';
401
+
402
+ // Skip empty lines but track them
403
+ if (!line) {
404
+ processedLines.push('');
405
+ prevLineEmpty = true;
406
+ continue;
407
+ }
408
+
409
+ // Detect headers based on various heuristics
410
+ const isShortLine = line.length < 100;
411
+ const isAllCaps = line === line.toUpperCase() && line.length > 3 && /[A-Z]/.test(line);
412
+ const isTitleCase = /^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$/.test(line) && line.length > 3;
413
+ const looksLikeNumberedHeader = /^\d+(?:\.\d+)*\.?\s+\S/.test(line);
414
+ const looksLikeChapter = /^(Chapter|CHAPTER|第[一二三四五六七八九十\d]+章)/.test(line);
415
+
416
+ if (prevLineEmpty && isShortLine) {
417
+ if (looksLikeChapter) {
418
+ // Chapter header - H1
419
+ processedLines.push(`# ${line}`);
420
+ } else if (looksLikeNumberedHeader) {
421
+ // Numbered section - could be H2 or H3
422
+ const level = (line.match(/\./g) || []).length + 2;
423
+ processedLines.push(`${'#'.repeat(Math.min(level, 6))} ${line}`);
424
+ } else if (isAllCaps && line.length < 50) {
425
+ // ALL CAPS short line - likely a section header
426
+ processedLines.push(`## ${line}`);
427
+ } else if (isTitleCase && !nextLine.startsWith(line.substring(0, 10))) {
428
+ // Title case that doesn't continue - likely a header
429
+ processedLines.push(`### ${line}`);
430
+ } else {
431
+ processedLines.push(line);
432
+ }
433
+ } else {
434
+ processedLines.push(line);
435
+ }
436
+
437
+ prevLineEmpty = false;
438
+ }
439
+
440
+ md = processedLines.join('\n');
441
+
442
+ // Clean up excessive whitespace
443
+ md = md.replace(/\n{4,}/g, '\n\n\n');
444
+
445
+ return normalizeMarkdown(md);
446
+ }
447
+
448
+ async function parseDocxToMarkdown(buffer: Buffer): Promise<ParsedDocument> {
449
+ try {
450
+ // Dynamic import to avoid loading if not needed
451
+ const mammoth = await import('mammoth');
452
+
453
+ // Use mammoth's HTML conversion to preserve structure, then convert to Markdown
454
+ const htmlResult = await mammoth.convertToHtml({ buffer }, {
455
+ styleMap: [
456
+ "p[style-name='Heading 1'] => h1:fresh",
457
+ "p[style-name='Heading 2'] => h2:fresh",
458
+ "p[style-name='Heading 3'] => h3:fresh",
459
+ "p[style-name='Heading 4'] => h4:fresh",
460
+ "p[style-name='Heading 5'] => h5:fresh",
461
+ "p[style-name='Heading 6'] => h6:fresh",
462
+ "p[style-name='Title'] => h1.title:fresh",
463
+ "p[style-name='Subtitle'] => h2.subtitle:fresh",
464
+ ]
465
+ });
466
+
467
+ // Convert HTML to Markdown
468
+ const markdown = convertHtmlToMarkdown(htmlResult.value);
469
+ const extracted = extractEmbeddedImages(markdown);
470
+
471
+ // Extract metadata from document
472
+ const metadata: ParsedDocument['metadata'] = {
473
+ wordCount: markdown.split(/\s+/).length,
474
+ };
475
+
476
+ // Try to extract title from the first heading
477
+ const titleMatch = markdown.match(/^#\s+(.+)$/m);
478
+ if (titleMatch) {
479
+ metadata.title = titleMatch[1].trim();
480
+ }
481
+
482
+ return {
483
+ content: extracted.content,
484
+ format: 'markdown',
485
+ images: extracted.images,
486
+ metadata,
487
+ };
488
+ } catch (error) {
489
+ // Fallback: try raw text extraction
490
+ try {
491
+ const mammoth = await import('mammoth');
492
+ const result = await mammoth.extractRawText({ buffer });
493
+ return {
494
+ content: extractEmbeddedImages(normalizeMarkdown(result.value)).content,
495
+ format: 'markdown',
496
+ images: [],
497
+ metadata: {
498
+ wordCount: result.value.split(/\s+/).length,
499
+ },
500
+ };
501
+ } catch {
502
+ // Last resort: read as plain text
503
+ const text = buffer.toString('utf-8');
504
+ if (text.length > 100) {
505
+ return {
506
+ content: extractEmbeddedImages(normalizeMarkdown(text)).content,
507
+ format: 'markdown',
508
+ images: [],
509
+ metadata: {},
510
+ };
511
+ }
512
+ }
513
+ throw new Error(`Failed to parse DOCX: ${error instanceof Error ? error.message : String(error)}`);
514
+ }
515
+ }
516
+
517
+ /**
518
+ * Read and concatenate multiple document files
519
+ * All content is normalized to Markdown format
520
+ */
521
+ export async function readChunkContent(filePaths: string[]): Promise<string> {
522
+ const contents: string[] = [];
523
+
524
+ for (const filePath of filePaths) {
525
+ try {
526
+ const parsed = await parseDocument(filePath);
527
+ contents.push(`=== ${path.basename(filePath)} ===\n${parsed.content}`);
528
+ } catch (error) {
529
+ // If parsing fails, try to read as plain text
530
+ try {
531
+ const text = await fs.readFile(filePath, 'utf-8');
532
+ contents.push(`=== ${path.basename(filePath)} ===\n${normalizeMarkdown(text)}`);
533
+ } catch {
534
+ contents.push(`=== ${path.basename(filePath)} ===\n[Error reading file: ${error instanceof Error ? error.message : String(error)}]`);
535
+ }
536
+ }
537
+ }
538
+
539
+ return contents.join('\n\n---\n\n');
540
+ }
541
+
542
+ /**
543
+ * Check if a file format is supported
544
+ */
545
+ export function isSupportedFormat(filePath: string): boolean {
546
+ const ext = path.extname(filePath).toLowerCase();
547
+ return ['.md', '.txt', '.html', '.htm', '.pdf', '.docx'].includes(ext);
548
+ }