spec-agent 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/bin/spec-agent.js +14 -0
- package/dist/commands/analyze.d.ts +16 -0
- package/dist/commands/analyze.d.ts.map +1 -0
- package/dist/commands/analyze.js +283 -0
- package/dist/commands/analyze.js.map +1 -0
- package/dist/commands/clean.d.ts +9 -0
- package/dist/commands/clean.d.ts.map +1 -0
- package/dist/commands/clean.js +109 -0
- package/dist/commands/clean.js.map +1 -0
- package/dist/commands/dispatch.d.ts +12 -0
- package/dist/commands/dispatch.d.ts.map +1 -0
- package/dist/commands/dispatch.js +232 -0
- package/dist/commands/dispatch.js.map +1 -0
- package/dist/commands/doctor.d.ts +9 -0
- package/dist/commands/doctor.d.ts.map +1 -0
- package/dist/commands/doctor.js +153 -0
- package/dist/commands/doctor.js.map +1 -0
- package/dist/commands/learn.d.ts +13 -0
- package/dist/commands/learn.d.ts.map +1 -0
- package/dist/commands/learn.js +234 -0
- package/dist/commands/learn.js.map +1 -0
- package/dist/commands/merge.d.ts +11 -0
- package/dist/commands/merge.d.ts.map +1 -0
- package/dist/commands/merge.js +335 -0
- package/dist/commands/merge.js.map +1 -0
- package/dist/commands/pipeline.d.ts +19 -0
- package/dist/commands/pipeline.d.ts.map +1 -0
- package/dist/commands/pipeline.js +266 -0
- package/dist/commands/pipeline.js.map +1 -0
- package/dist/commands/plan.d.ts +13 -0
- package/dist/commands/plan.d.ts.map +1 -0
- package/dist/commands/plan.js +314 -0
- package/dist/commands/plan.js.map +1 -0
- package/dist/commands/scan.d.ts +28 -0
- package/dist/commands/scan.d.ts.map +1 -0
- package/dist/commands/scan.js +488 -0
- package/dist/commands/scan.js.map +1 -0
- package/dist/commands/status.d.ts +8 -0
- package/dist/commands/status.d.ts.map +1 -0
- package/dist/commands/status.js +146 -0
- package/dist/commands/status.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +126 -0
- package/dist/index.js.map +1 -0
- package/dist/services/document-parser.d.ts +49 -0
- package/dist/services/document-parser.d.ts.map +1 -0
- package/dist/services/document-parser.js +499 -0
- package/dist/services/document-parser.js.map +1 -0
- package/dist/services/llm.d.ts +61 -0
- package/dist/services/llm.d.ts.map +1 -0
- package/dist/services/llm.js +716 -0
- package/dist/services/llm.js.map +1 -0
- package/dist/types.d.ts +159 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/file.d.ts +10 -0
- package/dist/utils/file.d.ts.map +1 -0
- package/dist/utils/file.js +96 -0
- package/dist/utils/file.js.map +1 -0
- package/dist/utils/logger.d.ts +13 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/logger.js.map +1 -0
- package/package.json +48 -0
- package/scripts/publish-npm.js +174 -0
- package/spec-agent-implementation.md +750 -0
- package/src/commands/analyze.ts +322 -0
- package/src/commands/clean.ts +88 -0
- package/src/commands/dispatch.ts +250 -0
- package/src/commands/doctor.ts +136 -0
- package/src/commands/learn.ts +261 -0
- package/src/commands/merge.ts +377 -0
- package/src/commands/pipeline.ts +306 -0
- package/src/commands/plan.ts +331 -0
- package/src/commands/scan.ts +568 -0
- package/src/commands/status.ts +129 -0
- package/src/index.ts +137 -0
- package/src/services/document-parser.ts +548 -0
- package/src/services/llm.ts +857 -0
- package/src/types.ts +161 -0
- package/src/utils/file.ts +60 -0
- package/src/utils/logger.ts +58 -0
- package/tsconfig.json +19 -0
package/src/index.ts
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { Command } from 'commander';
|
|
2
|
+
import chalk from 'chalk';
|
|
3
|
+
import { scanCommand } from './commands/scan';
|
|
4
|
+
import { analyzeCommand } from './commands/analyze';
|
|
5
|
+
import { mergeCommand } from './commands/merge';
|
|
6
|
+
import { planCommand } from './commands/plan';
|
|
7
|
+
import { dispatchCommand } from './commands/dispatch';
|
|
8
|
+
import { learnCommand } from './commands/learn';
|
|
9
|
+
import { pipelineCommand } from './commands/pipeline';
|
|
10
|
+
import { statusCommand } from './commands/status';
|
|
11
|
+
import { cleanCommand } from './commands/clean';
|
|
12
|
+
import { doctorCommand } from './commands/doctor';
|
|
13
|
+
|
|
14
|
+
const program = new Command();
|
|
15
|
+
|
|
16
|
+
program
|
|
17
|
+
.name('spec-agent')
|
|
18
|
+
.description('Multi-agent CLI tool for breaking down large requirement documents')
|
|
19
|
+
.version('1.0.0');
|
|
20
|
+
|
|
21
|
+
program
|
|
22
|
+
.command('scan')
|
|
23
|
+
.description('Scan documents and create chunk manifest')
|
|
24
|
+
.option('-i, --input <path>', 'Input file or directory')
|
|
25
|
+
.option('--stdin', 'Read file paths from stdin')
|
|
26
|
+
.option('-o, --output <path>', 'Output path for manifest.json', 'manifest.json')
|
|
27
|
+
.option('-c, --chunk-size <size>', 'Maximum chunk size for LLM analysis (default: 200kb)', '200kb')
|
|
28
|
+
.option('--min-chunk-size <size>', 'Minimum chunk size - smaller chunks will be merged (default: 10kb)', '10kb')
|
|
29
|
+
.option('--no-llm-chunking', 'Disable LLM-driven document structure analysis (use rule-based only)')
|
|
30
|
+
.option('--strict-llm', 'Fail if LLM chunking fails (no fallback)')
|
|
31
|
+
.option('-f, --format <format>', 'Input format: md, pdf, docx, html, auto', 'auto')
|
|
32
|
+
.option('--dry-run', 'Preview scan plan without creating manifest')
|
|
33
|
+
.option('-y, --yes', 'Skip confirmation prompts')
|
|
34
|
+
.action(scanCommand);
|
|
35
|
+
|
|
36
|
+
program
|
|
37
|
+
.command('analyze')
|
|
38
|
+
.description('Analyze document chunks in parallel using multiple agents')
|
|
39
|
+
.option('-m, --manifest <path>', 'Path to manifest.json', 'manifest.json')
|
|
40
|
+
.option('-o, --output <dir>', 'Output directory for summaries', 'summaries')
|
|
41
|
+
.option('-a, --agents <count>', 'Number of parallel agents', 'auto')
|
|
42
|
+
.option('--chunks <indices>', 'Comma-separated chunk indices to analyze')
|
|
43
|
+
.option('--focus <type>', 'Analysis focus: full, features, data-model, api, pages', 'full')
|
|
44
|
+
.option('--apply-learned', 'Apply learned patterns from previous runs')
|
|
45
|
+
.option('--retries <count>', 'Retry attempts per failed chunk (default: 1)', '1')
|
|
46
|
+
.option('--budget-tokens <count>', 'Max total tokens for analyze run (0 = unlimited)', '0')
|
|
47
|
+
.option('--dry-run', 'Preview analysis plan without running')
|
|
48
|
+
.option('-y, --yes', 'Skip confirmation prompts')
|
|
49
|
+
.action(analyzeCommand);
|
|
50
|
+
|
|
51
|
+
program
|
|
52
|
+
.command('merge')
|
|
53
|
+
.description('Merge chunk summaries into unified spec')
|
|
54
|
+
.option('-s, --summaries <dir>', 'Directory containing summaries', 'summaries')
|
|
55
|
+
.option('-o, --output <path>', 'Output path for spec_summary.json', 'spec_summary.json')
|
|
56
|
+
.option('--strategy <strategy>', 'Merge strategy: conservative, aggressive', 'conservative')
|
|
57
|
+
.option('--dry-run', 'Preview merge plan without running')
|
|
58
|
+
.option('-y, --yes', 'Skip confirmation prompts')
|
|
59
|
+
.action(mergeCommand);
|
|
60
|
+
|
|
61
|
+
program
|
|
62
|
+
.command('plan')
|
|
63
|
+
.description('Generate executable task plan from spec')
|
|
64
|
+
.option('-s, --spec <path>', 'Path to spec_summary.json', 'spec_summary.json')
|
|
65
|
+
.option('-o, --output <path>', 'Output path for task_plan.json', 'task_plan.json')
|
|
66
|
+
.option('-t, --type <type>', 'Output type: prototype, code, docs', 'prototype')
|
|
67
|
+
.option('--framework <fw>', 'Target framework: vue3, react, html', 'vue3')
|
|
68
|
+
.option('-p, --parallel <count>', 'Max parallel tasks', '3')
|
|
69
|
+
.option('--dry-run', 'Preview task plan without generating')
|
|
70
|
+
.option('-y, --yes', 'Skip confirmation prompts')
|
|
71
|
+
.action(planCommand);
|
|
72
|
+
|
|
73
|
+
program
|
|
74
|
+
.command('dispatch')
|
|
75
|
+
.description('Dispatch tasks to specialized agents')
|
|
76
|
+
.option('-p, --plan <path>', 'Path to task_plan.json', 'task_plan.json')
|
|
77
|
+
.option('-o, --output <path>', 'Output path for dispatch_plan.json', 'dispatch_plan.json')
|
|
78
|
+
.option('-a, --agents <mapping>', 'Agent type mapping, e.g. frontend:2,backend:2')
|
|
79
|
+
.option('--strategy <strategy>', 'Dispatch strategy: balanced, skill-first, load-first', 'balanced')
|
|
80
|
+
.option('--dry-run', 'Preview dispatch plan without executing')
|
|
81
|
+
.option('-y, --yes', 'Skip confirmation prompts')
|
|
82
|
+
.action(dispatchCommand);
|
|
83
|
+
|
|
84
|
+
program
|
|
85
|
+
.command('learn')
|
|
86
|
+
.description('Learn and accumulate project-specific patterns')
|
|
87
|
+
.option('-w, --workspace <dir>', 'Workspace directory', '.')
|
|
88
|
+
.option('--from <phase>', 'Learn from: summaries, plan, dispatch', 'summaries')
|
|
89
|
+
.option('--pattern <name>', 'Pattern name to learn')
|
|
90
|
+
.option('--rule <value>', 'Pattern rule value')
|
|
91
|
+
.option('--list', 'List all learned patterns')
|
|
92
|
+
.option('--export <path>', 'Export patterns to JSON file')
|
|
93
|
+
.option('--apply', 'Apply learned patterns to current workspace')
|
|
94
|
+
.action(learnCommand);
|
|
95
|
+
|
|
96
|
+
program
|
|
97
|
+
.command('pipeline')
|
|
98
|
+
.description('Run full pipeline: scan → analyze → merge → plan → dispatch')
|
|
99
|
+
.option('-i, --input <path>', 'Input file or directory')
|
|
100
|
+
.option('-o, --output <dir>', 'Output directory', 'workspace')
|
|
101
|
+
.option('-a, --agents <count>', 'Max parallel agents', 'auto')
|
|
102
|
+
.option('-c, --chunk-size <size>', 'Max chunk size for LLM analysis (default: 200kb)', '200kb')
|
|
103
|
+
.option('--min-chunk-size <size>', 'Minimum chunk size - smaller chunks will be merged (default: 10kb)', '10kb')
|
|
104
|
+
.option('--analyze-retries <count>', 'Retry attempts per failed analyze chunk (default: 1)', '1')
|
|
105
|
+
.option('--analyze-budget-tokens <count>', 'Max total tokens for analyze in pipeline (0 = unlimited)', '0')
|
|
106
|
+
.option('--strict-llm', 'Fail if LLM chunking fails (no fallback)')
|
|
107
|
+
.option('--framework <fw>', 'Target framework', 'vue3')
|
|
108
|
+
.option('--stop-at <phase>', 'Stop after phase: scan, analyze, merge, plan, dispatch')
|
|
109
|
+
.option('--from <phase>', 'Resume from phase: scan, analyze, merge, plan, dispatch')
|
|
110
|
+
.option('--dry-run', 'Preview full pipeline without executing')
|
|
111
|
+
.option('-y, --yes', 'Skip all confirmation prompts')
|
|
112
|
+
.action(pipelineCommand);
|
|
113
|
+
|
|
114
|
+
program
|
|
115
|
+
.command('status')
|
|
116
|
+
.description('Check workspace status')
|
|
117
|
+
.option('-w, --workspace <dir>', 'Workspace directory', '.')
|
|
118
|
+
.option('--format <format>', 'Output format: text, json', 'text')
|
|
119
|
+
.action(statusCommand);
|
|
120
|
+
|
|
121
|
+
program
|
|
122
|
+
.command('clean')
|
|
123
|
+
.description('Clean workspace intermediate files')
|
|
124
|
+
.option('-w, --workspace <dir>', 'Workspace directory', '.')
|
|
125
|
+
.option('--dry-run', 'Preview what would be cleaned')
|
|
126
|
+
.option('-y, --yes', 'Skip confirmation')
|
|
127
|
+
.action(cleanCommand);
|
|
128
|
+
|
|
129
|
+
program
|
|
130
|
+
.command('doctor')
|
|
131
|
+
.description('Run environment and configuration health checks')
|
|
132
|
+
.option('-w, --workspace <dir>', 'Workspace directory', '.')
|
|
133
|
+
.option('--check-llm', 'Run a lightweight real LLM connectivity test')
|
|
134
|
+
.option('--format <format>', 'Output format: text, json', 'text')
|
|
135
|
+
.action(doctorCommand);
|
|
136
|
+
|
|
137
|
+
program.parse();
|
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
import * as fs from 'fs-extra';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
import { execFile } from 'child_process';
|
|
4
|
+
import { promisify } from 'util';
|
|
5
|
+
|
|
6
|
+
const execFileAsync = promisify(execFile);
|
|
7
|
+
|
|
8
|
+
export interface ParsedDocument {
|
|
9
|
+
content: string;
|
|
10
|
+
format: 'markdown' | 'text' | 'html';
|
|
11
|
+
images?: EmbeddedImage[];
|
|
12
|
+
metadata: {
|
|
13
|
+
title?: string;
|
|
14
|
+
author?: string;
|
|
15
|
+
pages?: number;
|
|
16
|
+
wordCount?: number;
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface EmbeddedImage {
|
|
21
|
+
id: string;
|
|
22
|
+
alt: string;
|
|
23
|
+
mimeType: string;
|
|
24
|
+
estimatedSize: number;
|
|
25
|
+
dataUri: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Parse a document file and extract text content as Markdown
|
|
30
|
+
* Supports: .md, .txt, .html, .pdf, .docx
|
|
31
|
+
*
|
|
32
|
+
* All formats are normalized to Markdown for consistent chunking:
|
|
33
|
+
* - Headings become # ## ###
|
|
34
|
+
* - Lists become - or 1.
|
|
35
|
+
* - Tables become Markdown tables
|
|
36
|
+
*/
|
|
37
|
+
export async function parseDocument(filePath: string): Promise<ParsedDocument> {
|
|
38
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
39
|
+
const buffer = await fs.readFile(filePath);
|
|
40
|
+
|
|
41
|
+
switch (ext) {
|
|
42
|
+
case '.md':
|
|
43
|
+
return parseMarkdownFile(buffer);
|
|
44
|
+
case '.txt':
|
|
45
|
+
return parseTextFile(buffer);
|
|
46
|
+
case '.html':
|
|
47
|
+
case '.htm':
|
|
48
|
+
return parseHtmlToMarkdown(buffer);
|
|
49
|
+
case '.pdf':
|
|
50
|
+
return parsePdfToMarkdown(buffer, filePath);
|
|
51
|
+
case '.docx':
|
|
52
|
+
return parseDocxToMarkdown(buffer);
|
|
53
|
+
default:
|
|
54
|
+
// Try to read as text
|
|
55
|
+
return parseTextFile(buffer);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function parseMarkdownFile(buffer: Buffer): ParsedDocument {
|
|
60
|
+
const content = buffer.toString('utf-8');
|
|
61
|
+
const normalized = normalizeMarkdown(content);
|
|
62
|
+
const extracted = extractEmbeddedImages(normalized);
|
|
63
|
+
return {
|
|
64
|
+
content: extracted.content,
|
|
65
|
+
format: 'markdown',
|
|
66
|
+
images: extracted.images,
|
|
67
|
+
metadata: {
|
|
68
|
+
wordCount: content.split(/\s+/).length,
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function parseTextFile(buffer: Buffer): ParsedDocument {
|
|
74
|
+
const content = buffer.toString('utf-8');
|
|
75
|
+
const normalized = normalizeMarkdown(content);
|
|
76
|
+
const extracted = extractEmbeddedImages(normalized);
|
|
77
|
+
return {
|
|
78
|
+
content: extracted.content,
|
|
79
|
+
format: 'markdown',
|
|
80
|
+
images: extracted.images,
|
|
81
|
+
metadata: {
|
|
82
|
+
wordCount: content.split(/\s+/).length,
|
|
83
|
+
},
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Normalize content to proper Markdown format
|
|
89
|
+
* - Ensures consistent heading syntax
|
|
90
|
+
* - Normalizes list markers
|
|
91
|
+
* - Fixes spacing around headers
|
|
92
|
+
* - Removes base64 images (replaces with placeholder)
|
|
93
|
+
*/
|
|
94
|
+
function normalizeMarkdown(content: string): string {
|
|
95
|
+
return content
|
|
96
|
+
// Ensure space after # for headers
|
|
97
|
+
.replace(/^(#{1,6})([^\s#])/gm, '$1 $2')
|
|
98
|
+
// Normalize list markers (convert * to -)
|
|
99
|
+
.replace(/^(\s*)\*[ \t]/gm, '$1- ')
|
|
100
|
+
// Ensure blank line before headers
|
|
101
|
+
.replace(/([^\n])\n(#{1,6}\s)/g, '$1\n\n$2')
|
|
102
|
+
// Remove excessive blank lines (max 2)
|
|
103
|
+
.replace(/\n{4,}/g, '\n\n\n')
|
|
104
|
+
.trim();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export function extractEmbeddedImages(content: string): { content: string; images: EmbeddedImage[] } {
|
|
108
|
+
const images: EmbeddedImage[] = [];
|
|
109
|
+
let imageIndex = 1;
|
|
110
|
+
|
|
111
|
+
const withMarkdownImages = content.replace(
|
|
112
|
+
/!\[([^\]]*)\]\((data:image\/([^;]+);base64,([A-Za-z0-9+/=]+))\)/g,
|
|
113
|
+
(_match, altText: string, dataUri: string, mimeSubType: string, base64Data: string) => {
|
|
114
|
+
const id = `IMG${String(imageIndex++).padStart(4, '0')}`;
|
|
115
|
+
const estimatedSize = Math.round(base64Data.length * 0.75);
|
|
116
|
+
const mimeType = `image/${mimeSubType}`;
|
|
117
|
+
const alt = (altText || '').trim();
|
|
118
|
+
images.push({ id, alt, mimeType, estimatedSize, dataUri });
|
|
119
|
+
return `\n[图片引用 ${id} | alt="${alt || '无'}" | ${mimeType} | ${estimatedSize} bytes]\n`;
|
|
120
|
+
}
|
|
121
|
+
);
|
|
122
|
+
|
|
123
|
+
const withHtmlImages = withMarkdownImages.replace(
|
|
124
|
+
/<img[^>]*src="(data:image\/([^;]+);base64,([A-Za-z0-9+/=]+))"[^>]*>/gi,
|
|
125
|
+
(match, dataUri: string, mimeSubType: string, base64Data: string) => {
|
|
126
|
+
const altMatch = match.match(/\salt="([^"]*)"/i);
|
|
127
|
+
const alt = altMatch?.[1]?.trim() || '';
|
|
128
|
+
const id = `IMG${String(imageIndex++).padStart(4, '0')}`;
|
|
129
|
+
const estimatedSize = Math.round(base64Data.length * 0.75);
|
|
130
|
+
const mimeType = `image/${mimeSubType}`;
|
|
131
|
+
images.push({ id, alt, mimeType, estimatedSize, dataUri });
|
|
132
|
+
return `\n[图片引用 ${id} | alt="${alt || '无'}" | ${mimeType} | ${estimatedSize} bytes]\n`;
|
|
133
|
+
}
|
|
134
|
+
);
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
content: withHtmlImages,
|
|
138
|
+
images,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Check if content contains base64 images and estimate their size
|
|
144
|
+
*/
|
|
145
|
+
export function analyzeBase64Images(content: string): { count: number; estimatedSize: number } {
|
|
146
|
+
const base64Pattern = /data:image\/[^;]+;base64,([A-Za-z0-9+/=]+)/g;
|
|
147
|
+
let match;
|
|
148
|
+
let count = 0;
|
|
149
|
+
let totalLength = 0;
|
|
150
|
+
|
|
151
|
+
while ((match = base64Pattern.exec(content)) !== null) {
|
|
152
|
+
count++;
|
|
153
|
+
totalLength += match[1].length;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Base64 is ~4/3 of binary size, so multiply by 0.75 to get approximate binary size
|
|
157
|
+
const estimatedSize = Math.round(totalLength * 0.75);
|
|
158
|
+
|
|
159
|
+
return { count, estimatedSize };
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
async function parseHtmlToMarkdown(buffer: Buffer): Promise<ParsedDocument> {
|
|
163
|
+
const html = buffer.toString('utf-8');
|
|
164
|
+
|
|
165
|
+
// Convert HTML to Markdown with structure preservation
|
|
166
|
+
const markdown = convertHtmlToMarkdown(html);
|
|
167
|
+
|
|
168
|
+
const extracted = extractEmbeddedImages(markdown);
|
|
169
|
+
return {
|
|
170
|
+
content: extracted.content,
|
|
171
|
+
format: 'markdown',
|
|
172
|
+
images: extracted.images,
|
|
173
|
+
metadata: {
|
|
174
|
+
wordCount: markdown.split(/\s+/).length,
|
|
175
|
+
},
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Convert HTML to Markdown while preserving document structure
|
|
181
|
+
*/
|
|
182
|
+
function convertHtmlToMarkdown(html: string): string {
|
|
183
|
+
let md = html;
|
|
184
|
+
|
|
185
|
+
// Remove script and style tags with content
|
|
186
|
+
md = md.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
|
|
187
|
+
md = md.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
|
188
|
+
|
|
189
|
+
// Convert headings
|
|
190
|
+
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n\n# $1\n\n');
|
|
191
|
+
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n\n## $1\n\n');
|
|
192
|
+
md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n\n### $1\n\n');
|
|
193
|
+
md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n\n#### $1\n\n');
|
|
194
|
+
md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n\n##### $1\n\n');
|
|
195
|
+
md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n\n###### $1\n\n');
|
|
196
|
+
|
|
197
|
+
// Convert paragraphs
|
|
198
|
+
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n\n$1\n\n');
|
|
199
|
+
|
|
200
|
+
// Convert line breaks
|
|
201
|
+
md = md.replace(/<br\s*\/?>/gi, '\n');
|
|
202
|
+
|
|
203
|
+
// Convert strong/b and em/i
|
|
204
|
+
md = md.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/(strong|b)>/gi, '**$2**');
|
|
205
|
+
md = md.replace(/<(em|i)[^>]*>([\s\S]*?)<\/(em|i)>/gi, '*$2*');
|
|
206
|
+
|
|
207
|
+
// Convert code
|
|
208
|
+
md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
|
|
209
|
+
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n\n```\n$1\n```\n\n');
|
|
210
|
+
|
|
211
|
+
// Convert unordered lists
|
|
212
|
+
md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (match, content) => {
|
|
213
|
+
return '\n\n' + content.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n') + '\n';
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
// Convert ordered lists
|
|
217
|
+
md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (match, content) => {
|
|
218
|
+
let index = 1;
|
|
219
|
+
return '\n\n' + content.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_liMatch: string, itemContent: string) => `${index++}. ${itemContent.trim()}\n`) + '\n';
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
// Convert tables
|
|
223
|
+
md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (match, content) => {
|
|
224
|
+
let tableMd = '\n\n';
|
|
225
|
+
const rows = content.match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi) || [];
|
|
226
|
+
|
|
227
|
+
rows.forEach((row: string, rowIndex: number) => {
|
|
228
|
+
const cells = row.match(/<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi) || [];
|
|
229
|
+
const cellContents = cells.map((cell: string) =>
|
|
230
|
+
cell.replace(/<[^>]+>/g, '').trim()
|
|
231
|
+
);
|
|
232
|
+
|
|
233
|
+
if (cellContents.length > 0) {
|
|
234
|
+
tableMd += '| ' + cellContents.join(' | ') + ' |\n';
|
|
235
|
+
// Add separator after header row
|
|
236
|
+
if (rowIndex === 0) {
|
|
237
|
+
tableMd += '|' + cellContents.map(() => ' --- |').join('') + '\n';
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
return tableMd + '\n';
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
// Convert links
|
|
246
|
+
md = md.replace(/<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
|
|
247
|
+
|
|
248
|
+
// Convert images
|
|
249
|
+
md = md.replace(/<img[^>]+src="([^"]+)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '');
|
|
250
|
+
md = md.replace(/<img[^>]+alt="([^"]*)"[^>]*src="([^"]+)"[^>]*\/?>/gi, '');
|
|
251
|
+
md = md.replace(/<img[^>]+src="([^"]+)"[^>]*\/?>/gi, '');
|
|
252
|
+
|
|
253
|
+
// Remove remaining HTML tags but keep content
|
|
254
|
+
md = md.replace(/<[^>]+>/g, '');
|
|
255
|
+
|
|
256
|
+
// Decode HTML entities
|
|
257
|
+
md = md.replace(/&/g, '&');
|
|
258
|
+
md = md.replace(/</g, '<');
|
|
259
|
+
md = md.replace(/>/g, '>');
|
|
260
|
+
md = md.replace(/"/g, '"');
|
|
261
|
+
md = md.replace(/'/g, "'");
|
|
262
|
+
md = md.replace(/ /g, ' ');
|
|
263
|
+
md = md.replace(/—/g, '—');
|
|
264
|
+
md = md.replace(/–/g, '–');
|
|
265
|
+
md = md.replace(/…/g, '...');
|
|
266
|
+
|
|
267
|
+
// Clean up excessive whitespace
|
|
268
|
+
md = md.replace(/\n{4,}/g, '\n\n\n');
|
|
269
|
+
|
|
270
|
+
return md.trim();
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function parseEnvInt(name: string, fallback: number): number {
|
|
274
|
+
const raw = process.env[name];
|
|
275
|
+
if (!raw) return fallback;
|
|
276
|
+
const parsed = parseInt(raw, 10);
|
|
277
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
async function extractPdfPageImages(pdfPath: string, maxPages: number): Promise<EmbeddedImage[]> {
|
|
281
|
+
if (maxPages <= 0) {
|
|
282
|
+
return [];
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const tempRoot = path.join(
|
|
286
|
+
process.cwd(),
|
|
287
|
+
'.spec-agent-tmp',
|
|
288
|
+
'pdf-pages',
|
|
289
|
+
`${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
290
|
+
);
|
|
291
|
+
const outPrefix = path.join(tempRoot, 'page');
|
|
292
|
+
await fs.ensureDir(tempRoot);
|
|
293
|
+
|
|
294
|
+
try {
|
|
295
|
+
await execFileAsync('pdftoppm', [
|
|
296
|
+
'-png',
|
|
297
|
+
'-f', '1',
|
|
298
|
+
'-l', String(maxPages),
|
|
299
|
+
pdfPath,
|
|
300
|
+
outPrefix,
|
|
301
|
+
]);
|
|
302
|
+
|
|
303
|
+
const files = await fs.readdir(tempRoot);
|
|
304
|
+
const pngFiles = files
|
|
305
|
+
.filter(f => /^page-\d+\.png$/i.test(f))
|
|
306
|
+
.sort((a, b) => {
|
|
307
|
+
const ai = parseInt(a.match(/\d+/)?.[0] || '0', 10);
|
|
308
|
+
const bi = parseInt(b.match(/\d+/)?.[0] || '0', 10);
|
|
309
|
+
return ai - bi;
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
const images: EmbeddedImage[] = [];
|
|
313
|
+
let imageIdx = 1;
|
|
314
|
+
for (const file of pngFiles) {
|
|
315
|
+
const imagePath = path.join(tempRoot, file);
|
|
316
|
+
const buffer = await fs.readFile(imagePath);
|
|
317
|
+
const base64 = buffer.toString('base64');
|
|
318
|
+
const pageNo = parseInt(file.match(/\d+/)?.[0] || String(imageIdx), 10);
|
|
319
|
+
images.push({
|
|
320
|
+
id: `PDFIMG${String(imageIdx++).padStart(4, '0')}`,
|
|
321
|
+
alt: `PDF第${pageNo}页`,
|
|
322
|
+
mimeType: 'image/png',
|
|
323
|
+
estimatedSize: buffer.length,
|
|
324
|
+
dataUri: `data:image/png;base64,${base64}`,
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
return images;
|
|
328
|
+
} catch {
|
|
329
|
+
// pdftoppm unavailable or conversion failed.
|
|
330
|
+
return [];
|
|
331
|
+
} finally {
|
|
332
|
+
await fs.remove(tempRoot).catch(() => undefined);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
async function parsePdfToMarkdown(buffer: Buffer, filePath?: string): Promise<ParsedDocument> {
|
|
337
|
+
try {
|
|
338
|
+
// Dynamic import to avoid loading if not needed
|
|
339
|
+
const pdfParse = await import('pdf-parse');
|
|
340
|
+
const result = await pdfParse.default(buffer);
|
|
341
|
+
|
|
342
|
+
// Convert PDF text to structured Markdown
|
|
343
|
+
// PDF text often has page breaks and layout artifacts that need cleaning
|
|
344
|
+
const structuredContent = structurePdfContent(result.text);
|
|
345
|
+
|
|
346
|
+
const maxPdfImagePages = Math.max(0, parseEnvInt('PDF_IMAGE_PAGE_LIMIT', 8));
|
|
347
|
+
const pageImages = filePath
|
|
348
|
+
? await extractPdfPageImages(filePath, Math.min(result.numpages, maxPdfImagePages))
|
|
349
|
+
: [];
|
|
350
|
+
|
|
351
|
+
const imageHeaders = pageImages.map(
|
|
352
|
+
image => `[图片引用 ${image.id} | alt="${image.alt || '无'}" | ${image.mimeType} | ${image.estimatedSize} bytes]`
|
|
353
|
+
);
|
|
354
|
+
const contentWithPageImages = imageHeaders.length > 0
|
|
355
|
+
? `${imageHeaders.join('\n')}\n\n${structuredContent}`
|
|
356
|
+
: structuredContent;
|
|
357
|
+
|
|
358
|
+
return {
|
|
359
|
+
content: extractEmbeddedImages(contentWithPageImages).content,
|
|
360
|
+
format: 'markdown',
|
|
361
|
+
images: pageImages,
|
|
362
|
+
metadata: {
|
|
363
|
+
pages: result.numpages,
|
|
364
|
+
wordCount: result.text.split(/\s+/).length,
|
|
365
|
+
},
|
|
366
|
+
};
|
|
367
|
+
} catch (error) {
|
|
368
|
+
// Fallback: try to extract text as-is
|
|
369
|
+
const text = buffer.toString('utf-8');
|
|
370
|
+
if (text.length > 100) {
|
|
371
|
+
return {
|
|
372
|
+
content: extractEmbeddedImages(normalizeMarkdown(text)).content,
|
|
373
|
+
format: 'markdown',
|
|
374
|
+
images: [],
|
|
375
|
+
metadata: {},
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
throw new Error(`Failed to parse PDF: ${error instanceof Error ? error.message : String(error)}`);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/**
|
|
383
|
+
* Structure PDF content into Markdown format
|
|
384
|
+
* PDFs often have layout artifacts that need intelligent processing
|
|
385
|
+
*/
|
|
386
|
+
function structurePdfContent(text: string): string {
|
|
387
|
+
let md = text;
|
|
388
|
+
|
|
389
|
+
// Remove page number lines (standalone numbers)
|
|
390
|
+
md = md.replace(/\n\s*\d+\s*\n/g, '\n\n');
|
|
391
|
+
|
|
392
|
+
// Detect and convert potential headers
|
|
393
|
+
// Short lines at the start of paragraphs that are all caps or title case
|
|
394
|
+
const lines = md.split('\n');
|
|
395
|
+
const processedLines: string[] = [];
|
|
396
|
+
let prevLineEmpty = true;
|
|
397
|
+
|
|
398
|
+
for (let i = 0; i < lines.length; i++) {
|
|
399
|
+
const line = lines[i].trim();
|
|
400
|
+
const nextLine = lines[i + 1]?.trim() || '';
|
|
401
|
+
|
|
402
|
+
// Skip empty lines but track them
|
|
403
|
+
if (!line) {
|
|
404
|
+
processedLines.push('');
|
|
405
|
+
prevLineEmpty = true;
|
|
406
|
+
continue;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Detect headers based on various heuristics
|
|
410
|
+
const isShortLine = line.length < 100;
|
|
411
|
+
const isAllCaps = line === line.toUpperCase() && line.length > 3 && /[A-Z]/.test(line);
|
|
412
|
+
const isTitleCase = /^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$/.test(line) && line.length > 3;
|
|
413
|
+
const looksLikeNumberedHeader = /^\d+(?:\.\d+)*\.?\s+\S/.test(line);
|
|
414
|
+
const looksLikeChapter = /^(Chapter|CHAPTER|第[一二三四五六七八九十\d]+章)/.test(line);
|
|
415
|
+
|
|
416
|
+
if (prevLineEmpty && isShortLine) {
|
|
417
|
+
if (looksLikeChapter) {
|
|
418
|
+
// Chapter header - H1
|
|
419
|
+
processedLines.push(`# ${line}`);
|
|
420
|
+
} else if (looksLikeNumberedHeader) {
|
|
421
|
+
// Numbered section - could be H2 or H3
|
|
422
|
+
const level = (line.match(/\./g) || []).length + 2;
|
|
423
|
+
processedLines.push(`${'#'.repeat(Math.min(level, 6))} ${line}`);
|
|
424
|
+
} else if (isAllCaps && line.length < 50) {
|
|
425
|
+
// ALL CAPS short line - likely a section header
|
|
426
|
+
processedLines.push(`## ${line}`);
|
|
427
|
+
} else if (isTitleCase && !nextLine.startsWith(line.substring(0, 10))) {
|
|
428
|
+
// Title case that doesn't continue - likely a header
|
|
429
|
+
processedLines.push(`### ${line}`);
|
|
430
|
+
} else {
|
|
431
|
+
processedLines.push(line);
|
|
432
|
+
}
|
|
433
|
+
} else {
|
|
434
|
+
processedLines.push(line);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
prevLineEmpty = false;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
md = processedLines.join('\n');
|
|
441
|
+
|
|
442
|
+
// Clean up excessive whitespace
|
|
443
|
+
md = md.replace(/\n{4,}/g, '\n\n\n');
|
|
444
|
+
|
|
445
|
+
return normalizeMarkdown(md);
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
async function parseDocxToMarkdown(buffer: Buffer): Promise<ParsedDocument> {
|
|
449
|
+
try {
|
|
450
|
+
// Dynamic import to avoid loading if not needed
|
|
451
|
+
const mammoth = await import('mammoth');
|
|
452
|
+
|
|
453
|
+
// Use mammoth's HTML conversion to preserve structure, then convert to Markdown
|
|
454
|
+
const htmlResult = await mammoth.convertToHtml({ buffer }, {
|
|
455
|
+
styleMap: [
|
|
456
|
+
"p[style-name='Heading 1'] => h1:fresh",
|
|
457
|
+
"p[style-name='Heading 2'] => h2:fresh",
|
|
458
|
+
"p[style-name='Heading 3'] => h3:fresh",
|
|
459
|
+
"p[style-name='Heading 4'] => h4:fresh",
|
|
460
|
+
"p[style-name='Heading 5'] => h5:fresh",
|
|
461
|
+
"p[style-name='Heading 6'] => h6:fresh",
|
|
462
|
+
"p[style-name='Title'] => h1.title:fresh",
|
|
463
|
+
"p[style-name='Subtitle'] => h2.subtitle:fresh",
|
|
464
|
+
]
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
// Convert HTML to Markdown
|
|
468
|
+
const markdown = convertHtmlToMarkdown(htmlResult.value);
|
|
469
|
+
const extracted = extractEmbeddedImages(markdown);
|
|
470
|
+
|
|
471
|
+
// Extract metadata from document
|
|
472
|
+
const metadata: ParsedDocument['metadata'] = {
|
|
473
|
+
wordCount: markdown.split(/\s+/).length,
|
|
474
|
+
};
|
|
475
|
+
|
|
476
|
+
// Try to extract title from the first heading
|
|
477
|
+
const titleMatch = markdown.match(/^#\s+(.+)$/m);
|
|
478
|
+
if (titleMatch) {
|
|
479
|
+
metadata.title = titleMatch[1].trim();
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
content: extracted.content,
|
|
484
|
+
format: 'markdown',
|
|
485
|
+
images: extracted.images,
|
|
486
|
+
metadata,
|
|
487
|
+
};
|
|
488
|
+
} catch (error) {
|
|
489
|
+
// Fallback: try raw text extraction
|
|
490
|
+
try {
|
|
491
|
+
const mammoth = await import('mammoth');
|
|
492
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
493
|
+
return {
|
|
494
|
+
content: extractEmbeddedImages(normalizeMarkdown(result.value)).content,
|
|
495
|
+
format: 'markdown',
|
|
496
|
+
images: [],
|
|
497
|
+
metadata: {
|
|
498
|
+
wordCount: result.value.split(/\s+/).length,
|
|
499
|
+
},
|
|
500
|
+
};
|
|
501
|
+
} catch {
|
|
502
|
+
// Last resort: read as plain text
|
|
503
|
+
const text = buffer.toString('utf-8');
|
|
504
|
+
if (text.length > 100) {
|
|
505
|
+
return {
|
|
506
|
+
content: extractEmbeddedImages(normalizeMarkdown(text)).content,
|
|
507
|
+
format: 'markdown',
|
|
508
|
+
images: [],
|
|
509
|
+
metadata: {},
|
|
510
|
+
};
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
throw new Error(`Failed to parse DOCX: ${error instanceof Error ? error.message : String(error)}`);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/**
|
|
518
|
+
* Read and concatenate multiple document files
|
|
519
|
+
* All content is normalized to Markdown format
|
|
520
|
+
*/
|
|
521
|
+
export async function readChunkContent(filePaths: string[]): Promise<string> {
|
|
522
|
+
const contents: string[] = [];
|
|
523
|
+
|
|
524
|
+
for (const filePath of filePaths) {
|
|
525
|
+
try {
|
|
526
|
+
const parsed = await parseDocument(filePath);
|
|
527
|
+
contents.push(`=== ${path.basename(filePath)} ===\n${parsed.content}`);
|
|
528
|
+
} catch (error) {
|
|
529
|
+
// If parsing fails, try to read as plain text
|
|
530
|
+
try {
|
|
531
|
+
const text = await fs.readFile(filePath, 'utf-8');
|
|
532
|
+
contents.push(`=== ${path.basename(filePath)} ===\n${normalizeMarkdown(text)}`);
|
|
533
|
+
} catch {
|
|
534
|
+
contents.push(`=== ${path.basename(filePath)} ===\n[Error reading file: ${error instanceof Error ? error.message : String(error)}]`);
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
return contents.join('\n\n---\n\n');
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
/**
|
|
543
|
+
* Check if a file format is supported
|
|
544
|
+
*/
|
|
545
|
+
export function isSupportedFormat(filePath: string): boolean {
|
|
546
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
547
|
+
return ['.md', '.txt', '.html', '.htm', '.pdf', '.docx'].includes(ext);
|
|
548
|
+
}
|