deepfish-ai 1.0.21 → 1.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/AgentRobot/AgentRobotFactory/MainAgentRobot.js +4 -7
- package/src/AgentRobot/AgentRobotFactory/SubAgentRobot.js +3 -8
- package/src/AgentRobot/AgentRobotFactory/SubSkillAgentRobot.js +3 -8
- package/src/AgentRobot/BaseAgentRobot/Brain.js +8 -8
- package/src/AgentRobot/BaseAgentRobot/Hand.js +3 -3
- package/src/AgentRobot/BaseAgentRobot/index.js +25 -95
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/doc-transform.js +204 -0
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/docx.js +552 -1
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/embedding.js +763 -0
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/img.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/pdf.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/pptx.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/lazy-tools/xlsx.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/tools/BaseTools.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/tools/CreateAgentTools.js +3 -2
- package/src/AgentRobot/BaseAgentRobot/tools/FileTools.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/tools/GenerateTools.js +4 -2
- package/src/AgentRobot/BaseAgentRobot/tools/InquirerTools.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/tools/SystemTools.js +3 -4
- package/src/AgentRobot/BaseAgentRobot/tools/TaskTools.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/tools/TestTools.js +1 -0
- package/src/AgentRobot/BaseAgentRobot/tools/UserTool.js +87 -0
- package/src/AgentRobot/BaseAgentRobot/tools/WebTools.js +257 -0
- package/src/AgentRobot/BaseAgentRobot/utils/AIRequest.js +9 -2
- package/src/AgentRobot/BaseAgentRobot/utils/AIToolManager.js +128 -0
- package/src/AgentRobot/BaseAgentRobot/utils/AttachmentToolScanner.js +4 -5
- package/src/cli/DefaultConfig.js +3 -0
|
@@ -8,6 +8,7 @@ const mammoth = require('mammoth')
|
|
|
8
8
|
const docx = require('docx')
|
|
9
9
|
const PizZip = require('pizzip')
|
|
10
10
|
const Docxtemplater = require('docxtemplater')
|
|
11
|
+
const cheerio = require('cheerio')
|
|
11
12
|
|
|
12
13
|
// ─── 统一返回结构 ─────────────────────────────────────────────────────────────
|
|
13
14
|
|
|
@@ -23,6 +24,204 @@ function resolvePath(filePath) {
|
|
|
23
24
|
return path.resolve(process.cwd(), filePath)
|
|
24
25
|
}
|
|
25
26
|
|
|
27
|
+
// ─── 格式转换辅助函数 ────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* 使用 puppeteer 将 HTML 字符串渲染为 PDF 文件
|
|
31
|
+
*/
|
|
32
|
+
async function htmlStringToPdf(html, outputPath) {
|
|
33
|
+
let puppeteer
|
|
34
|
+
try {
|
|
35
|
+
puppeteer = require('puppeteer')
|
|
36
|
+
} catch {
|
|
37
|
+
throw new Error('puppeteer 未安装,请先执行 npm install puppeteer')
|
|
38
|
+
}
|
|
39
|
+
const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] })
|
|
40
|
+
try {
|
|
41
|
+
const page = await browser.newPage()
|
|
42
|
+
await page.setContent(html, { waitUntil: 'networkidle0' })
|
|
43
|
+
fs.ensureDirSync(path.dirname(outputPath))
|
|
44
|
+
await page.pdf({ path: outputPath, format: 'A4', printBackground: true })
|
|
45
|
+
} finally {
|
|
46
|
+
await browser.close()
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function escapeHtml(str) {
|
|
51
|
+
return str
|
|
52
|
+
.replace(/&/g, '&')
|
|
53
|
+
.replace(/</g, '<')
|
|
54
|
+
.replace(/>/g, '>')
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function unescapeXml(str) {
|
|
58
|
+
return str
|
|
59
|
+
.replace(/&/g, '&')
|
|
60
|
+
.replace(/</g, '<')
|
|
61
|
+
.replace(/>/g, '>')
|
|
62
|
+
.replace(/"/g, '"')
|
|
63
|
+
.replace(/'/g, "'")
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function buildRunXml(rPr, text) {
|
|
67
|
+
if (!text) return ''
|
|
68
|
+
const xmlSp = /^\s|\s$/.test(text) ? ' xml:space="preserve"' : ''
|
|
69
|
+
return `<w:r>${rPr}<w:t${xmlSp}>${escapeHtml(text)}</w:t></w:r>`
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* 将 Markdown 文本转换为 HTML 字符串
|
|
74
|
+
*/
|
|
75
|
+
function markdownToHtmlString(md) {
|
|
76
|
+
let html = md
|
|
77
|
+
.replace(/```(\w*)\n([\s\S]*?)```/g, (_, lang, code) =>
|
|
78
|
+
`<pre><code class="language-${lang}">${escapeHtml(code.trimEnd())}</code></pre>`)
|
|
79
|
+
.replace(/`([^`]+)`/g, (_, c) => `<code>${escapeHtml(c)}</code>`)
|
|
80
|
+
.replace(/^###### (.+)$/gm, '<h6>$1</h6>')
|
|
81
|
+
.replace(/^##### (.+)$/gm, '<h5>$1</h5>')
|
|
82
|
+
.replace(/^#### (.+)$/gm, '<h4>$1</h4>')
|
|
83
|
+
.replace(/^### (.+)$/gm, '<h3>$1</h3>')
|
|
84
|
+
.replace(/^## (.+)$/gm, '<h2>$1</h2>')
|
|
85
|
+
.replace(/^# (.+)$/gm, '<h1>$1</h1>')
|
|
86
|
+
.replace(/^[-*_]{3,}$/gm, '<hr>')
|
|
87
|
+
.replace(/\*\*\*(.+?)\*\*\*/g, '<strong><em>$1</em></strong>')
|
|
88
|
+
.replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
|
|
89
|
+
.replace(/__(.+?)__/g, '<strong>$1</strong>')
|
|
90
|
+
.replace(/\*(.+?)\*/g, '<em>$1</em>')
|
|
91
|
+
.replace(/_(.+?)_/g, '<em>$1</em>')
|
|
92
|
+
.replace(/~~(.+?)~~/g, '<del>$1</del>')
|
|
93
|
+
.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, '<img alt="$1" src="$2">')
|
|
94
|
+
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2">$1</a>')
|
|
95
|
+
.replace(/^[ \t]*[-*+] (.+)$/gm, '<li>$1</li>')
|
|
96
|
+
.replace(/^[ \t]*\d+\. (.+)$/gm, '<li>$1</li>')
|
|
97
|
+
.replace(/^> (.+)$/gm, '<blockquote>$1</blockquote>')
|
|
98
|
+
html = html.replace(/(<li>[\s\S]+?<\/li>)(\n(?!<li>)|$)/g, (_, items) => `<ul>${items}</ul>`)
|
|
99
|
+
html = html.replace(/^(?!<[a-z]|$)(.+)$/gm, '<p>$1</p>')
|
|
100
|
+
return `<!DOCTYPE html><html><head><meta charset="utf-8"><style>
|
|
101
|
+
body{font-family:sans-serif;line-height:1.7;max-width:900px;margin:40px auto;padding:0 20px;color:#333}
|
|
102
|
+
h1,h2,h3,h4,h5,h6{margin-top:1.2em}
|
|
103
|
+
pre{background:#f5f5f5;padding:12px;border-radius:4px;overflow:auto}
|
|
104
|
+
code{background:#f0f0f0;padding:2px 4px;border-radius:3px}
|
|
105
|
+
blockquote{border-left:4px solid #ddd;margin:0;padding-left:1em;color:#666}
|
|
106
|
+
table{border-collapse:collapse;width:100%}td,th{border:1px solid #ddd;padding:6px 10px}
|
|
107
|
+
</style></head><body>${html}</body></html>`
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* 将 HTML 字符串转换为 Markdown 文本
|
|
112
|
+
*/
|
|
113
|
+
function htmlStringToMarkdown(html) {
|
|
114
|
+
const $ = cheerio.load(html)
|
|
115
|
+
const body = $('body').length ? $('body') : $.root()
|
|
116
|
+
|
|
117
|
+
function nodeToMd(el) {
|
|
118
|
+
const node = $(el)
|
|
119
|
+
const tag = el.type === 'text' ? '#text' : (el.name || '').toLowerCase()
|
|
120
|
+
if (el.type === 'text') return el.data || ''
|
|
121
|
+
const inner = () => node.contents().toArray().map(nodeToMd).join('')
|
|
122
|
+
switch (tag) {
|
|
123
|
+
case 'h1': return `# ${inner()}\n\n`
|
|
124
|
+
case 'h2': return `## ${inner()}\n\n`
|
|
125
|
+
case 'h3': return `### ${inner()}\n\n`
|
|
126
|
+
case 'h4': return `#### ${inner()}\n\n`
|
|
127
|
+
case 'h5': return `##### ${inner()}\n\n`
|
|
128
|
+
case 'h6': return `###### ${inner()}\n\n`
|
|
129
|
+
case 'p': return `${inner()}\n\n`
|
|
130
|
+
case 'br': return '\n'
|
|
131
|
+
case 'hr': return '---\n\n'
|
|
132
|
+
case 'strong':
|
|
133
|
+
case 'b': return `**${inner()}**`
|
|
134
|
+
case 'em':
|
|
135
|
+
case 'i': return `*${inner()}*`
|
|
136
|
+
case 'del':
|
|
137
|
+
case 's': return `~~${inner()}~~`
|
|
138
|
+
case 'code': return `\`${inner()}\``
|
|
139
|
+
case 'pre': {
|
|
140
|
+
const codeEl = node.find('code')
|
|
141
|
+
const lang = (codeEl.attr('class') || '').replace('language-', '')
|
|
142
|
+
const content = codeEl.length ? codeEl.text() : node.text()
|
|
143
|
+
return `\`\`\`${lang}\n${content}\n\`\`\`\n\n`
|
|
144
|
+
}
|
|
145
|
+
case 'blockquote': return inner().split('\n').map(l => l ? `> ${l}` : '').join('\n') + '\n\n'
|
|
146
|
+
case 'a': return `[${inner()}](${node.attr('href') || ''})`
|
|
147
|
+
case 'img': return ` || ''})`
|
|
148
|
+
case 'ul':
|
|
149
|
+
case 'ol': return inner() + '\n'
|
|
150
|
+
case 'li': return `- ${inner()}\n`
|
|
151
|
+
case 'table': {
|
|
152
|
+
const rows = node.find('tr').toArray()
|
|
153
|
+
if (!rows.length) return ''
|
|
154
|
+
return rows.map((row, i) => {
|
|
155
|
+
const cells = $(row).find('th,td').toArray().map(c => $(c).text().trim())
|
|
156
|
+
const line = `| ${cells.join(' | ')} |`
|
|
157
|
+
return i === 0 ? `${line}\n| ${cells.map(() => '---').join(' | ')} |` : line
|
|
158
|
+
}).join('\n') + '\n\n'
|
|
159
|
+
}
|
|
160
|
+
case 'head':
|
|
161
|
+
case 'style':
|
|
162
|
+
case 'script': return ''
|
|
163
|
+
default: return inner()
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return body.contents().toArray().map(nodeToMd).join('').replace(/\n{3,}/g, '\n\n').trim()
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* 将 HTML 字符串解析为 docx sections 数组
|
|
172
|
+
*/
|
|
173
|
+
function htmlStringToDocxSections(html) {
|
|
174
|
+
const $ = cheerio.load(html)
|
|
175
|
+
const sections = []
|
|
176
|
+
|
|
177
|
+
function processNode(el) {
|
|
178
|
+
const node = $(el)
|
|
179
|
+
const tag = (el.name || '').toLowerCase()
|
|
180
|
+
const headingMatch = tag.match(/^h([1-6])$/)
|
|
181
|
+
if (headingMatch) {
|
|
182
|
+
sections.push({ type: 'heading', level: parseInt(headingMatch[1]), text: node.text().trim() })
|
|
183
|
+
return
|
|
184
|
+
}
|
|
185
|
+
switch (tag) {
|
|
186
|
+
case 'p':
|
|
187
|
+
if (node.text().trim()) sections.push({ type: 'paragraph', text: node.text().trim() })
|
|
188
|
+
break
|
|
189
|
+
case 'ul':
|
|
190
|
+
sections.push({ type: 'list', items: node.find('li').toArray().map(li => $(li).text().trim()) })
|
|
191
|
+
break
|
|
192
|
+
case 'ol':
|
|
193
|
+
sections.push({ type: 'numberedList', items: node.find('li').toArray().map(li => $(li).text().trim()) })
|
|
194
|
+
break
|
|
195
|
+
case 'table': {
|
|
196
|
+
const rows = node.find('tr').toArray().map(row =>
|
|
197
|
+
$(row).find('th,td').toArray().map(c => $(c).text().trim()))
|
|
198
|
+
if (rows.length) sections.push({ type: 'table', rows })
|
|
199
|
+
break
|
|
200
|
+
}
|
|
201
|
+
case 'hr':
|
|
202
|
+
sections.push({ type: 'horizontalRule' })
|
|
203
|
+
break
|
|
204
|
+
case 'pre':
|
|
205
|
+
sections.push({ type: 'paragraph', text: node.text() })
|
|
206
|
+
break
|
|
207
|
+
case 'blockquote':
|
|
208
|
+
node.find('p').each((_, pEl) => {
|
|
209
|
+
const t = $(pEl).text().trim()
|
|
210
|
+
if (t) sections.push({ type: 'paragraph', text: `> ${t}` })
|
|
211
|
+
})
|
|
212
|
+
if (!node.find('p').length && node.text().trim()) {
|
|
213
|
+
sections.push({ type: 'paragraph', text: `> ${node.text().trim()}` })
|
|
214
|
+
}
|
|
215
|
+
break
|
|
216
|
+
default:
|
|
217
|
+
node.children().each((_, child) => processNode(child))
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
$('body').children().each((_, el) => processNode(el))
|
|
222
|
+
return sections
|
|
223
|
+
}
|
|
224
|
+
|
|
26
225
|
// ─── 内部辅助:将 sections 描述转换为 docx children ──────────────────────────
|
|
27
226
|
/**
|
|
28
227
|
* sections 数组每项结构:
|
|
@@ -439,6 +638,242 @@ async function overwriteDocx(filePath, sections = []) {
|
|
|
439
638
|
}
|
|
440
639
|
}
|
|
441
640
|
|
|
641
|
+
// ─── Word 格式转换函数 ───────────────────────────────────────────────────────
|
|
642
|
+
|
|
643
|
+
/**
|
|
644
|
+
* Word 转 PDF
|
|
645
|
+
*/
|
|
646
|
+
async function wordToPdf(inputPath, outputPath) {
|
|
647
|
+
try {
|
|
648
|
+
const fullInput = resolvePath(inputPath)
|
|
649
|
+
const fullOutput = resolvePath(outputPath)
|
|
650
|
+
if (!fs.existsSync(fullInput)) {
|
|
651
|
+
return fail(`File does not exist: ${fullInput}`, { inputPath: fullInput })
|
|
652
|
+
}
|
|
653
|
+
const result = await mammoth.convertToHtml({ path: fullInput })
|
|
654
|
+
const html = `<!DOCTYPE html><html><head><meta charset="utf-8"><style>
|
|
655
|
+
body{font-family:sans-serif;line-height:1.7;margin:40px;color:#333}
|
|
656
|
+
table{border-collapse:collapse;width:100%}td,th{border:1px solid #ddd;padding:6px 10px}
|
|
657
|
+
</style></head><body>${result.value}</body></html>`
|
|
658
|
+
await htmlStringToPdf(html, fullOutput)
|
|
659
|
+
return ok({ inputPath: fullInput, outputPath: fullOutput })
|
|
660
|
+
} catch (error) {
|
|
661
|
+
return fail(error, { inputPath, outputPath })
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
/**
|
|
666
|
+
* Word 转 HTML
|
|
667
|
+
*/
|
|
668
|
+
async function wordToHtml(inputPath, outputPath) {
|
|
669
|
+
try {
|
|
670
|
+
const fullInput = resolvePath(inputPath)
|
|
671
|
+
const fullOutput = resolvePath(outputPath)
|
|
672
|
+
if (!fs.existsSync(fullInput)) {
|
|
673
|
+
return fail(`File does not exist: ${fullInput}`, { inputPath: fullInput })
|
|
674
|
+
}
|
|
675
|
+
const result = await mammoth.convertToHtml({ path: fullInput })
|
|
676
|
+
const html = `<!DOCTYPE html><html><head><meta charset="utf-8"><style>
|
|
677
|
+
body{font-family:sans-serif;line-height:1.7;max-width:900px;margin:40px auto;padding:0 20px;color:#333}
|
|
678
|
+
table{border-collapse:collapse;width:100%}td,th{border:1px solid #ddd;padding:6px 10px}
|
|
679
|
+
</style></head><body>${result.value}</body></html>`
|
|
680
|
+
fs.ensureDirSync(path.dirname(fullOutput))
|
|
681
|
+
fs.writeFileSync(fullOutput, html, 'utf8')
|
|
682
|
+
return ok({ inputPath: fullInput, outputPath: fullOutput, messages: result.messages })
|
|
683
|
+
} catch (error) {
|
|
684
|
+
return fail(error, { inputPath, outputPath })
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
/**
|
|
689
|
+
* Word 转 Markdown
|
|
690
|
+
*/
|
|
691
|
+
async function wordToMarkdown(inputPath, outputPath) {
|
|
692
|
+
try {
|
|
693
|
+
const fullInput = resolvePath(inputPath)
|
|
694
|
+
const fullOutput = resolvePath(outputPath)
|
|
695
|
+
if (!fs.existsSync(fullInput)) {
|
|
696
|
+
return fail(`File does not exist: ${fullInput}`, { inputPath: fullInput })
|
|
697
|
+
}
|
|
698
|
+
const result = await mammoth.convertToHtml({ path: fullInput })
|
|
699
|
+
const md = htmlStringToMarkdown(result.value)
|
|
700
|
+
fs.ensureDirSync(path.dirname(fullOutput))
|
|
701
|
+
fs.writeFileSync(fullOutput, md, 'utf8')
|
|
702
|
+
return ok({ inputPath: fullInput, outputPath: fullOutput })
|
|
703
|
+
} catch (error) {
|
|
704
|
+
return fail(error, { inputPath, outputPath })
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Markdown 转 Word
|
|
710
|
+
*/
|
|
711
|
+
async function markdownToWord(inputPath, outputPath) {
|
|
712
|
+
try {
|
|
713
|
+
const fullInput = resolvePath(inputPath)
|
|
714
|
+
const fullOutput = resolvePath(outputPath)
|
|
715
|
+
if (!fs.existsSync(fullInput)) {
|
|
716
|
+
return fail(`File does not exist: ${fullInput}`, { inputPath: fullInput })
|
|
717
|
+
}
|
|
718
|
+
const md = fs.readFileSync(fullInput, 'utf8')
|
|
719
|
+
const html = markdownToHtmlString(md)
|
|
720
|
+
const sections = htmlStringToDocxSections(html)
|
|
721
|
+
const { Document, Packer } = docx
|
|
722
|
+
fs.ensureDirSync(path.dirname(fullOutput))
|
|
723
|
+
const children = buildChildren(sections, docx)
|
|
724
|
+
const doc = new Document({ sections: [{ properties: {}, children }] })
|
|
725
|
+
const buffer = await Packer.toBuffer(doc)
|
|
726
|
+
fs.writeFileSync(fullOutput, buffer)
|
|
727
|
+
return ok({ inputPath: fullInput, outputPath: fullOutput, sectionCount: sections.length })
|
|
728
|
+
} catch (error) {
|
|
729
|
+
return fail(error, { inputPath, outputPath })
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
/**
|
|
734
|
+
* HTML 转 Word
|
|
735
|
+
*/
|
|
736
|
+
async function htmlToWord(inputPath, outputPath) {
|
|
737
|
+
try {
|
|
738
|
+
const fullInput = resolvePath(inputPath)
|
|
739
|
+
const fullOutput = resolvePath(outputPath)
|
|
740
|
+
if (!fs.existsSync(fullInput)) {
|
|
741
|
+
return fail(`File does not exist: ${fullInput}`, { inputPath: fullInput })
|
|
742
|
+
}
|
|
743
|
+
const html = fs.readFileSync(fullInput, 'utf8')
|
|
744
|
+
const sections = htmlStringToDocxSections(html)
|
|
745
|
+
const { Document, Packer } = docx
|
|
746
|
+
fs.ensureDirSync(path.dirname(fullOutput))
|
|
747
|
+
const children = buildChildren(sections, docx)
|
|
748
|
+
const doc = new Document({ sections: [{ properties: {}, children }] })
|
|
749
|
+
const buffer = await Packer.toBuffer(doc)
|
|
750
|
+
fs.writeFileSync(fullOutput, buffer)
|
|
751
|
+
return ok({ inputPath: fullInput, outputPath: fullOutput, sectionCount: sections.length })
|
|
752
|
+
} catch (error) {
|
|
753
|
+
return fail(error, { inputPath, outputPath })
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
// ─── 工具描述 ─────────────────────────────────────────────────────────────────
|
|
758
|
+
|
|
759
|
+
/**
|
|
760
|
+
* 保留原格式修改 Word 文档内容(支持跨 <w:r> run 的文本替换,不破坏其余样式)
|
|
761
|
+
* @param {string} filePath - .docx 文件路径
|
|
762
|
+
* @param {Array<{search:string, replace:string}>} replacements - 替换规则数组
|
|
763
|
+
*/
|
|
764
|
+
async function patchDocxText(filePath, replacements) {
|
|
765
|
+
try {
|
|
766
|
+
const fullPath = resolvePath(filePath)
|
|
767
|
+
if (!fs.existsSync(fullPath)) {
|
|
768
|
+
return fail(`File does not exist: ${fullPath}`, { filePath: fullPath })
|
|
769
|
+
}
|
|
770
|
+
const content = fs.readFileSync(fullPath, 'binary')
|
|
771
|
+
const zip = new PizZip(content)
|
|
772
|
+
const docFile = zip.file('word/document.xml')
|
|
773
|
+
if (!docFile) return fail('word/document.xml not found', { filePath: fullPath })
|
|
774
|
+
let xml = docFile.asText()
|
|
775
|
+
let totalCount = 0
|
|
776
|
+
|
|
777
|
+
for (const { search, replace: replaceStr } of (replacements || [])) {
|
|
778
|
+
if (!search) continue
|
|
779
|
+
|
|
780
|
+
xml = xml.replace(/<w:p(?:\s[^>]*)?>([\s\S]*?)<\/w:p>/g, (para) => {
|
|
781
|
+
// 提取所有含文本的 <w:r> 元素
|
|
782
|
+
const runRe = /<w:r(?:\s[^>]*)?>([\s\S]*?)<\/w:r>/g
|
|
783
|
+
const runs = []
|
|
784
|
+
let rm
|
|
785
|
+
while ((rm = runRe.exec(para)) !== null) {
|
|
786
|
+
const tM = rm[0].match(/<w:t(?:\s[^>]*)?>([^<]*)<\/w:t>/)
|
|
787
|
+
if (!tM) continue // 跳过无文本内容的 run(如字段符号)
|
|
788
|
+
const rPrM = rm[0].match(/<w:rPr>[\s\S]*?<\/w:rPr>/)
|
|
789
|
+
runs.push({
|
|
790
|
+
full: rm[0],
|
|
791
|
+
rPr: rPrM ? rPrM[0] : '',
|
|
792
|
+
text: unescapeXml(tM[1]),
|
|
793
|
+
})
|
|
794
|
+
}
|
|
795
|
+
if (!runs.length) return para
|
|
796
|
+
|
|
797
|
+
const fullText = runs.map(r => r.text).join('')
|
|
798
|
+
if (!fullText.includes(search)) return para
|
|
799
|
+
|
|
800
|
+
// 构建每个 run 的字符偏移区间
|
|
801
|
+
let off = 0
|
|
802
|
+
const offsets = runs.map(r => { const s = off; off += r.text.length; return { start: s, end: off } })
|
|
803
|
+
|
|
804
|
+
const findRunIdx = (pos) => {
|
|
805
|
+
const idx = offsets.findIndex(o => pos >= o.start && pos < o.end)
|
|
806
|
+
return idx >= 0 ? idx : runs.length - 1
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
// 找所有非重叠匹配位置
|
|
810
|
+
const occurrences = []
|
|
811
|
+
let from = 0
|
|
812
|
+
while (true) {
|
|
813
|
+
const pos = fullText.indexOf(search, from)
|
|
814
|
+
if (pos === -1) break
|
|
815
|
+
occurrences.push(pos)
|
|
816
|
+
from = pos + search.length
|
|
817
|
+
}
|
|
818
|
+
totalCount += occurrences.length
|
|
819
|
+
|
|
820
|
+
// 按字符位置构建新的 {rPr, text} 片段列表
|
|
821
|
+
const segs = []
|
|
822
|
+
let cursor = 0
|
|
823
|
+
for (const matchPos of occurrences) {
|
|
824
|
+
// 匹配前的文本:按 run 边界拆分,保留各自 rPr
|
|
825
|
+
let p = cursor
|
|
826
|
+
while (p < matchPos) {
|
|
827
|
+
const ri = findRunIdx(p)
|
|
828
|
+
const runEnd = Math.min(offsets[ri].end, matchPos)
|
|
829
|
+
segs.push({ rPr: runs[ri].rPr, text: fullText.slice(p, runEnd) })
|
|
830
|
+
p = runEnd
|
|
831
|
+
}
|
|
832
|
+
// 替换文本:使用匹配起始位置所在 run 的 rPr
|
|
833
|
+
segs.push({ rPr: runs[findRunIdx(matchPos)].rPr, text: replaceStr })
|
|
834
|
+
cursor = matchPos + search.length
|
|
835
|
+
}
|
|
836
|
+
// 匹配后剩余文本
|
|
837
|
+
let p = cursor
|
|
838
|
+
while (p < fullText.length) {
|
|
839
|
+
const ri = findRunIdx(p)
|
|
840
|
+
segs.push({ rPr: runs[ri].rPr, text: fullText.slice(p, offsets[ri].end) })
|
|
841
|
+
p = offsets[ri].end
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
// 合并相邻且 rPr 相同的片段
|
|
845
|
+
const merged = []
|
|
846
|
+
for (const seg of segs) {
|
|
847
|
+
if (!seg.text) continue
|
|
848
|
+
if (merged.length && merged[merged.length - 1].rPr === seg.rPr) {
|
|
849
|
+
merged[merged.length - 1].text += seg.text
|
|
850
|
+
} else {
|
|
851
|
+
merged.push({ rPr: seg.rPr, text: seg.text })
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
// 构建新的 runs XML
|
|
856
|
+
const newRunsXml = merged.map(s => buildRunXml(s.rPr, s.text)).join('')
|
|
857
|
+
|
|
858
|
+
// 将段落中从第一个到最后一个文本 run 的区间替换为新 runs
|
|
859
|
+
const firstRunFull = runs[0].full
|
|
860
|
+
const lastRunFull = runs[runs.length - 1].full
|
|
861
|
+
const firstIdx = para.indexOf(firstRunFull)
|
|
862
|
+
const lastIdx = para.lastIndexOf(lastRunFull)
|
|
863
|
+
if (firstIdx === -1 || lastIdx === -1) return para
|
|
864
|
+
return para.slice(0, firstIdx) + newRunsXml + para.slice(lastIdx + lastRunFull.length)
|
|
865
|
+
})
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
zip.file('word/document.xml', xml)
|
|
869
|
+
const buf = zip.generate({ type: 'nodebuffer', compression: 'DEFLATE' })
|
|
870
|
+
fs.writeFileSync(fullPath, buf)
|
|
871
|
+
return ok({ filePath: fullPath, totalReplacements: totalCount })
|
|
872
|
+
} catch (error) {
|
|
873
|
+
return fail(error, { filePath })
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
442
877
|
// ─── 工具描述 ─────────────────────────────────────────────────────────────────
|
|
443
878
|
|
|
444
879
|
const descriptions = [
|
|
@@ -676,6 +1111,114 @@ const descriptions = [
|
|
|
676
1111
|
},
|
|
677
1112
|
},
|
|
678
1113
|
},
|
|
1114
|
+
{
|
|
1115
|
+
type: 'function',
|
|
1116
|
+
function: {
|
|
1117
|
+
name: 'wordToPdf',
|
|
1118
|
+
description:
|
|
1119
|
+
'将 Word 文档(.docx)转换为 PDF 文件。依赖 puppeteer,转换时保留基础格式。参数:inputPath 为源 .docx 路径;outputPath 为输出 .pdf 路径。返回值:对象,包含 success、data(含 inputPath、outputPath)、error。',
|
|
1120
|
+
parameters: {
|
|
1121
|
+
type: 'object',
|
|
1122
|
+
properties: {
|
|
1123
|
+
inputPath: { type: 'string', description: '源 .docx 文件路径。' },
|
|
1124
|
+
outputPath: { type: 'string', description: '输出 .pdf 文件路径。' },
|
|
1125
|
+
},
|
|
1126
|
+
required: ['inputPath', 'outputPath'],
|
|
1127
|
+
},
|
|
1128
|
+
},
|
|
1129
|
+
},
|
|
1130
|
+
{
|
|
1131
|
+
type: 'function',
|
|
1132
|
+
function: {
|
|
1133
|
+
name: 'wordToHtml',
|
|
1134
|
+
description:
|
|
1135
|
+
'将 Word 文档(.docx)转换为 HTML 文件,保留格式信息(粗体、斜体、表格等)。参数:inputPath 为源 .docx 路径;outputPath 为输出 .html 路径。返回值:对象,包含 success、data(含 inputPath、outputPath、messages)、error。',
|
|
1136
|
+
parameters: {
|
|
1137
|
+
type: 'object',
|
|
1138
|
+
properties: {
|
|
1139
|
+
inputPath: { type: 'string', description: '源 .docx 文件路径。' },
|
|
1140
|
+
outputPath: { type: 'string', description: '输出 .html 文件路径。' },
|
|
1141
|
+
},
|
|
1142
|
+
required: ['inputPath', 'outputPath'],
|
|
1143
|
+
},
|
|
1144
|
+
},
|
|
1145
|
+
},
|
|
1146
|
+
{
|
|
1147
|
+
type: 'function',
|
|
1148
|
+
function: {
|
|
1149
|
+
name: 'wordToMarkdown',
|
|
1150
|
+
description:
|
|
1151
|
+
'将 Word 文档(.docx)转换为 Markdown 文件(.md)。通过 HTML 中间格式进行转换,保留标题、段落、表格、链接等结构。参数:inputPath 为源 .docx 路径;outputPath 为输出 .md 路径。返回值:对象,包含 success、data(含 inputPath、outputPath)、error。',
|
|
1152
|
+
parameters: {
|
|
1153
|
+
type: 'object',
|
|
1154
|
+
properties: {
|
|
1155
|
+
inputPath: { type: 'string', description: '源 .docx 文件路径。' },
|
|
1156
|
+
outputPath: { type: 'string', description: '输出 .md 文件路径。' },
|
|
1157
|
+
},
|
|
1158
|
+
required: ['inputPath', 'outputPath'],
|
|
1159
|
+
},
|
|
1160
|
+
},
|
|
1161
|
+
},
|
|
1162
|
+
{
|
|
1163
|
+
type: 'function',
|
|
1164
|
+
function: {
|
|
1165
|
+
name: 'markdownToWord',
|
|
1166
|
+
description:
|
|
1167
|
+
'将 Markdown 文件(.md)转换为 Word 文档(.docx)。支持标题、段落、表格、列表、代码块、引用等元素。参数:inputPath 为源 .md 路径;outputPath 为输出 .docx 路径。返回值:对象,包含 success、data(含 inputPath、outputPath、sectionCount)、error。',
|
|
1168
|
+
parameters: {
|
|
1169
|
+
type: 'object',
|
|
1170
|
+
properties: {
|
|
1171
|
+
inputPath: { type: 'string', description: '源 .md 文件路径。' },
|
|
1172
|
+
outputPath: { type: 'string', description: '输出 .docx 文件路径。' },
|
|
1173
|
+
},
|
|
1174
|
+
required: ['inputPath', 'outputPath'],
|
|
1175
|
+
},
|
|
1176
|
+
},
|
|
1177
|
+
},
|
|
1178
|
+
{
|
|
1179
|
+
type: 'function',
|
|
1180
|
+
function: {
|
|
1181
|
+
name: 'htmlToWord',
|
|
1182
|
+
description:
|
|
1183
|
+
'将 HTML 文件转换为 Word 文档(.docx),解析标题、段落、表格、列表等元素。参数:inputPath 为源 .html 路径;outputPath 为输出 .docx 路径。返回值:对象,包含 success、data(含 inputPath、outputPath、sectionCount)、error。',
|
|
1184
|
+
parameters: {
|
|
1185
|
+
type: 'object',
|
|
1186
|
+
properties: {
|
|
1187
|
+
inputPath: { type: 'string', description: '源 .html 文件路径。' },
|
|
1188
|
+
outputPath: { type: 'string', description: '输出 .docx 文件路径。' },
|
|
1189
|
+
},
|
|
1190
|
+
required: ['inputPath', 'outputPath'],
|
|
1191
|
+
},
|
|
1192
|
+
},
|
|
1193
|
+
},
|
|
1194
|
+
{
|
|
1195
|
+
type: 'function',
|
|
1196
|
+
function: {
|
|
1197
|
+
name: 'patchDocxText',
|
|
1198
|
+
description: `保留原格式修改 Word 文档内容(支持跨 run 文本替换)。与 replaceDocxText 的区别:本函数在替换时智能处理 Word XML 的 run 拆分问题,替换文本会沿用匹配位置的原有字符样式(字体、字号、颜色等),不重建文档结构,最大程度保留原有格式。
|
|
1199
|
+
参数:filePath 为目标 .docx 路径;replacements 为替换规则数组,每项格式 { search: '查找文本', replace: '替换文本' }。
|
|
1200
|
+
返回值:对象,包含 success、data(含 filePath、totalReplacements)、error。`,
|
|
1201
|
+
parameters: {
|
|
1202
|
+
type: 'object',
|
|
1203
|
+
properties: {
|
|
1204
|
+
filePath: { type: 'string', description: '目标 .docx 文件路径。' },
|
|
1205
|
+
replacements: {
|
|
1206
|
+
type: 'array',
|
|
1207
|
+
description: '替换规则数组,每项包含 search(查找文本)和 replace(替换文本)。',
|
|
1208
|
+
items: {
|
|
1209
|
+
type: 'object',
|
|
1210
|
+
properties: {
|
|
1211
|
+
search: { type: 'string', description: '要查找的文本。' },
|
|
1212
|
+
replace: { type: 'string', description: '替换后的文本。' },
|
|
1213
|
+
},
|
|
1214
|
+
required: ['search', 'replace'],
|
|
1215
|
+
},
|
|
1216
|
+
},
|
|
1217
|
+
},
|
|
1218
|
+
required: ['filePath', 'replacements'],
|
|
1219
|
+
},
|
|
1220
|
+
},
|
|
1221
|
+
},
|
|
679
1222
|
]
|
|
680
1223
|
|
|
681
1224
|
// ─── 导出 ──────────────────────────────────────────────────────────────────────
|
|
@@ -695,14 +1238,22 @@ const functions = {
|
|
|
695
1238
|
mergeDocx,
|
|
696
1239
|
extractDocxLinks,
|
|
697
1240
|
getDocxParagraphStats,
|
|
1241
|
+
wordToPdf,
|
|
1242
|
+
wordToHtml,
|
|
1243
|
+
wordToMarkdown,
|
|
1244
|
+
markdownToWord,
|
|
1245
|
+
htmlToWord,
|
|
1246
|
+
patchDocxText,
|
|
698
1247
|
}
|
|
699
1248
|
|
|
700
1249
|
const DocxTool = {
|
|
701
1250
|
name: 'DocxTool',
|
|
702
|
-
description: '提供 Word 文档(.docx
|
|
1251
|
+
description: '提供 Word 文档(.docx)的创建、读取、搜索、替换、模板填充、格式转换、合并,以及 Word/Markdown/HTML/PDF 格式互转等全面处理能力',
|
|
1252
|
+
// 格式转换:wordToPdf、wordToHtml、wordToMarkdown、markdownToWord、htmlToWord
|
|
703
1253
|
platform: 'all',
|
|
704
1254
|
descriptions,
|
|
705
1255
|
functions,
|
|
1256
|
+
isSystem: true
|
|
706
1257
|
}
|
|
707
1258
|
|
|
708
1259
|
module.exports = DocxTool
|