@incremark/core 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/detector/index.d.ts +1 -1
- package/dist/detector/index.js +9 -1
- package/dist/detector/index.js.map +1 -1
- package/dist/index-3rgnFbip.d.ts +396 -0
- package/dist/index.d.ts +53 -2
- package/dist/index.js +1564 -35
- package/dist/index.js.map +1 -1
- package/dist/utils/index.d.ts +6 -1
- package/dist/utils/index.js +7 -1
- package/dist/utils/index.js.map +1 -1
- package/package.json +7 -1
- package/src/__tests__/footnote.test.ts +214 -0
- package/src/detector/index.ts +30 -0
- package/src/extensions/html-extension/index.test.ts +409 -0
- package/src/extensions/html-extension/index.ts +792 -0
- package/src/extensions/micromark-gfm-footnote-incremental.ts +275 -0
- package/src/extensions/micromark-reference-extension.ts +724 -0
- package/src/index.ts +33 -0
- package/src/parser/IncremarkParser.footnote.test.ts +334 -0
- package/src/parser/IncremarkParser.ts +374 -14
- package/src/types/index.ts +29 -1
- package/src/utils/index.ts +9 -0
- package/dist/index-ChNeZ1wr.d.ts +0 -217
|
@@ -0,0 +1,792 @@
|
|
|
1
|
+
import type { Parent, RootContent, Root, PhrasingContent, HTML } from 'mdast'
|
|
2
|
+
import type { Extension as MdastExtension } from 'mdast-util-from-markdown'
|
|
3
|
+
|
|
4
|
+
declare module 'mdast' {
|
|
5
|
+
interface RootContentMap {
|
|
6
|
+
htmlElement: HtmlElementNode
|
|
7
|
+
}
|
|
8
|
+
interface PhrasingContentMap {
|
|
9
|
+
htmlElement: HtmlElementNode
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// ============ 类型定义 ============
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* 自定义 HTML 元素节点类型
|
|
17
|
+
*/
|
|
18
|
+
export interface HtmlElementNode extends Parent {
|
|
19
|
+
type: 'htmlElement'
|
|
20
|
+
tagName: string
|
|
21
|
+
attrs: Record<string, string>
|
|
22
|
+
children: RootContent[]
|
|
23
|
+
data?: {
|
|
24
|
+
rawHtml?: string
|
|
25
|
+
parsed?: boolean
|
|
26
|
+
originalType?: string
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* HTML 属性信息
|
|
32
|
+
*/
|
|
33
|
+
export interface HtmlAttrInfo {
|
|
34
|
+
name: string
|
|
35
|
+
value: string
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* 解析后的 HTML 标签信息
|
|
40
|
+
*/
|
|
41
|
+
export interface ParsedHtmlTag {
|
|
42
|
+
tagName: string
|
|
43
|
+
attrs: Record<string, string>
|
|
44
|
+
isClosing: boolean
|
|
45
|
+
isSelfClosing: boolean
|
|
46
|
+
rawHtml: string
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* HTML 树扩展配置
|
|
51
|
+
*/
|
|
52
|
+
export interface HtmlTreeExtensionOptions {
|
|
53
|
+
/**
|
|
54
|
+
* 标签黑名单 - 这些标签会被过滤掉(XSS 防护)
|
|
55
|
+
* 默认包含危险标签:script, style, iframe, object, embed, form, input, button, textarea, select
|
|
56
|
+
*/
|
|
57
|
+
tagBlacklist?: string[]
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* 属性黑名单 - 这些属性会被过滤掉(XSS 防护)
|
|
61
|
+
* 默认包含所有 on* 事件属性和 javascript: 协议
|
|
62
|
+
*/
|
|
63
|
+
attrBlacklist?: string[]
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* 协议黑名单 - URL 属性中禁止的协议
|
|
67
|
+
* 默认包含 javascript:, vbscript:, data: (允许 data:image/)
|
|
68
|
+
*/
|
|
69
|
+
protocolBlacklist?: string[]
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* 是否保留原始 HTML 在 data 中
|
|
73
|
+
* 默认为 true
|
|
74
|
+
*/
|
|
75
|
+
preserveRawHtml?: boolean
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* 自定义标签处理器
|
|
79
|
+
* 可以对特定标签进行自定义处理
|
|
80
|
+
*/
|
|
81
|
+
tagHandlers?: Record<string, (node: HtmlElementNode) => HtmlElementNode | null>
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ============ 默认配置 ============
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* 危险标签黑名单(XSS 防护)
|
|
88
|
+
*/
|
|
89
|
+
export const DEFAULT_TAG_BLACKLIST = [
|
|
90
|
+
'script',
|
|
91
|
+
'style',
|
|
92
|
+
'iframe',
|
|
93
|
+
'object',
|
|
94
|
+
'embed',
|
|
95
|
+
'form',
|
|
96
|
+
'input',
|
|
97
|
+
'button',
|
|
98
|
+
'textarea',
|
|
99
|
+
'select',
|
|
100
|
+
'meta',
|
|
101
|
+
'link',
|
|
102
|
+
'base',
|
|
103
|
+
'frame',
|
|
104
|
+
'frameset',
|
|
105
|
+
'applet',
|
|
106
|
+
'noscript',
|
|
107
|
+
'template'
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* 危险属性黑名单(XSS 防护)
|
|
112
|
+
* 包含所有 on* 事件属性
|
|
113
|
+
*/
|
|
114
|
+
export const DEFAULT_ATTR_BLACKLIST = [
|
|
115
|
+
// 事件属性通过正则匹配
|
|
116
|
+
'formaction',
|
|
117
|
+
'xlink:href',
|
|
118
|
+
'xmlns',
|
|
119
|
+
'srcdoc'
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* 危险协议黑名单
|
|
124
|
+
*/
|
|
125
|
+
export const DEFAULT_PROTOCOL_BLACKLIST = [
|
|
126
|
+
'javascript:',
|
|
127
|
+
'vbscript:',
|
|
128
|
+
'data:' // 注意:data:image/ 会被特殊处理允许
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* URL 类属性列表(需要检查协议)
|
|
133
|
+
*/
|
|
134
|
+
const URL_ATTRS = ['href', 'src', 'action', 'formaction', 'poster', 'background']
|
|
135
|
+
|
|
136
|
+
// ============ HTML 解析工具 ============
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* HTML 内容类型
|
|
140
|
+
*/
|
|
141
|
+
export type HtmlContentType = 'opening' | 'closing' | 'self-closing' | 'fragment' | 'unknown'
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* 自闭合标签列表
|
|
145
|
+
*/
|
|
146
|
+
const VOID_ELEMENTS = ['br', 'hr', 'img', 'input', 'meta', 'link', 'area', 'base', 'col', 'embed', 'source', 'track', 'wbr']
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* 判断 HTML 内容的类型
|
|
150
|
+
* - opening: 单个开标签,如 <span class="foo">
|
|
151
|
+
* - closing: 单个闭标签,如 </span>
|
|
152
|
+
* - self-closing: 自闭合标签,如 <br /> 或 <img src="...">
|
|
153
|
+
* - fragment: 完整的 HTML 片段,包含多个标签
|
|
154
|
+
* - unknown: 无法识别
|
|
155
|
+
*/
|
|
156
|
+
export function detectHtmlContentType(html: string): HtmlContentType {
|
|
157
|
+
const trimmed = html.trim()
|
|
158
|
+
|
|
159
|
+
// 空内容
|
|
160
|
+
if (!trimmed) return 'unknown'
|
|
161
|
+
|
|
162
|
+
// 不是以 < 开头
|
|
163
|
+
if (!trimmed.startsWith('<')) return 'unknown'
|
|
164
|
+
|
|
165
|
+
// 检查是否是单个闭标签: </tagName>
|
|
166
|
+
const closingMatch = trimmed.match(/^<\/([a-zA-Z][a-zA-Z0-9-]*)\s*>$/)
|
|
167
|
+
if (closingMatch) {
|
|
168
|
+
return 'closing'
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// 检查是否是单个开标签或自闭合标签
|
|
172
|
+
// 单个标签不应该包含其他 < 字符(除了在属性值中)
|
|
173
|
+
// 使用更精确的匹配:从开头到第一个 > 之间不应该有未转义的 <
|
|
174
|
+
const singleTagMatch = trimmed.match(/^<([a-zA-Z][a-zA-Z0-9-]*)(\s[^]*?)?(\/?)>$/)
|
|
175
|
+
if (singleTagMatch) {
|
|
176
|
+
const [fullMatch, tagName, attrsString, selfClosingSlash] = singleTagMatch
|
|
177
|
+
|
|
178
|
+
// 检查属性字符串中是否有未闭合的 <
|
|
179
|
+
// 如果有,说明这可能是一个片段而不是单个标签
|
|
180
|
+
if (attrsString) {
|
|
181
|
+
// 统计属性字符串中的 < 数量(不在引号内的)
|
|
182
|
+
let inQuote = ''
|
|
183
|
+
let hasUnquotedBracket = false
|
|
184
|
+
for (let i = 0; i < attrsString.length; i++) {
|
|
185
|
+
const char = attrsString[i]
|
|
186
|
+
if (inQuote) {
|
|
187
|
+
if (char === inQuote) inQuote = ''
|
|
188
|
+
} else {
|
|
189
|
+
if (char === '"' || char === "'") inQuote = char
|
|
190
|
+
else if (char === '<') {
|
|
191
|
+
hasUnquotedBracket = true
|
|
192
|
+
break
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
if (hasUnquotedBracket) {
|
|
197
|
+
return 'fragment'
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// 判断是否是自闭合
|
|
202
|
+
const isSelfClosing = selfClosingSlash === '/' || VOID_ELEMENTS.includes(tagName.toLowerCase())
|
|
203
|
+
return isSelfClosing ? 'self-closing' : 'opening'
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// 检查是否包含多个标签(片段)
|
|
207
|
+
// 统计 < 的数量
|
|
208
|
+
let bracketCount = 0
|
|
209
|
+
for (const char of trimmed) {
|
|
210
|
+
if (char === '<') bracketCount++
|
|
211
|
+
}
|
|
212
|
+
if (bracketCount > 1) {
|
|
213
|
+
return 'fragment'
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return 'unknown'
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* 解析单个 HTML 标签(开标签、闭标签或自闭合标签)
|
|
221
|
+
* 只处理单个标签,不处理完整的 HTML 片段
|
|
222
|
+
*/
|
|
223
|
+
export function parseHtmlTag(html: string): ParsedHtmlTag | null {
|
|
224
|
+
const trimmed = html.trim()
|
|
225
|
+
const contentType = detectHtmlContentType(trimmed)
|
|
226
|
+
|
|
227
|
+
// 只处理单个标签
|
|
228
|
+
if (contentType !== 'opening' && contentType !== 'closing' && contentType !== 'self-closing') {
|
|
229
|
+
return null
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// 闭标签
|
|
233
|
+
if (contentType === 'closing') {
|
|
234
|
+
const match = trimmed.match(/^<\/([a-zA-Z][a-zA-Z0-9-]*)\s*>$/)
|
|
235
|
+
if (!match) return null
|
|
236
|
+
return {
|
|
237
|
+
tagName: match[1].toLowerCase(),
|
|
238
|
+
attrs: {},
|
|
239
|
+
isClosing: true,
|
|
240
|
+
isSelfClosing: false,
|
|
241
|
+
rawHtml: html
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// 开标签或自闭合标签
|
|
246
|
+
const match = trimmed.match(/^<([a-zA-Z][a-zA-Z0-9-]*)(\s[^]*?)?(\/?)>$/)
|
|
247
|
+
if (!match) return null
|
|
248
|
+
|
|
249
|
+
const [, tagName, attrsString, selfClosingSlash] = match
|
|
250
|
+
const isSelfClosing = selfClosingSlash === '/' || VOID_ELEMENTS.includes(tagName.toLowerCase())
|
|
251
|
+
|
|
252
|
+
// 解析属性
|
|
253
|
+
const attrs: Record<string, string> = {}
|
|
254
|
+
if (attrsString) {
|
|
255
|
+
// 匹配属性:name="value", name='value', name=value, name
|
|
256
|
+
const attrRegex = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g
|
|
257
|
+
let attrMatch
|
|
258
|
+
while ((attrMatch = attrRegex.exec(attrsString)) !== null) {
|
|
259
|
+
const [, name, doubleQuoted, singleQuoted, unquoted] = attrMatch
|
|
260
|
+
const value = doubleQuoted ?? singleQuoted ?? unquoted ?? ''
|
|
261
|
+
attrs[name.toLowerCase()] = decodeHtmlEntities(value)
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
tagName: tagName.toLowerCase(),
|
|
267
|
+
attrs,
|
|
268
|
+
isClosing: false,
|
|
269
|
+
isSelfClosing,
|
|
270
|
+
rawHtml: html
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* 解码 HTML 实体
|
|
276
|
+
*/
|
|
277
|
+
function decodeHtmlEntities(text: string): string {
|
|
278
|
+
const entities: Record<string, string> = {
|
|
279
|
+
'&': '&',
|
|
280
|
+
'<': '<',
|
|
281
|
+
'>': '>',
|
|
282
|
+
'"': '"',
|
|
283
|
+
''': "'",
|
|
284
|
+
''': "'",
|
|
285
|
+
' ': ' '
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return text.replace(/&(?:#(\d+)|#x([a-fA-F0-9]+)|([a-zA-Z]+));/g, (match, dec, hex, name) => {
|
|
289
|
+
if (dec) return String.fromCharCode(parseInt(dec, 10))
|
|
290
|
+
if (hex) return String.fromCharCode(parseInt(hex, 16))
|
|
291
|
+
return entities[`&${name};`] || match
|
|
292
|
+
})
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* 内部函数:直接解析单个 HTML 标签(不进行类型检测)
|
|
297
|
+
* 用于 parseHtmlFragment 中已经通过正则分离出的标签
|
|
298
|
+
*/
|
|
299
|
+
function parseTagDirect(tag: string): ParsedHtmlTag | null {
|
|
300
|
+
const trimmed = tag.trim()
|
|
301
|
+
|
|
302
|
+
// 闭标签
|
|
303
|
+
const closingMatch = trimmed.match(/^<\/([a-zA-Z][a-zA-Z0-9-]*)\s*>$/)
|
|
304
|
+
if (closingMatch) {
|
|
305
|
+
return {
|
|
306
|
+
tagName: closingMatch[1].toLowerCase(),
|
|
307
|
+
attrs: {},
|
|
308
|
+
isClosing: true,
|
|
309
|
+
isSelfClosing: false,
|
|
310
|
+
rawHtml: tag
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// 开标签或自闭合标签(允许多行属性)
|
|
315
|
+
const openMatch = trimmed.match(/^<([a-zA-Z][a-zA-Z0-9-]*)([\s\S]*?)(\/?)>$/)
|
|
316
|
+
if (!openMatch) return null
|
|
317
|
+
|
|
318
|
+
const [, tagName, attrsString, selfClosingSlash] = openMatch
|
|
319
|
+
const isSelfClosing = selfClosingSlash === '/' || VOID_ELEMENTS.includes(tagName.toLowerCase())
|
|
320
|
+
|
|
321
|
+
// 解析属性
|
|
322
|
+
const attrs: Record<string, string> = {}
|
|
323
|
+
if (attrsString) {
|
|
324
|
+
// 匹配属性:name="value", name='value', name=value, name
|
|
325
|
+
const attrRegex = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g
|
|
326
|
+
let attrMatch
|
|
327
|
+
while ((attrMatch = attrRegex.exec(attrsString)) !== null) {
|
|
328
|
+
const [, name, doubleQuoted, singleQuoted, unquoted] = attrMatch
|
|
329
|
+
const value = doubleQuoted ?? singleQuoted ?? unquoted ?? ''
|
|
330
|
+
attrs[name.toLowerCase()] = decodeHtmlEntities(value)
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
return {
|
|
335
|
+
tagName: tagName.toLowerCase(),
|
|
336
|
+
attrs,
|
|
337
|
+
isClosing: false,
|
|
338
|
+
isSelfClosing,
|
|
339
|
+
rawHtml: tag
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* 解析完整的 HTML 片段为 AST
|
|
345
|
+
*/
|
|
346
|
+
export function parseHtmlFragment(html: string, options: HtmlTreeExtensionOptions = {}): HtmlElementNode[] {
|
|
347
|
+
const result: HtmlElementNode[] = []
|
|
348
|
+
const stack: HtmlElementNode[] = []
|
|
349
|
+
|
|
350
|
+
// 使用正则逐个提取标签和文本
|
|
351
|
+
const tokenRegex = /(<\/?[a-zA-Z][^>]*>)|([^<]+)/g
|
|
352
|
+
let match
|
|
353
|
+
|
|
354
|
+
while ((match = tokenRegex.exec(html)) !== null) {
|
|
355
|
+
const [, tag, text] = match
|
|
356
|
+
|
|
357
|
+
if (tag) {
|
|
358
|
+
// 使用 parseTagDirect 直接解析,避免类型检测误判
|
|
359
|
+
const parsed = parseTagDirect(tag)
|
|
360
|
+
if (!parsed) continue
|
|
361
|
+
|
|
362
|
+
// 检查标签黑名单
|
|
363
|
+
if (isTagBlacklisted(parsed.tagName, options)) {
|
|
364
|
+
continue
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
if (parsed.isClosing) {
|
|
368
|
+
// 结束标签:向上查找匹配的开始标签
|
|
369
|
+
let found = false
|
|
370
|
+
for (let i = stack.length - 1; i >= 0; i--) {
|
|
371
|
+
if (stack[i].tagName === parsed.tagName) {
|
|
372
|
+
// 找到匹配,弹出并关闭
|
|
373
|
+
const node = stack.pop()!
|
|
374
|
+
if (stack.length > 0) {
|
|
375
|
+
stack[stack.length - 1].children.push(node)
|
|
376
|
+
} else {
|
|
377
|
+
result.push(node)
|
|
378
|
+
}
|
|
379
|
+
found = true
|
|
380
|
+
break
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
// 未找到匹配的开始标签,忽略该结束标签
|
|
384
|
+
if (!found) continue
|
|
385
|
+
} else {
|
|
386
|
+
// 开始标签或自闭合标签
|
|
387
|
+
const sanitizedAttrs = sanitizeAttrs(parsed.attrs, options)
|
|
388
|
+
|
|
389
|
+
const node: HtmlElementNode = {
|
|
390
|
+
type: 'htmlElement',
|
|
391
|
+
tagName: parsed.tagName,
|
|
392
|
+
attrs: sanitizedAttrs,
|
|
393
|
+
children: [],
|
|
394
|
+
data: options.preserveRawHtml !== false ? {
|
|
395
|
+
rawHtml: tag,
|
|
396
|
+
parsed: true
|
|
397
|
+
} : undefined
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (parsed.isSelfClosing) {
|
|
401
|
+
// 自闭合标签直接添加
|
|
402
|
+
if (stack.length > 0) {
|
|
403
|
+
stack[stack.length - 1].children.push(node)
|
|
404
|
+
} else {
|
|
405
|
+
result.push(node)
|
|
406
|
+
}
|
|
407
|
+
} else {
|
|
408
|
+
// 开始标签,入栈
|
|
409
|
+
stack.push(node)
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
} else if (text && text.trim()) {
|
|
413
|
+
// 文本节点
|
|
414
|
+
const textNode: RootContent = {
|
|
415
|
+
type: 'text',
|
|
416
|
+
value: text
|
|
417
|
+
} as RootContent
|
|
418
|
+
|
|
419
|
+
if (stack.length > 0) {
|
|
420
|
+
stack[stack.length - 1].children.push(textNode)
|
|
421
|
+
}
|
|
422
|
+
// 顶层纯文本不处理(应该已经被 markdown 解析器处理)
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// 处理未闭合的标签(从栈中弹出)
|
|
427
|
+
while (stack.length > 0) {
|
|
428
|
+
const node = stack.pop()!
|
|
429
|
+
if (stack.length > 0) {
|
|
430
|
+
stack[stack.length - 1].children.push(node)
|
|
431
|
+
} else {
|
|
432
|
+
result.push(node)
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return result
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// ============ XSS 防护 ============
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* 检查标签是否在黑名单中
|
|
443
|
+
*/
|
|
444
|
+
function isTagBlacklisted(tagName: string, options: HtmlTreeExtensionOptions): boolean {
|
|
445
|
+
const blacklist = options.tagBlacklist ?? DEFAULT_TAG_BLACKLIST
|
|
446
|
+
return blacklist.includes(tagName.toLowerCase())
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* 检查属性是否在黑名单中
|
|
451
|
+
*/
|
|
452
|
+
function isAttrBlacklisted(attrName: string, options: HtmlTreeExtensionOptions): boolean {
|
|
453
|
+
const name = attrName.toLowerCase()
|
|
454
|
+
const blacklist = options.attrBlacklist ?? DEFAULT_ATTR_BLACKLIST
|
|
455
|
+
|
|
456
|
+
// 检查 on* 事件属性
|
|
457
|
+
if (name.startsWith('on')) return true
|
|
458
|
+
|
|
459
|
+
return blacklist.includes(name)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* 检查 URL 是否包含危险协议
|
|
464
|
+
*/
|
|
465
|
+
function isProtocolDangerous(url: string, options: HtmlTreeExtensionOptions): boolean {
|
|
466
|
+
const protocolBlacklist = options.protocolBlacklist ?? DEFAULT_PROTOCOL_BLACKLIST
|
|
467
|
+
const normalizedUrl = url.trim().toLowerCase()
|
|
468
|
+
|
|
469
|
+
for (const protocol of protocolBlacklist) {
|
|
470
|
+
if (normalizedUrl.startsWith(protocol)) {
|
|
471
|
+
// 特殊处理:允许 data:image/
|
|
472
|
+
if (protocol === 'data:' && normalizedUrl.startsWith('data:image/')) {
|
|
473
|
+
return false
|
|
474
|
+
}
|
|
475
|
+
return true
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return false
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
/**
|
|
483
|
+
* 清理属性,移除危险属性
|
|
484
|
+
*/
|
|
485
|
+
function sanitizeAttrs(
|
|
486
|
+
attrs: Record<string, string>,
|
|
487
|
+
options: HtmlTreeExtensionOptions
|
|
488
|
+
): Record<string, string> {
|
|
489
|
+
const result: Record<string, string> = {}
|
|
490
|
+
|
|
491
|
+
for (const [name, value] of Object.entries(attrs)) {
|
|
492
|
+
// 检查属性黑名单
|
|
493
|
+
if (isAttrBlacklisted(name, options)) continue
|
|
494
|
+
|
|
495
|
+
// 检查 URL 属性的协议
|
|
496
|
+
if (URL_ATTRS.includes(name.toLowerCase())) {
|
|
497
|
+
if (isProtocolDangerous(value, options)) continue
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
result[name] = value
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
return result
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// ============ AST 转换器 ============
|
|
507
|
+
|
|
508
|
+
/**
|
|
509
|
+
* 检查是否是 HTML 节点
|
|
510
|
+
*/
|
|
511
|
+
function isHtmlNode(node: RootContent): node is HTML {
|
|
512
|
+
return node.type === 'html'
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* 检查节点是否有子节点
|
|
517
|
+
*/
|
|
518
|
+
function hasChildren(node: RootContent | Root): node is Parent & RootContent {
|
|
519
|
+
return 'children' in node && Array.isArray((node as Parent).children)
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
/**
|
|
523
|
+
* 处理 HTML 节点数组,将开始标签、内容、结束标签合并为结构化节点
|
|
524
|
+
*/
|
|
525
|
+
function processHtmlNodesInArray(
|
|
526
|
+
nodes: RootContent[],
|
|
527
|
+
options: HtmlTreeExtensionOptions
|
|
528
|
+
): RootContent[] {
|
|
529
|
+
const result: RootContent[] = []
|
|
530
|
+
let i = 0
|
|
531
|
+
|
|
532
|
+
while (i < nodes.length) {
|
|
533
|
+
const node = nodes[i]
|
|
534
|
+
|
|
535
|
+
if (isHtmlNode(node)) {
|
|
536
|
+
// 首先检测 HTML 内容类型
|
|
537
|
+
const contentType = detectHtmlContentType(node.value)
|
|
538
|
+
|
|
539
|
+
if (contentType === 'fragment') {
|
|
540
|
+
// 完整的 HTML 片段,解析为 HTML 树
|
|
541
|
+
const fragmentNodes = parseHtmlFragment(node.value, options)
|
|
542
|
+
if (fragmentNodes.length > 0) {
|
|
543
|
+
result.push(...fragmentNodes)
|
|
544
|
+
} else {
|
|
545
|
+
// 无法解析,保留原节点
|
|
546
|
+
result.push(node)
|
|
547
|
+
}
|
|
548
|
+
i++
|
|
549
|
+
} else if (contentType === 'self-closing') {
|
|
550
|
+
// 自闭合标签
|
|
551
|
+
const parsed = parseHtmlTag(node.value)
|
|
552
|
+
if (parsed && !isTagBlacklisted(parsed.tagName, options)) {
|
|
553
|
+
const elementNode: HtmlElementNode = {
|
|
554
|
+
type: 'htmlElement',
|
|
555
|
+
tagName: parsed.tagName,
|
|
556
|
+
attrs: sanitizeAttrs(parsed.attrs, options),
|
|
557
|
+
children: [],
|
|
558
|
+
data: options.preserveRawHtml !== false ? {
|
|
559
|
+
rawHtml: node.value,
|
|
560
|
+
parsed: true,
|
|
561
|
+
originalType: 'html'
|
|
562
|
+
} : undefined
|
|
563
|
+
}
|
|
564
|
+
result.push(elementNode)
|
|
565
|
+
}
|
|
566
|
+
i++
|
|
567
|
+
} else if (contentType === 'closing') {
|
|
568
|
+
// 孤立的结束标签,跳过(通常已被开标签处理)
|
|
569
|
+
i++
|
|
570
|
+
} else if (contentType === 'opening') {
|
|
571
|
+
// 开始标签:收集子节点直到找到对应的结束标签
|
|
572
|
+
const parsed = parseHtmlTag(node.value)
|
|
573
|
+
if (!parsed || isTagBlacklisted(parsed.tagName, options)) {
|
|
574
|
+
i++
|
|
575
|
+
continue
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
const tagName = parsed.tagName
|
|
579
|
+
const contentNodes: RootContent[] = []
|
|
580
|
+
let depth = 1
|
|
581
|
+
let j = i + 1
|
|
582
|
+
let foundClosing = false
|
|
583
|
+
|
|
584
|
+
while (j < nodes.length && depth > 0) {
|
|
585
|
+
const nextNode = nodes[j]
|
|
586
|
+
|
|
587
|
+
if (isHtmlNode(nextNode)) {
|
|
588
|
+
const nextType = detectHtmlContentType(nextNode.value)
|
|
589
|
+
|
|
590
|
+
if (nextType === 'closing') {
|
|
591
|
+
const nextParsed = parseHtmlTag(nextNode.value)
|
|
592
|
+
if (nextParsed && nextParsed.tagName === tagName) {
|
|
593
|
+
depth--
|
|
594
|
+
if (depth === 0) {
|
|
595
|
+
foundClosing = true
|
|
596
|
+
break
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
} else if (nextType === 'opening') {
|
|
600
|
+
const nextParsed = parseHtmlTag(nextNode.value)
|
|
601
|
+
if (nextParsed && nextParsed.tagName === tagName) {
|
|
602
|
+
depth++
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
// fragment 和 self-closing 不影响深度
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
contentNodes.push(nextNode)
|
|
609
|
+
j++
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// 创建结构化节点
|
|
613
|
+
const elementNode: HtmlElementNode = {
|
|
614
|
+
type: 'htmlElement',
|
|
615
|
+
tagName: parsed.tagName,
|
|
616
|
+
attrs: sanitizeAttrs(parsed.attrs, options),
|
|
617
|
+
children: processHtmlNodesInArray(contentNodes, options),
|
|
618
|
+
data: options.preserveRawHtml !== false ? {
|
|
619
|
+
rawHtml: node.value,
|
|
620
|
+
parsed: true,
|
|
621
|
+
originalType: 'html'
|
|
622
|
+
} : undefined
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
result.push(elementNode)
|
|
626
|
+
i = foundClosing ? j + 1 : j
|
|
627
|
+
} else {
|
|
628
|
+
// unknown 类型,保留原节点
|
|
629
|
+
result.push(node)
|
|
630
|
+
i++
|
|
631
|
+
}
|
|
632
|
+
} else {
|
|
633
|
+
// 非 HTML 节点,递归处理子节点
|
|
634
|
+
if (hasChildren(node)) {
|
|
635
|
+
const processed = processHtmlNodesInArray(
|
|
636
|
+
(node as Parent).children as RootContent[],
|
|
637
|
+
options
|
|
638
|
+
)
|
|
639
|
+
result.push({
|
|
640
|
+
...node,
|
|
641
|
+
children: processed
|
|
642
|
+
} as RootContent)
|
|
643
|
+
} else {
|
|
644
|
+
result.push(node)
|
|
645
|
+
}
|
|
646
|
+
i++
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
return result
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
/**
|
|
654
|
+
* 转换整个 AST,处理所有 HTML 节点
|
|
655
|
+
*/
|
|
656
|
+
export function transformHtmlNodes(ast: Root, options: HtmlTreeExtensionOptions = {}): Root {
|
|
657
|
+
return {
|
|
658
|
+
...ast,
|
|
659
|
+
children: processHtmlNodesInArray(ast.children, options) as Root['children']
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
/**
|
|
664
|
+
* 创建 HTML 树转换器
|
|
665
|
+
* 这是一个 unified 兼容的转换器
|
|
666
|
+
*/
|
|
667
|
+
export function createHtmlTreeTransformer(options: HtmlTreeExtensionOptions = {}) {
|
|
668
|
+
return function transformer(tree: Root): Root {
|
|
669
|
+
return transformHtmlNodes(tree, options)
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// ============ mdast 扩展(用于 fromMarkdown) ============
|
|
674
|
+
|
|
675
|
+
/**
|
|
676
|
+
* mdast-util-from-markdown 扩展
|
|
677
|
+
* 注意:此扩展主要用于类型声明,实际转换在后处理阶段完成
|
|
678
|
+
*/
|
|
679
|
+
export const htmlTreeExtension: MdastExtension = {
|
|
680
|
+
enter: {},
|
|
681
|
+
exit: {}
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// ============ 便捷工具函数 ============
|
|
685
|
+
|
|
686
|
+
/**
|
|
687
|
+
* 判断节点是否是 HtmlElementNode
|
|
688
|
+
*/
|
|
689
|
+
export function isHtmlElementNode(node: RootContent): node is HtmlElementNode {
|
|
690
|
+
return node.type === 'htmlElement'
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
/**
|
|
694
|
+
* 遍历所有 HTML 元素节点
|
|
695
|
+
*/
|
|
696
|
+
export function walkHtmlElements(
|
|
697
|
+
node: RootContent | Root,
|
|
698
|
+
callback: (node: HtmlElementNode, parent: Parent | Root | null) => void,
|
|
699
|
+
parent: Parent | Root | null = null
|
|
700
|
+
): void {
|
|
701
|
+
if (isHtmlElementNode(node as RootContent)) {
|
|
702
|
+
callback(node as HtmlElementNode, parent)
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
if (hasChildren(node as RootContent) || node.type === 'root') {
|
|
706
|
+
const children = (node as Parent | Root).children
|
|
707
|
+
for (const child of children) {
|
|
708
|
+
walkHtmlElements(child, callback, node as Parent | Root)
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
/**
|
|
714
|
+
* 查找特定标签的所有节点
|
|
715
|
+
*/
|
|
716
|
+
export function findHtmlElementsByTag(
|
|
717
|
+
root: Root,
|
|
718
|
+
tagName: string
|
|
719
|
+
): HtmlElementNode[] {
|
|
720
|
+
const result: HtmlElementNode[] = []
|
|
721
|
+
|
|
722
|
+
walkHtmlElements(root, (node) => {
|
|
723
|
+
if (node.tagName === tagName.toLowerCase()) {
|
|
724
|
+
result.push(node)
|
|
725
|
+
}
|
|
726
|
+
})
|
|
727
|
+
|
|
728
|
+
return result
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
/**
|
|
732
|
+
* 将 HtmlElementNode 转回 HTML 字符串
|
|
733
|
+
*/
|
|
734
|
+
export function htmlElementToString(node: HtmlElementNode): string {
|
|
735
|
+
const { tagName, attrs, children } = node
|
|
736
|
+
|
|
737
|
+
// 构建属性字符串
|
|
738
|
+
const attrsStr = Object.entries(attrs)
|
|
739
|
+
.map(([name, value]) => {
|
|
740
|
+
if (value === '') return name
|
|
741
|
+
return `${name}="${escapeHtml(value)}"`
|
|
742
|
+
})
|
|
743
|
+
.join(' ')
|
|
744
|
+
|
|
745
|
+
const openTag = attrsStr ? `<${tagName} ${attrsStr}>` : `<${tagName}>`
|
|
746
|
+
|
|
747
|
+
// 自闭合标签
|
|
748
|
+
if (children.length === 0 && isSelfClosingTag(tagName)) {
|
|
749
|
+
return attrsStr ? `<${tagName} ${attrsStr} />` : `<${tagName} />`
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
// 递归处理子节点
|
|
753
|
+
const childrenStr = children.map(child => {
|
|
754
|
+
if (child.type === 'text') {
|
|
755
|
+
return (child as { value: string }).value
|
|
756
|
+
}
|
|
757
|
+
if (isHtmlElementNode(child)) {
|
|
758
|
+
return htmlElementToString(child)
|
|
759
|
+
}
|
|
760
|
+
// 其他节点类型保持原样(实际使用中可能需要扩展)
|
|
761
|
+
return ''
|
|
762
|
+
}).join('')
|
|
763
|
+
|
|
764
|
+
return `${openTag}${childrenStr}</${tagName}>`
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
/**
|
|
768
|
+
* 检查是否是自闭合标签
|
|
769
|
+
*/
|
|
770
|
+
function isSelfClosingTag(tagName: string): boolean {
|
|
771
|
+
return ['br', 'hr', 'img', 'input', 'meta', 'link', 'area', 'base', 'col', 'embed', 'source', 'track', 'wbr'].includes(tagName.toLowerCase())
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
/**
|
|
775
|
+
* HTML 转义
|
|
776
|
+
*/
|
|
777
|
+
function escapeHtml(text: string): string {
|
|
778
|
+
return text
|
|
779
|
+
.replace(/&/g, '&')
|
|
780
|
+
.replace(/</g, '<')
|
|
781
|
+
.replace(/>/g, '>')
|
|
782
|
+
.replace(/"/g, '"')
|
|
783
|
+
.replace(/'/g, ''')
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// ============ 导出 ============
|
|
787
|
+
|
|
788
|
+
export {
|
|
789
|
+
DEFAULT_TAG_BLACKLIST as HTML_TAG_BLACKLIST,
|
|
790
|
+
DEFAULT_ATTR_BLACKLIST as HTML_ATTR_BLACKLIST,
|
|
791
|
+
DEFAULT_PROTOCOL_BLACKLIST as HTML_PROTOCOL_BLACKLIST
|
|
792
|
+
}
|