@incremark/core 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,792 @@
1
+ import type { Parent, RootContent, Root, PhrasingContent, HTML } from 'mdast'
2
+ import type { Extension as MdastExtension } from 'mdast-util-from-markdown'
3
+
4
+ declare module 'mdast' {
5
+ interface RootContentMap {
6
+ htmlElement: HtmlElementNode
7
+ }
8
+ interface PhrasingContentMap {
9
+ htmlElement: HtmlElementNode
10
+ }
11
+ }
12
+
13
+ // ============ 类型定义 ============
14
+
15
+ /**
16
+ * 自定义 HTML 元素节点类型
17
+ */
18
+ export interface HtmlElementNode extends Parent {
19
+ type: 'htmlElement'
20
+ tagName: string
21
+ attrs: Record<string, string>
22
+ children: RootContent[]
23
+ data?: {
24
+ rawHtml?: string
25
+ parsed?: boolean
26
+ originalType?: string
27
+ }
28
+ }
29
+
30
+ /**
31
+ * HTML 属性信息
32
+ */
33
+ export interface HtmlAttrInfo {
34
+ name: string
35
+ value: string
36
+ }
37
+
38
+ /**
39
+ * 解析后的 HTML 标签信息
40
+ */
41
+ export interface ParsedHtmlTag {
42
+ tagName: string
43
+ attrs: Record<string, string>
44
+ isClosing: boolean
45
+ isSelfClosing: boolean
46
+ rawHtml: string
47
+ }
48
+
49
+ /**
50
+ * HTML 树扩展配置
51
+ */
52
+ export interface HtmlTreeExtensionOptions {
53
+ /**
54
+ * 标签黑名单 - 这些标签会被过滤掉(XSS 防护)
55
+ * 默认包含危险标签:script, style, iframe, object, embed, form, input, button, textarea, select
56
+ */
57
+ tagBlacklist?: string[]
58
+
59
+ /**
60
+ * 属性黑名单 - 这些属性会被过滤掉(XSS 防护)
61
+ * 默认包含所有 on* 事件属性和 javascript: 协议
62
+ */
63
+ attrBlacklist?: string[]
64
+
65
+ /**
66
+ * 协议黑名单 - URL 属性中禁止的协议
67
+ * 默认包含 javascript:, vbscript:, data: (允许 data:image/)
68
+ */
69
+ protocolBlacklist?: string[]
70
+
71
+ /**
72
+ * 是否保留原始 HTML 在 data 中
73
+ * 默认为 true
74
+ */
75
+ preserveRawHtml?: boolean
76
+
77
+ /**
78
+ * 自定义标签处理器
79
+ * 可以对特定标签进行自定义处理
80
+ */
81
+ tagHandlers?: Record<string, (node: HtmlElementNode) => HtmlElementNode | null>
82
+ }
83
+
84
+ // ============ 默认配置 ============
85
+
86
+ /**
87
+ * 危险标签黑名单(XSS 防护)
88
+ */
89
+ export const DEFAULT_TAG_BLACKLIST = [
90
+ 'script',
91
+ 'style',
92
+ 'iframe',
93
+ 'object',
94
+ 'embed',
95
+ 'form',
96
+ 'input',
97
+ 'button',
98
+ 'textarea',
99
+ 'select',
100
+ 'meta',
101
+ 'link',
102
+ 'base',
103
+ 'frame',
104
+ 'frameset',
105
+ 'applet',
106
+ 'noscript',
107
+ 'template'
108
+ ]
109
+
110
+ /**
111
+ * 危险属性黑名单(XSS 防护)
112
+ * 包含所有 on* 事件属性
113
+ */
114
+ export const DEFAULT_ATTR_BLACKLIST = [
115
+ // 事件属性通过正则匹配
116
+ 'formaction',
117
+ 'xlink:href',
118
+ 'xmlns',
119
+ 'srcdoc'
120
+ ]
121
+
122
+ /**
123
+ * 危险协议黑名单
124
+ */
125
+ export const DEFAULT_PROTOCOL_BLACKLIST = [
126
+ 'javascript:',
127
+ 'vbscript:',
128
+ 'data:' // 注意:data:image/ 会被特殊处理允许
129
+ ]
130
+
131
+ /**
132
+ * URL 类属性列表(需要检查协议)
133
+ */
134
+ const URL_ATTRS = ['href', 'src', 'action', 'formaction', 'poster', 'background']
135
+
136
+ // ============ HTML 解析工具 ============
137
+
138
+ /**
139
+ * HTML 内容类型
140
+ */
141
+ export type HtmlContentType = 'opening' | 'closing' | 'self-closing' | 'fragment' | 'unknown'
142
+
143
+ /**
144
+ * 自闭合标签列表
145
+ */
146
+ const VOID_ELEMENTS = ['br', 'hr', 'img', 'input', 'meta', 'link', 'area', 'base', 'col', 'embed', 'source', 'track', 'wbr']
147
+
148
+ /**
149
+ * 判断 HTML 内容的类型
150
+ * - opening: 单个开标签,如 <span class="foo">
151
+ * - closing: 单个闭标签,如 </span>
152
+ * - self-closing: 自闭合标签,如 <br /> 或 <img src="...">
153
+ * - fragment: 完整的 HTML 片段,包含多个标签
154
+ * - unknown: 无法识别
155
+ */
156
+ export function detectHtmlContentType(html: string): HtmlContentType {
157
+ const trimmed = html.trim()
158
+
159
+ // 空内容
160
+ if (!trimmed) return 'unknown'
161
+
162
+ // 不是以 < 开头
163
+ if (!trimmed.startsWith('<')) return 'unknown'
164
+
165
+ // 检查是否是单个闭标签: </tagName>
166
+ const closingMatch = trimmed.match(/^<\/([a-zA-Z][a-zA-Z0-9-]*)\s*>$/)
167
+ if (closingMatch) {
168
+ return 'closing'
169
+ }
170
+
171
+ // 检查是否是单个开标签或自闭合标签
172
+ // 单个标签不应该包含其他 < 字符(除了在属性值中)
173
+ // 使用更精确的匹配:从开头到第一个 > 之间不应该有未转义的 <
174
+ const singleTagMatch = trimmed.match(/^<([a-zA-Z][a-zA-Z0-9-]*)(\s[^]*?)?(\/?)>$/)
175
+ if (singleTagMatch) {
176
+ const [fullMatch, tagName, attrsString, selfClosingSlash] = singleTagMatch
177
+
178
+ // 检查属性字符串中是否有未闭合的 <
179
+ // 如果有,说明这可能是一个片段而不是单个标签
180
+ if (attrsString) {
181
+ // 统计属性字符串中的 < 数量(不在引号内的)
182
+ let inQuote = ''
183
+ let hasUnquotedBracket = false
184
+ for (let i = 0; i < attrsString.length; i++) {
185
+ const char = attrsString[i]
186
+ if (inQuote) {
187
+ if (char === inQuote) inQuote = ''
188
+ } else {
189
+ if (char === '"' || char === "'") inQuote = char
190
+ else if (char === '<') {
191
+ hasUnquotedBracket = true
192
+ break
193
+ }
194
+ }
195
+ }
196
+ if (hasUnquotedBracket) {
197
+ return 'fragment'
198
+ }
199
+ }
200
+
201
+ // 判断是否是自闭合
202
+ const isSelfClosing = selfClosingSlash === '/' || VOID_ELEMENTS.includes(tagName.toLowerCase())
203
+ return isSelfClosing ? 'self-closing' : 'opening'
204
+ }
205
+
206
+ // 检查是否包含多个标签(片段)
207
+ // 统计 < 的数量
208
+ let bracketCount = 0
209
+ for (const char of trimmed) {
210
+ if (char === '<') bracketCount++
211
+ }
212
+ if (bracketCount > 1) {
213
+ return 'fragment'
214
+ }
215
+
216
+ return 'unknown'
217
+ }
218
+
219
+ /**
220
+ * 解析单个 HTML 标签(开标签、闭标签或自闭合标签)
221
+ * 只处理单个标签,不处理完整的 HTML 片段
222
+ */
223
+ export function parseHtmlTag(html: string): ParsedHtmlTag | null {
224
+ const trimmed = html.trim()
225
+ const contentType = detectHtmlContentType(trimmed)
226
+
227
+ // 只处理单个标签
228
+ if (contentType !== 'opening' && contentType !== 'closing' && contentType !== 'self-closing') {
229
+ return null
230
+ }
231
+
232
+ // 闭标签
233
+ if (contentType === 'closing') {
234
+ const match = trimmed.match(/^<\/([a-zA-Z][a-zA-Z0-9-]*)\s*>$/)
235
+ if (!match) return null
236
+ return {
237
+ tagName: match[1].toLowerCase(),
238
+ attrs: {},
239
+ isClosing: true,
240
+ isSelfClosing: false,
241
+ rawHtml: html
242
+ }
243
+ }
244
+
245
+ // 开标签或自闭合标签
246
+ const match = trimmed.match(/^<([a-zA-Z][a-zA-Z0-9-]*)(\s[^]*?)?(\/?)>$/)
247
+ if (!match) return null
248
+
249
+ const [, tagName, attrsString, selfClosingSlash] = match
250
+ const isSelfClosing = selfClosingSlash === '/' || VOID_ELEMENTS.includes(tagName.toLowerCase())
251
+
252
+ // 解析属性
253
+ const attrs: Record<string, string> = {}
254
+ if (attrsString) {
255
+ // 匹配属性:name="value", name='value', name=value, name
256
+ const attrRegex = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g
257
+ let attrMatch
258
+ while ((attrMatch = attrRegex.exec(attrsString)) !== null) {
259
+ const [, name, doubleQuoted, singleQuoted, unquoted] = attrMatch
260
+ const value = doubleQuoted ?? singleQuoted ?? unquoted ?? ''
261
+ attrs[name.toLowerCase()] = decodeHtmlEntities(value)
262
+ }
263
+ }
264
+
265
+ return {
266
+ tagName: tagName.toLowerCase(),
267
+ attrs,
268
+ isClosing: false,
269
+ isSelfClosing,
270
+ rawHtml: html
271
+ }
272
+ }
273
+
274
+ /**
275
+ * 解码 HTML 实体
276
+ */
277
+ function decodeHtmlEntities(text: string): string {
278
+ const entities: Record<string, string> = {
279
+ '&amp;': '&',
280
+ '&lt;': '<',
281
+ '&gt;': '>',
282
+ '&quot;': '"',
283
+ '&#39;': "'",
284
+ '&apos;': "'",
285
+ '&nbsp;': ' '
286
+ }
287
+
288
+ return text.replace(/&(?:#(\d+)|#x([a-fA-F0-9]+)|([a-zA-Z]+));/g, (match, dec, hex, name) => {
289
+ if (dec) return String.fromCharCode(parseInt(dec, 10))
290
+ if (hex) return String.fromCharCode(parseInt(hex, 16))
291
+ return entities[`&${name};`] || match
292
+ })
293
+ }
294
+
295
+ /**
296
+ * 内部函数:直接解析单个 HTML 标签(不进行类型检测)
297
+ * 用于 parseHtmlFragment 中已经通过正则分离出的标签
298
+ */
299
+ function parseTagDirect(tag: string): ParsedHtmlTag | null {
300
+ const trimmed = tag.trim()
301
+
302
+ // 闭标签
303
+ const closingMatch = trimmed.match(/^<\/([a-zA-Z][a-zA-Z0-9-]*)\s*>$/)
304
+ if (closingMatch) {
305
+ return {
306
+ tagName: closingMatch[1].toLowerCase(),
307
+ attrs: {},
308
+ isClosing: true,
309
+ isSelfClosing: false,
310
+ rawHtml: tag
311
+ }
312
+ }
313
+
314
+ // 开标签或自闭合标签(允许多行属性)
315
+ const openMatch = trimmed.match(/^<([a-zA-Z][a-zA-Z0-9-]*)([\s\S]*?)(\/?)>$/)
316
+ if (!openMatch) return null
317
+
318
+ const [, tagName, attrsString, selfClosingSlash] = openMatch
319
+ const isSelfClosing = selfClosingSlash === '/' || VOID_ELEMENTS.includes(tagName.toLowerCase())
320
+
321
+ // 解析属性
322
+ const attrs: Record<string, string> = {}
323
+ if (attrsString) {
324
+ // 匹配属性:name="value", name='value', name=value, name
325
+ const attrRegex = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g
326
+ let attrMatch
327
+ while ((attrMatch = attrRegex.exec(attrsString)) !== null) {
328
+ const [, name, doubleQuoted, singleQuoted, unquoted] = attrMatch
329
+ const value = doubleQuoted ?? singleQuoted ?? unquoted ?? ''
330
+ attrs[name.toLowerCase()] = decodeHtmlEntities(value)
331
+ }
332
+ }
333
+
334
+ return {
335
+ tagName: tagName.toLowerCase(),
336
+ attrs,
337
+ isClosing: false,
338
+ isSelfClosing,
339
+ rawHtml: tag
340
+ }
341
+ }
342
+
343
+ /**
344
+ * 解析完整的 HTML 片段为 AST
345
+ */
346
+ export function parseHtmlFragment(html: string, options: HtmlTreeExtensionOptions = {}): HtmlElementNode[] {
347
+ const result: HtmlElementNode[] = []
348
+ const stack: HtmlElementNode[] = []
349
+
350
+ // 使用正则逐个提取标签和文本
351
+ const tokenRegex = /(<\/?[a-zA-Z][^>]*>)|([^<]+)/g
352
+ let match
353
+
354
+ while ((match = tokenRegex.exec(html)) !== null) {
355
+ const [, tag, text] = match
356
+
357
+ if (tag) {
358
+ // 使用 parseTagDirect 直接解析,避免类型检测误判
359
+ const parsed = parseTagDirect(tag)
360
+ if (!parsed) continue
361
+
362
+ // 检查标签黑名单
363
+ if (isTagBlacklisted(parsed.tagName, options)) {
364
+ continue
365
+ }
366
+
367
+ if (parsed.isClosing) {
368
+ // 结束标签:向上查找匹配的开始标签
369
+ let found = false
370
+ for (let i = stack.length - 1; i >= 0; i--) {
371
+ if (stack[i].tagName === parsed.tagName) {
372
+ // 找到匹配,弹出并关闭
373
+ const node = stack.pop()!
374
+ if (stack.length > 0) {
375
+ stack[stack.length - 1].children.push(node)
376
+ } else {
377
+ result.push(node)
378
+ }
379
+ found = true
380
+ break
381
+ }
382
+ }
383
+ // 未找到匹配的开始标签,忽略该结束标签
384
+ if (!found) continue
385
+ } else {
386
+ // 开始标签或自闭合标签
387
+ const sanitizedAttrs = sanitizeAttrs(parsed.attrs, options)
388
+
389
+ const node: HtmlElementNode = {
390
+ type: 'htmlElement',
391
+ tagName: parsed.tagName,
392
+ attrs: sanitizedAttrs,
393
+ children: [],
394
+ data: options.preserveRawHtml !== false ? {
395
+ rawHtml: tag,
396
+ parsed: true
397
+ } : undefined
398
+ }
399
+
400
+ if (parsed.isSelfClosing) {
401
+ // 自闭合标签直接添加
402
+ if (stack.length > 0) {
403
+ stack[stack.length - 1].children.push(node)
404
+ } else {
405
+ result.push(node)
406
+ }
407
+ } else {
408
+ // 开始标签,入栈
409
+ stack.push(node)
410
+ }
411
+ }
412
+ } else if (text && text.trim()) {
413
+ // 文本节点
414
+ const textNode: RootContent = {
415
+ type: 'text',
416
+ value: text
417
+ } as RootContent
418
+
419
+ if (stack.length > 0) {
420
+ stack[stack.length - 1].children.push(textNode)
421
+ }
422
+ // 顶层纯文本不处理(应该已经被 markdown 解析器处理)
423
+ }
424
+ }
425
+
426
+ // 处理未闭合的标签(从栈中弹出)
427
+ while (stack.length > 0) {
428
+ const node = stack.pop()!
429
+ if (stack.length > 0) {
430
+ stack[stack.length - 1].children.push(node)
431
+ } else {
432
+ result.push(node)
433
+ }
434
+ }
435
+
436
+ return result
437
+ }
438
+
439
+ // ============ XSS 防护 ============
440
+
441
+ /**
442
+ * 检查标签是否在黑名单中
443
+ */
444
+ function isTagBlacklisted(tagName: string, options: HtmlTreeExtensionOptions): boolean {
445
+ const blacklist = options.tagBlacklist ?? DEFAULT_TAG_BLACKLIST
446
+ return blacklist.includes(tagName.toLowerCase())
447
+ }
448
+
449
+ /**
450
+ * 检查属性是否在黑名单中
451
+ */
452
+ function isAttrBlacklisted(attrName: string, options: HtmlTreeExtensionOptions): boolean {
453
+ const name = attrName.toLowerCase()
454
+ const blacklist = options.attrBlacklist ?? DEFAULT_ATTR_BLACKLIST
455
+
456
+ // 检查 on* 事件属性
457
+ if (name.startsWith('on')) return true
458
+
459
+ return blacklist.includes(name)
460
+ }
461
+
462
+ /**
463
+ * 检查 URL 是否包含危险协议
464
+ */
465
+ function isProtocolDangerous(url: string, options: HtmlTreeExtensionOptions): boolean {
466
+ const protocolBlacklist = options.protocolBlacklist ?? DEFAULT_PROTOCOL_BLACKLIST
467
+ const normalizedUrl = url.trim().toLowerCase()
468
+
469
+ for (const protocol of protocolBlacklist) {
470
+ if (normalizedUrl.startsWith(protocol)) {
471
+ // 特殊处理:允许 data:image/
472
+ if (protocol === 'data:' && normalizedUrl.startsWith('data:image/')) {
473
+ return false
474
+ }
475
+ return true
476
+ }
477
+ }
478
+
479
+ return false
480
+ }
481
+
482
+ /**
483
+ * 清理属性,移除危险属性
484
+ */
485
+ function sanitizeAttrs(
486
+ attrs: Record<string, string>,
487
+ options: HtmlTreeExtensionOptions
488
+ ): Record<string, string> {
489
+ const result: Record<string, string> = {}
490
+
491
+ for (const [name, value] of Object.entries(attrs)) {
492
+ // 检查属性黑名单
493
+ if (isAttrBlacklisted(name, options)) continue
494
+
495
+ // 检查 URL 属性的协议
496
+ if (URL_ATTRS.includes(name.toLowerCase())) {
497
+ if (isProtocolDangerous(value, options)) continue
498
+ }
499
+
500
+ result[name] = value
501
+ }
502
+
503
+ return result
504
+ }
505
+
506
+ // ============ AST 转换器 ============
507
+
508
+ /**
509
+ * 检查是否是 HTML 节点
510
+ */
511
+ function isHtmlNode(node: RootContent): node is HTML {
512
+ return node.type === 'html'
513
+ }
514
+
515
+ /**
516
+ * 检查节点是否有子节点
517
+ */
518
+ function hasChildren(node: RootContent | Root): node is Parent & RootContent {
519
+ return 'children' in node && Array.isArray((node as Parent).children)
520
+ }
521
+
522
+ /**
523
+ * 处理 HTML 节点数组,将开始标签、内容、结束标签合并为结构化节点
524
+ */
525
+ function processHtmlNodesInArray(
526
+ nodes: RootContent[],
527
+ options: HtmlTreeExtensionOptions
528
+ ): RootContent[] {
529
+ const result: RootContent[] = []
530
+ let i = 0
531
+
532
+ while (i < nodes.length) {
533
+ const node = nodes[i]
534
+
535
+ if (isHtmlNode(node)) {
536
+ // 首先检测 HTML 内容类型
537
+ const contentType = detectHtmlContentType(node.value)
538
+
539
+ if (contentType === 'fragment') {
540
+ // 完整的 HTML 片段,解析为 HTML 树
541
+ const fragmentNodes = parseHtmlFragment(node.value, options)
542
+ if (fragmentNodes.length > 0) {
543
+ result.push(...fragmentNodes)
544
+ } else {
545
+ // 无法解析,保留原节点
546
+ result.push(node)
547
+ }
548
+ i++
549
+ } else if (contentType === 'self-closing') {
550
+ // 自闭合标签
551
+ const parsed = parseHtmlTag(node.value)
552
+ if (parsed && !isTagBlacklisted(parsed.tagName, options)) {
553
+ const elementNode: HtmlElementNode = {
554
+ type: 'htmlElement',
555
+ tagName: parsed.tagName,
556
+ attrs: sanitizeAttrs(parsed.attrs, options),
557
+ children: [],
558
+ data: options.preserveRawHtml !== false ? {
559
+ rawHtml: node.value,
560
+ parsed: true,
561
+ originalType: 'html'
562
+ } : undefined
563
+ }
564
+ result.push(elementNode)
565
+ }
566
+ i++
567
+ } else if (contentType === 'closing') {
568
+ // 孤立的结束标签,跳过(通常已被开标签处理)
569
+ i++
570
+ } else if (contentType === 'opening') {
571
+ // 开始标签:收集子节点直到找到对应的结束标签
572
+ const parsed = parseHtmlTag(node.value)
573
+ if (!parsed || isTagBlacklisted(parsed.tagName, options)) {
574
+ i++
575
+ continue
576
+ }
577
+
578
+ const tagName = parsed.tagName
579
+ const contentNodes: RootContent[] = []
580
+ let depth = 1
581
+ let j = i + 1
582
+ let foundClosing = false
583
+
584
+ while (j < nodes.length && depth > 0) {
585
+ const nextNode = nodes[j]
586
+
587
+ if (isHtmlNode(nextNode)) {
588
+ const nextType = detectHtmlContentType(nextNode.value)
589
+
590
+ if (nextType === 'closing') {
591
+ const nextParsed = parseHtmlTag(nextNode.value)
592
+ if (nextParsed && nextParsed.tagName === tagName) {
593
+ depth--
594
+ if (depth === 0) {
595
+ foundClosing = true
596
+ break
597
+ }
598
+ }
599
+ } else if (nextType === 'opening') {
600
+ const nextParsed = parseHtmlTag(nextNode.value)
601
+ if (nextParsed && nextParsed.tagName === tagName) {
602
+ depth++
603
+ }
604
+ }
605
+ // fragment 和 self-closing 不影响深度
606
+ }
607
+
608
+ contentNodes.push(nextNode)
609
+ j++
610
+ }
611
+
612
+ // 创建结构化节点
613
+ const elementNode: HtmlElementNode = {
614
+ type: 'htmlElement',
615
+ tagName: parsed.tagName,
616
+ attrs: sanitizeAttrs(parsed.attrs, options),
617
+ children: processHtmlNodesInArray(contentNodes, options),
618
+ data: options.preserveRawHtml !== false ? {
619
+ rawHtml: node.value,
620
+ parsed: true,
621
+ originalType: 'html'
622
+ } : undefined
623
+ }
624
+
625
+ result.push(elementNode)
626
+ i = foundClosing ? j + 1 : j
627
+ } else {
628
+ // unknown 类型,保留原节点
629
+ result.push(node)
630
+ i++
631
+ }
632
+ } else {
633
+ // 非 HTML 节点,递归处理子节点
634
+ if (hasChildren(node)) {
635
+ const processed = processHtmlNodesInArray(
636
+ (node as Parent).children as RootContent[],
637
+ options
638
+ )
639
+ result.push({
640
+ ...node,
641
+ children: processed
642
+ } as RootContent)
643
+ } else {
644
+ result.push(node)
645
+ }
646
+ i++
647
+ }
648
+ }
649
+
650
+ return result
651
+ }
652
+
653
+ /**
654
+ * 转换整个 AST,处理所有 HTML 节点
655
+ */
656
+ export function transformHtmlNodes(ast: Root, options: HtmlTreeExtensionOptions = {}): Root {
657
+ return {
658
+ ...ast,
659
+ children: processHtmlNodesInArray(ast.children, options) as Root['children']
660
+ }
661
+ }
662
+
663
+ /**
664
+ * 创建 HTML 树转换器
665
+ * 这是一个 unified 兼容的转换器
666
+ */
667
+ export function createHtmlTreeTransformer(options: HtmlTreeExtensionOptions = {}) {
668
+ return function transformer(tree: Root): Root {
669
+ return transformHtmlNodes(tree, options)
670
+ }
671
+ }
672
+
673
+ // ============ mdast 扩展(用于 fromMarkdown) ============
674
+
675
+ /**
676
+ * mdast-util-from-markdown 扩展
677
+ * 注意:此扩展主要用于类型声明,实际转换在后处理阶段完成
678
+ */
679
+ export const htmlTreeExtension: MdastExtension = {
680
+ enter: {},
681
+ exit: {}
682
+ }
683
+
684
+ // ============ 便捷工具函数 ============
685
+
686
+ /**
687
+ * 判断节点是否是 HtmlElementNode
688
+ */
689
+ export function isHtmlElementNode(node: RootContent): node is HtmlElementNode {
690
+ return node.type === 'htmlElement'
691
+ }
692
+
693
+ /**
694
+ * 遍历所有 HTML 元素节点
695
+ */
696
+ export function walkHtmlElements(
697
+ node: RootContent | Root,
698
+ callback: (node: HtmlElementNode, parent: Parent | Root | null) => void,
699
+ parent: Parent | Root | null = null
700
+ ): void {
701
+ if (isHtmlElementNode(node as RootContent)) {
702
+ callback(node as HtmlElementNode, parent)
703
+ }
704
+
705
+ if (hasChildren(node as RootContent) || node.type === 'root') {
706
+ const children = (node as Parent | Root).children
707
+ for (const child of children) {
708
+ walkHtmlElements(child, callback, node as Parent | Root)
709
+ }
710
+ }
711
+ }
712
+
713
+ /**
714
+ * 查找特定标签的所有节点
715
+ */
716
+ export function findHtmlElementsByTag(
717
+ root: Root,
718
+ tagName: string
719
+ ): HtmlElementNode[] {
720
+ const result: HtmlElementNode[] = []
721
+
722
+ walkHtmlElements(root, (node) => {
723
+ if (node.tagName === tagName.toLowerCase()) {
724
+ result.push(node)
725
+ }
726
+ })
727
+
728
+ return result
729
+ }
730
+
731
+ /**
732
+ * 将 HtmlElementNode 转回 HTML 字符串
733
+ */
734
+ export function htmlElementToString(node: HtmlElementNode): string {
735
+ const { tagName, attrs, children } = node
736
+
737
+ // 构建属性字符串
738
+ const attrsStr = Object.entries(attrs)
739
+ .map(([name, value]) => {
740
+ if (value === '') return name
741
+ return `${name}="${escapeHtml(value)}"`
742
+ })
743
+ .join(' ')
744
+
745
+ const openTag = attrsStr ? `<${tagName} ${attrsStr}>` : `<${tagName}>`
746
+
747
+ // 自闭合标签
748
+ if (children.length === 0 && isSelfClosingTag(tagName)) {
749
+ return attrsStr ? `<${tagName} ${attrsStr} />` : `<${tagName} />`
750
+ }
751
+
752
+ // 递归处理子节点
753
+ const childrenStr = children.map(child => {
754
+ if (child.type === 'text') {
755
+ return (child as { value: string }).value
756
+ }
757
+ if (isHtmlElementNode(child)) {
758
+ return htmlElementToString(child)
759
+ }
760
+ // 其他节点类型保持原样(实际使用中可能需要扩展)
761
+ return ''
762
+ }).join('')
763
+
764
+ return `${openTag}${childrenStr}</${tagName}>`
765
+ }
766
+
767
+ /**
768
+ * 检查是否是自闭合标签
769
+ */
770
+ function isSelfClosingTag(tagName: string): boolean {
771
+ return ['br', 'hr', 'img', 'input', 'meta', 'link', 'area', 'base', 'col', 'embed', 'source', 'track', 'wbr'].includes(tagName.toLowerCase())
772
+ }
773
+
774
+ /**
775
+ * HTML 转义
776
+ */
777
+ function escapeHtml(text: string): string {
778
+ return text
779
+ .replace(/&/g, '&amp;')
780
+ .replace(/</g, '&lt;')
781
+ .replace(/>/g, '&gt;')
782
+ .replace(/"/g, '&quot;')
783
+ .replace(/'/g, '&#39;')
784
+ }
785
+
786
+ // ============ 导出 ============
787
+
788
+ export {
789
+ DEFAULT_TAG_BLACKLIST as HTML_TAG_BLACKLIST,
790
+ DEFAULT_ATTR_BLACKLIST as HTML_ATTR_BLACKLIST,
791
+ DEFAULT_PROTOCOL_BLACKLIST as HTML_PROTOCOL_BLACKLIST
792
+ }