suneditor 3.0.6 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/dist/suneditor.min.css +1 -1
  2. package/dist/suneditor.min.js +1 -1
  3. package/package.json +1 -1
  4. package/src/assets/suneditor.css +2 -2
  5. package/src/core/editor.js +20 -3
  6. package/src/core/event/eventOrchestrator.js +2 -1
  7. package/src/core/event/handlers/handler_ww_key.js +2 -2
  8. package/src/core/event/rules/keydown.rule.enter.js +2 -2
  9. package/src/core/logic/dom/format.js +5 -1
  10. package/src/core/logic/dom/html.js +23 -1
  11. package/src/core/logic/dom/offset.js +24 -1
  12. package/src/core/logic/panel/menu.js +74 -3
  13. package/src/core/logic/panel/viewer.js +6 -4
  14. package/src/core/logic/shell/shortcuts.js +1 -1
  15. package/src/core/schema/options.js +1 -1
  16. package/src/core/section/constructor.js +2 -2
  17. package/src/helper/index.js +3 -0
  18. package/src/helper/msOffice.js +849 -0
  19. package/src/interfaces/plugins.js +1 -1
  20. package/src/langs/ckb.js +1 -0
  21. package/src/langs/cs.js +1 -0
  22. package/src/langs/da.js +1 -0
  23. package/src/langs/de.js +1 -0
  24. package/src/langs/en.js +1 -1
  25. package/src/langs/es.js +1 -0
  26. package/src/langs/fa.js +1 -0
  27. package/src/langs/fr.js +1 -0
  28. package/src/langs/he.js +1 -0
  29. package/src/langs/hu.js +1 -0
  30. package/src/langs/it.js +1 -0
  31. package/src/langs/ja.js +1 -0
  32. package/src/langs/km.js +1 -0
  33. package/src/langs/ko.js +1 -0
  34. package/src/langs/lv.js +1 -0
  35. package/src/langs/nl.js +1 -0
  36. package/src/langs/pl.js +1 -0
  37. package/src/langs/pt_br.js +1 -0
  38. package/src/langs/ro.js +1 -0
  39. package/src/langs/ru.js +1 -0
  40. package/src/langs/se.js +1 -0
  41. package/src/langs/tr.js +1 -0
  42. package/src/langs/uk.js +1 -0
  43. package/src/langs/ur.js +1 -0
  44. package/src/langs/zh_cn.js +1 -0
  45. package/src/modules/contract/Browser.js +1 -0
  46. package/src/plugins/dropdown/layout.js +1 -1
  47. package/src/plugins/dropdown/template.js +2 -1
  48. package/src/plugins/field/autocomplete.js +383 -0
  49. package/src/plugins/index.js +3 -3
  50. package/src/typedef.js +1 -1
  51. package/types/core/logic/shell/shortcuts.d.ts +2 -2
  52. package/types/core/schema/options.d.ts +2 -2
  53. package/types/helper/index.d.ts +4 -0
  54. package/types/helper/msOffice.d.ts +11 -0
  55. package/types/interfaces/plugins.d.ts +1 -1
  56. package/types/langs/_Lang.d.ts +1 -2
  57. package/types/plugins/field/autocomplete.d.ts +251 -0
  58. package/types/plugins/index.d.ts +3 -3
  59. package/types/typedef.d.ts +1 -1
  60. package/src/plugins/field/mention.js +0 -251
  61. package/types/plugins/field/mention.d.ts +0 -104
@@ -0,0 +1,849 @@
1
+ /**
2
+ * @description Converts MS Word/Excel/OneNote clipboard HTML into clean.
3
+ */
4
+
5
+ /** Matches MS conditional comments: <!--[if ...]>...<![endif]--> */
6
+ const _RE_CONDITIONAL_COMMENTS = /<!--\[if[^>]*>[\s\S]*?<!\[endif\]-->/gi;
7
+
8
+ /** Matches Office namespace tags: <o:p>, </o:p>, <w:Sdt>, etc. */
9
+ const _RE_OFFICE_TAGS = /<\/?(?:\w+:)\w+[^>]*>/gi;
10
+
11
+ /** Matches <xml>...</xml> blocks */
12
+ const _RE_XML_BLOCKS = /<xml>[\s\S]*?<\/xml>/gi;
13
+
14
+ /** Matches <style>...</style> blocks */
15
+ const _RE_STYLE_BLOCKS = /<style[^>]*>[\s\S]*?<\/style>/gi;
16
+
17
+ /** Matches individual mso-* CSS declarations */
18
+ const _RE_MSO_STYLE = /\s*mso-[^:]+:[^;"]+;?/gi;
19
+
20
+ /** Matches mso-list value: l{id} level{n} lfo{order} */
21
+ const _RE_MSO_LIST = /l(\d+)\s+level(\d+)\s+lfo(\d+)/i;
22
+
23
+ /** Matches mso-list:Ignore markers */
24
+ const _RE_MSO_LIST_IGNORE = /mso-list:\s*Ignore/i;
25
+
26
+ /** Matches Mso* class names */
27
+ const _RE_MSO_CLASS = /\bMso\w+/g;
28
+
29
+ /** Matches Excel xl* class names */
30
+ const _RE_XL_CLASS = /\bxl\d+/g;
31
+
32
+ /** Matches tab-stops CSS declarations */
33
+ const _RE_TAB_STOPS = /\s*tab-stops:[^;"]+;?/gi;
34
+
35
+ /** Matches bullet-like characters at the start of text */
36
+ const _RE_BULLET_CHARS = /^[\s\u00a0]*[\u2022\u00b7\u00a7\u25cf\u25cb\u25aa\u25a0·o§]\s*/;
37
+
38
+ /** Matches ordered list marker patterns */
39
+ const _RE_ORDERED_MARKERS = [/^\s*[0-9]+[.)]\s/, /^\s*[a-z][.)]\s/i, /^\s*[ivxlmcd]+[.)]\s/i];
40
+
41
+ /** Matches mso-spacerun spans */
42
+ const _RE_SPACERUN = /<span\s+style\s*=\s*["']?\s*mso-spacerun:\s*yes\s*;?\s*["']?\s*>([\s\u00a0]*)<\/span>/gi;
43
+
44
+ /** Matches mso-tab-count spans */
45
+ const _RE_TAB_COUNT = /<span\s+style\s*=\s*["'][^"']*mso-tab-count:\s*\d+[^"']*["']\s*>[^<]*<\/span>/gi;
46
+
47
+ /** Matches mso-outline-level value */
48
+ const _RE_OUTLINE_LEVEL = /mso-outline-level:\s*(\d+)/i;
49
+
50
+ /** Matches mso-highlight value */
51
+ const _RE_MSO_HIGHLIGHT = /mso-highlight:\s*([^;"]+)/i;
52
+
53
+ /** Matches mso-level-number-format value */
54
+ const _RE_LEVEL_NUMBER_FORMAT = /mso-level-number-format:\s*([^;\s]+)/i;
55
+
56
+ /** Matches mso-level-start-at value */
57
+ const _RE_LEVEL_START_AT = /mso-level-start-at:\s*(\d+)/i;
58
+
59
+ /** Matches mso-level-text value */
60
+ const _RE_LEVEL_TEXT = /mso-level-text:\s*([^;\s]+)/i;
61
+
62
+ /** Word number format → CSS list-style-type */
63
+ const _LEVEL_STYLE_MAP = {
64
+ decimal: 'decimal',
65
+ 'alpha-upper': 'upper-alpha',
66
+ 'alpha-lower': 'lower-alpha',
67
+ 'roman-upper': 'upper-roman',
68
+ 'roman-lower': 'lower-roman',
69
+ 'arabic-leading-zero': 'decimal-leading-zero',
70
+ };
71
+
72
+ /** Matches zero-value margin shorthand (3 values) */
73
+ const _RE_ZERO_MARGIN = /\s*margin:\s*0[a-z]*\s+0[a-z]*\s+0[a-z]*\s*;?/gi;
74
+
75
+ /** Matches text-indent CSS declarations */
76
+ const _RE_TEXT_INDENT = /\s*text-indent:\s*-?[\d.]+[a-z]*\s*;?/gi;
77
+
78
+ /** Matches line-height: normal */
79
+ const _RE_LINE_HEIGHT_NORMAL = /\s*line-height:\s*normal\s*;?/gi;
80
+
81
+ /** Matches MsoHeading class names */
82
+ const _RE_MSO_HEADING = /MsoHeading(\d)/i;
83
+
84
+ /** Matches file:/// protocol */
85
+ const _RE_FILE_PROTOCOL = /^file:\/\/\//i;
86
+
87
+ /** Matches Word bookmark anchor names (_Toc*, _Ref*, _Hlt*, _Hlk*) */
88
+ const _RE_BOOKMARK_NAME = /^_(Toc|Ref|Hlt|Hlk)\d+$/i;
89
+
90
+ /** Matches Word bookmark href targets (#_Toc*, etc.) */
91
+ const _RE_BOOKMARK_HREF = /^#_(Toc|Ref|Hlt|Hlk)\d+$/i;
92
+
93
+ /** Matches page-break-before: always */
94
+ const _RE_PAGE_BREAK = /page-break-before\s*:\s*always/i;
95
+
96
+ /** Matches mso-break-type: section-break */
97
+ const _RE_SECTION_BREAK = /mso-break-type\s*:\s*section-break/i;
98
+
99
+ /** Matches mso-column-break-before: always */
100
+ const _RE_COLUMN_BREAK = /mso-column-break-before\s*:\s*always/i;
101
+
102
+ /** Matches Word/WordSection div class names */
103
+ const _RE_SECTION_CLASS = /^(?:Word)?Section\d+$/i;
104
+
105
+ // ---------- internal helpers ----------
106
+
107
+ /**
108
+ * @description Extracts `@list` rules from Word's <style> block for list type detection.
109
+ * @param {string} html Raw Word HTML
110
+ * @returns {Map<string, Object>} Map of "l{id}:level{n}" -> { type: 'ol'|'ul', listStyleType: string }
111
+ */
112
+ function _extractListStyles(html) {
113
+ const map = new Map();
114
+ const styleMatch = html.match(/<style[^>]*>([\s\S]*?)<\/style>/i);
115
+ if (!styleMatch) return map;
116
+
117
+ const css = styleMatch[1];
118
+ const listRuleRe = /@list\s+l(\d+):level(\d+)\s*\{([^}]*)\}/gi;
119
+ let m;
120
+ while ((m = listRuleRe.exec(css)) !== null) {
121
+ const id = m[1];
122
+ const level = m[2];
123
+ const body = m[3];
124
+ const key = `l${id}:level${level}`;
125
+
126
+ const fmt = _RE_LEVEL_NUMBER_FORMAT.exec(body);
127
+ const startAt = _RE_LEVEL_START_AT.exec(body);
128
+ const text = _RE_LEVEL_TEXT.exec(body);
129
+
130
+ let type = 'ol';
131
+ let listStyleType = '';
132
+
133
+ if (fmt) {
134
+ const fmtVal = fmt[1].toLowerCase();
135
+ if (fmtVal === 'bullet' || fmtVal === 'image') {
136
+ type = 'ul';
137
+ if (text) {
138
+ const t = text[1].replace(/["%\\]/g, '');
139
+ if (t === '\u00b7' || t === '·') listStyleType = 'disc';
140
+ else if (t === 'o') listStyleType = 'circle';
141
+ else if (t === '\u00a7' || t === '§') listStyleType = 'square';
142
+ }
143
+ } else {
144
+ listStyleType = _LEVEL_STYLE_MAP[fmtVal] || '';
145
+ }
146
+ }
147
+
148
+ map.set(key, {
149
+ type,
150
+ listStyleType,
151
+ startAt: startAt ? parseInt(startAt[1], 10) : 1,
152
+ });
153
+ }
154
+
155
+ return map;
156
+ }
157
+
158
+ /**
159
+ * @description Detects whether a list item is ordered or unordered by inspecting its marker text content.
160
+ * @param {string} text The marker text (from mso-list:Ignore span)
161
+ * @returns {'ol'|'ul'}
162
+ */
163
+ function _detectListTypeFromText(text) {
164
+ const trimmed = text.replace(/[\s\u00a0]+/g, ' ').trim();
165
+ if (_RE_BULLET_CHARS.test(trimmed)) return 'ul';
166
+ for (const re of _RE_ORDERED_MARKERS) {
167
+ if (re.test(trimmed)) return 'ol';
168
+ }
169
+ return 'ul';
170
+ }
171
+
172
+ /**
173
+ * @description Removes bullet/number marker spans (mso-list:Ignore) and conditional comments from a list item element.
174
+ * @param {Element} el
175
+ */
176
+ function _removeListMarkers(el) {
177
+ const walker = document.createTreeWalker(el, NodeFilter.SHOW_COMMENT, null);
178
+ const commentsToRemove = [];
179
+ while (walker.nextNode()) {
180
+ commentsToRemove.push(walker.currentNode);
181
+ }
182
+ for (const c of commentsToRemove) {
183
+ /** @type {Element} */ (c)?.remove();
184
+ }
185
+
186
+ const spans = el.querySelectorAll('span');
187
+ for (const span of spans) {
188
+ const style = span.getAttribute('style') || '';
189
+ if (_RE_MSO_LIST_IGNORE.test(style)) {
190
+ span.remove();
191
+ }
192
+ }
193
+
194
+ if (el.firstChild && el.firstChild.nodeType === 3) {
195
+ el.firstChild.nodeValue = el.firstChild.nodeValue.replace(/^[\s\u00a0]+/, '');
196
+ }
197
+ }
198
+
199
+ /**
200
+ * @description Cleans inline style attributes: strips mso-*, tab-stops, converts mso-highlight to background-color.
201
+ * @param {Element} el
202
+ */
203
+ function _cleanStyles(el) {
204
+ const style = el.getAttribute('style');
205
+ if (!style) return;
206
+
207
+ let cleaned = style;
208
+
209
+ // Convert mso-highlight to background-color before stripping mso-*
210
+ const highlightMatch = _RE_MSO_HIGHLIGHT.exec(cleaned);
211
+ if (highlightMatch && !/background-color/i.test(cleaned) && !/background\s*:/i.test(cleaned)) {
212
+ cleaned += ';background-color:' + highlightMatch[1].trim();
213
+ }
214
+
215
+ cleaned = cleaned.replace(_RE_MSO_STYLE, '').replace(_RE_TAB_STOPS, '').replace(_RE_ZERO_MARGIN, '').replace(_RE_TEXT_INDENT, '').replace(_RE_LINE_HEIGHT_NORMAL, '').trim();
216
+
217
+ // Remove trailing/leading semicolons
218
+ cleaned = cleaned
219
+ .replace(/^;+|;+$/g, '')
220
+ .replace(/;{2,}/g, ';')
221
+ .trim();
222
+
223
+ if (cleaned) {
224
+ el.setAttribute('style', cleaned);
225
+ } else {
226
+ el.removeAttribute('style');
227
+ }
228
+ }
229
+
230
+ /**
231
+ * @description Removes Mso- and xl- class names from an element. Removes class attr if empty.
232
+ * @param {Element} el
233
+ */
234
+ function _cleanClasses(el) {
235
+ const cls = el.getAttribute('class');
236
+ if (!cls) return;
237
+
238
+ const cleaned = cls.replace(_RE_MSO_CLASS, '').replace(_RE_XL_CLASS, '').trim();
239
+ if (cleaned) {
240
+ el.setAttribute('class', cleaned);
241
+ } else {
242
+ el.removeAttribute('class');
243
+ }
244
+ }
245
+
246
+ /**
247
+ * @description Unwraps spans that have no attributes left after cleanup.
248
+ * @param {Element} container
249
+ */
250
+ function _unwrapEmptySpans(container) {
251
+ const spans = container.querySelectorAll('span');
252
+ for (let i = spans.length - 1; i >= 0; i--) {
253
+ const span = spans[i];
254
+ if (!span.attributes.length || (!span.getAttribute('style') && !span.getAttribute('class'))) {
255
+ const parent = span.parentNode;
256
+ if (parent) {
257
+ while (span.firstChild) {
258
+ parent.insertBefore(span.firstChild, span);
259
+ }
260
+ parent.removeChild(span);
261
+ }
262
+ }
263
+ }
264
+ }
265
+
266
+ // ---------- list conversion ----------
267
+
268
+ /**
269
+ * @description Transforms flat MsoListParagraph paragraphs into nested ol/ul/li structure.
270
+ * @param {Document} doc Parsed DOM document
271
+ * @param {Map<string, Object>} listStyles Extracted @list CSS rules
272
+ */
273
+ function _convertLists(doc, listStyles) {
274
+ const body = doc.body;
275
+ const items = [];
276
+
277
+ const allElements = body.querySelectorAll('p, h1, h2, h3, h4, h5, h6');
278
+ for (const el of allElements) {
279
+ const style = el.getAttribute('style') || '';
280
+ const cls = el.getAttribute('class') || '';
281
+ const msoListMatch = _RE_MSO_LIST.exec(style);
282
+
283
+ if (msoListMatch || /MsoListParagraph/i.test(cls)) {
284
+ items.push({
285
+ el,
286
+ listId: msoListMatch ? msoListMatch[1] : '0',
287
+ level: msoListMatch ? parseInt(msoListMatch[2], 10) : 1,
288
+ lfo: msoListMatch ? msoListMatch[3] : '1',
289
+ });
290
+ }
291
+ }
292
+
293
+ if (items.length === 0) return;
294
+
295
+ const groups = [];
296
+ let currentGroup = null;
297
+
298
+ for (const item of items) {
299
+ const prevSibling = _getPrevSiblingElement(item.el);
300
+ const isConsecutive = currentGroup && currentGroup.length > 0 && prevSibling === currentGroup[currentGroup.length - 1].el;
301
+
302
+ if (isConsecutive) {
303
+ currentGroup.push(item);
304
+ } else {
305
+ currentGroup = [item];
306
+ groups.push(currentGroup);
307
+ }
308
+ }
309
+
310
+ for (const group of groups) {
311
+ _buildListFromGroup(group, listStyles);
312
+ }
313
+ }
314
+
315
+ /**
316
+ * @description Gets the previous element sibling, skipping text nodes.
317
+ * @param {Element} el
318
+ * @returns {Element|null}
319
+ */
320
+ function _getPrevSiblingElement(el) {
321
+ let prev = el.previousSibling;
322
+ while (prev) {
323
+ if (prev.nodeType === 1) return /** @type {Element} */ (prev);
324
+ if (prev.nodeType === 3 && prev.nodeValue.trim()) return null;
325
+ prev = prev.previousSibling;
326
+ }
327
+ return null;
328
+ }
329
+
330
+ /**
331
+ * @description Builds a proper list (ol/ul with li) from a consecutive group of Word list paragraphs.
332
+ * @param {Array<Object>} group Array of { el, listId, level, lfo }
333
+ * @param {Map<string, Object>} listStyles
334
+ */
335
+ function _buildListFromGroup(group, listStyles) {
336
+ for (const item of group) {
337
+ const styleKey = `l${item.listId}:level${item.level}`;
338
+ const styleDef = listStyles.get(styleKey);
339
+
340
+ if (styleDef) {
341
+ item.type = styleDef.type;
342
+ item.listStyleType = styleDef.listStyleType;
343
+ item.startAt = styleDef.startAt;
344
+ } else {
345
+ const markerText = _extractMarkerText(item.el);
346
+ item.type = _detectListTypeFromText(markerText);
347
+ item.listStyleType = '';
348
+ item.startAt = 1;
349
+ }
350
+ }
351
+
352
+ const parent = group[0].el.parentNode;
353
+ // Save anchor before the loop removes elements from the DOM
354
+ const anchor = group[group.length - 1].el.nextSibling;
355
+
356
+ const stack = [];
357
+ let rootList = null;
358
+
359
+ for (const item of group) {
360
+ const { el, level, type, listStyleType, startAt } = item;
361
+
362
+ _removeListMarkers(el);
363
+ _cleanStyles(el);
364
+ _cleanClasses(el);
365
+
366
+ const li = el.ownerDocument.createElement('li');
367
+ while (el.firstChild) {
368
+ li.appendChild(el.firstChild);
369
+ }
370
+
371
+ if (el.getAttribute('style')) {
372
+ li.setAttribute('style', el.getAttribute('style'));
373
+ }
374
+
375
+ if (stack.length === 0) {
376
+ rootList = _createListElement(el.ownerDocument, type, listStyleType, startAt);
377
+ rootList.appendChild(li);
378
+ stack.push({ listEl: rootList, level });
379
+ } else {
380
+ const currentLevel = stack[stack.length - 1].level;
381
+
382
+ if (level > currentLevel) {
383
+ const prevLi = stack[stack.length - 1].listEl.lastElementChild;
384
+ const childList = _createListElement(el.ownerDocument, type, listStyleType, startAt);
385
+ childList.appendChild(li);
386
+
387
+ if (prevLi) {
388
+ prevLi.appendChild(childList);
389
+ } else {
390
+ stack[stack.length - 1].listEl.appendChild(childList);
391
+ }
392
+
393
+ stack.push({ listEl: childList, level });
394
+ } else if (level < currentLevel) {
395
+ while (stack.length > 1 && stack[stack.length - 1].level > level) {
396
+ stack.pop();
397
+ }
398
+ stack[stack.length - 1].listEl.appendChild(li);
399
+ } else {
400
+ stack[stack.length - 1].listEl.appendChild(li);
401
+ }
402
+ }
403
+
404
+ el.remove();
405
+ }
406
+
407
+ if (rootList) {
408
+ parent.insertBefore(rootList, anchor);
409
+ }
410
+ }
411
+
412
+ /**
413
+ * @description Creates an <ol> or <ul> element with optional list-style-type and start attributes.
414
+ * @param {Document} doc
415
+ * @param {'ol'|'ul'} type
416
+ * @param {string} listStyleType
417
+ * @param {number} startAt
418
+ * @returns {Element}
419
+ */
420
+ function _createListElement(doc, type, listStyleType, startAt) {
421
+ const list = doc.createElement(type);
422
+ if (listStyleType) {
423
+ list.style.listStyleType = listStyleType;
424
+ }
425
+ if (type === 'ol' && startAt > 1) {
426
+ list.setAttribute('start', String(startAt));
427
+ }
428
+ return list;
429
+ }
430
+
431
+ /**
432
+ * @description Extracts the marker text from a Word list paragraph (the mso-list:Ignore span content).
433
+ * @param {Element} el
434
+ * @returns {string}
435
+ */
436
+ function _extractMarkerText(el) {
437
+ const spans = el.querySelectorAll('span');
438
+ for (const span of spans) {
439
+ const style = span.getAttribute('style') || '';
440
+ if (_RE_MSO_LIST_IGNORE.test(style)) {
441
+ return span.textContent || '';
442
+ }
443
+ }
444
+ return el.textContent?.substring(0, 10) || '';
445
+ }
446
+
447
+ // ---------- heading conversion ----------
448
+
449
+ /**
450
+ * @description Converts `<p>` elements with `mso-outline-level` to semantic `<h1>`-`<h6>` headings.
451
+ * Also normalizes existing heading elements that have MsoHeading* classes.
452
+ * @param {Document} doc
453
+ */
454
+ function _convertHeadings(doc) {
455
+ const paragraphs = doc.body.querySelectorAll('p');
456
+ for (const p of paragraphs) {
457
+ const style = p.getAttribute('style') || '';
458
+ const cls = p.getAttribute('class') || '';
459
+
460
+ let level = 0;
461
+
462
+ // Check mso-outline-level
463
+ const outlineMatch = _RE_OUTLINE_LEVEL.exec(style);
464
+ if (outlineMatch) {
465
+ level = parseInt(outlineMatch[1], 10);
466
+ }
467
+
468
+ // Check MsoHeading* class
469
+ if (!level) {
470
+ const headingClassMatch = _RE_MSO_HEADING.exec(cls);
471
+ if (headingClassMatch) {
472
+ level = parseInt(headingClassMatch[1], 10);
473
+ }
474
+ }
475
+
476
+ if (level >= 1 && level <= 6) {
477
+ const heading = doc.createElement('h' + level);
478
+ while (p.firstChild) {
479
+ heading.appendChild(p.firstChild);
480
+ }
481
+ if (style) heading.setAttribute('style', style);
482
+
483
+ p.replaceWith(heading);
484
+ }
485
+ }
486
+ }
487
+
488
+ // ---------- table cleanup ----------
489
+
490
+ /**
491
+ * @description Cleans Word/Excel table markup.
492
+ * - Removes mso-yfti-*, mso-border-*, mso-padding-alt, mso-cellspacing, mso-table-layout-alt.
493
+ * - Strips MsoTableGrid/MsoNormalTable classes.
494
+ * - Removes Excel-specific <col> elements.
495
+ * - Preserves border, colspan, rowspan, and basic styling.
496
+ * @param {Document} doc
497
+ */
498
+ function _cleanTables(doc) {
499
+ const tables = doc.body.querySelectorAll('table');
500
+ for (const table of tables) {
501
+ _cleanClasses(table);
502
+
503
+ // Remove cellspacing/cellpadding attributes (handle via CSS)
504
+ table.removeAttribute('cellspacing');
505
+ table.removeAttribute('cellpadding');
506
+
507
+ // Remove Excel <col> elements
508
+ const cols = table.querySelectorAll('col');
509
+ for (const col of cols) {
510
+ col.remove();
511
+ }
512
+
513
+ // Remove <colgroup> if empty after col removal
514
+ const colgroups = table.querySelectorAll('colgroup');
515
+ for (const cg of colgroups) {
516
+ if (!cg.children.length) cg.remove();
517
+ }
518
+ }
519
+
520
+ // Clean rows
521
+ const rows = doc.body.querySelectorAll('tr');
522
+ for (const tr of rows) {
523
+ // Remove height attribute (keep style-based height if meaningful)
524
+ tr.removeAttribute('height');
525
+ }
526
+
527
+ // Clean cells
528
+ const cells = doc.body.querySelectorAll('td, th');
529
+ for (const cell of cells) {
530
+ // Remove valign attribute (use CSS vertical-align instead if present)
531
+ const valign = cell.getAttribute('valign');
532
+ if (valign) {
533
+ cell.removeAttribute('valign');
534
+ const existingStyle = cell.getAttribute('style') || '';
535
+ if (!/vertical-align/i.test(existingStyle)) {
536
+ cell.setAttribute('style', (existingStyle ? existingStyle + ';' : '') + 'vertical-align:' + valign);
537
+ }
538
+ }
539
+
540
+ // Unwrap single MsoNormal <p> inside cells → inline content
541
+ const children = cell.children;
542
+ if (children.length === 1 && children[0].tagName === 'P') {
543
+ const p = children[0];
544
+ const cls = p.getAttribute('class') || '';
545
+ if (/MsoNormal/i.test(cls) || !cls) {
546
+ while (p.firstChild) {
547
+ cell.insertBefore(p.firstChild, p);
548
+ }
549
+ p.remove();
550
+ }
551
+ }
552
+ }
553
+ }
554
+
555
+ // ---------- track changes & comments ----------
556
+
557
+ /**
558
+ * @description Removes track changes and comment markup.
559
+ * - `<del>` / `msoDel`: remove entirely (deleted content should not appear)
560
+ * - `<ins>` / `msoIns`: unwrap (keep the inserted content)
561
+ * - `MsoCommentReference` / `MsoCommentText`: remove
562
+ * - `mso-element:comment` divs: remove
563
+ * @param {Document} doc
564
+ */
565
+ function _cleanTrackChanges(doc) {
566
+ // Remove deletions
567
+ const dels = doc.body.querySelectorAll('del, .msoDel');
568
+ for (const del of dels) {
569
+ del.remove();
570
+ }
571
+
572
+ // Unwrap insertions (keep content)
573
+ const inses = doc.body.querySelectorAll('ins, .msoIns');
574
+ for (let i = inses.length - 1; i >= 0; i--) {
575
+ const ins = inses[i];
576
+ const parent = ins.parentNode;
577
+ if (parent) {
578
+ while (ins.firstChild) {
579
+ parent.insertBefore(ins.firstChild, ins);
580
+ }
581
+ ins.remove();
582
+ }
583
+ }
584
+
585
+ // Remove comment references
586
+ const commentRefs = doc.body.querySelectorAll('.MsoCommentReference, [style*="mso-comment-reference"]');
587
+ for (const ref of commentRefs) {
588
+ ref.remove();
589
+ }
590
+
591
+ // Remove comment text blocks
592
+ const commentTexts = doc.body.querySelectorAll('.MsoCommentText');
593
+ for (const ct of commentTexts) {
594
+ ct.remove();
595
+ }
596
+
597
+ // Remove mso-element:comment divs
598
+ const commentDivs = doc.body.querySelectorAll('div[style*="mso-element:comment"]');
599
+ for (const div of commentDivs) {
600
+ div.remove();
601
+ }
602
+
603
+ // Remove comment anchor names
604
+ const commentAnchors = doc.body.querySelectorAll('a[name^="_msocom"], a[name^="_msoanchor"]');
605
+ for (const anchor of commentAnchors) {
606
+ anchor.remove();
607
+ }
608
+ }
609
+
610
+ // ---------- link cleanup ----------
611
+
612
+ /**
613
+ * @description Cleans up Word-specific link patterns.
614
+ * - Removes `file:///` protocol links (converts to plain text)
615
+ * - Removes internal bookmark anchors (_Toc, _Ref, _Hlt targets)
616
+ * - Keeps mailto: and http(s): links intact
617
+ * @param {Document} doc
618
+ */
619
+ function _cleanLinks(doc) {
620
+ const anchors = doc.body.querySelectorAll('a');
621
+ for (let i = anchors.length - 1; i >= 0; i--) {
622
+ const a = anchors[i];
623
+ const href = a.getAttribute('href') || '';
624
+ const name = a.getAttribute('name') || '';
625
+
626
+ // Remove file:/// protocol links (meaningless outside source machine)
627
+ if (_RE_FILE_PROTOCOL.test(href)) {
628
+ const parent = a.parentNode;
629
+ if (parent) {
630
+ while (a.firstChild) {
631
+ parent.insertBefore(a.firstChild, a);
632
+ }
633
+ a.remove();
634
+ }
635
+ continue;
636
+ }
637
+
638
+ // Remove bookmark anchor targets (_Toc, _Ref, _Hlt, _Hlk)
639
+ if (_RE_BOOKMARK_NAME.test(name)) {
640
+ // If it has content, unwrap; if empty anchor, remove
641
+ const parent = a.parentNode;
642
+ if (parent) {
643
+ if (a.childNodes.length) {
644
+ while (a.firstChild) {
645
+ parent.insertBefore(a.firstChild, a);
646
+ }
647
+ }
648
+ a.remove();
649
+ }
650
+ continue;
651
+ }
652
+
653
+ // Remove bookmark links pointing to internal anchors
654
+ if (_RE_BOOKMARK_HREF.test(href)) {
655
+ const parent = a.parentNode;
656
+ if (parent) {
657
+ while (a.firstChild) {
658
+ parent.insertBefore(a.firstChild, a);
659
+ }
660
+ a.remove();
661
+ }
662
+ continue;
663
+ }
664
+
665
+ // Clean v:shapes attribute from regular links
666
+ a.removeAttribute('v:shapes');
667
+ }
668
+ }
669
+
670
+ // ---------- image cleanup ----------
671
+
672
+ /**
673
+ * @description Cleans Word-specific image patterns.
674
+ * - Removes `v:shapes` attribute
675
+ * - Removes images with `file:///` src (broken temp file paths)
676
+ * @param {Document} doc
677
+ */
678
+ function _cleanImages(doc) {
679
+ const images = doc.body.querySelectorAll('img');
680
+ for (let i = images.length - 1; i >= 0; i--) {
681
+ const img = images[i];
682
+ const src = img.getAttribute('src') || '';
683
+
684
+ // Remove broken file:/// protocol images
685
+ if (_RE_FILE_PROTOCOL.test(src)) {
686
+ img.remove();
687
+ continue;
688
+ }
689
+
690
+ // Remove v:shapes attribute
691
+ img.removeAttribute('v:shapes');
692
+ }
693
+ }
694
+
695
+ // ---------- break handling ----------
696
+
697
+ /**
698
+ * @description Handles Word page breaks, section breaks, and manual line breaks.
699
+ * - Page breaks (`mso-special-character:line-break` + `page-break-before:always`): remove
700
+ * - Section break `<br>` + `<div class=SectionN>`: unwrap section divs, remove break
701
+ * - Column breaks: remove
702
+ * - Manual line breaks (Shift+Enter): keep as `<br>`
703
+ * @param {Document} doc
704
+ */
705
+ function _cleanBreaks(doc) {
706
+ const brs = doc.body.querySelectorAll('br');
707
+ for (let i = brs.length - 1; i >= 0; i--) {
708
+ const br = brs[i];
709
+ const style = br.getAttribute('style') || '';
710
+
711
+ // Page break or section break: remove
712
+ if (_RE_PAGE_BREAK.test(style) || _RE_SECTION_BREAK.test(style)) {
713
+ br.remove();
714
+ continue;
715
+ }
716
+
717
+ // Column break: remove
718
+ if (_RE_COLUMN_BREAK.test(style)) {
719
+ br.remove();
720
+ continue;
721
+ }
722
+
723
+ // Clean clear attribute and style from regular line breaks
724
+ br.removeAttribute('style');
725
+ br.removeAttribute('clear');
726
+ }
727
+
728
+ // Unwrap Section divs (Section1, Section2, etc.)
729
+ const sectionDivs = doc.body.querySelectorAll('div[class*="Section"]');
730
+ for (let i = sectionDivs.length - 1; i >= 0; i--) {
731
+ const div = sectionDivs[i];
732
+ const cls = div.getAttribute('class') || '';
733
+ if (_RE_SECTION_CLASS.test(cls.trim())) {
734
+ const parent = div.parentNode;
735
+ if (parent) {
736
+ while (div.firstChild) {
737
+ parent.insertBefore(div.firstChild, div);
738
+ }
739
+ div.remove();
740
+ }
741
+ }
742
+ }
743
+ }
744
+
745
+ // ---------- public API ----------
746
+
747
+ /**
748
+ * @description Converts MS Word/Excel/OneNote HTML clipboard data to clean, standards-compliant HTML.
749
+ * @param {string} html Raw HTML string from MS Office clipboard
750
+ * @returns {string} Cleaned HTML string
751
+ */
752
+ export function cleanHTML(html) {
753
+ if (!html) return '';
754
+
755
+ // 1. Extract list style definitions from <style> blocks before removing them
756
+ const listStyles = _extractListStyles(html);
757
+
758
+ // 2. String-level cleanup (before DOM parsing)
759
+ let cleaned = html;
760
+
761
+ // Remove <style> blocks
762
+ cleaned = cleaned.replace(_RE_STYLE_BLOCKS, '');
763
+
764
+ // Remove <xml>...</xml> blocks
765
+ cleaned = cleaned.replace(_RE_XML_BLOCKS, '');
766
+
767
+ // Remove conditional comments
768
+ cleaned = cleaned.replace(_RE_CONDITIONAL_COMMENTS, '');
769
+
770
+ // Remove Office namespace tags (<o:p>, <w:Sdt>, etc.)
771
+ cleaned = cleaned.replace(_RE_OFFICE_TAGS, '');
772
+
773
+ // Remove <!--StartFragment--> / <!--EndFragment-->
774
+ cleaned = cleaned.replace(/<!--(?:Start|End)Fragment-->/gi, '');
775
+
776
+ // Remove <meta> and <link> tags
777
+ cleaned = cleaned.replace(/<(?:meta|link)[^>]*>/gi, '');
778
+
779
+ // Normalize mso-spacerun spans → single space
780
+ cleaned = cleaned.replace(_RE_SPACERUN, ' ');
781
+
782
+ // Normalize mso-tab-count spans → single space
783
+ cleaned = cleaned.replace(_RE_TAB_COUNT, ' ');
784
+
785
+ // Normalize &nbsp; sequences (Word uses excessive &nbsp;)
786
+ cleaned = cleaned.replace(/(&nbsp;){2,}/g, ' ');
787
+
788
+ // Remove soft hyphens
789
+ cleaned = cleaned.replace(/\u00AD/g, '');
790
+
791
+ // 3. DOM-level cleanup
792
+ const doc = new DOMParser().parseFromString(cleaned, 'text/html');
793
+
794
+ // 3a. Remove track changes & comments first (before structural changes)
795
+ _cleanTrackChanges(doc);
796
+
797
+ // 3b. Convert Word fake lists to proper <ol>/<ul>/<li>
798
+ _convertLists(doc, listStyles);
799
+
800
+ // 3c. Convert outline-level paragraphs to semantic headings
801
+ _convertHeadings(doc);
802
+
803
+ // 3d. Clean tables
804
+ _cleanTables(doc);
805
+
806
+ // 3e. Clean links
807
+ _cleanLinks(doc);
808
+
809
+ // 3f. Clean images
810
+ _cleanImages(doc);
811
+
812
+ // 3g. Clean page/section/column breaks
813
+ _cleanBreaks(doc);
814
+
815
+ // 3h. Clean all elements (styles, classes, attributes)
816
+ const allElements = doc.body.querySelectorAll('*');
817
+ for (const el of allElements) {
818
+ _cleanStyles(el);
819
+ _cleanClasses(el);
820
+
821
+ // Remove Word-specific attributes
822
+ if (el.getAttribute('lang')) el.removeAttribute('lang');
823
+ el.removeAttribute('v:shapes');
824
+
825
+ // Remove width/height from non-media, non-table elements
826
+ const tag = el.tagName.toLowerCase();
827
+ if (tag !== 'img' && tag !== 'video' && tag !== 'iframe' && tag !== 'table' && tag !== 'td' && tag !== 'th') {
828
+ el.removeAttribute('width');
829
+ el.removeAttribute('height');
830
+ }
831
+ }
832
+
833
+ // 3i. Unwrap empty spans
834
+ _unwrapEmptySpans(doc.body);
835
+
836
+ // 3j. Remove completely empty paragraphs
837
+ const paragraphs = doc.body.querySelectorAll('p');
838
+ for (const p of paragraphs) {
839
+ if (!p.textContent.trim() && !p.querySelector('img, video, iframe, br, table')) {
840
+ p.remove();
841
+ }
842
+ }
843
+
844
+ return doc.body.innerHTML;
845
+ }
846
+
847
+ export default {
848
+ cleanHTML,
849
+ };