@lkaopremier/html-to-docx 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
package/src/tree.js ADDED
@@ -0,0 +1,643 @@
1
+ import { parse as htmlParser, TextNode } from 'node-html-parser'
2
+ import {
3
+ pageNodes,
4
+ normalizeHtml,
5
+ trim,
6
+ normalizeMeasure,
7
+ capitalize,
8
+ colorToHex,
9
+ imageBase64ToBuffer,
10
+ splitMeasure,
11
+ convertCssToDocxMeasurement,
12
+ parseCssMargin,
13
+ } from './helper.js'
14
+ import lodash from 'lodash'
15
+ import { AlignmentType, HeadingLevel } from 'docx'
16
+
17
+ function getStyle(node) {
18
+ const style = {}
19
+
20
+ switch (node?.tagName?.toLowerCase()) {
21
+ case 'strong':
22
+ case 'b':
23
+ style.bold = true
24
+ break
25
+
26
+ case 'em':
27
+ case 'i':
28
+ style.italics = true
29
+ break
30
+
31
+ case 'u':
32
+ style.underline = {}
33
+ break
34
+
35
+ case 's':
36
+ style.strike = {}
37
+ break
38
+
39
+ case 'h1':
40
+ style.heading = HeadingLevel.HEADING_1
41
+ break
42
+
43
+ case 'h2':
44
+ style.heading = HeadingLevel.HEADING_2
45
+ break
46
+
47
+ case 'h3':
48
+ style.heading = HeadingLevel.HEADING_3
49
+ break
50
+
51
+ case 'h4':
52
+ style.heading = HeadingLevel.HEADING_4
53
+ break
54
+
55
+ case 'h5':
56
+ style.heading = HeadingLevel.HEADING_5
57
+ break
58
+
59
+ case 'h6':
60
+ style.heading = HeadingLevel.HEADING_6
61
+ break
62
+ }
63
+
64
+ lodash.merge(style, getCurrentNodeStyle(node))
65
+
66
+ if (node.parentNode) {
67
+ return lodash.merge(getStyle(node.parentNode), style)
68
+ }
69
+
70
+ return style
71
+ }
72
+
73
+ function getCurrentNodeStyle(node) {
74
+ const style = {}
75
+ if (node?.attributes?.style) {
76
+ const styles = node.attributes?.style?.split(';') ?? []
77
+
78
+ styles.forEach(item => {
79
+ const [key, value] = item.split(':').map(s => s.trim())
80
+ switch (key) {
81
+ case 'font-weight':
82
+ if (value === 'bold') {
83
+ style.bold = true
84
+ }
85
+ break
86
+
87
+ case 'font-style':
88
+ if (value === 'italic') {
89
+ style.italics = true
90
+ }
91
+ break
92
+
93
+ case 'text-decoration':
94
+ if (value === 'underline') {
95
+ style.underline = {}
96
+ }
97
+ break
98
+
99
+ case 'text-transform':
100
+ if (value === 'uppercase') {
101
+ style.uppercase = true
102
+ } else if (value === 'capitalize') {
103
+ style.capitalize = true
104
+ } else if (value === 'lowercase') {
105
+ style.lowercase = true
106
+ } else if (value === 'invertcase') {
107
+ style.invertcase = true
108
+ } else if (value === 'uppercasesentence') {
109
+ style.uppercasesentence = true
110
+ }
111
+ break
112
+
113
+ case 'font-family':
114
+ style.font = trim(trim(value, "'"), '"')
115
+ break
116
+
117
+ case 'font-size':
118
+ style.size = normalizeMeasure(value)
119
+ break
120
+
121
+ case 'color':
122
+ style.color = colorToHex(value)
123
+ break
124
+
125
+ case 'margin':
126
+ case 'margin-top':
127
+ case 'margin-bottom':
128
+ const { top, bottom } = parseCssMargin(value)
129
+
130
+ if (key === 'margin') {
131
+ style.spacing = {
132
+ before: top,
133
+ after: bottom,
134
+ }
135
+ } else if (key === 'margin-top') {
136
+ style.spacing = {
137
+ before: top,
138
+ }
139
+ } else if (key === 'margin-bottom') {
140
+ style.spacing = {
141
+ after: bottom,
142
+ }
143
+ }
144
+ break
145
+
146
+ case 'text-align':
147
+ switch (value) {
148
+ case 'left':
149
+ style.alignment = AlignmentType.LEFT
150
+ break
151
+
152
+ case 'right':
153
+ style.alignment = AlignmentType.RIGHT
154
+ break
155
+
156
+ case 'center':
157
+ style.alignment = AlignmentType.CENTER
158
+ break
159
+
160
+ case 'justify':
161
+ style.alignment = AlignmentType.JUSTIFIED
162
+ break
163
+ }
164
+ break
165
+
166
+ case 'text-indent':
167
+ style.indent = { left: normalizeMeasure(value) }
168
+ break
169
+
170
+ case 'width':
171
+ if (
172
+ ['table', 'tr', 'th', 'td'].includes(node?.tagName.toLowerCase())
173
+ ) {
174
+ style.width = convertCssToDocxMeasurement(value)
175
+ } else {
176
+ style.width = normalizeMeasure(value)
177
+ }
178
+ break
179
+
180
+ case 'height':
181
+ style.height = normalizeMeasure(value)
182
+ break
183
+ }
184
+ })
185
+ }
186
+
187
+ return style
188
+ }
189
+
190
+ async function paragraphNodeParse(
191
+ sheet,
192
+ node,
193
+ paragraph = null,
194
+ newParagraphAfterBr = false,
195
+ ) {
196
+ if (node instanceof TextNode) {
197
+ const style = getStyle(node?.parentNode) ?? {}
198
+ const textRun = {
199
+ type: 'text',
200
+ style,
201
+ content: node.rawText,
202
+ }
203
+
204
+ if (!paragraph || newParagraphAfterBr) {
205
+ paragraph = { run: [textRun] }
206
+ } else {
207
+ paragraph.run.push(textRun)
208
+ }
209
+ } else if (node.tagName?.toLowerCase() === 'img') {
210
+ const image =
211
+ node.attributes.src && (await imageBase64ToBuffer(node.attributes.src))
212
+
213
+ if (image) {
214
+ const style = getCurrentNodeStyle(node)
215
+
216
+ if (style.width && !style.height) {
217
+ const [numericValue] = splitMeasure(style.width)
218
+ style.width = numericValue
219
+ style.height = numericValue / image.ratio
220
+ } else if (!style.width && style.height) {
221
+ const [numericValue] = splitMeasure(style.height)
222
+ style.height = numericValue
223
+ style.width = numericValue * image.ratio
224
+ }
225
+
226
+ const imageRun = {
227
+ type: 'image',
228
+ style: {
229
+ type: image.extension,
230
+ ...getStyle(node),
231
+ transformation: {
232
+ width: image.width,
233
+ height: image.height,
234
+ ...style,
235
+ },
236
+ },
237
+ data: image.buffer,
238
+ }
239
+
240
+ if (!paragraph || newParagraphAfterBr) {
241
+ paragraph = { run: [imageRun] }
242
+ } else {
243
+ paragraph.run.push(imageRun)
244
+ }
245
+ }
246
+ } else if (node.tagName?.toLowerCase() === 'br') {
247
+ if (paragraph) {
248
+ sheet.push(normalizeParagraph(paragraph))
249
+ paragraph = null
250
+ }
251
+
252
+ if (
253
+ ['br', 'table', 'p'].includes(
254
+ node.previousElementSibling?.tagName?.toLowerCase(),
255
+ )
256
+ ) {
257
+ sheet.push({ type: 'break', run: [] })
258
+ }
259
+ } else if (node.childNodes) {
260
+ for (const child of node.childNodes) {
261
+ paragraph = await paragraphNodeParse(
262
+ sheet,
263
+ child,
264
+ paragraph,
265
+ newParagraphAfterBr,
266
+ )
267
+ newParagraphAfterBr = false
268
+ }
269
+ }
270
+
271
+ return paragraph
272
+ }
273
+
274
+ function reflectStyleToParagraph(paragraph) {
275
+ if (!paragraph || (paragraph.run ?? []).length === 0) return paragraph
276
+ let style = {}
277
+
278
+ for (const text of paragraph.run) {
279
+ style = { ...style, ...(text.style ?? {}) }
280
+ }
281
+
282
+ const { alignment, indent, heading, spacing } = style
283
+
284
+ if (!paragraph.style) paragraph.style = {}
285
+
286
+ if (alignment) {
287
+ paragraph.style.alignment = alignment
288
+ }
289
+
290
+ if (indent) {
291
+ paragraph.style.indent = indent
292
+ }
293
+
294
+ if (heading) {
295
+ paragraph.style.heading = heading
296
+ }
297
+
298
+ if (spacing) {
299
+ paragraph.style.spacing = spacing
300
+ }
301
+
302
+ return paragraph
303
+ }
304
+
305
+ function applyStyle(paragraph) {
306
+ if (!paragraph || (paragraph.run ?? []).length === 0) return paragraph
307
+ paragraph.run = paragraph.run.map(item => {
308
+ if (['table', 'row', 'cell', 'image'].includes(item.type)) return item
309
+
310
+ const style = item.style ?? {}
311
+ let content = item?.content?.length > 0 ? item.content : null
312
+
313
+ if (content) {
314
+ if (style.uppercase) {
315
+ content = content.toUpperCase()
316
+ } else if (style.capitalize) {
317
+ content = capitalize(content, true)
318
+ } else if (style.lowercase) {
319
+ content = content.toLowerCase()
320
+ } else if (style.invertcase) {
321
+ content = [...content]
322
+ .map(char =>
323
+ char === char.toUpperCase()
324
+ ? char.toLowerCase()
325
+ : char.toUpperCase(),
326
+ )
327
+ .join('')
328
+ } else if (style.uppercasesentence) {
329
+ content = content
330
+ .toLowerCase()
331
+ .split('.')
332
+ .map(sentence => {
333
+ const firstAlphaIndex = sentence.search(/[a-zA-Z]/)
334
+ if (firstAlphaIndex !== -1) {
335
+ return (
336
+ sentence.slice(0, firstAlphaIndex) +
337
+ sentence[firstAlphaIndex].toUpperCase() +
338
+ sentence.slice(firstAlphaIndex + 1)
339
+ )
340
+ }
341
+ return sentence
342
+ })
343
+ .join('.')
344
+ }
345
+ }
346
+
347
+ return { ...item, content }
348
+ })
349
+
350
+ return paragraph
351
+ }
352
+
353
+ function trimContent(content, isFirst, isLast, isSingle) {
354
+ if (isSingle) return content.trim()
355
+ if (isFirst) return content.trimStart()
356
+ if (isLast) return content.trimEnd()
357
+ return content
358
+ }
359
+
360
+ function trimParagraph(paragraph) {
361
+ if (!paragraph || (paragraph.run ?? []).length === 0) return paragraph
362
+
363
+ const run = []
364
+ let foundNonEmptyText = false
365
+
366
+ for (const line of paragraph.run) {
367
+ if (paragraph?.type === 'list' && line.content.trim().length === 0) {
368
+ continue
369
+ } else if (
370
+ !foundNonEmptyText &&
371
+ line.type === 'text' &&
372
+ line.content.trim().length === 0
373
+ ) {
374
+ continue
375
+ }
376
+
377
+ foundNonEmptyText = true
378
+
379
+ if (paragraph?.type === 'list') {
380
+ run.push({ ...line, content: line.content.trim() })
381
+ } else {
382
+ run.push(line)
383
+ }
384
+ }
385
+
386
+ while (
387
+ run.length > 0 &&
388
+ run[run.length - 1].type === 'text' &&
389
+ run[run.length - 1].content.trim().length === 0
390
+ ) {
391
+ run.pop()
392
+ }
393
+
394
+ return {
395
+ ...paragraph,
396
+ run: run.map((item, index) => {
397
+ if (item.content === undefined) return item
398
+
399
+ const isFirst = index === 0
400
+ const isLast = index === run.length - 1
401
+ const isSingle = run.length === 1
402
+
403
+ return {
404
+ ...item,
405
+ content: trimContent(item.content, isFirst, isLast, isSingle),
406
+ }
407
+ }),
408
+ }
409
+ }
410
+
411
+ function normalizeParagraph(paragraph) {
412
+ return applyStyle(reflectStyleToParagraph(trimParagraph(paragraph)))
413
+ }
414
+
415
+ function getListFormat(node) {
416
+ const items = (node.attributes?.style?.split(';') ?? [])
417
+ .map(item => item.trim())
418
+ .filter(item => item.startsWith('list-style'))
419
+ for (const item of items) {
420
+ const [key, value] = item.split(':').map(s => s.trim())
421
+
422
+ if (key === 'list-style-type' || key === 'list-style') {
423
+ return value
424
+ }
425
+ }
426
+
427
+ switch (node?.tagName?.toLowerCase()) {
428
+ case 'ol':
429
+ return 'decimal'
430
+
431
+ default:
432
+ return 'bullet'
433
+ }
434
+ }
435
+
436
+ function getListLevel(node, level = 0) {
437
+ level = level ?? 0
438
+
439
+ if (typeof node?.parentNode !== 'undefined') {
440
+ if (['ul', 'ol'].includes(node?.parentNode?.tagName?.toLowerCase())) {
441
+ return getListLevel(node.parentNode, level + 1)
442
+ } else {
443
+ return getListLevel(node.parentNode, level)
444
+ }
445
+ }
446
+
447
+ return level
448
+ }
449
+
450
+ function normalizeSheet(sheet) {
451
+ const items = []
452
+
453
+ for (const index in sheet) {
454
+ const item = sheet[index]
455
+ switch (item.type) {
456
+ case 'list':
457
+ let i = 0
458
+
459
+ const bullet = item.bullet ?? {}
460
+
461
+ const style = {
462
+ ...(item.style ?? {}),
463
+ bullet,
464
+ }
465
+
466
+ const start =
467
+ bullet.start && bullet.start > 0 ? bullet.start : undefined
468
+
469
+ for (const listItem of item.run) {
470
+ i = start ?? i + 1
471
+
472
+ items.push({
473
+ type: 'list',
474
+ style: lodash.merge(lodash.cloneDeep(style), listItem.style ?? {}, {
475
+ bullet: { start: i },
476
+ }),
477
+ run: [
478
+ {
479
+ type: 'text',
480
+ content: listItem.content,
481
+ style: {},
482
+ },
483
+ ],
484
+ })
485
+ }
486
+ break
487
+
488
+ default:
489
+ items.push(item)
490
+ break
491
+ }
492
+ }
493
+
494
+ return items
495
+ }
496
+
497
+ async function parseTreeNode(mainNode) {
498
+ const sheet = []
499
+
500
+ let paragraph = null
501
+
502
+ for (const node of mainNode.childNodes) {
503
+ if (node instanceof TextNode) {
504
+ paragraph = await paragraphNodeParse(sheet, node, paragraph)
505
+ } else {
506
+ switch (node.tagName?.toLowerCase()) {
507
+ case 'span':
508
+ case 'strong':
509
+ case 'a':
510
+ case 'i':
511
+ case 's':
512
+ case 'u':
513
+ case 'b':
514
+ case 'em':
515
+ case 'h1':
516
+ case 'h2':
517
+ case 'h3':
518
+ case 'h4':
519
+ case 'h5':
520
+ case 'h6':
521
+ case 'p':
522
+ case 'ul':
523
+ case 'ol':
524
+ case 'li':
525
+ if (
526
+ ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol'].includes(
527
+ node.tagName?.toLowerCase(),
528
+ ) &&
529
+ paragraph !== null
530
+ ) {
531
+ sheet.push(normalizeParagraph(paragraph))
532
+ paragraph = null
533
+ }
534
+
535
+ if (
536
+ ['ul', 'ol'].includes(node.tagName?.toLowerCase()) &&
537
+ paragraph === null
538
+ ) {
539
+ paragraph = {
540
+ type: 'list',
541
+ bullet: {
542
+ level: getListLevel(node),
543
+ format: getListFormat(node),
544
+ start:
545
+ node.attributes?.start && parseInt(node.attributes?.start),
546
+ },
547
+ run: [],
548
+ }
549
+ }
550
+
551
+ for (const child of node.childNodes) {
552
+ paragraph = await paragraphNodeParse(sheet, child, paragraph)
553
+ }
554
+ break
555
+
556
+ case 'br':
557
+ if (paragraph !== null) {
558
+ sheet.push(normalizeParagraph(paragraph))
559
+ paragraph = null
560
+ }
561
+
562
+ if (
563
+ ['br', 'table', 'p'].includes(
564
+ node.previousElementSibling?.tagName?.toLowerCase(),
565
+ )
566
+ ) {
567
+ sheet.push({ type: 'break', run: [] })
568
+ }
569
+ break
570
+
571
+ case 'img':
572
+ paragraph = await paragraphNodeParse(sheet, node, paragraph)
573
+ break
574
+
575
+ case 'table':
576
+ if (paragraph !== null) {
577
+ sheet.push(normalizeParagraph(paragraph))
578
+ paragraph = null
579
+ }
580
+
581
+ paragraph = {
582
+ type: 'table',
583
+ style: getCurrentNodeStyle(node),
584
+ run: await Promise.all(
585
+ Array.from(node.querySelectorAll('tbody > tr')).map(
586
+ async trNode => {
587
+ const cells = await Promise.all(
588
+ Array.from(trNode.querySelectorAll('td')).map(
589
+ async tdNode => {
590
+ return {
591
+ type: 'cell',
592
+ style: getCurrentNodeStyle(tdNode),
593
+ run: await parseTreeNode(tdNode),
594
+ }
595
+ },
596
+ ),
597
+ )
598
+
599
+ return {
600
+ type: 'row',
601
+ style: getCurrentNodeStyle(trNode),
602
+ run: cells,
603
+ }
604
+ },
605
+ ),
606
+ ),
607
+ }
608
+ break
609
+
610
+ default:
611
+ break
612
+ }
613
+ }
614
+ }
615
+
616
+ if (paragraph !== null) {
617
+ sheet.push(normalizeParagraph(paragraph))
618
+ }
619
+
620
+ return sheet
621
+ .map(item => {
622
+ if (item.type !== 'break' && item?.run?.length === 0) {
623
+ return undefined
624
+ }
625
+
626
+ return item
627
+ })
628
+ .filter(Boolean)
629
+ }
630
+
631
+ export async function nodeTree(content) {
632
+ const sheets = {}
633
+
634
+ const pages = pageNodes(
635
+ htmlParser(normalizeHtml(content)).querySelector('body'),
636
+ )
637
+
638
+ for (const index in pages) {
639
+ sheets[index] = normalizeSheet(await parseTreeNode(pages[index]))
640
+ }
641
+
642
+ return sheets
643
+ }