@lkaopremier/html-to-docx 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tree.js ADDED
@@ -0,0 +1,643 @@
1
+ import { parse as htmlParser, TextNode } from 'node-html-parser'
2
+ import {
3
+ pageNodes,
4
+ normalizeHtml,
5
+ trim,
6
+ normalizeMeasure,
7
+ capitalize,
8
+ colorToHex,
9
+ imageBase64ToBuffer,
10
+ splitMeasure,
11
+ convertCssToDocxMeasurement,
12
+ parseCssMargin,
13
+ } from './helper.js'
14
+ import lodash from 'lodash'
15
+ import { AlignmentType, HeadingLevel } from 'docx'
16
+
17
+ function getStyle(node) {
18
+ const style = {}
19
+
20
+ switch (node?.tagName?.toLowerCase()) {
21
+ case 'strong':
22
+ case 'b':
23
+ style.bold = true
24
+ break
25
+
26
+ case 'em':
27
+ case 'i':
28
+ style.italics = true
29
+ break
30
+
31
+ case 'u':
32
+ style.underline = {}
33
+ break
34
+
35
+ case 's':
36
+ style.strike = {}
37
+ break
38
+
39
+ case 'h1':
40
+ style.heading = HeadingLevel.HEADING_1
41
+ break
42
+
43
+ case 'h2':
44
+ style.heading = HeadingLevel.HEADING_2
45
+ break
46
+
47
+ case 'h3':
48
+ style.heading = HeadingLevel.HEADING_3
49
+ break
50
+
51
+ case 'h4':
52
+ style.heading = HeadingLevel.HEADING_4
53
+ break
54
+
55
+ case 'h5':
56
+ style.heading = HeadingLevel.HEADING_5
57
+ break
58
+
59
+ case 'h6':
60
+ style.heading = HeadingLevel.HEADING_6
61
+ break
62
+ }
63
+
64
+ lodash.merge(style, getCurrentNodeStyle(node))
65
+
66
+ if (node.parentNode) {
67
+ return lodash.merge(getStyle(node.parentNode), style)
68
+ }
69
+
70
+ return style
71
+ }
72
+
73
+ function getCurrentNodeStyle(node) {
74
+ const style = {}
75
+ if (node?.attributes?.style) {
76
+ const styles = node.attributes?.style?.split(';') ?? []
77
+
78
+ styles.forEach(item => {
79
+ const [key, value] = item.split(':').map(s => s.trim())
80
+ switch (key) {
81
+ case 'font-weight':
82
+ if (value === 'bold') {
83
+ style.bold = true
84
+ }
85
+ break
86
+
87
+ case 'font-style':
88
+ if (value === 'italic') {
89
+ style.italics = true
90
+ }
91
+ break
92
+
93
+ case 'text-decoration':
94
+ if (value === 'underline') {
95
+ style.underline = {}
96
+ }
97
+ break
98
+
99
+ case 'text-transform':
100
+ if (value === 'uppercase') {
101
+ style.uppercase = true
102
+ } else if (value === 'capitalize') {
103
+ style.capitalize = true
104
+ } else if (value === 'lowercase') {
105
+ style.lowercase = true
106
+ } else if (value === 'invertcase') {
107
+ style.invertcase = true
108
+ } else if (value === 'uppercasesentence') {
109
+ style.uppercasesentence = true
110
+ }
111
+ break
112
+
113
+ case 'font-family':
114
+ style.font = trim(trim(value, "'"), '"')
115
+ break
116
+
117
+ case 'font-size':
118
+ style.size = normalizeMeasure(value)
119
+ break
120
+
121
+ case 'color':
122
+ style.color = colorToHex(value)
123
+ break
124
+
125
+ case 'margin':
126
+ case 'margin-top':
127
+ case 'margin-bottom':
128
+ const { top, bottom } = parseCssMargin(value)
129
+
130
+ if (key === 'margin') {
131
+ style.spacing = {
132
+ before: top,
133
+ after: bottom,
134
+ }
135
+ } else if (key === 'margin-top') {
136
+ style.spacing = {
137
+ before: top,
138
+ }
139
+ } else if (key === 'margin-bottom') {
140
+ style.spacing = {
141
+ after: bottom,
142
+ }
143
+ }
144
+ break
145
+
146
+ case 'text-align':
147
+ switch (value) {
148
+ case 'left':
149
+ style.alignment = AlignmentType.LEFT
150
+ break
151
+
152
+ case 'right':
153
+ style.alignment = AlignmentType.RIGHT
154
+ break
155
+
156
+ case 'center':
157
+ style.alignment = AlignmentType.CENTER
158
+ break
159
+
160
+ case 'justify':
161
+ style.alignment = AlignmentType.JUSTIFIED
162
+ break
163
+ }
164
+ break
165
+
166
+ case 'text-indent':
167
+ style.indent = { left: normalizeMeasure(value) }
168
+ break
169
+
170
+ case 'width':
171
+ if (
172
+ ['table', 'tr', 'th', 'td'].includes(node?.tagName.toLowerCase())
173
+ ) {
174
+ style.width = convertCssToDocxMeasurement(value)
175
+ } else {
176
+ style.width = normalizeMeasure(value)
177
+ }
178
+ break
179
+
180
+ case 'height':
181
+ style.height = normalizeMeasure(value)
182
+ break
183
+ }
184
+ })
185
+ }
186
+
187
+ return style
188
+ }
189
+
190
+ async function paragraphNodeParse(
191
+ sheet,
192
+ node,
193
+ paragraph = null,
194
+ newParagraphAfterBr = false,
195
+ ) {
196
+ if (node instanceof TextNode) {
197
+ const style = getStyle(node?.parentNode) ?? {}
198
+ const textRun = {
199
+ type: 'text',
200
+ style,
201
+ content: node.rawText,
202
+ }
203
+
204
+ if (!paragraph || newParagraphAfterBr) {
205
+ paragraph = { run: [textRun] }
206
+ } else {
207
+ paragraph.run.push(textRun)
208
+ }
209
+ } else if (node.tagName?.toLowerCase() === 'img') {
210
+ const image =
211
+ node.attributes.src && (await imageBase64ToBuffer(node.attributes.src))
212
+
213
+ if (image) {
214
+ const style = getCurrentNodeStyle(node)
215
+
216
+ if (style.width && !style.height) {
217
+ const [numericValue] = splitMeasure(style.width)
218
+ style.width = numericValue
219
+ style.height = numericValue / image.ratio
220
+ } else if (!style.width && style.height) {
221
+ const [numericValue] = splitMeasure(style.height)
222
+ style.height = numericValue
223
+ style.width = numericValue * image.ratio
224
+ }
225
+
226
+ const imageRun = {
227
+ type: 'image',
228
+ style: {
229
+ type: image.extension,
230
+ ...getStyle(node),
231
+ transformation: {
232
+ width: image.width,
233
+ height: image.height,
234
+ ...style,
235
+ },
236
+ },
237
+ data: image.buffer,
238
+ }
239
+
240
+ if (!paragraph || newParagraphAfterBr) {
241
+ paragraph = { run: [imageRun] }
242
+ } else {
243
+ paragraph.run.push(imageRun)
244
+ }
245
+ }
246
+ } else if (node.tagName?.toLowerCase() === 'br') {
247
+ if (paragraph) {
248
+ sheet.push(normalizeParagraph(paragraph))
249
+ paragraph = null
250
+ }
251
+
252
+ if (
253
+ ['br', 'table', 'p'].includes(
254
+ node.previousElementSibling?.tagName?.toLowerCase(),
255
+ )
256
+ ) {
257
+ sheet.push({ type: 'break', run: [] })
258
+ }
259
+ } else if (node.childNodes) {
260
+ for (const child of node.childNodes) {
261
+ paragraph = await paragraphNodeParse(
262
+ sheet,
263
+ child,
264
+ paragraph,
265
+ newParagraphAfterBr,
266
+ )
267
+ newParagraphAfterBr = false
268
+ }
269
+ }
270
+
271
+ return paragraph
272
+ }
273
+
274
+ function reflectStyleToParagraph(paragraph) {
275
+ if (!paragraph || (paragraph.run ?? []).length === 0) return paragraph
276
+ let style = {}
277
+
278
+ for (const text of paragraph.run) {
279
+ style = { ...style, ...(text.style ?? {}) }
280
+ }
281
+
282
+ const { alignment, indent, heading, spacing } = style
283
+
284
+ if (!paragraph.style) paragraph.style = {}
285
+
286
+ if (alignment) {
287
+ paragraph.style.alignment = alignment
288
+ }
289
+
290
+ if (indent) {
291
+ paragraph.style.indent = indent
292
+ }
293
+
294
+ if (heading) {
295
+ paragraph.style.heading = heading
296
+ }
297
+
298
+ if (spacing) {
299
+ paragraph.style.spacing = spacing
300
+ }
301
+
302
+ return paragraph
303
+ }
304
+
305
+ function applyStyle(paragraph) {
306
+ if (!paragraph || (paragraph.run ?? []).length === 0) return paragraph
307
+ paragraph.run = paragraph.run.map(item => {
308
+ if (['table', 'row', 'cell', 'image'].includes(item.type)) return item
309
+
310
+ const style = item.style ?? {}
311
+ let content = item?.content?.length > 0 ? item.content : null
312
+
313
+ if (content) {
314
+ if (style.uppercase) {
315
+ content = content.toUpperCase()
316
+ } else if (style.capitalize) {
317
+ content = capitalize(content, true)
318
+ } else if (style.lowercase) {
319
+ content = content.toLowerCase()
320
+ } else if (style.invertcase) {
321
+ content = [...content]
322
+ .map(char =>
323
+ char === char.toUpperCase()
324
+ ? char.toLowerCase()
325
+ : char.toUpperCase(),
326
+ )
327
+ .join('')
328
+ } else if (style.uppercasesentence) {
329
+ content = content
330
+ .toLowerCase()
331
+ .split('.')
332
+ .map(sentence => {
333
+ const firstAlphaIndex = sentence.search(/[a-zA-Z]/)
334
+ if (firstAlphaIndex !== -1) {
335
+ return (
336
+ sentence.slice(0, firstAlphaIndex) +
337
+ sentence[firstAlphaIndex].toUpperCase() +
338
+ sentence.slice(firstAlphaIndex + 1)
339
+ )
340
+ }
341
+ return sentence
342
+ })
343
+ .join('.')
344
+ }
345
+ }
346
+
347
+ return { ...item, content }
348
+ })
349
+
350
+ return paragraph
351
+ }
352
+
353
+ function trimContent(content, isFirst, isLast, isSingle) {
354
+ if (isSingle) return content.trim()
355
+ if (isFirst) return content.trimStart()
356
+ if (isLast) return content.trimEnd()
357
+ return content
358
+ }
359
+
360
+ function trimParagraph(paragraph) {
361
+ if (!paragraph || (paragraph.run ?? []).length === 0) return paragraph
362
+
363
+ const run = []
364
+ let foundNonEmptyText = false
365
+
366
+ for (const line of paragraph.run) {
367
+ if (paragraph?.type === 'list' && line.content.trim().length === 0) {
368
+ continue
369
+ } else if (
370
+ !foundNonEmptyText &&
371
+ line.type === 'text' &&
372
+ line.content.trim().length === 0
373
+ ) {
374
+ continue
375
+ }
376
+
377
+ foundNonEmptyText = true
378
+
379
+ if (paragraph?.type === 'list') {
380
+ run.push({ ...line, content: line.content.trim() })
381
+ } else {
382
+ run.push(line)
383
+ }
384
+ }
385
+
386
+ while (
387
+ run.length > 0 &&
388
+ run[run.length - 1].type === 'text' &&
389
+ run[run.length - 1].content.trim().length === 0
390
+ ) {
391
+ run.pop()
392
+ }
393
+
394
+ return {
395
+ ...paragraph,
396
+ run: run.map((item, index) => {
397
+ if (item.content === undefined) return item
398
+
399
+ const isFirst = index === 0
400
+ const isLast = index === run.length - 1
401
+ const isSingle = run.length === 1
402
+
403
+ return {
404
+ ...item,
405
+ content: trimContent(item.content, isFirst, isLast, isSingle),
406
+ }
407
+ }),
408
+ }
409
+ }
410
+
411
+ function normalizeParagraph(paragraph) {
412
+ return applyStyle(reflectStyleToParagraph(trimParagraph(paragraph)))
413
+ }
414
+
415
+ function getListFormat(node) {
416
+ const items = (node.attributes?.style?.split(';') ?? [])
417
+ .map(item => item.trim())
418
+ .filter(item => item.startsWith('list-style'))
419
+ for (const item of items) {
420
+ const [key, value] = item.split(':').map(s => s.trim())
421
+
422
+ if (key === 'list-style-type' || key === 'list-style') {
423
+ return value
424
+ }
425
+ }
426
+
427
+ switch (node?.tagName?.toLowerCase()) {
428
+ case 'ol':
429
+ return 'decimal'
430
+
431
+ default:
432
+ return 'bullet'
433
+ }
434
+ }
435
+
436
+ function getListLevel(node, level = 0) {
437
+ level = level ?? 0
438
+
439
+ if (typeof node?.parentNode !== 'undefined') {
440
+ if (['ul', 'ol'].includes(node?.parentNode?.tagName?.toLowerCase())) {
441
+ return getListLevel(node.parentNode, level + 1)
442
+ } else {
443
+ return getListLevel(node.parentNode, level)
444
+ }
445
+ }
446
+
447
+ return level
448
+ }
449
+
450
+ function normalizeSheet(sheet) {
451
+ const items = []
452
+
453
+ for (const index in sheet) {
454
+ const item = sheet[index]
455
+ switch (item.type) {
456
+ case 'list':
457
+ let i = 0
458
+
459
+ const bullet = item.bullet ?? {}
460
+
461
+ const style = {
462
+ ...(item.style ?? {}),
463
+ bullet,
464
+ }
465
+
466
+ const start =
467
+ bullet.start && bullet.start > 0 ? bullet.start : undefined
468
+
469
+ for (const listItem of item.run) {
470
+ i = start ?? i + 1
471
+
472
+ items.push({
473
+ type: 'list',
474
+ style: lodash.merge(lodash.cloneDeep(style), listItem.style ?? {}, {
475
+ bullet: { start: i },
476
+ }),
477
+ run: [
478
+ {
479
+ type: 'text',
480
+ content: listItem.content,
481
+ style: {},
482
+ },
483
+ ],
484
+ })
485
+ }
486
+ break
487
+
488
+ default:
489
+ items.push(item)
490
+ break
491
+ }
492
+ }
493
+
494
+ return items
495
+ }
496
+
497
+ async function parseTreeNode(mainNode) {
498
+ const sheet = []
499
+
500
+ let paragraph = null
501
+
502
+ for (const node of mainNode.childNodes) {
503
+ if (node instanceof TextNode) {
504
+ paragraph = await paragraphNodeParse(sheet, node, paragraph)
505
+ } else {
506
+ switch (node.tagName?.toLowerCase()) {
507
+ case 'span':
508
+ case 'strong':
509
+ case 'a':
510
+ case 'i':
511
+ case 's':
512
+ case 'u':
513
+ case 'b':
514
+ case 'em':
515
+ case 'h1':
516
+ case 'h2':
517
+ case 'h3':
518
+ case 'h4':
519
+ case 'h5':
520
+ case 'h6':
521
+ case 'p':
522
+ case 'ul':
523
+ case 'ol':
524
+ case 'li':
525
+ if (
526
+ ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol'].includes(
527
+ node.tagName?.toLowerCase(),
528
+ ) &&
529
+ paragraph !== null
530
+ ) {
531
+ sheet.push(normalizeParagraph(paragraph))
532
+ paragraph = null
533
+ }
534
+
535
+ if (
536
+ ['ul', 'ol'].includes(node.tagName?.toLowerCase()) &&
537
+ paragraph === null
538
+ ) {
539
+ paragraph = {
540
+ type: 'list',
541
+ bullet: {
542
+ level: getListLevel(node),
543
+ format: getListFormat(node),
544
+ start:
545
+ node.attributes?.start && parseInt(node.attributes?.start),
546
+ },
547
+ run: [],
548
+ }
549
+ }
550
+
551
+ for (const child of node.childNodes) {
552
+ paragraph = await paragraphNodeParse(sheet, child, paragraph)
553
+ }
554
+ break
555
+
556
+ case 'br':
557
+ if (paragraph !== null) {
558
+ sheet.push(normalizeParagraph(paragraph))
559
+ paragraph = null
560
+ }
561
+
562
+ if (
563
+ ['br', 'table', 'p'].includes(
564
+ node.previousElementSibling?.tagName?.toLowerCase(),
565
+ )
566
+ ) {
567
+ sheet.push({ type: 'break', run: [] })
568
+ }
569
+ break
570
+
571
+ case 'img':
572
+ paragraph = await paragraphNodeParse(sheet, node, paragraph)
573
+ break
574
+
575
+ case 'table':
576
+ if (paragraph !== null) {
577
+ sheet.push(normalizeParagraph(paragraph))
578
+ paragraph = null
579
+ }
580
+
581
+ paragraph = {
582
+ type: 'table',
583
+ style: getCurrentNodeStyle(node),
584
+ run: await Promise.all(
585
+ Array.from(node.querySelectorAll('tbody > tr')).map(
586
+ async trNode => {
587
+ const cells = await Promise.all(
588
+ Array.from(trNode.querySelectorAll('td')).map(
589
+ async tdNode => {
590
+ return {
591
+ type: 'cell',
592
+ style: getCurrentNodeStyle(tdNode),
593
+ run: await parseTreeNode(tdNode),
594
+ }
595
+ },
596
+ ),
597
+ )
598
+
599
+ return {
600
+ type: 'row',
601
+ style: getCurrentNodeStyle(trNode),
602
+ run: cells,
603
+ }
604
+ },
605
+ ),
606
+ ),
607
+ }
608
+ break
609
+
610
+ default:
611
+ break
612
+ }
613
+ }
614
+ }
615
+
616
+ if (paragraph !== null) {
617
+ sheet.push(normalizeParagraph(paragraph))
618
+ }
619
+
620
+ return sheet
621
+ .map(item => {
622
+ if (item.type !== 'break' && item?.run?.length === 0) {
623
+ return undefined
624
+ }
625
+
626
+ return item
627
+ })
628
+ .filter(Boolean)
629
+ }
630
+
631
+ export async function nodeTree(content) {
632
+ const sheets = {}
633
+
634
+ const pages = pageNodes(
635
+ htmlParser(normalizeHtml(content)).querySelector('body'),
636
+ )
637
+
638
+ for (const index in pages) {
639
+ sheets[index] = normalizeSheet(await parseTreeNode(pages[index]))
640
+ }
641
+
642
+ return sheets
643
+ }