@polotno/pdf-export 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/text.d.ts +5 -1
- package/lib/text.js +43 -30
- package/package.json +1 -1
package/lib/text.d.ts
CHANGED
|
@@ -36,10 +36,14 @@ export interface TextSegment {
|
|
|
36
36
|
* while preserving inline formatting tags.
|
|
37
37
|
*/
|
|
38
38
|
declare function normalizeRichText(text: string): string;
|
|
39
|
+
/**
|
|
40
|
+
* Parse HTML text into styled segments
|
|
41
|
+
*/
|
|
42
|
+
declare function parseHTMLToSegments(html: string, baseElement: TextElement): TextSegment[];
|
|
39
43
|
export declare function getGoogleFontPath(fontFamily: string, fontWeight?: string, italic?: boolean): Promise<string>;
|
|
40
44
|
export declare function loadFontIfNeeded(doc: any, element: TextElement, fonts: Record<string, boolean>): Promise<string>;
|
|
41
45
|
/**
|
|
42
46
|
* Main text rendering function
|
|
43
47
|
*/
|
|
44
48
|
export declare function renderText(doc: PDFKit.PDFDocument, element: TextElement, fonts: Record<string, boolean>, attrs?: RenderAttrs): Promise<void>;
|
|
45
|
-
export { normalizeRichText as __normalizeRichTextForTests };
|
|
49
|
+
export { normalizeRichText as __normalizeRichTextForTests, parseHTMLToSegments as __parseHTMLToSegmentsForTests, };
|
package/lib/text.js
CHANGED
|
@@ -2,6 +2,14 @@ import { parseColor, srcToBuffer } from './utils.js';
|
|
|
2
2
|
import getUrls from 'get-urls';
|
|
3
3
|
import fetch from 'node-fetch';
|
|
4
4
|
import { stripHtml } from 'string-strip-html';
|
|
5
|
+
import { decode as decodeEntities } from 'html-entities';
|
|
6
|
+
function decodeHtmlEntities(text) {
|
|
7
|
+
if (!text) {
|
|
8
|
+
return text;
|
|
9
|
+
}
|
|
10
|
+
const decoded = decodeEntities(text);
|
|
11
|
+
return decoded.replace(/\t/g, ' ');
|
|
12
|
+
}
|
|
5
13
|
/**
|
|
6
14
|
* Check if text contains HTML tags
|
|
7
15
|
*/
|
|
@@ -18,6 +26,8 @@ function normalizeRichText(text) {
|
|
|
18
26
|
return text;
|
|
19
27
|
}
|
|
20
28
|
let normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|
29
|
+
// Normalize tab characters into 8 spaces
|
|
30
|
+
normalized = normalized.replace(/\t/g, ' '.repeat(8));
|
|
21
31
|
// Convert explicit HTML break tags into newline characters
|
|
22
32
|
normalized = normalized.replace(/<br\s*\/?>/gi, '\n');
|
|
23
33
|
// Treat paragraph boundaries as newlines and drop opening tags
|
|
@@ -29,6 +39,8 @@ function normalizeRichText(text) {
|
|
|
29
39
|
normalized = normalized.replace(/^\n+/, '').replace(/\n+$/, '');
|
|
30
40
|
// Decode common HTML non-breaking space entities into their unicode counterpart
|
|
31
41
|
normalized = normalized.replace(/&(nbsp|#160|#xA0);/gi, '\u00A0');
|
|
42
|
+
// Strip zero-width characters that can create missing-glyph boxes in PDF output
|
|
43
|
+
normalized = normalized.replace(/[\u200B\u200C\u200D\uFEFF\u2060]/g, '');
|
|
32
44
|
return normalized;
|
|
33
45
|
}
|
|
34
46
|
/**
|
|
@@ -43,7 +55,7 @@ function parseHTMLToSegments(html, baseElement) {
|
|
|
43
55
|
while ((match = regex.exec(html)) !== null) {
|
|
44
56
|
if (match[4]) {
|
|
45
57
|
// Text content
|
|
46
|
-
const text = match[4];
|
|
58
|
+
const text = decodeHtmlEntities(match[4]);
|
|
47
59
|
// Calculate current styles from tag stack
|
|
48
60
|
let bold = false;
|
|
49
61
|
let italic = false;
|
|
@@ -177,9 +189,11 @@ function tokenizeHTML(html) {
|
|
|
177
189
|
while ((match = regex.exec(html)) !== null) {
|
|
178
190
|
if (match[4]) {
|
|
179
191
|
// Text content
|
|
192
|
+
const decodedContent = decodeHtmlEntities(match[4]);
|
|
180
193
|
tokens.push({
|
|
181
194
|
type: 'text',
|
|
182
195
|
content: match[4],
|
|
196
|
+
decodedContent,
|
|
183
197
|
});
|
|
184
198
|
}
|
|
185
199
|
else {
|
|
@@ -539,7 +553,7 @@ function splitTextIntoLines(doc, element, props) {
|
|
|
539
553
|
// Extract plain text for width calculation
|
|
540
554
|
const plainText = tokens
|
|
541
555
|
.filter((t) => t.type === 'text')
|
|
542
|
-
.map((t) => t.content)
|
|
556
|
+
.map((t) => t.decodedContent ?? decodeHtmlEntities(t.content))
|
|
543
557
|
.join('');
|
|
544
558
|
const baseMeta = paragraph.listMeta
|
|
545
559
|
? createListLineMeta(doc, element, props, paragraph.listMeta)
|
|
@@ -563,7 +577,7 @@ function splitTextIntoLines(doc, element, props) {
|
|
|
563
577
|
}
|
|
564
578
|
else {
|
|
565
579
|
// Need to split paragraph into multiple lines
|
|
566
|
-
let
|
|
580
|
+
let currentLineDecoded = '';
|
|
567
581
|
let currentWidth = 0;
|
|
568
582
|
let currentTokens = [];
|
|
569
583
|
let openTags = [];
|
|
@@ -573,34 +587,32 @@ function splitTextIntoLines(doc, element, props) {
|
|
|
573
587
|
continue;
|
|
574
588
|
}
|
|
575
589
|
// Text token - split by words
|
|
576
|
-
const
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
const
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
const
|
|
590
|
+
const rawWords = token.content.split(' ');
|
|
591
|
+
const decodedWords = (token.decodedContent ?? decodeHtmlEntities(token.content)).split(' ');
|
|
592
|
+
for (let i = 0; i < rawWords.length; i++) {
|
|
593
|
+
const rawWord = rawWords[i];
|
|
594
|
+
const decodedWord = decodedWords[i] ?? decodeHtmlEntities(rawWord);
|
|
595
|
+
const separator = i > 0 ? ' ' : '';
|
|
596
|
+
const hasCurrentLine = currentLineDecoded.length > 0;
|
|
597
|
+
const testLineDecoded = hasCurrentLine
|
|
598
|
+
? `${currentLineDecoded}${separator}${decodedWord}`
|
|
599
|
+
: decodedWord;
|
|
600
|
+
const testWidth = doc.widthOfString(testLineDecoded, props);
|
|
583
601
|
if (testWidth <= availableWidth) {
|
|
584
|
-
|
|
602
|
+
currentLineDecoded = testLineDecoded;
|
|
585
603
|
currentWidth = testWidth;
|
|
586
604
|
// Add text token (with space if not first word in token)
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
}
|
|
594
|
-
else {
|
|
595
|
-
currentTokens.push({
|
|
596
|
-
type: 'text',
|
|
597
|
-
content: word,
|
|
598
|
-
});
|
|
599
|
-
}
|
|
605
|
+
const rawContent = separator.length > 0 ? `${separator}${rawWord}` : rawWord;
|
|
606
|
+
const decodedContent = separator.length > 0 ? `${separator}${decodedWord}` : decodedWord;
|
|
607
|
+
currentTokens.push({
|
|
608
|
+
type: 'text',
|
|
609
|
+
content: rawContent,
|
|
610
|
+
decodedContent,
|
|
611
|
+
});
|
|
600
612
|
}
|
|
601
613
|
else {
|
|
602
614
|
// Line is too long, save current line and start new one
|
|
603
|
-
if (
|
|
615
|
+
if (currentLineDecoded.length > 0) {
|
|
604
616
|
const result = tokensToHTML(currentTokens, openTags);
|
|
605
617
|
const listMeta = cloneListMetaForLine(baseMeta, showMarkerForLine);
|
|
606
618
|
lines.push({
|
|
@@ -613,17 +625,18 @@ function splitTextIntoLines(doc, element, props) {
|
|
|
613
625
|
currentTokens = [];
|
|
614
626
|
showMarkerForLine = false;
|
|
615
627
|
}
|
|
616
|
-
|
|
617
|
-
currentWidth = doc.widthOfString(
|
|
628
|
+
currentLineDecoded = decodedWord;
|
|
629
|
+
currentWidth = doc.widthOfString(decodedWord, props);
|
|
618
630
|
currentTokens.push({
|
|
619
631
|
type: 'text',
|
|
620
|
-
content:
|
|
632
|
+
content: rawWord,
|
|
633
|
+
decodedContent: decodedWord,
|
|
621
634
|
});
|
|
622
635
|
}
|
|
623
636
|
}
|
|
624
637
|
}
|
|
625
638
|
// Add the last line
|
|
626
|
-
if (
|
|
639
|
+
if (currentLineDecoded.length > 0) {
|
|
627
640
|
const result = tokensToHTML(currentTokens, openTags);
|
|
628
641
|
const listMeta = cloneListMetaForLine(baseMeta, showMarkerForLine);
|
|
629
642
|
lines.push({
|
|
@@ -1034,4 +1047,4 @@ export async function renderText(doc, element, fonts, attrs = {}) {
|
|
|
1034
1047
|
}
|
|
1035
1048
|
}
|
|
1036
1049
|
// Internal exports for testing
|
|
1037
|
-
export { normalizeRichText as __normalizeRichTextForTests };
|
|
1050
|
+
export { normalizeRichText as __normalizeRichTextForTests, parseHTMLToSegments as __parseHTMLToSegmentsForTests, };
|