@marvalt/wparser 0.1.66 → 0.1.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +54 -17
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.esm.js +54 -17
- package/dist/index.esm.js.map +1 -1
- package/dist/utils/contentExtractor.d.ts +1 -0
- package/dist/utils/contentExtractor.d.ts.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -264,6 +264,7 @@ declare function getBlockTextContent(block: WordPressBlock): string;
|
|
|
264
264
|
declare function getImageUrl(block: WordPressBlock): string | null;
|
|
265
265
|
/**
|
|
266
266
|
* Extract image attributes (alt, width, height, alignment) from block
|
|
267
|
+
* Prioritizes Cloudflare URLs over WordPress URLs
|
|
267
268
|
*/
|
|
268
269
|
declare function getImageAttributes(block: WordPressBlock): {
|
|
269
270
|
url: string | null;
|
package/dist/index.esm.js
CHANGED
|
@@ -1517,29 +1517,64 @@ function renderTextWithShortcodes(text, registry) {
|
|
|
1517
1517
|
* Content extraction utilities for WordPress blocks
|
|
1518
1518
|
* Extracts text content from various block formats
|
|
1519
1519
|
*/
|
|
1520
|
+
/**
|
|
1521
|
+
* Decode HTML entities in a string
|
|
1522
|
+
* Handles both named entities (&, ") and numeric entities (', ’)
|
|
1523
|
+
*/
|
|
1524
|
+
function decodeHtmlEntities(text) {
|
|
1525
|
+
if (!text)
|
|
1526
|
+
return '';
|
|
1527
|
+
// Use browser's built-in decoder if available (most efficient)
|
|
1528
|
+
if (typeof document !== 'undefined') {
|
|
1529
|
+
const textarea = document.createElement('textarea');
|
|
1530
|
+
textarea.innerHTML = text;
|
|
1531
|
+
return textarea.value;
|
|
1532
|
+
}
|
|
1533
|
+
// Fallback for server-side or when document is not available
|
|
1534
|
+
// Decode numeric entities (', ’, etc.)
|
|
1535
|
+
let decoded = text.replace(/&#(\d+);/g, (match, dec) => {
|
|
1536
|
+
return String.fromCharCode(parseInt(dec, 10));
|
|
1537
|
+
});
|
|
1538
|
+
// Decode hex entities (', etc.)
|
|
1539
|
+
decoded = decoded.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
|
|
1540
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
1541
|
+
});
|
|
1542
|
+
// Decode common named entities
|
|
1543
|
+
const namedEntities = {
|
|
1544
|
+
'&': '&',
|
|
1545
|
+
'<': '<',
|
|
1546
|
+
'>': '>',
|
|
1547
|
+
'"': '"',
|
|
1548
|
+
''': "'",
|
|
1549
|
+
' ': ' ',
|
|
1550
|
+
'©': '©',
|
|
1551
|
+
'®': '®',
|
|
1552
|
+
'™': '™',
|
|
1553
|
+
'…': '…',
|
|
1554
|
+
'—': '—',
|
|
1555
|
+
'–': '–',
|
|
1556
|
+
'‘': '\u2018', // Left single quotation mark
|
|
1557
|
+
'’': '\u2019', // Right single quotation mark
|
|
1558
|
+
'“': '\u201C', // Left double quotation mark
|
|
1559
|
+
'”': '\u201D', // Right double quotation mark
|
|
1560
|
+
};
|
|
1561
|
+
Object.entries(namedEntities).forEach(([entity, char]) => {
|
|
1562
|
+
decoded = decoded.replace(new RegExp(entity, 'g'), char);
|
|
1563
|
+
});
|
|
1564
|
+
return decoded;
|
|
1565
|
+
}
|
|
1520
1566
|
/**
|
|
1521
1567
|
* Extract text content from a block's innerHTML by stripping HTML tags
|
|
1522
1568
|
*/
|
|
1523
1569
|
function extractTextFromHTML(html) {
|
|
1524
1570
|
if (!html)
|
|
1525
1571
|
return '';
|
|
1526
|
-
// Remove HTML tags
|
|
1527
|
-
let text = html
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
.replace(/’/g, "'") // Replace apostrophe entity
|
|
1531
|
-
.replace(/“/g, '"') // Replace left double quote
|
|
1532
|
-
.replace(/”/g, '"') // Replace right double quote
|
|
1533
|
-
.replace(/…/g, '...') // Replace ellipsis
|
|
1534
|
-
.replace(/&/g, '&') // Replace &
|
|
1535
|
-
.replace(/</g, '<') // Replace <
|
|
1536
|
-
.replace(/>/g, '>') // Replace >
|
|
1537
|
-
.replace(/"/g, '"') // Replace "
|
|
1538
|
-
.replace(/–/g, '–') // Replace en dash
|
|
1539
|
-
.replace(/—/g, '—') // Replace em dash
|
|
1540
|
-
.trim();
|
|
1572
|
+
// Remove HTML tags first
|
|
1573
|
+
let text = html.replace(/<[^>]*>/g, '');
|
|
1574
|
+
// Decode all HTML entities (comprehensive)
|
|
1575
|
+
text = decodeHtmlEntities(text);
|
|
1541
1576
|
// Clean up extra whitespace
|
|
1542
|
-
text = text.replace(/\s+/g, ' ');
|
|
1577
|
+
text = text.replace(/\s+/g, ' ').trim();
|
|
1543
1578
|
return text;
|
|
1544
1579
|
}
|
|
1545
1580
|
/**
|
|
@@ -1591,10 +1626,12 @@ function getImageUrl(block) {
|
|
|
1591
1626
|
}
|
|
1592
1627
|
/**
|
|
1593
1628
|
* Extract image attributes (alt, width, height, alignment) from block
|
|
1629
|
+
* Prioritizes Cloudflare URLs over WordPress URLs
|
|
1594
1630
|
*/
|
|
1595
1631
|
function getImageAttributes(block) {
|
|
1596
1632
|
const attrs = block.attributes || {};
|
|
1597
|
-
|
|
1633
|
+
// Use extractImageUrlWithFallback to prioritize Cloudflare URLs
|
|
1634
|
+
const url = extractImageUrlWithFallback(block);
|
|
1598
1635
|
// Extract width - can be number or string like "640px"
|
|
1599
1636
|
let width;
|
|
1600
1637
|
const widthAttr = attrs['width'];
|