@marvalt/wparser 0.1.66 → 0.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1519,29 +1519,64 @@ function renderTextWithShortcodes(text, registry) {
1519
1519
  * Content extraction utilities for WordPress blocks
1520
1520
  * Extracts text content from various block formats
1521
1521
  */
1522
+ /**
1523
+ * Decode HTML entities in a string
1524
+ * Handles both named entities (&, ") and numeric entities (', ’)
1525
+ */
1526
+ function decodeHtmlEntities(text) {
1527
+ if (!text)
1528
+ return '';
1529
+ // Use browser's built-in decoder if available (most efficient)
1530
+ if (typeof document !== 'undefined') {
1531
+ const textarea = document.createElement('textarea');
1532
+ textarea.innerHTML = text;
1533
+ return textarea.value;
1534
+ }
1535
+ // Fallback for server-side or when document is not available
1536
+ // Decode numeric entities (', ’, etc.)
1537
+ let decoded = text.replace(/&#(\d+);/g, (match, dec) => {
1538
+ return String.fromCharCode(parseInt(dec, 10));
1539
+ });
1540
+ // Decode hex entities (', etc.)
1541
+ decoded = decoded.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
1542
+ return String.fromCharCode(parseInt(hex, 16));
1543
+ });
1544
+ // Decode common named entities
1545
+ const namedEntities = {
1546
+ '&': '&',
1547
+ '&lt;': '<',
1548
+ '&gt;': '>',
1549
+ '&quot;': '"',
1550
+ '&apos;': "'",
1551
+ '&nbsp;': ' ',
1552
+ '&copy;': '©',
1553
+ '&reg;': '®',
1554
+ '&trade;': '™',
1555
+ '&hellip;': '…',
1556
+ '&mdash;': '—',
1557
+ '&ndash;': '–',
1558
+ '&lsquo;': '\u2018', // Left single quotation mark
1559
+ '&rsquo;': '\u2019', // Right single quotation mark
1560
+ '&ldquo;': '\u201C', // Left double quotation mark
1561
+ '&rdquo;': '\u201D', // Right double quotation mark
1562
+ };
1563
+ Object.entries(namedEntities).forEach(([entity, char]) => {
1564
+ decoded = decoded.replace(new RegExp(entity, 'g'), char);
1565
+ });
1566
+ return decoded;
1567
+ }
1522
1568
  /**
1523
1569
  * Extract text content from a block's innerHTML by stripping HTML tags
1524
1570
  */
1525
1571
  function extractTextFromHTML(html) {
1526
1572
  if (!html)
1527
1573
  return '';
1528
- // Remove HTML tags and decode entities
1529
- let text = html
1530
- .replace(/<[^>]*>/g, '') // Remove HTML tags
1531
- .replace(/&nbsp;/g, ' ') // Replace &nbsp; with space
1532
- .replace(/&#8217;/g, "'") // Replace apostrophe entity
1533
- .replace(/&#8220;/g, '"') // Replace left double quote
1534
- .replace(/&#8221;/g, '"') // Replace right double quote
1535
- .replace(/&#8230;/g, '...') // Replace ellipsis
1536
- .replace(/&amp;/g, '&') // Replace &amp;
1537
- .replace(/&lt;/g, '<') // Replace &lt;
1538
- .replace(/&gt;/g, '>') // Replace &gt;
1539
- .replace(/&quot;/g, '"') // Replace &quot;
1540
- .replace(/&#8211;/g, '–') // Replace en dash
1541
- .replace(/&#8212;/g, '—') // Replace em dash
1542
- .trim();
1574
+ // Remove HTML tags first
1575
+ let text = html.replace(/<[^>]*>/g, '');
1576
+ // Decode all HTML entities (comprehensive)
1577
+ text = decodeHtmlEntities(text);
1543
1578
  // Clean up extra whitespace
1544
- text = text.replace(/\s+/g, ' ');
1579
+ text = text.replace(/\s+/g, ' ').trim();
1545
1580
  return text;
1546
1581
  }
1547
1582
  /**
@@ -1593,10 +1628,12 @@ function getImageUrl(block) {
1593
1628
  }
1594
1629
  /**
1595
1630
  * Extract image attributes (alt, width, height, alignment) from block
1631
+ * Prioritizes Cloudflare URLs over WordPress URLs
1596
1632
  */
1597
1633
  function getImageAttributes(block) {
1598
1634
  const attrs = block.attributes || {};
1599
- const url = getImageUrl(block);
1635
+ // Use extractImageUrlWithFallback to prioritize Cloudflare URLs
1636
+ const url = extractImageUrlWithFallback(block);
1600
1637
  // Extract width - can be number or string like "640px"
1601
1638
  let width;
1602
1639
  const widthAttr = attrs['width'];