@marvalt/wparser 0.1.66 → 0.1.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -264,6 +264,7 @@ declare function getBlockTextContent(block: WordPressBlock): string;
264
264
  declare function getImageUrl(block: WordPressBlock): string | null;
265
265
  /**
266
266
  * Extract image attributes (alt, width, height, alignment) from block
267
+ * Prioritizes Cloudflare URLs over WordPress URLs
267
268
  */
268
269
  declare function getImageAttributes(block: WordPressBlock): {
269
270
  url: string | null;
package/dist/index.esm.js CHANGED
@@ -1517,29 +1517,64 @@ function renderTextWithShortcodes(text, registry) {
1517
1517
  * Content extraction utilities for WordPress blocks
1518
1518
  * Extracts text content from various block formats
1519
1519
  */
1520
+ /**
1521
+ * Decode HTML entities in a string
1522
+ * Handles both named entities (&, ") and numeric entities (', ’)
1523
+ */
1524
+ function decodeHtmlEntities(text) {
1525
+ if (!text)
1526
+ return '';
1527
+ // Use browser's built-in decoder if available (most efficient)
1528
+ if (typeof document !== 'undefined') {
1529
+ const textarea = document.createElement('textarea');
1530
+ textarea.innerHTML = text;
1531
+ return textarea.value;
1532
+ }
1533
+ // Fallback for server-side or when document is not available
1534
+ // Decode numeric entities (', ’, etc.)
1535
+ let decoded = text.replace(/&#(\d+);/g, (match, dec) => {
1536
+ return String.fromCharCode(parseInt(dec, 10));
1537
+ });
1538
+ // Decode hex entities (', etc.)
1539
+ decoded = decoded.replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => {
1540
+ return String.fromCharCode(parseInt(hex, 16));
1541
+ });
1542
+ // Decode common named entities
1543
+ const namedEntities = {
1544
+ '&': '&',
1545
+ '&lt;': '<',
1546
+ '&gt;': '>',
1547
+ '&quot;': '"',
1548
+ '&apos;': "'",
1549
+ '&nbsp;': ' ',
1550
+ '&copy;': '©',
1551
+ '&reg;': '®',
1552
+ '&trade;': '™',
1553
+ '&hellip;': '…',
1554
+ '&mdash;': '—',
1555
+ '&ndash;': '–',
1556
+ '&lsquo;': '\u2018', // Left single quotation mark
1557
+ '&rsquo;': '\u2019', // Right single quotation mark
1558
+ '&ldquo;': '\u201C', // Left double quotation mark
1559
+ '&rdquo;': '\u201D', // Right double quotation mark
1560
+ };
1561
+ Object.entries(namedEntities).forEach(([entity, char]) => {
1562
+ decoded = decoded.replace(new RegExp(entity, 'g'), char);
1563
+ });
1564
+ return decoded;
1565
+ }
1520
1566
  /**
1521
1567
  * Extract text content from a block's innerHTML by stripping HTML tags
1522
1568
  */
1523
1569
  function extractTextFromHTML(html) {
1524
1570
  if (!html)
1525
1571
  return '';
1526
- // Remove HTML tags and decode entities
1527
- let text = html
1528
- .replace(/<[^>]*>/g, '') // Remove HTML tags
1529
- .replace(/&nbsp;/g, ' ') // Replace &nbsp; with space
1530
- .replace(/&#8217;/g, "'") // Replace apostrophe entity
1531
- .replace(/&#8220;/g, '"') // Replace left double quote
1532
- .replace(/&#8221;/g, '"') // Replace right double quote
1533
- .replace(/&#8230;/g, '...') // Replace ellipsis
1534
- .replace(/&amp;/g, '&') // Replace &amp;
1535
- .replace(/&lt;/g, '<') // Replace &lt;
1536
- .replace(/&gt;/g, '>') // Replace &gt;
1537
- .replace(/&quot;/g, '"') // Replace &quot;
1538
- .replace(/&#8211;/g, '–') // Replace en dash
1539
- .replace(/&#8212;/g, '—') // Replace em dash
1540
- .trim();
1572
+ // Remove HTML tags first
1573
+ let text = html.replace(/<[^>]*>/g, '');
1574
+ // Decode all HTML entities (comprehensive)
1575
+ text = decodeHtmlEntities(text);
1541
1576
  // Clean up extra whitespace
1542
- text = text.replace(/\s+/g, ' ');
1577
+ text = text.replace(/\s+/g, ' ').trim();
1543
1578
  return text;
1544
1579
  }
1545
1580
  /**
@@ -1591,10 +1626,12 @@ function getImageUrl(block) {
1591
1626
  }
1592
1627
  /**
1593
1628
  * Extract image attributes (alt, width, height, alignment) from block
1629
+ * Prioritizes Cloudflare URLs over WordPress URLs
1594
1630
  */
1595
1631
  function getImageAttributes(block) {
1596
1632
  const attrs = block.attributes || {};
1597
- const url = getImageUrl(block);
1633
+ // Use extractImageUrlWithFallback to prioritize Cloudflare URLs
1634
+ const url = extractImageUrlWithFallback(block);
1598
1635
  // Extract width - can be number or string like "640px"
1599
1636
  let width;
1600
1637
  const widthAttr = attrs['width'];