docgen-utils 1.0.20 → 1.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,11 +6,37 @@ import { parseHeadingLevel, getTextAlignment, getTextContent, BLOCK_LEVEL_TAGS,
6
6
  import { isInlineOnlyContainer, extractInlineRuns, hasInlineFormatting } from "./parse-inline";
7
7
  import { isGridOrFlexContainer, isHorizontalFlexContainer, isDecorativeSvg, isTwoColumnGridLayout, findTwoColumnChildren, detectFlexEqualColumns, detectGridEqualColumns } from "./parse-layout";
8
8
  import { detectSkillItem, detectLanguageItem, detectProgressBar, detectTimeline } from "./parse-special";
9
+ function createParsedImageElement(imageEl, imageKey) {
10
+ const src = imageEl.getAttribute("src")?.trim();
11
+ if (!src) {
12
+ return null;
13
+ }
14
+ const alt = imageEl.getAttribute("alt") || undefined;
15
+ let width;
16
+ let height;
17
+ const widthAttr = imageEl.getAttribute("width");
18
+ const heightAttr = imageEl.getAttribute("height");
19
+ if (widthAttr && !widthAttr.includes("%")) {
20
+ width = parseInt(widthAttr, 10) || undefined;
21
+ }
22
+ if (heightAttr && !heightAttr.includes("%")) {
23
+ height = parseInt(heightAttr, 10) || undefined;
24
+ }
25
+ let caption;
26
+ const parentFigure = imageEl.closest("figure");
27
+ if (parentFigure) {
28
+ const figcaption = parentFigure.querySelector("figcaption");
29
+ if (figcaption) {
30
+ caption = getTextContent(figcaption).trim() || undefined;
31
+ }
32
+ }
33
+ return { type: "image", imageKey, src, alt, width, height, caption };
34
+ }
9
35
  /**
10
36
  * Parse content from a container element (like sidebar or main content).
11
37
  * Handles headings, paragraphs, lists, and nested containers with color inheritance.
12
38
  */
13
- function parseContainerContent(element, cssContext, inheritedColor) {
39
+ function parseContainerContent(element, cssContext, nextImageKey, inheritedColor) {
14
40
  const innerElements = [];
15
41
  function processInnerNode(node, color) {
16
42
  if (node.nodeType === Node.TEXT_NODE) {
@@ -236,7 +262,7 @@ function parseContainerContent(element, cssContext, inheritedColor) {
236
262
  * @param element The blockquote/callout element
237
263
  * @param cssContext The CSS context for resolving styles
238
264
  */
239
- function parseBlockquoteContent(element, cssContext) {
265
+ function parseBlockquoteContent(element, cssContext, nextImageKey) {
240
266
  const innerElements = [];
241
267
  // GENERALIZED: Extract blockquote's font-style from CSS element selector
242
268
  // This handles rules like "blockquote { font-style: italic; }"
@@ -374,7 +400,7 @@ function parseBlockquoteContent(element, cssContext) {
374
400
  const nestedBorderHex = extractBorderColorFromStyle(styles);
375
401
  // Only treat as nested blockquote if it has a visually distinct background or border
376
402
  if (nestedBgColor || nestedBorderHex) {
377
- const nestedContent = parseBlockquoteContent(el, cssContext);
403
+ const nestedContent = parseBlockquoteContent(el, cssContext, nextImageKey);
378
404
  if (nestedContent.length > 0) {
379
405
  let nestedBorderStyle;
380
406
  if (styles.borderLeft && !styles.border) {
@@ -436,29 +462,9 @@ function parseBlockquoteContent(element, cssContext) {
436
462
  }
437
463
  // Handle <img> elements inside blockquotes/callouts
438
464
  if (tagName === "img") {
439
- const src = el.getAttribute("src")?.trim();
440
- if (src) {
441
- const alt = el.getAttribute("alt") || undefined;
442
- let width;
443
- let height;
444
- const widthAttr = el.getAttribute("width");
445
- const heightAttr = el.getAttribute("height");
446
- if (widthAttr && !widthAttr.includes("%")) {
447
- width = parseInt(widthAttr, 10) || undefined;
448
- }
449
- if (heightAttr && !heightAttr.includes("%")) {
450
- height = parseInt(heightAttr, 10) || undefined;
451
- }
452
- // Check if img is inside a figure with figcaption
453
- let caption;
454
- const parentFigure = el.closest("figure");
455
- if (parentFigure) {
456
- const figcaption = parentFigure.querySelector("figcaption");
457
- if (figcaption) {
458
- caption = getTextContent(figcaption).trim() || undefined;
459
- }
460
- }
461
- innerElements.push({ type: "image", src, alt, width, height, caption });
465
+ const imageElement = createParsedImageElement(el, nextImageKey());
466
+ if (imageElement) {
467
+ innerElements.push(imageElement);
462
468
  }
463
469
  return;
464
470
  }
@@ -664,6 +670,8 @@ export function parseHtmlContent(html) {
664
670
  const cssContext = parseCssContext(doc);
665
671
  // Track SVGs that have been processed (to avoid duplicate processing)
666
672
  const processedSvgs = new Set();
673
+ let imageIndex = 0;
674
+ const nextImageKey = () => `image-${imageIndex++}`;
667
675
  const { body } = doc;
668
676
  function processNode(node, inheritedAlignment, inheritedColor) {
669
677
  if (node.nodeType === Node.TEXT_NODE) {
@@ -862,7 +870,7 @@ export function parseHtmlContent(html) {
862
870
  // This handles patterns like: <p class="intro-section"> with border-left: 4px solid ...
863
871
  // Must come BEFORE regular paragraph handling
864
872
  if (tagName === "p" && isBlockquoteOrCallout(element, cssContext)) {
865
- const content = parseBlockquoteContent(element, cssContext);
873
+ const content = parseBlockquoteContent(element, cssContext, nextImageKey);
866
874
  if (content.length > 0) {
867
875
  const elementStyles = getElementStyles(element, cssContext);
868
876
  let borderColor;
@@ -1361,8 +1369,8 @@ export function parseHtmlContent(html) {
1361
1369
  ? extractHexColor(sidebarStyles.color)
1362
1370
  : undefined;
1363
1371
  // Parse sidebar and main content separately
1364
- const sidebarContent = parseContainerContent(sidebarEl, cssContext, sidebarTextColor);
1365
- const mainContent = parseContainerContent(mainEl, cssContext);
1372
+ const sidebarContent = parseContainerContent(sidebarEl, cssContext, nextImageKey, sidebarTextColor);
1373
+ const mainContent = parseContainerContent(mainEl, cssContext, nextImageKey);
1366
1374
  if (sidebarContent.length > 0 || mainContent.length > 0) {
1367
1375
  // Emit two-column layout for documents with sidebar patterns.
1368
1376
  // This produces a DOCX table with sidebar + main content columns,
@@ -1435,7 +1443,7 @@ export function parseHtmlContent(html) {
1435
1443
  // Parse each column's content
1436
1444
  const columnContents = [];
1437
1445
  for (const col of gridColumns) {
1438
- const colContent = parseContainerContent(col, cssContext);
1446
+ const colContent = parseContainerContent(col, cssContext, nextImageKey);
1439
1447
  columnContents.push(colContent);
1440
1448
  }
1441
1449
  // Check if at least one column has content
@@ -1927,32 +1935,9 @@ export function parseHtmlContent(html) {
1927
1935
  }
1928
1936
  // Handle <img> elements - external images that need to be fetched
1929
1937
  if (tagName === "img") {
1930
- const src = element.getAttribute("src")?.trim();
1931
- if (src) {
1932
- const alt = element.getAttribute("alt") || undefined;
1933
- // Extract width and height from attributes only
1934
- // Computed styles are not available in linkedom (Node.js)
1935
- // Actual dimensions will be obtained when the image is fetched
1936
- let width;
1937
- let height;
1938
- const widthAttr = element.getAttribute("width");
1939
- const heightAttr = element.getAttribute("height");
1940
- if (widthAttr && !widthAttr.includes("%")) {
1941
- width = parseInt(widthAttr, 10) || undefined;
1942
- }
1943
- if (heightAttr && !heightAttr.includes("%")) {
1944
- height = parseInt(heightAttr, 10) || undefined;
1945
- }
1946
- // Check if img is inside a figure with figcaption
1947
- let caption;
1948
- const parentFigure = element.closest("figure");
1949
- if (parentFigure) {
1950
- const figcaption = parentFigure.querySelector("figcaption");
1951
- if (figcaption) {
1952
- caption = getTextContent(figcaption).trim() || undefined;
1953
- }
1954
- }
1955
- elements.push({ type: "image", src, alt, width, height, caption });
1938
+ const imageElement = createParsedImageElement(element, nextImageKey());
1939
+ if (imageElement) {
1940
+ elements.push(imageElement);
1956
1941
  }
1957
1942
  return;
1958
1943
  }
@@ -1961,30 +1946,9 @@ export function parseHtmlContent(html) {
1961
1946
  // Find the fallback img inside picture
1962
1947
  const imgEl = element.querySelector("img");
1963
1948
  if (imgEl) {
1964
- const src = imgEl.getAttribute("src")?.trim();
1965
- if (src) {
1966
- const alt = imgEl.getAttribute("alt") || undefined;
1967
- // Extract width and height
1968
- let width;
1969
- let height;
1970
- const widthAttr = imgEl.getAttribute("width");
1971
- const heightAttr = imgEl.getAttribute("height");
1972
- if (widthAttr && !widthAttr.includes("%")) {
1973
- width = parseInt(widthAttr, 10) || undefined;
1974
- }
1975
- if (heightAttr && !heightAttr.includes("%")) {
1976
- height = parseInt(heightAttr, 10) || undefined;
1977
- }
1978
- // Check for figcaption
1979
- let caption;
1980
- const parentFigure = element.closest("figure");
1981
- if (parentFigure) {
1982
- const figcaption = parentFigure.querySelector("figcaption");
1983
- if (figcaption) {
1984
- caption = getTextContent(figcaption).trim() || undefined;
1985
- }
1986
- }
1987
- elements.push({ type: "image", src, alt, width, height, caption });
1949
+ const imageElement = createParsedImageElement(imgEl, nextImageKey());
1950
+ if (imageElement) {
1951
+ elements.push(imageElement);
1988
1952
  }
1989
1953
  }
1990
1954
  return;
@@ -1993,29 +1957,9 @@ export function parseHtmlContent(html) {
1993
1957
  if (tagName === "figure") {
1994
1958
  const imgEl = element.querySelector("img") || element.querySelector("picture img");
1995
1959
  if (imgEl) {
1996
- const src = imgEl.getAttribute("src")?.trim();
1997
- if (src) {
1998
- const alt = imgEl.getAttribute("alt") || undefined;
1999
- // Extract width and height from attributes only
2000
- // Computed styles are not available in linkedom (Node.js)
2001
- // Actual dimensions will be obtained when the image is fetched
2002
- let width;
2003
- let height;
2004
- const widthAttr = imgEl.getAttribute("width");
2005
- const heightAttr = imgEl.getAttribute("height");
2006
- if (widthAttr && !widthAttr.includes("%")) {
2007
- width = parseInt(widthAttr, 10) || undefined;
2008
- }
2009
- if (heightAttr && !heightAttr.includes("%")) {
2010
- height = parseInt(heightAttr, 10) || undefined;
2011
- }
2012
- // Extract caption from figcaption
2013
- let caption;
2014
- const figcaption = element.querySelector("figcaption");
2015
- if (figcaption) {
2016
- caption = getTextContent(figcaption).trim() || undefined;
2017
- }
2018
- elements.push({ type: "image", src, alt, width, height, caption });
1960
+ const imageElement = createParsedImageElement(imgEl, nextImageKey());
1961
+ if (imageElement) {
1962
+ elements.push(imageElement);
2019
1963
  return;
2020
1964
  }
2021
1965
  }
@@ -2031,7 +1975,7 @@ export function parseHtmlContent(html) {
2031
1975
  // Check for blockquote/callout before generic container handling
2032
1976
  // Uses style-based detection, NOT class names
2033
1977
  if (isBlockquoteOrCallout(element, cssContext)) {
2034
- const content = parseBlockquoteContent(element, cssContext);
1978
+ const content = parseBlockquoteContent(element, cssContext, nextImageKey);
2035
1979
  if (content.length > 0) {
2036
1980
  // Extract styling from CSS classes and inline styles (generalized approach)
2037
1981
  const elementStyles = getElementStyles(element, cssContext);