npm - @absolutejs/absolute - Versions diffs - 0.19.0-beta.618 → 0.19.0-beta.619 - Mend

@absolutejs/absolute 0.19.0-beta.618 → 0.19.0-beta.619

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/ai/client/index.js +96 -22
package/dist/ai/client/index.js.map +3 -3
package/dist/ai/client/ui.js +96 -22
package/dist/ai/client/ui.js.map +3 -3
package/dist/ai/index.js +473 -82
package/dist/ai/index.js.map +7 -7
package/dist/ai/rag/ui.js +96 -22
package/dist/ai/rag/ui.js.map +3 -3
package/dist/ai-client/angular/ai/index.js +95 -21
package/dist/ai-client/react/ai/index.js +95 -21
package/dist/ai-client/vue/ai/index.js +95 -21
package/dist/angular/ai/index.js +96 -22
package/dist/angular/ai/index.js.map +3 -3
package/dist/angular/index.js +2 -2
package/dist/angular/index.js.map +1 -1
package/dist/angular/server.js +2 -2
package/dist/angular/server.js.map +1 -1
package/dist/build.js +2 -2
package/dist/build.js.map +1 -1
package/dist/index.js +2 -2
package/dist/index.js.map +1 -1
package/dist/react/ai/index.js +96 -22
package/dist/react/ai/index.js.map +3 -3
package/dist/src/vue/ai/useRAG.d.ts +4 -4
package/dist/src/vue/ai/useRAGChunkPreview.d.ts +2 -2
package/dist/src/vue/ai/useRAGSearch.d.ts +2 -2
package/dist/svelte/ai/index.js +96 -22
package/dist/svelte/ai/index.js.map +3 -3
package/dist/types/ai.d.ts +2 -2
package/dist/vue/ai/index.js +96 -22
package/dist/vue/ai/index.js.map +3 -3
package/package.json +7 -7

package/dist/ai/index.js CHANGED Viewed

@@ -4237,6 +4237,25 @@ var buildContextLabel2 = (metadata) => {
   if (!metadata) {
     return;
   }
+  const pdfTextKind = getContextString2(metadata.pdfTextKind);
+  const officeBlockKind = getContextString2(metadata.officeBlockKind);
+  const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
+  const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
+  if (pdfTextKind === "table_like" && sectionTitle) {
+    return `PDF table block ${sectionTitle}`;
+  }
+  if (pdfTextKind === "paragraph" && sectionTitle) {
+    return `PDF text block ${sectionTitle}`;
+  }
+  if (officeBlockKind === "table" && sectionTitle) {
+    return `Office table block ${sectionTitle}`;
+  }
+  if (officeBlockKind === "list" && sectionTitle) {
+    return `Office list block ${sectionTitle}`;
+  }
+  if (officeBlockKind === "paragraph" && sectionTitle) {
+    return `Office paragraph block ${sectionTitle}`;
+  }
   const emailKind = getContextString2(metadata.emailKind);
   if (emailKind === "attachment") {
     return "Attachment evidence";
@@ -4273,8 +4292,6 @@ var buildContextLabel2 = (metadata) => {
   if (speaker) {
     return `Speaker ${speaker}`;
   }
-  const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
-  const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
   if (sectionTitle) {
     return `Section ${sectionTitle}`;
   }
@@ -4284,11 +4301,21 @@ var buildLocatorLabel2 = (metadata, source, title) => {
   if (!metadata) {
     return;
   }
+  const pdfTextKind = getContextString2(metadata.pdfTextKind);
+  const officeBlockKind = getContextString2(metadata.officeBlockKind);
+  const pdfBlockNumber = getContextNumber2(metadata.pdfBlockNumber);
+  const officeBlockNumber = getContextNumber2(metadata.officeBlockNumber);
   const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
   const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
   if (page && region) {
     return `Page ${page} \xB7 Region ${region}`;
   }
+  if (page && pdfBlockNumber && pdfTextKind === "table_like") {
+    return `Page ${page} \xB7 Table Block ${pdfBlockNumber}`;
+  }
+  if (page && pdfBlockNumber) {
+    return `Page ${page} \xB7 Text Block ${pdfBlockNumber}`;
+  }
   if (page) {
     return `Page ${page}`;
   }
@@ -4317,6 +4344,15 @@ var buildLocatorLabel2 = (metadata, source, title) => {
   if (mediaStart) {
     return `Timestamp ${mediaStart}`;
   }
+  if (officeBlockNumber && officeBlockKind === "table") {
+    return `Office table block ${officeBlockNumber}`;
+  }
+  if (officeBlockNumber && officeBlockKind === "list") {
+    return `Office list block ${officeBlockNumber}`;
+  }
+  if (officeBlockNumber && officeBlockKind === "paragraph") {
+    return `Office paragraph block ${officeBlockNumber}`;
+  }
   const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
   if (sectionPath.length > 0) {
     return `Section ${sectionPath.join(" > ")}`;
@@ -4334,12 +4370,16 @@ var buildProvenanceLabel2 = (metadata) => {
   const mediaKind = getContextString2(metadata.mediaKind);
   const transcriptSource = getContextString2(metadata.transcriptSource);
   const pdfTextMode = getContextString2(metadata.pdfTextMode);
+  const pdfTextKind = getContextString2(metadata.pdfTextKind);
+  const officeBlockKind = getContextString2(metadata.officeBlockKind);
   const ocrEngine = getContextString2(metadata.ocrEngine);
   const extractorRegistryMatch = getContextString2(metadata.extractorRegistryMatch);
   const chunkingProfile = getContextString2(metadata.chunkingProfile);
   const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
   const labels = [
     pdfTextMode ? `PDF ${pdfTextMode}` : "",
+    pdfTextKind === "table_like" ? "PDF table block" : pdfTextKind === "paragraph" ? "PDF text block" : "",
+    officeBlockKind ? `Office ${officeBlockKind}` : "",
     ocrEngine ? `OCR ${ocrEngine}` : "",
     extractorRegistryMatch ? `Extractor ${extractorRegistryMatch}` : "",
     chunkingProfile ? `Chunking ${chunkingProfile}` : "",
@@ -4375,7 +4415,7 @@ var buildRAGChunkStructure = (metadata) => {
     return;
   }
   const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : undefined;
-  const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
+  const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "office_block" || metadata.sectionKind === "pdf_block" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
   const section = {
     depth: getContextNumber2(metadata.sectionDepth),
     kind: sectionKind,
@@ -4695,7 +4735,7 @@ var buildRAGSourceSummaries = (sources) => {
   const citationReferenceMap = buildRAGCitationReferenceMap(citations);
   return sourceGroups.map((group) => {
     const groupCitations = citations.filter((citation) => group.chunks.some((chunk) => chunk.chunkId === citation.chunkId));
-    const leadChunk = group.chunks.slice().sort((left, right) => right.score - left.score)[0];
+    const leadChunk = getPreferredSourceLeadChunk(group.chunks);
     const excerpts = leadChunk ? buildRAGChunkExcerpts(group.chunks, leadChunk.chunkId) : undefined;
     const structure = leadChunk?.structure ?? buildRAGChunkStructure(leadChunk?.metadata);
     const excerptSelection = buildRAGExcerptSelection(excerpts, structure);
@@ -4723,13 +4763,45 @@ var getSectionPathFromSource = (source) => {
   const path = source.structure?.section?.path ?? (Array.isArray(source.metadata?.sectionPath) ? source.metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : []);
   return path.length > 0 ? path : undefined;
 };
+var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office "));
+var getStructuredSectionScoreWeight = (metadata) => {
+  if (!metadata) {
+    return 1;
+  }
+  const pdfTextKind = getContextString2(metadata.pdfTextKind);
+  const officeBlockKind = getContextString2(metadata.officeBlockKind);
+  const sectionKind = getContextString2(metadata.sectionKind);
+  if (pdfTextKind === "table_like") {
+    return 1.28;
+  }
+  if (officeBlockKind === "table" || officeBlockKind === "list") {
+    return 1.24;
+  }
+  if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
+    return 1.12;
+  }
+  return 1;
+};
+var getStructuredSourceLeadScore = (source) => source.score * getStructuredSectionScoreWeight(source.metadata);
+var getPreferredSourceLeadChunk = (chunks) => chunks.slice().sort((left, right) => {
+  const leftWeightedScore = getStructuredSourceLeadScore(left);
+  const rightWeightedScore = getStructuredSourceLeadScore(right);
+  if (rightWeightedScore !== leftWeightedScore) {
+    return rightWeightedScore - leftWeightedScore;
+  }
+  if (right.score !== left.score) {
+    return right.score - left.score;
+  }
+  return left.chunkId.localeCompare(right.chunkId);
+})[0];
 var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
-  const totalScore = sources.reduce((sum, source) => sum + source.score, 0);
+  const totalScore = sources.reduce((sum, source) => sum + source.score * getStructuredSectionScoreWeight(source.metadata), 0);
   if (sources.length === 0 || totalScore <= 0) {
     return [];
   }
   const sections = new Map;
   for (const source of sources) {
+    const structuredScore = source.score * getStructuredSectionScoreWeight(source.metadata);
     const path = getSectionPathFromSource(source);
     if (!path) {
       continue;
@@ -4761,7 +4833,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
         sourceSet: new Set(source.source ? [source.source] : []),
         topChunkId: source.chunkId,
         topSource: source.source,
-        totalScore: source.score,
+        totalScore: structuredScore,
         transformedHits,
         variantHits,
         vectorHits
@@ -4769,7 +4841,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
       continue;
     }
     existing.count += 1;
-    existing.totalScore += source.score;
+    existing.totalScore += structuredScore;
     if (source.source) {
       existing.sourceSet.add(source.source);
     }
@@ -4797,6 +4869,8 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
     const parentTotal = siblingPool.reduce((sum, entry) => sum + entry.totalScore, 0);
     const scoreShare = section.totalScore / totalScore;
     const parentShare = parentTotal > 0 ? section.totalScore / parentTotal : undefined;
+    const topChunk = sources.find((source) => source.chunkId === section.topChunkId);
+    const topContextLabel = topChunk?.labels?.contextLabel ?? buildContextLabel2(topChunk?.metadata);
     const parentDistribution = parentTotal > 0 ? siblingPool.map((entry) => ({
       count: entry.count,
       isActive: entry.key === section.key,
@@ -4922,6 +4996,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
       reasons.push("concentrated_evidence");
     }
     const summaryParts = [
+      isBlockAwareContextLabel(topContextLabel) ? topContextLabel : "",
       `${section.count} hit${section.count === 1 ? "" : "s"}`,
       `${(scoreShare * 100).toFixed(0)}% score share`,
       `vector ${section.vectorHits} \xB7 lexical ${section.lexicalHits} \xB7 hybrid ${section.hybridHits}`,
@@ -5133,22 +5208,21 @@ var updateSourceGroup = (groups, source) => {
     groups.set(key, buildSourceGroup(source, key));
     return;
   }
-  if (source.score > existing.bestScore) {
-    existing.bestScore = source.score;
-    existing.label = buildSourceLabel2(source);
-    existing.labels = source.labels ?? buildRAGSourceLabels({
-      metadata: source.metadata,
-      source: source.source,
-      title: source.title
-    });
-    existing.structure = source.structure ?? buildRAGChunkStructure(source.metadata);
-    existing.source = source.source;
-    existing.title = source.title;
-  } else {
-    existing.bestScore = Math.max(existing.bestScore, source.score);
-  }
+  existing.bestScore = Math.max(existing.bestScore, source.score);
   existing.count += 1;
   existing.chunks.push(source);
+  const leadChunk = getPreferredSourceLeadChunk(existing.chunks);
+  if (leadChunk) {
+    existing.label = buildSourceLabel2(leadChunk);
+    existing.labels = leadChunk.labels ?? buildRAGSourceLabels({
+      metadata: leadChunk.metadata,
+      source: leadChunk.source,
+      title: leadChunk.title
+    });
+    existing.structure = leadChunk.structure ?? buildRAGChunkStructure(leadChunk.metadata);
+    existing.source = leadChunk.source;
+    existing.title = leadChunk.title;
+  }
 };
 var getLatestAssistantMessage = (messages) => {
   for (let index = messages.length - 1;index >= 0; index -= 1) {
@@ -8485,6 +8559,55 @@ var scoreLoosePhraseMatch2 = (query, text) => {
   }
   return 0;
 };
+var queryHasAnyToken = (queryTokens, candidates) => candidates.some((candidate) => queryTokens.includes(candidate));
+var scoreStructuredEvidenceMatch = (queryTokens, result) => {
+  const metadata = result.metadata ?? {};
+  const pdfTextKind = typeof metadata.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
+  const officeBlockKind = typeof metadata.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
+  const hasBlockMetadata = typeof metadata.pdfBlockNumber === "number" || typeof metadata.officeBlockNumber === "number";
+  let score = 0;
+  if (hasBlockMetadata) {
+    score += 0.12;
+  }
+  if (pdfTextKind === "table_like" && queryHasAnyToken(queryTokens, [
+    "table",
+    "row",
+    "rows",
+    "column",
+    "columns",
+    "spreadsheet",
+    "sheet",
+    "workbook"
+  ])) {
+    score += 0.65;
+  }
+  if (officeBlockKind === "table" && queryHasAnyToken(queryTokens, [
+    "table",
+    "row",
+    "rows",
+    "column",
+    "columns",
+    "matrix",
+    "grid"
+  ])) {
+    score += 0.55;
+  }
+  if (officeBlockKind === "list" && queryHasAnyToken(queryTokens, [
+    "list",
+    "checklist",
+    "bullet",
+    "bullets",
+    "step",
+    "steps",
+    "task",
+    "tasks",
+    "item",
+    "items"
+  ])) {
+    score += 0.55;
+  }
+  return score;
+};
 var scoreHeuristicMatch = ({
   query,
   queryTokens,
@@ -8501,7 +8624,8 @@ var scoreHeuristicMatch = ({
   const exactPhraseBoost = Math.max(normalizeText([result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")).includes(queryTokens.join(" ")) ? 1 : 0, scoreLoosePhraseMatch2(query, [result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")));
   const sourcePathBoost = typeof result.source === "string" && queryTokens.some((token) => result.source?.toLowerCase().includes(token)) ? 0.5 : 0;
   const metadataBoost = metadataValues.length > 0 ? queryTokens.filter((token) => metadataValues.some((value) => value.toLowerCase().includes(token))).length / queryTokens.length : 0;
-  return result.score + overlapBoost + exactPhraseBoost + sourcePathBoost + metadataBoost;
+  const structuredEvidenceBoost = scoreStructuredEvidenceMatch(queryTokens, result);
+  return result.score + overlapBoost + exactPhraseBoost + sourcePathBoost + metadataBoost + structuredEvidenceBoost;
 };
 var normalizeText = (value) => tokenize3(value).join(" ");
 var applyRAGReranking = async ({
@@ -8736,32 +8860,59 @@ var stripHtmlTags = (value) => {
 `).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
   return decodeHtmlEntities(withoutTags);
 };
+var stripHtmlNoiseBlocks = (value) => value.replace(/<!--[\s\S]*?-->/g, " ").replace(/<(script|style|template|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(hidden|aria-hidden=(['"])true\3)[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<(nav|footer|header|aside|form|dialog)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(nav|menu|footer|header|sidebar|promo|banner|cookie|breadcrumb|share|social|subscribe|newsletter|modal)[^'"]*\2[^>]*>[\s\S]*?<\/\1>/gi, " ");
+var collectHtmlContentCandidates = (value) => {
+  const patterns = [
+    {
+      contentGroup: 1,
+      pattern: /<main\b[^>]*>([\s\S]*?)<\/main>/gi
+    },
+    {
+      contentGroup: 1,
+      pattern: /<article\b[^>]*>([\s\S]*?)<\/article>/gi
+    },
+    {
+      contentGroup: 3,
+      pattern: /<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/gi
+    },
+    {
+      contentGroup: 4,
+      pattern: /<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(content|article|main|post|body)[^'"]*\2[^>]*>([\s\S]*?)<\/\1>/gi
+    }
+  ];
+  const candidates = [];
+  for (const entry of patterns) {
+    for (const match of value.matchAll(entry.pattern)) {
+      const rawCandidate = match[entry.contentGroup];
+      const candidate = typeof rawCandidate === "string" ? rawCandidate : "";
+      if (candidate.trim()) {
+        candidates.push(candidate.trim());
+      }
+    }
+  }
+  return candidates;
+};
 var extractMainHtmlContent = (value) => {
   const trimmed = value.trim();
   if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
     return value;
   }
-  const boilerplateStripped = trimmed.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<(nav|footer|header|aside|form)\b[^>]*>[\s\S]*?<\/\1>/gi, " ");
-  const mainMatch = boilerplateStripped.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
-  if (mainMatch?.[1]) {
-    return mainMatch[1];
-  }
-  const articleMatches = [
-    ...boilerplateStripped.matchAll(/<article\b[^>]*>([\s\S]*?)<\/article>/gi)
-  ].map((match) => match[1]?.trim()).filter(Boolean);
-  if (articleMatches.length > 0) {
-    return articleMatches.join(`
-`);
-  }
-  const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
-  if (roleMainMatch?.[3]) {
-    return roleMainMatch[3];
+  const stripped = stripHtmlNoiseBlocks(trimmed);
+  const candidates = collectHtmlContentCandidates(stripped);
+  if (candidates.length > 0) {
+    const bestCandidate = candidates.map((candidate) => ({
+      candidate,
+      score: stripHtmlTags(candidate).replace(/\s+/g, " ").trim().length
+    })).sort((left, right) => right.score - left.score)[0]?.candidate;
+    if (bestCandidate) {
+      return bestCandidate;
+    }
   }
-  const bodyMatch = boilerplateStripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
+  const bodyMatch = stripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
   if (bodyMatch?.[1]) {
     return bodyMatch[1];
   }
-  return boilerplateStripped;
+  return stripped;
 };
 var stripHtml = (value) => {
   const focused = extractMainHtmlContent(value);
@@ -8779,6 +8930,93 @@ var stripMarkdown = (value) => {
 `);
   return normalizeWhitespace(stripped);
 };
+var pdfNativeStructureUnits = (metadata) => {
+  const blocks = Array.isArray(metadata?.pdfTextBlocks) ? metadata.pdfTextBlocks : [];
+  const units = [];
+  for (const block of blocks) {
+    if (!block || typeof block !== "object") {
+      continue;
+    }
+    const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
+    if (!text) {
+      continue;
+    }
+    const pageNumber = typeof block.pageNumber === "number" && Number.isFinite(block.pageNumber) ? block.pageNumber : undefined;
+    const pdfBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
+    const pdfTextKind = block.textKind === "table_like" ? "table_like" : "paragraph";
+    const sectionTitle = pageNumber ? pdfTextKind === "table_like" ? `Page ${pageNumber} Table Block` : `Page ${pageNumber} Text Block` : pdfTextKind === "table_like" ? "Table Block" : "Text Block";
+    units.push({
+      pageNumber,
+      pdfBlockNumber,
+      pdfTextKind,
+      preferredChunkUnits: pdfTextKind === "table_like" ? text.split(`
+`).filter(Boolean) : undefined,
+      sectionDepth: 1,
+      sectionKind: "pdf_block",
+      sectionPath: [sectionTitle],
+      sectionTitle,
+      text
+    });
+  }
+  return units;
+};
+var officeNativeStructureUnits = (metadata) => {
+  const blocks = Array.isArray(metadata?.officeBlocks) ? metadata.officeBlocks : [];
+  const units = [];
+  const headingStack = [];
+  const decorateOfficeSectionText = (text, sectionTitle) => {
+    if (!sectionTitle || text.includes(sectionTitle)) {
+      return text;
+    }
+    return normalizeWhitespace(`${sectionTitle}
+${text}`);
+  };
+  for (const [index, block] of blocks.entries()) {
+    if (!block || typeof block !== "object") {
+      continue;
+    }
+    const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
+    if (!text) {
+      continue;
+    }
+    const officeBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
+    const officeBlockKind = block.blockKind === "title" || block.blockKind === "heading" || block.blockKind === "list" || block.blockKind === "table" ? block.blockKind : "paragraph";
+    const headingLevel = typeof block.headingLevel === "number" && Number.isFinite(block.headingLevel) ? block.headingLevel : undefined;
+    if (officeBlockKind === "title" || officeBlockKind === "heading") {
+      const level = officeBlockKind === "title" ? 1 : headingLevel ?? 1;
+      headingStack[level - 1] = text;
+      headingStack.length = level;
+      const nextBlock = blocks[index + 1];
+      const nextKind = nextBlock && typeof nextBlock === "object" ? nextBlock.blockKind : undefined;
+      if (nextKind === "title" || nextKind === "heading" || nextKind === "list" || nextKind === "table" || !nextBlock) {
+        units.push({
+          officeBlockKind,
+          officeBlockNumber,
+          sectionDepth: headingStack.length,
+          sectionKind: "office_heading",
+          sectionPath: [...headingStack],
+          sectionTitle: text,
+          text
+        });
+      }
+      continue;
+    }
+    const sectionPath = headingStack.length > 0 ? [...headingStack] : undefined;
+    const sectionTitle = sectionPath?.at(-1);
+    units.push({
+      officeBlockKind,
+      officeBlockNumber,
+      preferredChunkUnits: officeBlockKind === "table" ? text.split(`
+`).filter(Boolean) : undefined,
+      sectionDepth: sectionPath?.length,
+      sectionKind: officeBlockKind === "paragraph" ? "office_heading" : "office_block",
+      sectionPath,
+      sectionTitle,
+      text: officeBlockKind === "paragraph" ? decorateOfficeSectionText(text, sectionTitle) : text
+    });
+  }
+  return units;
+};
 var markdownStructureUnits = (value) => {
   const lines = value.replace(/\r\n?/g, `
 `).split(`
@@ -9122,6 +9360,7 @@ var appendPdfLineBreak = (parts) => {
   parts.push(`
 `);
 };
+var PDF_CHROME_LINE_MAX_LENGTH = 80;
 var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
 var extractTextFromPDFTextObject = (value) => {
   const parts = [];
@@ -9150,19 +9389,84 @@ var extractTextFromPDFTextObject = (value) => {
   }
   return parts.join("");
 };
-var extractTextFromPDFBytes = (data) => {
-  const raw = Buffer.from(data).toString("latin1");
-  const textObjects = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match) => extractTextFromPDFTextObject(match[1] ?? "")).filter(Boolean);
-  const combined = textObjects.length > 0 ? textObjects.join(`
-`) : [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
-`);
-  return normalizeWhitespace(combined);
+var buildPDFNativeTextBlock = (text, blockNumber, pageNumber) => {
+  const normalized = normalizeWhitespace(text);
+  if (!normalized) {
+    return;
+  }
+  const lineCount = normalized.split(`
+`).filter(Boolean).length;
+  const textKind = normalized.includes(" | ") ? "table_like" : "paragraph";
+  return {
+    blockNumber,
+    lineCount,
+    pageNumber,
+    text: normalized,
+    textKind
+  };
+};
+var isLikelyPDFPageLabel = (value) => /^page\s+\d+(?:\s+of\s+\d+)?$/i.test(value.trim());
+var suppressRepeatedPDFChrome = (blocks) => {
+  const linePages = new Map;
+  for (const block of blocks) {
+    for (const line of block.text.split(`
+`)) {
+      const normalized = normalizeWhitespace(line);
+      if (!normalized || normalized.length > PDF_CHROME_LINE_MAX_LENGTH) {
+        continue;
+      }
+      const pages = linePages.get(normalized) ?? new Set;
+      pages.add(block.pageNumber);
+      linePages.set(normalized, pages);
+    }
+  }
+  return blocks.map((block) => {
+    const keptLines = block.text.split(`
+`).map((line) => normalizeWhitespace(line)).filter((line) => {
+      if (!line) {
+        return false;
+      }
+      if (isLikelyPDFPageLabel(line)) {
+        return false;
+      }
+      const repeatedPages = linePages.get(line);
+      if (line.length <= PDF_CHROME_LINE_MAX_LENGTH && repeatedPages && repeatedPages.size > 1) {
+        return false;
+      }
+      return true;
+    });
+    const text = normalizeWhitespace(keptLines.join(`
+`));
+    if (!text) {
+      return;
+    }
+    return buildPDFNativeTextBlock(text, block.blockNumber, block.pageNumber);
+  }).filter((value) => Boolean(value));
 };
-var estimatePDFPageCount = (data) => {
+var extractNativePDFText = (data) => {
   const raw = Buffer.from(data).toString("latin1");
   const count = [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
-  return count > 0 ? count : 1;
+  const pageCount = count > 0 ? count : 1;
+  const pageMarkers = [...raw.matchAll(/\/Type\s*\/Page\b/g)].map((match) => match.index ?? raw.length);
+  const blocks = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match, index) => {
+    const blockText = extractTextFromPDFTextObject(match[1] ?? "");
+    const objectEnd = (match.index ?? 0) + (match[0]?.length ?? 0);
+    const pageIndex = pageMarkers.findIndex((marker) => marker >= objectEnd);
+    const pageNumber = pageIndex >= 0 ? pageIndex + 1 : pageCount;
+    return buildPDFNativeTextBlock(blockText, index + 1, pageNumber);
+  }).filter((value) => Boolean(value));
+  const visibleBlocks = suppressRepeatedPDFChrome(blocks);
+  const fallbackText = [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
+`);
+  const text = visibleBlocks.length > 0 ? normalizeWhitespace(visibleBlocks.map((block) => block.text).join(`
+`)) : normalizeWhitespace(fallbackText);
+  return {
+    pageCount,
+    text,
+    textBlockCount: visibleBlocks.length,
+    textBlocks: visibleBlocks
+  };
 };
 var readUInt16LE = (data, offset) => data[offset] | data[offset + 1] << 8;
 var readUInt32LE = (data, offset) => (data[offset] | data[offset + 1] << 8 | data[offset + 2] << 16 | data[offset + 3] << 24) >>> 0;
@@ -9249,35 +9553,64 @@ var decodeGzipEntries = (data, input) => {
 var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
 var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
 `).replace(/<[^>]+>/g, " ")));
-var officeDocumentParagraphs = (entries) => {
+var officeDocumentBlocks = (entries) => {
   const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
   if (!documentEntry) {
     return [];
   }
   const xml = decodeUtf8(documentEntry.data);
-  const paragraphs = [...xml.matchAll(/<w:p\b[\s\S]*?<\/w:p>/g)];
-  return paragraphs.map((match) => {
-    const paragraphXml = match[0] ?? "";
-    const text = extractOfficeParagraphText(paragraphXml);
+  const bodyMatch = xml.match(/<w:body\b[^>]*>([\s\S]*?)<\/w:body>/i);
+  const body = bodyMatch?.[1] ?? xml;
+  const blocks = [];
+  const blockPattern = /<(w:p|w:tbl)\b[\s\S]*?<\/\1>/g;
+  for (const match of body.matchAll(blockPattern)) {
+    const blockXml = match[0] ?? "";
+    if (blockXml.startsWith("<w:tbl")) {
+      const rows = [...blockXml.matchAll(/<w:tr\b[\s\S]*?<\/w:tr>/g)].map((rowMatch, rowIndex) => {
+        const cells = [
+          ...(rowMatch[0] ?? "").matchAll(/<w:tc\b[\s\S]*?<\/w:tc>/g)
+        ].map((cellMatch) => extractOfficeParagraphText(cellMatch[0] ?? "")).filter(Boolean);
+        if (cells.length === 0) {
+          return "";
+        }
+        return `Row ${rowIndex + 1}. ${cells.map((cell, cellIndex) => `${String.fromCharCode(65 + cellIndex)}: ${cell}`).join(" | ")}`;
+      }).filter(Boolean);
+      const text2 = normalizeWhitespace(rows.join(`
+`));
+      if (!text2) {
+        continue;
+      }
+      blocks.push({
+        blockKind: "table",
+        blockNumber: blocks.length + 1,
+        text: text2
+      });
+      continue;
+    }
+    const text = extractOfficeParagraphText(blockXml);
     if (!text) {
-      return "";
+      continue;
     }
-    const styleMatch = paragraphXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
+    const styleMatch = blockXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
     const style = (styleMatch?.[1] ?? "").toLowerCase();
-    if (style === "title") {
-      return text;
-    }
     const headingMatch = style.match(/^heading([1-6])$/);
-    if (headingMatch) {
-      return text;
-    }
-    return text;
-  }).filter(Boolean);
+    const isListParagraph = /<w:numPr\b/i.test(blockXml) || style.includes("list") || style.includes("bullet");
+    const blockKind = style === "title" ? "title" : headingMatch ? "heading" : isListParagraph ? "list" : "paragraph";
+    const decoratedText = blockKind === "list" && !/^[-*]\s/.test(text) ? `- ${text}` : text;
+    blocks.push({
+      blockKind,
+      blockNumber: blocks.length + 1,
+      headingLevel: headingMatch ? Number.parseInt(headingMatch[1] ?? "1", 10) : undefined,
+      style: style || undefined,
+      text: decoratedText
+    });
+  }
+  return blocks;
 };
 var officeDocumentText = (entries) => {
-  const paragraphs = officeDocumentParagraphs(entries);
-  if (paragraphs.length > 0) {
-    return normalizeWhitespace(paragraphs.join(`
+  const blocks = officeDocumentBlocks(entries);
+  if (blocks.length > 0) {
+    return normalizeWhitespace(blocks.map((block) => block.text).join(`
 `));
   }
@@ -9288,11 +9621,7 @@ var officeDocumentText = (entries) => {
   return extractXmlText(decodeUtf8(documentEntry.data));
 };
 var officeDocumentSectionCount = (entries) => {
-  const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
-  if (!documentEntry) {
-    return;
-  }
-  const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
+  const count = officeDocumentBlocks(entries).length;
   return count > 0 ? count : undefined;
 };
 var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
@@ -9822,8 +10151,10 @@ var createOfficeDocumentExtractor = () => ({
     let officeMetadata = {};
     let structuredDocuments = [];
     if (extension === ".docx" || extension === ".odt") {
+      const officeBlocks = officeDocumentBlocks(entries);
       text = officeDocumentText(entries);
       officeMetadata = {
+        officeBlocks,
         sectionCount: officeDocumentSectionCount(entries)
       };
     } else if (extension === ".xlsx" || extension === ".ods") {
@@ -10013,8 +10344,8 @@ var createPDFFileExtractor = () => ({
   name: "absolute_pdf",
   supports: pdfExtractorSupports,
   extract: (input) => {
-    const text = extractTextFromPDFBytes(input.data);
-    if (!text) {
+    const extracted = extractNativePDFText(input.data);
+    if (!extracted.text) {
       throw new Error("AbsoluteJS could not extract readable text from this PDF. Supply a custom extractor for scanned or image-only PDFs.");
     }
     return {
@@ -10024,10 +10355,12 @@ var createPDFFileExtractor = () => ({
       metadata: {
         ...input.metadata ?? {},
         fileKind: "pdf",
-        pageCount: estimatePDFPageCount(input.data)
+        pageCount: extracted.pageCount,
+        pdfTextBlockCount: extracted.textBlockCount,
+        pdfTextBlocks: extracted.textBlocks
       },
       source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
-      text,
+      text: extracted.text,
       title: input.title
     };
   }
@@ -10052,7 +10385,8 @@ var createRAGPDFOCRExtractor = (options) => ({
   name: `absolute_pdf_ocr:${options.provider.name}`,
   supports: pdfExtractorSupports,
   extract: async (input) => {
-    const nativeText = extractTextFromPDFBytes(input.data);
+    const extracted = extractNativePDFText(input.data);
+    const nativeText = extracted.text;
     const minLength = options.minExtractedTextLength ?? 80;
     const shouldUseNativeText = !options.alwaysOCR && nativeText.length >= minLength;
     if (shouldUseNativeText) {
@@ -10063,7 +10397,9 @@ var createRAGPDFOCRExtractor = (options) => ({
         metadata: {
           ...input.metadata ?? {},
           fileKind: "pdf",
-          pageCount: estimatePDFPageCount(input.data),
+          pageCount: extracted.pageCount,
+          pdfTextBlockCount: extracted.textBlockCount,
+          pdfTextBlocks: extracted.textBlocks,
           pdfTextMode: "native"
         },
         source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
@@ -10078,7 +10414,7 @@ var createRAGPDFOCRExtractor = (options) => ({
     const baseMetadata = {
       ...ocrMetadata(ocr),
       fileKind: "pdf",
-      pageCount: estimatePDFPageCount(input.data),
+      pageCount: extracted.pageCount,
       pdfTextMode: "ocr"
     };
     const summaryDocument = {
@@ -10251,6 +10587,18 @@ var sourceAwareUnits = (document, format, normalizedText) => {
     }
     case "text":
     default:
+      if (document.metadata?.fileKind === "office") {
+        const sections = officeNativeStructureUnits(document.metadata);
+        if (sections.length > 0) {
+          return sections;
+        }
+      }
+      if (document.metadata?.fileKind === "pdf") {
+        const sections = pdfNativeStructureUnits(document.metadata);
+        if (sections.length > 0) {
+          return sections;
+        }
+      }
       if (document.metadata?.sourceNativeKind === "spreadsheet_sheet") {
         return spreadsheetStructureUnits(normalizedText, document.metadata);
       }
@@ -10574,6 +10922,11 @@ var prepareRAGDocument = (document, defaultChunking, chunkingRegistry) => {
         ...sectionTitle ? { sectionTitle } : {},
         ...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
         ...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
+        ...typeof entry.pageNumber === "number" ? { pageNumber: entry.pageNumber } : {},
+        ...typeof entry.officeBlockNumber === "number" ? { officeBlockNumber: entry.officeBlockNumber } : {},
+        ...entry.officeBlockKind ? { officeBlockKind: entry.officeBlockKind } : {},
+        ...typeof entry.pdfBlockNumber === "number" ? { pdfBlockNumber: entry.pdfBlockNumber } : {},
+        ...entry.pdfTextKind ? { pdfTextKind: entry.pdfTextKind } : {},
         ...entry.sectionKind ? { sectionKind: entry.sectionKind } : {},
         ...sectionChunkId ? { sectionChunkId } : {},
         ...sectionChunkId && sectionChunkIndex >= 0 ? {
@@ -10962,9 +11315,25 @@ var annotateRetrievalChannels = (input) => {
     };
   });
 };
+var getStructuredSectionScoreWeight2 = (metadata) => {
+  const pdfTextKind = typeof metadata?.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
+  const officeBlockKind = typeof metadata?.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
+  const sectionKind = typeof metadata?.sectionKind === "string" ? metadata.sectionKind : undefined;
+  if (pdfTextKind === "table_like") {
+    return 1.28;
+  }
+  if (officeBlockKind === "table" || officeBlockKind === "list") {
+    return 1.24;
+  }
+  if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
+    return 1.12;
+  }
+  return 1;
+};
 var buildTraceSectionCounts = (results) => {
   const sections = new Map;
   for (const result of results) {
+    const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
     const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
     if (path.length === 0) {
       continue;
@@ -10991,6 +11360,7 @@ var buildTraceSectionCounts = (results) => {
 var buildTraceSectionScores = (results) => {
   const sections = new Map;
   for (const result of results) {
+    const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
     const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
     if (path.length === 0) {
       continue;
@@ -10998,13 +11368,13 @@ var buildTraceSectionScores = (results) => {
     const key = path.join(" > ");
     const existing = sections.get(key);
     if (existing) {
-      existing.totalScore += result.score;
+      existing.totalScore += weightedScore;
       continue;
     }
     sections.set(key, {
       key,
       label: path.at(-1) ?? key,
-      totalScore: result.score
+      totalScore: weightedScore
     });
   }
   return [...sections.values()].sort((left, right) => {
@@ -11443,11 +11813,32 @@ var renderSourceLabels = (input) => {
   ].filter((row) => row.length > 0);
   return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
 };
+var formatStructureKindLabel = (kind) => {
+  switch (kind) {
+    case "markdown_heading":
+      return "Markdown heading";
+    case "html_heading":
+      return "HTML heading";
+    case "office_heading":
+      return "Office heading";
+    case "office_block":
+      return "Office block";
+    case "pdf_block":
+      return "PDF block";
+    case "spreadsheet_rows":
+      return "Spreadsheet rows";
+    case "presentation_slide":
+      return "Presentation slide";
+    default:
+      return;
+  }
+};
 var renderChunkStructure = (structure) => {
   if (!structure) {
     return "";
   }
   const rows = [
+    structure.section?.kind ? `<li><strong>Kind</strong> ${escapeHtml2(formatStructureKindLabel(structure.section.kind) ?? structure.section.kind)}</li>` : "",
     structure.section?.title ? `<li><strong>Section</strong> ${escapeHtml2(structure.section.title)}</li>` : "",
     structure.section?.path && structure.section.path.length > 1 ? `<li><strong>Section path</strong> ${escapeHtml2(structure.section.path.join(" > "))}</li>` : "",
     typeof structure.sequence?.sectionChunkIndex === "number" && typeof structure.sequence?.sectionChunkCount === "number" ? `<li><strong>Section chunk</strong> ${structure.sequence.sectionChunkIndex + 1} of ${structure.sequence.sectionChunkCount}</li>` : "",
@@ -23887,5 +24278,5 @@ export {
   aiChat
 };
-//# debugId=3A168E4E2E133AED64756E2164756E21
+//# debugId=23520EDE705830A964756E2164756E21
 //# sourceMappingURL=index.js.map