@absolutejs/absolute 0.19.0-beta.617 → 0.19.0-beta.619

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -2054,6 +2054,7 @@ var createRAGFileRetrievalLaneHandoffDecisionStore = (path) => ({
2054
2054
  });
2055
2055
  var createRAGFileRetrievalReleaseIncidentStore = (path) => ({
2056
2056
  listIncidents: async ({
2057
+ corpusGroupKey,
2057
2058
  groupKey,
2058
2059
  limit,
2059
2060
  severity,
@@ -2070,7 +2071,7 @@ var createRAGFileRetrievalReleaseIncidentStore = (path) => ({
2070
2071
  throw error;
2071
2072
  }
2072
2073
  }
2073
- const filtered = parsed.filter((entry) => (!groupKey || entry.groupKey === groupKey) && (!targetRolloutLabel || entry.targetRolloutLabel === targetRolloutLabel) && (!severity || entry.severity === severity) && (!status || entry.status === status));
2074
+ const filtered = parsed.filter((entry) => (!corpusGroupKey || entry.corpusGroupKey === corpusGroupKey) && (!groupKey || entry.groupKey === groupKey) && (!targetRolloutLabel || entry.targetRolloutLabel === targetRolloutLabel) && (!severity || entry.severity === severity) && (!status || entry.status === status));
2074
2075
  const sorted = normalizeRetrievalReleaseIncidentRecords(filtered);
2075
2076
  return typeof limit === "number" ? sorted.slice(0, limit) : sorted;
2076
2077
  },
@@ -3053,12 +3054,14 @@ var loadRAGRetrievalLaneHandoffDecisions = async ({
3053
3054
  })));
3054
3055
  var loadRAGRetrievalReleaseIncidents = async ({
3055
3056
  store,
3057
+ corpusGroupKey,
3056
3058
  groupKey,
3057
3059
  limit,
3058
3060
  targetRolloutLabel,
3059
3061
  status,
3060
3062
  severity
3061
3063
  }) => normalizeRetrievalReleaseIncidentRecords(await Promise.resolve(store.listIncidents({
3064
+ corpusGroupKey,
3062
3065
  groupKey,
3063
3066
  limit,
3064
3067
  severity,
@@ -4234,6 +4237,25 @@ var buildContextLabel2 = (metadata) => {
4234
4237
  if (!metadata) {
4235
4238
  return;
4236
4239
  }
4240
+ const pdfTextKind = getContextString2(metadata.pdfTextKind);
4241
+ const officeBlockKind = getContextString2(metadata.officeBlockKind);
4242
+ const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
4243
+ const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
4244
+ if (pdfTextKind === "table_like" && sectionTitle) {
4245
+ return `PDF table block ${sectionTitle}`;
4246
+ }
4247
+ if (pdfTextKind === "paragraph" && sectionTitle) {
4248
+ return `PDF text block ${sectionTitle}`;
4249
+ }
4250
+ if (officeBlockKind === "table" && sectionTitle) {
4251
+ return `Office table block ${sectionTitle}`;
4252
+ }
4253
+ if (officeBlockKind === "list" && sectionTitle) {
4254
+ return `Office list block ${sectionTitle}`;
4255
+ }
4256
+ if (officeBlockKind === "paragraph" && sectionTitle) {
4257
+ return `Office paragraph block ${sectionTitle}`;
4258
+ }
4237
4259
  const emailKind = getContextString2(metadata.emailKind);
4238
4260
  if (emailKind === "attachment") {
4239
4261
  return "Attachment evidence";
@@ -4270,8 +4292,6 @@ var buildContextLabel2 = (metadata) => {
4270
4292
  if (speaker) {
4271
4293
  return `Speaker ${speaker}`;
4272
4294
  }
4273
- const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
4274
- const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
4275
4295
  if (sectionTitle) {
4276
4296
  return `Section ${sectionTitle}`;
4277
4297
  }
@@ -4281,11 +4301,21 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4281
4301
  if (!metadata) {
4282
4302
  return;
4283
4303
  }
4304
+ const pdfTextKind = getContextString2(metadata.pdfTextKind);
4305
+ const officeBlockKind = getContextString2(metadata.officeBlockKind);
4306
+ const pdfBlockNumber = getContextNumber2(metadata.pdfBlockNumber);
4307
+ const officeBlockNumber = getContextNumber2(metadata.officeBlockNumber);
4284
4308
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
4285
4309
  const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
4286
4310
  if (page && region) {
4287
4311
  return `Page ${page} \xB7 Region ${region}`;
4288
4312
  }
4313
+ if (page && pdfBlockNumber && pdfTextKind === "table_like") {
4314
+ return `Page ${page} \xB7 Table Block ${pdfBlockNumber}`;
4315
+ }
4316
+ if (page && pdfBlockNumber) {
4317
+ return `Page ${page} \xB7 Text Block ${pdfBlockNumber}`;
4318
+ }
4289
4319
  if (page) {
4290
4320
  return `Page ${page}`;
4291
4321
  }
@@ -4314,6 +4344,15 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4314
4344
  if (mediaStart) {
4315
4345
  return `Timestamp ${mediaStart}`;
4316
4346
  }
4347
+ if (officeBlockNumber && officeBlockKind === "table") {
4348
+ return `Office table block ${officeBlockNumber}`;
4349
+ }
4350
+ if (officeBlockNumber && officeBlockKind === "list") {
4351
+ return `Office list block ${officeBlockNumber}`;
4352
+ }
4353
+ if (officeBlockNumber && officeBlockKind === "paragraph") {
4354
+ return `Office paragraph block ${officeBlockNumber}`;
4355
+ }
4317
4356
  const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
4318
4357
  if (sectionPath.length > 0) {
4319
4358
  return `Section ${sectionPath.join(" > ")}`;
@@ -4331,12 +4370,16 @@ var buildProvenanceLabel2 = (metadata) => {
4331
4370
  const mediaKind = getContextString2(metadata.mediaKind);
4332
4371
  const transcriptSource = getContextString2(metadata.transcriptSource);
4333
4372
  const pdfTextMode = getContextString2(metadata.pdfTextMode);
4373
+ const pdfTextKind = getContextString2(metadata.pdfTextKind);
4374
+ const officeBlockKind = getContextString2(metadata.officeBlockKind);
4334
4375
  const ocrEngine = getContextString2(metadata.ocrEngine);
4335
4376
  const extractorRegistryMatch = getContextString2(metadata.extractorRegistryMatch);
4336
4377
  const chunkingProfile = getContextString2(metadata.chunkingProfile);
4337
4378
  const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
4338
4379
  const labels = [
4339
4380
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
4381
+ pdfTextKind === "table_like" ? "PDF table block" : pdfTextKind === "paragraph" ? "PDF text block" : "",
4382
+ officeBlockKind ? `Office ${officeBlockKind}` : "",
4340
4383
  ocrEngine ? `OCR ${ocrEngine}` : "",
4341
4384
  extractorRegistryMatch ? `Extractor ${extractorRegistryMatch}` : "",
4342
4385
  chunkingProfile ? `Chunking ${chunkingProfile}` : "",
@@ -4372,7 +4415,7 @@ var buildRAGChunkStructure = (metadata) => {
4372
4415
  return;
4373
4416
  }
4374
4417
  const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : undefined;
4375
- const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
4418
+ const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" || metadata.sectionKind === "office_heading" || metadata.sectionKind === "office_block" || metadata.sectionKind === "pdf_block" || metadata.sectionKind === "spreadsheet_rows" || metadata.sectionKind === "presentation_slide" ? metadata.sectionKind : undefined;
4376
4419
  const section = {
4377
4420
  depth: getContextNumber2(metadata.sectionDepth),
4378
4421
  kind: sectionKind,
@@ -4692,7 +4735,7 @@ var buildRAGSourceSummaries = (sources) => {
4692
4735
  const citationReferenceMap = buildRAGCitationReferenceMap(citations);
4693
4736
  return sourceGroups.map((group) => {
4694
4737
  const groupCitations = citations.filter((citation) => group.chunks.some((chunk) => chunk.chunkId === citation.chunkId));
4695
- const leadChunk = group.chunks.slice().sort((left, right) => right.score - left.score)[0];
4738
+ const leadChunk = getPreferredSourceLeadChunk(group.chunks);
4696
4739
  const excerpts = leadChunk ? buildRAGChunkExcerpts(group.chunks, leadChunk.chunkId) : undefined;
4697
4740
  const structure = leadChunk?.structure ?? buildRAGChunkStructure(leadChunk?.metadata);
4698
4741
  const excerptSelection = buildRAGExcerptSelection(excerpts, structure);
@@ -4720,13 +4763,45 @@ var getSectionPathFromSource = (source) => {
4720
4763
  const path = source.structure?.section?.path ?? (Array.isArray(source.metadata?.sectionPath) ? source.metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : []);
4721
4764
  return path.length > 0 ? path : undefined;
4722
4765
  };
4766
+ var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office "));
4767
+ var getStructuredSectionScoreWeight = (metadata) => {
4768
+ if (!metadata) {
4769
+ return 1;
4770
+ }
4771
+ const pdfTextKind = getContextString2(metadata.pdfTextKind);
4772
+ const officeBlockKind = getContextString2(metadata.officeBlockKind);
4773
+ const sectionKind = getContextString2(metadata.sectionKind);
4774
+ if (pdfTextKind === "table_like") {
4775
+ return 1.28;
4776
+ }
4777
+ if (officeBlockKind === "table" || officeBlockKind === "list") {
4778
+ return 1.24;
4779
+ }
4780
+ if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
4781
+ return 1.12;
4782
+ }
4783
+ return 1;
4784
+ };
4785
+ var getStructuredSourceLeadScore = (source) => source.score * getStructuredSectionScoreWeight(source.metadata);
4786
+ var getPreferredSourceLeadChunk = (chunks) => chunks.slice().sort((left, right) => {
4787
+ const leftWeightedScore = getStructuredSourceLeadScore(left);
4788
+ const rightWeightedScore = getStructuredSourceLeadScore(right);
4789
+ if (rightWeightedScore !== leftWeightedScore) {
4790
+ return rightWeightedScore - leftWeightedScore;
4791
+ }
4792
+ if (right.score !== left.score) {
4793
+ return right.score - left.score;
4794
+ }
4795
+ return left.chunkId.localeCompare(right.chunkId);
4796
+ })[0];
4723
4797
  var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
4724
- const totalScore = sources.reduce((sum, source) => sum + source.score, 0);
4798
+ const totalScore = sources.reduce((sum, source) => sum + source.score * getStructuredSectionScoreWeight(source.metadata), 0);
4725
4799
  if (sources.length === 0 || totalScore <= 0) {
4726
4800
  return [];
4727
4801
  }
4728
4802
  const sections = new Map;
4729
4803
  for (const source of sources) {
4804
+ const structuredScore = source.score * getStructuredSectionScoreWeight(source.metadata);
4730
4805
  const path = getSectionPathFromSource(source);
4731
4806
  if (!path) {
4732
4807
  continue;
@@ -4758,7 +4833,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
4758
4833
  sourceSet: new Set(source.source ? [source.source] : []),
4759
4834
  topChunkId: source.chunkId,
4760
4835
  topSource: source.source,
4761
- totalScore: source.score,
4836
+ totalScore: structuredScore,
4762
4837
  transformedHits,
4763
4838
  variantHits,
4764
4839
  vectorHits
@@ -4766,7 +4841,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
4766
4841
  continue;
4767
4842
  }
4768
4843
  existing.count += 1;
4769
- existing.totalScore += source.score;
4844
+ existing.totalScore += structuredScore;
4770
4845
  if (source.source) {
4771
4846
  existing.sourceSet.add(source.source);
4772
4847
  }
@@ -4794,6 +4869,8 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
4794
4869
  const parentTotal = siblingPool.reduce((sum, entry) => sum + entry.totalScore, 0);
4795
4870
  const scoreShare = section.totalScore / totalScore;
4796
4871
  const parentShare = parentTotal > 0 ? section.totalScore / parentTotal : undefined;
4872
+ const topChunk = sources.find((source) => source.chunkId === section.topChunkId);
4873
+ const topContextLabel = topChunk?.labels?.contextLabel ?? buildContextLabel2(topChunk?.metadata);
4797
4874
  const parentDistribution = parentTotal > 0 ? siblingPool.map((entry) => ({
4798
4875
  count: entry.count,
4799
4876
  isActive: entry.key === section.key,
@@ -4919,6 +4996,7 @@ var buildRAGSectionRetrievalDiagnostics = (sources, trace) => {
4919
4996
  reasons.push("concentrated_evidence");
4920
4997
  }
4921
4998
  const summaryParts = [
4999
+ isBlockAwareContextLabel(topContextLabel) ? topContextLabel : "",
4922
5000
  `${section.count} hit${section.count === 1 ? "" : "s"}`,
4923
5001
  `${(scoreShare * 100).toFixed(0)}% score share`,
4924
5002
  `vector ${section.vectorHits} \xB7 lexical ${section.lexicalHits} \xB7 hybrid ${section.hybridHits}`,
@@ -5130,22 +5208,21 @@ var updateSourceGroup = (groups, source) => {
5130
5208
  groups.set(key, buildSourceGroup(source, key));
5131
5209
  return;
5132
5210
  }
5133
- if (source.score > existing.bestScore) {
5134
- existing.bestScore = source.score;
5135
- existing.label = buildSourceLabel2(source);
5136
- existing.labels = source.labels ?? buildRAGSourceLabels({
5137
- metadata: source.metadata,
5138
- source: source.source,
5139
- title: source.title
5140
- });
5141
- existing.structure = source.structure ?? buildRAGChunkStructure(source.metadata);
5142
- existing.source = source.source;
5143
- existing.title = source.title;
5144
- } else {
5145
- existing.bestScore = Math.max(existing.bestScore, source.score);
5146
- }
5211
+ existing.bestScore = Math.max(existing.bestScore, source.score);
5147
5212
  existing.count += 1;
5148
5213
  existing.chunks.push(source);
5214
+ const leadChunk = getPreferredSourceLeadChunk(existing.chunks);
5215
+ if (leadChunk) {
5216
+ existing.label = buildSourceLabel2(leadChunk);
5217
+ existing.labels = leadChunk.labels ?? buildRAGSourceLabels({
5218
+ metadata: leadChunk.metadata,
5219
+ source: leadChunk.source,
5220
+ title: leadChunk.title
5221
+ });
5222
+ existing.structure = leadChunk.structure ?? buildRAGChunkStructure(leadChunk.metadata);
5223
+ existing.source = leadChunk.source;
5224
+ existing.title = leadChunk.title;
5225
+ }
5149
5226
  };
5150
5227
  var getLatestAssistantMessage = (messages) => {
5151
5228
  for (let index = messages.length - 1;index >= 0; index -= 1) {
@@ -8482,6 +8559,55 @@ var scoreLoosePhraseMatch2 = (query, text) => {
8482
8559
  }
8483
8560
  return 0;
8484
8561
  };
8562
+ var queryHasAnyToken = (queryTokens, candidates) => candidates.some((candidate) => queryTokens.includes(candidate));
8563
+ var scoreStructuredEvidenceMatch = (queryTokens, result) => {
8564
+ const metadata = result.metadata ?? {};
8565
+ const pdfTextKind = typeof metadata.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
8566
+ const officeBlockKind = typeof metadata.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
8567
+ const hasBlockMetadata = typeof metadata.pdfBlockNumber === "number" || typeof metadata.officeBlockNumber === "number";
8568
+ let score = 0;
8569
+ if (hasBlockMetadata) {
8570
+ score += 0.12;
8571
+ }
8572
+ if (pdfTextKind === "table_like" && queryHasAnyToken(queryTokens, [
8573
+ "table",
8574
+ "row",
8575
+ "rows",
8576
+ "column",
8577
+ "columns",
8578
+ "spreadsheet",
8579
+ "sheet",
8580
+ "workbook"
8581
+ ])) {
8582
+ score += 0.65;
8583
+ }
8584
+ if (officeBlockKind === "table" && queryHasAnyToken(queryTokens, [
8585
+ "table",
8586
+ "row",
8587
+ "rows",
8588
+ "column",
8589
+ "columns",
8590
+ "matrix",
8591
+ "grid"
8592
+ ])) {
8593
+ score += 0.55;
8594
+ }
8595
+ if (officeBlockKind === "list" && queryHasAnyToken(queryTokens, [
8596
+ "list",
8597
+ "checklist",
8598
+ "bullet",
8599
+ "bullets",
8600
+ "step",
8601
+ "steps",
8602
+ "task",
8603
+ "tasks",
8604
+ "item",
8605
+ "items"
8606
+ ])) {
8607
+ score += 0.55;
8608
+ }
8609
+ return score;
8610
+ };
8485
8611
  var scoreHeuristicMatch = ({
8486
8612
  query,
8487
8613
  queryTokens,
@@ -8498,7 +8624,8 @@ var scoreHeuristicMatch = ({
8498
8624
  const exactPhraseBoost = Math.max(normalizeText([result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")).includes(queryTokens.join(" ")) ? 1 : 0, scoreLoosePhraseMatch2(query, [result.title, result.source, result.chunkText, ...metadataValues].filter(Boolean).join(" ")));
8499
8625
  const sourcePathBoost = typeof result.source === "string" && queryTokens.some((token) => result.source?.toLowerCase().includes(token)) ? 0.5 : 0;
8500
8626
  const metadataBoost = metadataValues.length > 0 ? queryTokens.filter((token) => metadataValues.some((value) => value.toLowerCase().includes(token))).length / queryTokens.length : 0;
8501
- return result.score + overlapBoost + exactPhraseBoost + sourcePathBoost + metadataBoost;
8627
+ const structuredEvidenceBoost = scoreStructuredEvidenceMatch(queryTokens, result);
8628
+ return result.score + overlapBoost + exactPhraseBoost + sourcePathBoost + metadataBoost + structuredEvidenceBoost;
8502
8629
  };
8503
8630
  var normalizeText = (value) => tokenize3(value).join(" ");
8504
8631
  var applyRAGReranking = async ({
@@ -8733,32 +8860,59 @@ var stripHtmlTags = (value) => {
8733
8860
  `).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
8734
8861
  return decodeHtmlEntities(withoutTags);
8735
8862
  };
8863
+ var stripHtmlNoiseBlocks = (value) => value.replace(/<!--[\s\S]*?-->/g, " ").replace(/<(script|style|template|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(hidden|aria-hidden=(['"])true\3)[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<(nav|footer|header|aside|form|dialog)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(nav|menu|footer|header|sidebar|promo|banner|cookie|breadcrumb|share|social|subscribe|newsletter|modal)[^'"]*\2[^>]*>[\s\S]*?<\/\1>/gi, " ");
8864
+ var collectHtmlContentCandidates = (value) => {
8865
+ const patterns = [
8866
+ {
8867
+ contentGroup: 1,
8868
+ pattern: /<main\b[^>]*>([\s\S]*?)<\/main>/gi
8869
+ },
8870
+ {
8871
+ contentGroup: 1,
8872
+ pattern: /<article\b[^>]*>([\s\S]*?)<\/article>/gi
8873
+ },
8874
+ {
8875
+ contentGroup: 3,
8876
+ pattern: /<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/gi
8877
+ },
8878
+ {
8879
+ contentGroup: 4,
8880
+ pattern: /<([a-z0-9:_-]+)\b[^>]*\b(?:id|class)=(['"])[^'"]*(content|article|main|post|body)[^'"]*\2[^>]*>([\s\S]*?)<\/\1>/gi
8881
+ }
8882
+ ];
8883
+ const candidates = [];
8884
+ for (const entry of patterns) {
8885
+ for (const match of value.matchAll(entry.pattern)) {
8886
+ const rawCandidate = match[entry.contentGroup];
8887
+ const candidate = typeof rawCandidate === "string" ? rawCandidate : "";
8888
+ if (candidate.trim()) {
8889
+ candidates.push(candidate.trim());
8890
+ }
8891
+ }
8892
+ }
8893
+ return candidates;
8894
+ };
8736
8895
  var extractMainHtmlContent = (value) => {
8737
8896
  const trimmed = value.trim();
8738
8897
  if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
8739
8898
  return value;
8740
8899
  }
8741
- const boilerplateStripped = trimmed.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<(nav|footer|header|aside|form)\b[^>]*>[\s\S]*?<\/\1>/gi, " ");
8742
- const mainMatch = boilerplateStripped.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
8743
- if (mainMatch?.[1]) {
8744
- return mainMatch[1];
8745
- }
8746
- const articleMatches = [
8747
- ...boilerplateStripped.matchAll(/<article\b[^>]*>([\s\S]*?)<\/article>/gi)
8748
- ].map((match) => match[1]?.trim()).filter(Boolean);
8749
- if (articleMatches.length > 0) {
8750
- return articleMatches.join(`
8751
- `);
8752
- }
8753
- const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
8754
- if (roleMainMatch?.[3]) {
8755
- return roleMainMatch[3];
8900
+ const stripped = stripHtmlNoiseBlocks(trimmed);
8901
+ const candidates = collectHtmlContentCandidates(stripped);
8902
+ if (candidates.length > 0) {
8903
+ const bestCandidate = candidates.map((candidate) => ({
8904
+ candidate,
8905
+ score: stripHtmlTags(candidate).replace(/\s+/g, " ").trim().length
8906
+ })).sort((left, right) => right.score - left.score)[0]?.candidate;
8907
+ if (bestCandidate) {
8908
+ return bestCandidate;
8909
+ }
8756
8910
  }
8757
- const bodyMatch = boilerplateStripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
8911
+ const bodyMatch = stripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
8758
8912
  if (bodyMatch?.[1]) {
8759
8913
  return bodyMatch[1];
8760
8914
  }
8761
- return boilerplateStripped;
8915
+ return stripped;
8762
8916
  };
8763
8917
  var stripHtml = (value) => {
8764
8918
  const focused = extractMainHtmlContent(value);
@@ -8776,6 +8930,93 @@ var stripMarkdown = (value) => {
8776
8930
  `);
8777
8931
  return normalizeWhitespace(stripped);
8778
8932
  };
8933
+ var pdfNativeStructureUnits = (metadata) => {
8934
+ const blocks = Array.isArray(metadata?.pdfTextBlocks) ? metadata.pdfTextBlocks : [];
8935
+ const units = [];
8936
+ for (const block of blocks) {
8937
+ if (!block || typeof block !== "object") {
8938
+ continue;
8939
+ }
8940
+ const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
8941
+ if (!text) {
8942
+ continue;
8943
+ }
8944
+ const pageNumber = typeof block.pageNumber === "number" && Number.isFinite(block.pageNumber) ? block.pageNumber : undefined;
8945
+ const pdfBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
8946
+ const pdfTextKind = block.textKind === "table_like" ? "table_like" : "paragraph";
8947
+ const sectionTitle = pageNumber ? pdfTextKind === "table_like" ? `Page ${pageNumber} Table Block` : `Page ${pageNumber} Text Block` : pdfTextKind === "table_like" ? "Table Block" : "Text Block";
8948
+ units.push({
8949
+ pageNumber,
8950
+ pdfBlockNumber,
8951
+ pdfTextKind,
8952
+ preferredChunkUnits: pdfTextKind === "table_like" ? text.split(`
8953
+ `).filter(Boolean) : undefined,
8954
+ sectionDepth: 1,
8955
+ sectionKind: "pdf_block",
8956
+ sectionPath: [sectionTitle],
8957
+ sectionTitle,
8958
+ text
8959
+ });
8960
+ }
8961
+ return units;
8962
+ };
8963
+ var officeNativeStructureUnits = (metadata) => {
8964
+ const blocks = Array.isArray(metadata?.officeBlocks) ? metadata.officeBlocks : [];
8965
+ const units = [];
8966
+ const headingStack = [];
8967
+ const decorateOfficeSectionText = (text, sectionTitle) => {
8968
+ if (!sectionTitle || text.includes(sectionTitle)) {
8969
+ return text;
8970
+ }
8971
+ return normalizeWhitespace(`${sectionTitle}
8972
+ ${text}`);
8973
+ };
8974
+ for (const [index, block] of blocks.entries()) {
8975
+ if (!block || typeof block !== "object") {
8976
+ continue;
8977
+ }
8978
+ const text = typeof block.text === "string" ? normalizeWhitespace(block.text) : "";
8979
+ if (!text) {
8980
+ continue;
8981
+ }
8982
+ const officeBlockNumber = typeof block.blockNumber === "number" && Number.isFinite(block.blockNumber) ? block.blockNumber : undefined;
8983
+ const officeBlockKind = block.blockKind === "title" || block.blockKind === "heading" || block.blockKind === "list" || block.blockKind === "table" ? block.blockKind : "paragraph";
8984
+ const headingLevel = typeof block.headingLevel === "number" && Number.isFinite(block.headingLevel) ? block.headingLevel : undefined;
8985
+ if (officeBlockKind === "title" || officeBlockKind === "heading") {
8986
+ const level = officeBlockKind === "title" ? 1 : headingLevel ?? 1;
8987
+ headingStack[level - 1] = text;
8988
+ headingStack.length = level;
8989
+ const nextBlock = blocks[index + 1];
8990
+ const nextKind = nextBlock && typeof nextBlock === "object" ? nextBlock.blockKind : undefined;
8991
+ if (nextKind === "title" || nextKind === "heading" || nextKind === "list" || nextKind === "table" || !nextBlock) {
8992
+ units.push({
8993
+ officeBlockKind,
8994
+ officeBlockNumber,
8995
+ sectionDepth: headingStack.length,
8996
+ sectionKind: "office_heading",
8997
+ sectionPath: [...headingStack],
8998
+ sectionTitle: text,
8999
+ text
9000
+ });
9001
+ }
9002
+ continue;
9003
+ }
9004
+ const sectionPath = headingStack.length > 0 ? [...headingStack] : undefined;
9005
+ const sectionTitle = sectionPath?.at(-1);
9006
+ units.push({
9007
+ officeBlockKind,
9008
+ officeBlockNumber,
9009
+ preferredChunkUnits: officeBlockKind === "table" ? text.split(`
9010
+ `).filter(Boolean) : undefined,
9011
+ sectionDepth: sectionPath?.length,
9012
+ sectionKind: officeBlockKind === "paragraph" ? "office_heading" : "office_block",
9013
+ sectionPath,
9014
+ sectionTitle,
9015
+ text: officeBlockKind === "paragraph" ? decorateOfficeSectionText(text, sectionTitle) : text
9016
+ });
9017
+ }
9018
+ return units;
9019
+ };
8779
9020
  var markdownStructureUnits = (value) => {
8780
9021
  const lines = value.replace(/\r\n?/g, `
8781
9022
  `).split(`
@@ -9119,6 +9360,7 @@ var appendPdfLineBreak = (parts) => {
9119
9360
  parts.push(`
9120
9361
  `);
9121
9362
  };
9363
+ var PDF_CHROME_LINE_MAX_LENGTH = 80;
9122
9364
  var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
9123
9365
  var extractTextFromPDFTextObject = (value) => {
9124
9366
  const parts = [];
@@ -9147,19 +9389,84 @@ var extractTextFromPDFTextObject = (value) => {
9147
9389
  }
9148
9390
  return parts.join("");
9149
9391
  };
9150
- var extractTextFromPDFBytes = (data) => {
9151
- const raw = Buffer.from(data).toString("latin1");
9152
- const textObjects = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match) => extractTextFromPDFTextObject(match[1] ?? "")).filter(Boolean);
9153
- const combined = textObjects.length > 0 ? textObjects.join(`
9154
-
9155
- `) : [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
9156
- `);
9157
- return normalizeWhitespace(combined);
9392
+ var buildPDFNativeTextBlock = (text, blockNumber, pageNumber) => {
9393
+ const normalized = normalizeWhitespace(text);
9394
+ if (!normalized) {
9395
+ return;
9396
+ }
9397
+ const lineCount = normalized.split(`
9398
+ `).filter(Boolean).length;
9399
+ const textKind = normalized.includes(" | ") ? "table_like" : "paragraph";
9400
+ return {
9401
+ blockNumber,
9402
+ lineCount,
9403
+ pageNumber,
9404
+ text: normalized,
9405
+ textKind
9406
+ };
9407
+ };
9408
+ var isLikelyPDFPageLabel = (value) => /^page\s+\d+(?:\s+of\s+\d+)?$/i.test(value.trim());
9409
+ var suppressRepeatedPDFChrome = (blocks) => {
9410
+ const linePages = new Map;
9411
+ for (const block of blocks) {
9412
+ for (const line of block.text.split(`
9413
+ `)) {
9414
+ const normalized = normalizeWhitespace(line);
9415
+ if (!normalized || normalized.length > PDF_CHROME_LINE_MAX_LENGTH) {
9416
+ continue;
9417
+ }
9418
+ const pages = linePages.get(normalized) ?? new Set;
9419
+ pages.add(block.pageNumber);
9420
+ linePages.set(normalized, pages);
9421
+ }
9422
+ }
9423
+ return blocks.map((block) => {
9424
+ const keptLines = block.text.split(`
9425
+ `).map((line) => normalizeWhitespace(line)).filter((line) => {
9426
+ if (!line) {
9427
+ return false;
9428
+ }
9429
+ if (isLikelyPDFPageLabel(line)) {
9430
+ return false;
9431
+ }
9432
+ const repeatedPages = linePages.get(line);
9433
+ if (line.length <= PDF_CHROME_LINE_MAX_LENGTH && repeatedPages && repeatedPages.size > 1) {
9434
+ return false;
9435
+ }
9436
+ return true;
9437
+ });
9438
+ const text = normalizeWhitespace(keptLines.join(`
9439
+ `));
9440
+ if (!text) {
9441
+ return;
9442
+ }
9443
+ return buildPDFNativeTextBlock(text, block.blockNumber, block.pageNumber);
9444
+ }).filter((value) => Boolean(value));
9158
9445
  };
9159
- var estimatePDFPageCount = (data) => {
9446
+ var extractNativePDFText = (data) => {
9160
9447
  const raw = Buffer.from(data).toString("latin1");
9161
9448
  const count = [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
9162
- return count > 0 ? count : 1;
9449
+ const pageCount = count > 0 ? count : 1;
9450
+ const pageMarkers = [...raw.matchAll(/\/Type\s*\/Page\b/g)].map((match) => match.index ?? raw.length);
9451
+ const blocks = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match, index) => {
9452
+ const blockText = extractTextFromPDFTextObject(match[1] ?? "");
9453
+ const objectEnd = (match.index ?? 0) + (match[0]?.length ?? 0);
9454
+ const pageIndex = pageMarkers.findIndex((marker) => marker >= objectEnd);
9455
+ const pageNumber = pageIndex >= 0 ? pageIndex + 1 : pageCount;
9456
+ return buildPDFNativeTextBlock(blockText, index + 1, pageNumber);
9457
+ }).filter((value) => Boolean(value));
9458
+ const visibleBlocks = suppressRepeatedPDFChrome(blocks);
9459
+ const fallbackText = [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
9460
+ `);
9461
+ const text = visibleBlocks.length > 0 ? normalizeWhitespace(visibleBlocks.map((block) => block.text).join(`
9462
+
9463
+ `)) : normalizeWhitespace(fallbackText);
9464
+ return {
9465
+ pageCount,
9466
+ text,
9467
+ textBlockCount: visibleBlocks.length,
9468
+ textBlocks: visibleBlocks
9469
+ };
9163
9470
  };
9164
9471
  var readUInt16LE = (data, offset) => data[offset] | data[offset + 1] << 8;
9165
9472
  var readUInt32LE = (data, offset) => (data[offset] | data[offset + 1] << 8 | data[offset + 2] << 16 | data[offset + 3] << 24) >>> 0;
@@ -9246,35 +9553,64 @@ var decodeGzipEntries = (data, input) => {
9246
9553
  var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
9247
9554
  var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
9248
9555
  `).replace(/<[^>]+>/g, " ")));
9249
- var officeDocumentParagraphs = (entries) => {
9556
+ var officeDocumentBlocks = (entries) => {
9250
9557
  const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
9251
9558
  if (!documentEntry) {
9252
9559
  return [];
9253
9560
  }
9254
9561
  const xml = decodeUtf8(documentEntry.data);
9255
- const paragraphs = [...xml.matchAll(/<w:p\b[\s\S]*?<\/w:p>/g)];
9256
- return paragraphs.map((match) => {
9257
- const paragraphXml = match[0] ?? "";
9258
- const text = extractOfficeParagraphText(paragraphXml);
9562
+ const bodyMatch = xml.match(/<w:body\b[^>]*>([\s\S]*?)<\/w:body>/i);
9563
+ const body = bodyMatch?.[1] ?? xml;
9564
+ const blocks = [];
9565
+ const blockPattern = /<(w:p|w:tbl)\b[\s\S]*?<\/\1>/g;
9566
+ for (const match of body.matchAll(blockPattern)) {
9567
+ const blockXml = match[0] ?? "";
9568
+ if (blockXml.startsWith("<w:tbl")) {
9569
+ const rows = [...blockXml.matchAll(/<w:tr\b[\s\S]*?<\/w:tr>/g)].map((rowMatch, rowIndex) => {
9570
+ const cells = [
9571
+ ...(rowMatch[0] ?? "").matchAll(/<w:tc\b[\s\S]*?<\/w:tc>/g)
9572
+ ].map((cellMatch) => extractOfficeParagraphText(cellMatch[0] ?? "")).filter(Boolean);
9573
+ if (cells.length === 0) {
9574
+ return "";
9575
+ }
9576
+ return `Row ${rowIndex + 1}. ${cells.map((cell, cellIndex) => `${String.fromCharCode(65 + cellIndex)}: ${cell}`).join(" | ")}`;
9577
+ }).filter(Boolean);
9578
+ const text2 = normalizeWhitespace(rows.join(`
9579
+ `));
9580
+ if (!text2) {
9581
+ continue;
9582
+ }
9583
+ blocks.push({
9584
+ blockKind: "table",
9585
+ blockNumber: blocks.length + 1,
9586
+ text: text2
9587
+ });
9588
+ continue;
9589
+ }
9590
+ const text = extractOfficeParagraphText(blockXml);
9259
9591
  if (!text) {
9260
- return "";
9592
+ continue;
9261
9593
  }
9262
- const styleMatch = paragraphXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
9594
+ const styleMatch = blockXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
9263
9595
  const style = (styleMatch?.[1] ?? "").toLowerCase();
9264
- if (style === "title") {
9265
- return text;
9266
- }
9267
9596
  const headingMatch = style.match(/^heading([1-6])$/);
9268
- if (headingMatch) {
9269
- return text;
9270
- }
9271
- return text;
9272
- }).filter(Boolean);
9597
+ const isListParagraph = /<w:numPr\b/i.test(blockXml) || style.includes("list") || style.includes("bullet");
9598
+ const blockKind = style === "title" ? "title" : headingMatch ? "heading" : isListParagraph ? "list" : "paragraph";
9599
+ const decoratedText = blockKind === "list" && !/^[-*]\s/.test(text) ? `- ${text}` : text;
9600
+ blocks.push({
9601
+ blockKind,
9602
+ blockNumber: blocks.length + 1,
9603
+ headingLevel: headingMatch ? Number.parseInt(headingMatch[1] ?? "1", 10) : undefined,
9604
+ style: style || undefined,
9605
+ text: decoratedText
9606
+ });
9607
+ }
9608
+ return blocks;
9273
9609
  };
9274
9610
  var officeDocumentText = (entries) => {
9275
- const paragraphs = officeDocumentParagraphs(entries);
9276
- if (paragraphs.length > 0) {
9277
- return normalizeWhitespace(paragraphs.join(`
9611
+ const blocks = officeDocumentBlocks(entries);
9612
+ if (blocks.length > 0) {
9613
+ return normalizeWhitespace(blocks.map((block) => block.text).join(`
9278
9614
 
9279
9615
  `));
9280
9616
  }
@@ -9285,11 +9621,7 @@ var officeDocumentText = (entries) => {
9285
9621
  return extractXmlText(decodeUtf8(documentEntry.data));
9286
9622
  };
9287
9623
  var officeDocumentSectionCount = (entries) => {
9288
- const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
9289
- if (!documentEntry) {
9290
- return;
9291
- }
9292
- const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
9624
+ const count = officeDocumentBlocks(entries).length;
9293
9625
  return count > 0 ? count : undefined;
9294
9626
  };
9295
9627
  var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
@@ -9819,8 +10151,10 @@ var createOfficeDocumentExtractor = () => ({
9819
10151
  let officeMetadata = {};
9820
10152
  let structuredDocuments = [];
9821
10153
  if (extension === ".docx" || extension === ".odt") {
10154
+ const officeBlocks = officeDocumentBlocks(entries);
9822
10155
  text = officeDocumentText(entries);
9823
10156
  officeMetadata = {
10157
+ officeBlocks,
9824
10158
  sectionCount: officeDocumentSectionCount(entries)
9825
10159
  };
9826
10160
  } else if (extension === ".xlsx" || extension === ".ods") {
@@ -10010,8 +10344,8 @@ var createPDFFileExtractor = () => ({
10010
10344
  name: "absolute_pdf",
10011
10345
  supports: pdfExtractorSupports,
10012
10346
  extract: (input) => {
10013
- const text = extractTextFromPDFBytes(input.data);
10014
- if (!text) {
10347
+ const extracted = extractNativePDFText(input.data);
10348
+ if (!extracted.text) {
10015
10349
  throw new Error("AbsoluteJS could not extract readable text from this PDF. Supply a custom extractor for scanned or image-only PDFs.");
10016
10350
  }
10017
10351
  return {
@@ -10021,10 +10355,12 @@ var createPDFFileExtractor = () => ({
10021
10355
  metadata: {
10022
10356
  ...input.metadata ?? {},
10023
10357
  fileKind: "pdf",
10024
- pageCount: estimatePDFPageCount(input.data)
10358
+ pageCount: extracted.pageCount,
10359
+ pdfTextBlockCount: extracted.textBlockCount,
10360
+ pdfTextBlocks: extracted.textBlocks
10025
10361
  },
10026
10362
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
10027
- text,
10363
+ text: extracted.text,
10028
10364
  title: input.title
10029
10365
  };
10030
10366
  }
@@ -10049,7 +10385,8 @@ var createRAGPDFOCRExtractor = (options) => ({
10049
10385
  name: `absolute_pdf_ocr:${options.provider.name}`,
10050
10386
  supports: pdfExtractorSupports,
10051
10387
  extract: async (input) => {
10052
- const nativeText = extractTextFromPDFBytes(input.data);
10388
+ const extracted = extractNativePDFText(input.data);
10389
+ const nativeText = extracted.text;
10053
10390
  const minLength = options.minExtractedTextLength ?? 80;
10054
10391
  const shouldUseNativeText = !options.alwaysOCR && nativeText.length >= minLength;
10055
10392
  if (shouldUseNativeText) {
@@ -10060,7 +10397,9 @@ var createRAGPDFOCRExtractor = (options) => ({
10060
10397
  metadata: {
10061
10398
  ...input.metadata ?? {},
10062
10399
  fileKind: "pdf",
10063
- pageCount: estimatePDFPageCount(input.data),
10400
+ pageCount: extracted.pageCount,
10401
+ pdfTextBlockCount: extracted.textBlockCount,
10402
+ pdfTextBlocks: extracted.textBlocks,
10064
10403
  pdfTextMode: "native"
10065
10404
  },
10066
10405
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
@@ -10075,7 +10414,7 @@ var createRAGPDFOCRExtractor = (options) => ({
10075
10414
  const baseMetadata = {
10076
10415
  ...ocrMetadata(ocr),
10077
10416
  fileKind: "pdf",
10078
- pageCount: estimatePDFPageCount(input.data),
10417
+ pageCount: extracted.pageCount,
10079
10418
  pdfTextMode: "ocr"
10080
10419
  };
10081
10420
  const summaryDocument = {
@@ -10248,6 +10587,18 @@ var sourceAwareUnits = (document, format, normalizedText) => {
10248
10587
  }
10249
10588
  case "text":
10250
10589
  default:
10590
+ if (document.metadata?.fileKind === "office") {
10591
+ const sections = officeNativeStructureUnits(document.metadata);
10592
+ if (sections.length > 0) {
10593
+ return sections;
10594
+ }
10595
+ }
10596
+ if (document.metadata?.fileKind === "pdf") {
10597
+ const sections = pdfNativeStructureUnits(document.metadata);
10598
+ if (sections.length > 0) {
10599
+ return sections;
10600
+ }
10601
+ }
10251
10602
  if (document.metadata?.sourceNativeKind === "spreadsheet_sheet") {
10252
10603
  return spreadsheetStructureUnits(normalizedText, document.metadata);
10253
10604
  }
@@ -10571,6 +10922,11 @@ var prepareRAGDocument = (document, defaultChunking, chunkingRegistry) => {
10571
10922
  ...sectionTitle ? { sectionTitle } : {},
10572
10923
  ...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
10573
10924
  ...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
10925
+ ...typeof entry.pageNumber === "number" ? { pageNumber: entry.pageNumber } : {},
10926
+ ...typeof entry.officeBlockNumber === "number" ? { officeBlockNumber: entry.officeBlockNumber } : {},
10927
+ ...entry.officeBlockKind ? { officeBlockKind: entry.officeBlockKind } : {},
10928
+ ...typeof entry.pdfBlockNumber === "number" ? { pdfBlockNumber: entry.pdfBlockNumber } : {},
10929
+ ...entry.pdfTextKind ? { pdfTextKind: entry.pdfTextKind } : {},
10574
10930
  ...entry.sectionKind ? { sectionKind: entry.sectionKind } : {},
10575
10931
  ...sectionChunkId ? { sectionChunkId } : {},
10576
10932
  ...sectionChunkId && sectionChunkIndex >= 0 ? {
@@ -10959,9 +11315,25 @@ var annotateRetrievalChannels = (input) => {
10959
11315
  };
10960
11316
  });
10961
11317
  };
11318
+ var getStructuredSectionScoreWeight2 = (metadata) => {
11319
+ const pdfTextKind = typeof metadata?.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
11320
+ const officeBlockKind = typeof metadata?.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
11321
+ const sectionKind = typeof metadata?.sectionKind === "string" ? metadata.sectionKind : undefined;
11322
+ if (pdfTextKind === "table_like") {
11323
+ return 1.28;
11324
+ }
11325
+ if (officeBlockKind === "table" || officeBlockKind === "list") {
11326
+ return 1.24;
11327
+ }
11328
+ if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
11329
+ return 1.12;
11330
+ }
11331
+ return 1;
11332
+ };
10962
11333
  var buildTraceSectionCounts = (results) => {
10963
11334
  const sections = new Map;
10964
11335
  for (const result of results) {
11336
+ const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
10965
11337
  const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
10966
11338
  if (path.length === 0) {
10967
11339
  continue;
@@ -10988,6 +11360,7 @@ var buildTraceSectionCounts = (results) => {
10988
11360
  var buildTraceSectionScores = (results) => {
10989
11361
  const sections = new Map;
10990
11362
  for (const result of results) {
11363
+ const weightedScore = result.score * getStructuredSectionScoreWeight2(result.metadata);
10991
11364
  const path = Array.isArray(result.metadata?.sectionPath) ? result.metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
10992
11365
  if (path.length === 0) {
10993
11366
  continue;
@@ -10995,13 +11368,13 @@ var buildTraceSectionScores = (results) => {
10995
11368
  const key = path.join(" > ");
10996
11369
  const existing = sections.get(key);
10997
11370
  if (existing) {
10998
- existing.totalScore += result.score;
11371
+ existing.totalScore += weightedScore;
10999
11372
  continue;
11000
11373
  }
11001
11374
  sections.set(key, {
11002
11375
  key,
11003
11376
  label: path.at(-1) ?? key,
11004
- totalScore: result.score
11377
+ totalScore: weightedScore
11005
11378
  });
11006
11379
  }
11007
11380
  return [...sections.values()].sort((left, right) => {
@@ -11440,11 +11813,32 @@ var renderSourceLabels = (input) => {
11440
11813
  ].filter((row) => row.length > 0);
11441
11814
  return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
11442
11815
  };
11816
+ var formatStructureKindLabel = (kind) => {
11817
+ switch (kind) {
11818
+ case "markdown_heading":
11819
+ return "Markdown heading";
11820
+ case "html_heading":
11821
+ return "HTML heading";
11822
+ case "office_heading":
11823
+ return "Office heading";
11824
+ case "office_block":
11825
+ return "Office block";
11826
+ case "pdf_block":
11827
+ return "PDF block";
11828
+ case "spreadsheet_rows":
11829
+ return "Spreadsheet rows";
11830
+ case "presentation_slide":
11831
+ return "Presentation slide";
11832
+ default:
11833
+ return;
11834
+ }
11835
+ };
11443
11836
  var renderChunkStructure = (structure) => {
11444
11837
  if (!structure) {
11445
11838
  return "";
11446
11839
  }
11447
11840
  const rows = [
11841
+ structure.section?.kind ? `<li><strong>Kind</strong> ${escapeHtml2(formatStructureKindLabel(structure.section.kind) ?? structure.section.kind)}</li>` : "",
11448
11842
  structure.section?.title ? `<li><strong>Section</strong> ${escapeHtml2(structure.section.title)}</li>` : "",
11449
11843
  structure.section?.path && structure.section.path.length > 1 ? `<li><strong>Section path</strong> ${escapeHtml2(structure.section.path.join(" > "))}</li>` : "",
11450
11844
  typeof structure.sequence?.sectionChunkIndex === "number" && typeof structure.sequence?.sectionChunkCount === "number" ? `<li><strong>Section chunk</strong> ${structure.sequence.sectionChunkIndex + 1} of ${structure.sequence.sectionChunkCount}</li>` : "",
@@ -13688,6 +14082,24 @@ var ragChat = (config) => {
13688
14082
  limit: 100,
13689
14083
  store: config.retrievalReleaseIncidentStore
13690
14084
  });
14085
+ const baselineCorpusGroups = config.retrievalBaselineStore ? await loadRAGRetrievalBaselines({
14086
+ limit: 200,
14087
+ store: config.retrievalBaselineStore
14088
+ }) : [];
14089
+ const comparisonRunCorpusGroups = config.retrievalComparisonHistoryStore ? await loadRAGRetrievalComparisonHistory({
14090
+ limit: 200,
14091
+ store: config.retrievalComparisonHistoryStore
14092
+ }) : [];
14093
+ const releaseDecisionCorpusGroups = config.retrievalReleaseDecisionStore ? await loadRAGRetrievalReleaseDecisions({
14094
+ limit: 200,
14095
+ store: config.retrievalReleaseDecisionStore
14096
+ }) : [];
14097
+ const resolveIncidentCorpusGroupKey = (groupKey) => {
14098
+ if (!groupKey) {
14099
+ return;
14100
+ }
14101
+ return existing.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey ?? comparisonRunCorpusGroups.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey ?? baselineCorpusGroups.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey ?? releaseDecisionCorpusGroups.find((entry) => entry.groupKey === groupKey && typeof entry.corpusGroupKey === "string")?.corpusGroupKey;
14102
+ };
13691
14103
  const nextByKey = new Map;
13692
14104
  for (const candidate of input.promotionCandidates) {
13693
14105
  if (!candidate.groupKey || !candidate.targetRolloutLabel) {
@@ -13701,6 +14113,7 @@ var ragChat = (config) => {
13701
14113
  nextByKey.set(key, {
13702
14114
  baselineRetrievalId: candidate.baselineRetrievalId,
13703
14115
  candidateRetrievalId: candidate.candidateRetrievalId,
14116
+ corpusGroupKey: resolveIncidentCorpusGroupKey(candidate.groupKey),
13704
14117
  groupKey: candidate.groupKey,
13705
14118
  id: key,
13706
14119
  kind,
@@ -13726,6 +14139,7 @@ var ragChat = (config) => {
13726
14139
  nextByKey.set(key, {
13727
14140
  baselineRetrievalId: handoff.targetBaselineRetrievalId,
13728
14141
  candidateRetrievalId: handoff.candidateRetrievalId,
14142
+ corpusGroupKey: handoff.corpusGroupKey,
13729
14143
  groupKey: handoff.groupKey,
13730
14144
  id: key,
13731
14145
  kind: "handoff_stale",
@@ -13738,7 +14152,7 @@ var ragChat = (config) => {
13738
14152
  });
13739
14153
  }
13740
14154
  for (const incident of nextByKey.values()) {
13741
- const matchingIncidents = existing.filter((entry) => entry.groupKey === incident.groupKey && entry.kind === incident.kind && (entry.targetRolloutLabel ?? undefined) === (incident.targetRolloutLabel ?? undefined)).sort((left, right) => right.triggeredAt - left.triggeredAt);
14155
+ const matchingIncidents = existing.filter((entry) => entry.corpusGroupKey === incident.corpusGroupKey && entry.groupKey === incident.groupKey && entry.kind === incident.kind && (entry.targetRolloutLabel ?? undefined) === (incident.targetRolloutLabel ?? undefined)).sort((left, right) => right.triggeredAt - left.triggeredAt);
13742
14156
  const openIncident = matchingIncidents.find((entry) => entry.status === "open");
13743
14157
  const latestMatchingIncident = matchingIncidents[0];
13744
14158
  if (!openIncident) {
@@ -15151,6 +15565,7 @@ var ragChat = (config) => {
15151
15565
  const acknowledged = getStringProperty(queryInput, "acknowledged");
15152
15566
  const targetRolloutLabel = getStringProperty(queryInput, "targetRolloutLabel");
15153
15567
  const incidents = await loadRAGRetrievalReleaseIncidents({
15568
+ corpusGroupKey: getStringProperty(queryInput, "corpusGroupKey"),
15154
15569
  groupKey: getStringProperty(queryInput, "groupKey"),
15155
15570
  limit: getIntegerLikeProperty(queryInput, "limit"),
15156
15571
  severity: severity === "warning" || severity === "critical" ? severity : undefined,
@@ -15801,6 +16216,7 @@ var ragChat = (config) => {
15801
16216
  });
15802
16217
  return {
15803
16218
  incidents: await loadRAGRetrievalReleaseIncidents({
16219
+ corpusGroupKey: incident.corpusGroupKey,
15804
16220
  groupKey: incident.groupKey,
15805
16221
  limit: 20,
15806
16222
  store: config.retrievalReleaseIncidentStore
@@ -15844,6 +16260,7 @@ var ragChat = (config) => {
15844
16260
  });
15845
16261
  return {
15846
16262
  incidents: await loadRAGRetrievalReleaseIncidents({
16263
+ corpusGroupKey: incident.corpusGroupKey,
15847
16264
  groupKey: incident.groupKey,
15848
16265
  limit: 20,
15849
16266
  store: config.retrievalReleaseIncidentStore
@@ -15887,6 +16304,7 @@ var ragChat = (config) => {
15887
16304
  });
15888
16305
  return {
15889
16306
  incidents: await loadRAGRetrievalReleaseIncidents({
16307
+ corpusGroupKey: incident.corpusGroupKey,
15890
16308
  groupKey: incident.groupKey,
15891
16309
  limit: 20,
15892
16310
  store: config.retrievalReleaseIncidentStore
@@ -23860,5 +24278,5 @@ export {
23860
24278
  aiChat
23861
24279
  };
23862
24280
 
23863
- //# debugId=98118EA892F30E7564756E2164756E21
24281
+ //# debugId=23520EDE705830A964756E2164756E21
23864
24282
  //# sourceMappingURL=index.js.map