@absolutejs/absolute 0.19.0-beta.619 → 0.19.0-beta.620

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -260,6 +260,12 @@ var formatMediaTimestamp = (value) => {
260
260
  const milliseconds = Math.floor(value % 1000);
261
261
  return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}.${String(milliseconds).padStart(3, "0")}`;
262
262
  };
263
+ var formatMediaDurationLabel = (value) => {
264
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0) {
265
+ return;
266
+ }
267
+ return formatMediaTimestamp(value);
268
+ };
263
269
  var buildLocatorLabel = (metadata, source, title) => {
264
270
  if (!metadata) {
265
271
  return;
@@ -322,6 +328,12 @@ var buildProvenanceLabel = (metadata) => {
322
328
  const sentAt = formatTimestampLabel(metadata.sentAt) ?? formatTimestampLabel(metadata.receivedAt);
323
329
  const speaker = getContextString(metadata.speaker);
324
330
  const mediaKind = getContextString(metadata.mediaKind);
331
+ const mediaSegmentCount = getContextNumber(metadata.mediaSegmentCount);
332
+ const mediaSegmentGroupSize = getContextNumber(metadata.mediaSegmentGroupSize);
333
+ const mediaSegmentGroupIndex = getContextNumber(metadata.mediaSegmentGroupIndex);
334
+ const mediaChannel = getContextString(metadata.mediaChannel);
335
+ const mediaSpeakerCount = getContextNumber(metadata.mediaSpeakerCount);
336
+ const mediaDurationLabel = formatMediaDurationLabel(metadata.mediaDurationMs);
325
337
  const transcriptSource = getContextString(metadata.transcriptSource);
326
338
  const pdfTextMode = getContextString(metadata.pdfTextMode);
327
339
  const ocrEngine = getContextString(metadata.ocrEngine);
@@ -331,6 +343,12 @@ var buildProvenanceLabel = (metadata) => {
331
343
  ocrEngine ? `OCR ${ocrEngine}` : "",
332
344
  typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
333
345
  mediaKind ? `Media ${mediaKind}` : "",
346
+ mediaSegmentCount ? `${mediaSegmentCount} segments` : "",
347
+ mediaSegmentGroupSize ? `${mediaSegmentGroupSize} grouped segments` : "",
348
+ mediaSegmentGroupIndex !== undefined ? `Segment group ${mediaSegmentGroupIndex + 1}` : "",
349
+ mediaChannel ? `Channel ${mediaChannel}` : "",
350
+ mediaSpeakerCount ? `${mediaSpeakerCount} speakers` : "",
351
+ mediaDurationLabel ? `Duration ${mediaDurationLabel}` : "",
334
352
  transcriptSource ? `Transcript ${transcriptSource}` : "",
335
353
  threadTopic ? `Thread ${threadTopic}` : "",
336
354
  speaker ? `Speaker ${speaker}` : "",
@@ -4233,6 +4251,34 @@ var getAttachmentName2 = (source, title) => {
4233
4251
  }
4234
4252
  return;
4235
4253
  };
4254
+ var getSpreadsheetHeaders = (metadata) => Array.isArray(metadata?.spreadsheetHeaders) ? metadata.spreadsheetHeaders.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
4255
+ var formatSpreadsheetRowRange = (rowStart, rowEnd) => {
4256
+ if (typeof rowStart !== "number" || !Number.isFinite(rowStart)) {
4257
+ return;
4258
+ }
4259
+ if (typeof rowEnd !== "number" && typeof rowStart === "number" && Number.isFinite(rowStart)) {
4260
+ return `Rows ${rowStart}`;
4261
+ }
4262
+ if (rowStart === rowEnd) {
4263
+ return `Rows ${rowStart}`;
4264
+ }
4265
+ return `Rows ${rowStart}-${rowEnd}`;
4266
+ };
4267
+ var formatSpreadsheetTableLabel = (tableIndex, tableCount) => {
4268
+ if (typeof tableIndex !== "number" || !Number.isFinite(tableIndex) || tableIndex < 1) {
4269
+ return;
4270
+ }
4271
+ if (typeof tableCount === "number" && Number.isFinite(tableCount) && tableCount >= tableIndex) {
4272
+ return `Table ${tableIndex} of ${tableCount}`;
4273
+ }
4274
+ return `Table ${tableIndex}`;
4275
+ };
4276
+ var formatMediaDurationLabel2 = (value) => {
4277
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0) {
4278
+ return;
4279
+ }
4280
+ return formatMediaTimestamp2(value);
4281
+ };
4236
4282
  var buildContextLabel2 = (metadata) => {
4237
4283
  if (!metadata) {
4238
4284
  return;
@@ -4258,29 +4304,61 @@ var buildContextLabel2 = (metadata) => {
4258
4304
  }
4259
4305
  const emailKind = getContextString2(metadata.emailKind);
4260
4306
  if (emailKind === "attachment") {
4261
- return "Attachment evidence";
4307
+ const attachmentName = getContextString2(metadata.attachmentName);
4308
+ const threadTopic2 = getContextString2(metadata.threadTopic);
4309
+ return attachmentName ? threadTopic2 ? `Attachment evidence ${attachmentName} in ${threadTopic2}` : `Attachment evidence ${attachmentName}` : "Attachment evidence";
4262
4310
  }
4263
4311
  if (emailKind === "message") {
4312
+ const threadTopic2 = getContextString2(metadata.threadTopic);
4264
4313
  const from = getContextString2(metadata.from);
4314
+ if (threadTopic2) {
4315
+ return from ? `Message in ${threadTopic2} from ${from}` : `Message in ${threadTopic2}`;
4316
+ }
4265
4317
  return from ? `Message from ${from}` : "Message evidence";
4266
4318
  }
4267
4319
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
4268
4320
  const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
4321
+ const hasOCRTrace = typeof metadata.ocrRegionConfidence === "number" || typeof metadata.ocrConfidence === "number" || getContextString2(metadata.pdfTextMode) === "ocr" || typeof metadata.ocrRegionCount === "number";
4269
4322
  if (page && region) {
4323
+ if (hasOCRTrace) {
4324
+ return `OCR page ${page} region ${region}`;
4325
+ }
4270
4326
  return `Page ${page} region ${region}`;
4271
4327
  }
4272
4328
  if (page) {
4329
+ if (hasOCRTrace) {
4330
+ return `OCR page ${page}`;
4331
+ }
4273
4332
  return `Page ${page}`;
4274
4333
  }
4275
4334
  const sheet = getContextString2(metadata.sheetName) ?? (Array.isArray(metadata.sheetNames) ? getContextString2(metadata.sheetNames[0]) : undefined);
4276
4335
  if (sheet) {
4336
+ const tableLabel = formatSpreadsheetTableLabel(getContextNumber2(metadata.spreadsheetTableIndex), getContextNumber2(metadata.spreadsheetTableCount));
4337
+ const rowRange = formatSpreadsheetRowRange(getContextNumber2(metadata.spreadsheetRowStart), getContextNumber2(metadata.spreadsheetRowEnd));
4338
+ const headers = getSpreadsheetHeaders(metadata);
4339
+ if (tableLabel && rowRange) {
4340
+ return `Sheet ${sheet} ${tableLabel} ${rowRange}`;
4341
+ }
4342
+ if (tableLabel) {
4343
+ return `Sheet ${sheet} ${tableLabel}`;
4344
+ }
4345
+ if (rowRange) {
4346
+ return `Sheet ${sheet} ${rowRange}`;
4347
+ }
4348
+ if (headers.length > 0) {
4349
+ return `Sheet ${sheet} by ${headers.slice(0, 2).join(", ")}`;
4350
+ }
4277
4351
  return `Sheet ${sheet}`;
4278
4352
  }
4279
4353
  const slide = getContextNumber2(metadata.slide) ?? getContextNumber2(metadata.slideNumber) ?? (typeof metadata.slideIndex === "number" ? metadata.slideIndex + 1 : undefined);
4354
+ const slideTitle = getContextString2(metadata.slideTitle);
4280
4355
  if (slide) {
4356
+ if (slideTitle) {
4357
+ return `Slide ${slide} ${slideTitle}`;
4358
+ }
4281
4359
  return `Slide ${slide}`;
4282
4360
  }
4283
- const archiveEntry = getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
4361
+ const archiveEntry = getContextString2(metadata.archiveFullPath) ?? getContextString2(metadata.archivePath) ?? getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
4284
4362
  if (archiveEntry) {
4285
4363
  return `Archive entry ${archiveEntry}`;
4286
4364
  }
@@ -4305,6 +4383,9 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4305
4383
  const officeBlockKind = getContextString2(metadata.officeBlockKind);
4306
4384
  const pdfBlockNumber = getContextNumber2(metadata.pdfBlockNumber);
4307
4385
  const officeBlockNumber = getContextNumber2(metadata.officeBlockNumber);
4386
+ const spreadsheetRowStart = getContextNumber2(metadata.spreadsheetRowStart);
4387
+ const spreadsheetRowEnd = getContextNumber2(metadata.spreadsheetRowEnd);
4388
+ const slideTitle = getContextString2(metadata.slideTitle);
4308
4389
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
4309
4390
  const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
4310
4391
  if (page && region) {
@@ -4321,19 +4402,31 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4321
4402
  }
4322
4403
  const sheet = getContextString2(metadata.sheetName) ?? (Array.isArray(metadata.sheetNames) ? getContextString2(metadata.sheetNames[0]) : undefined);
4323
4404
  if (sheet) {
4324
- return `Sheet ${sheet}`;
4405
+ const tableLabel = formatSpreadsheetTableLabel(getContextNumber2(metadata.spreadsheetTableIndex), getContextNumber2(metadata.spreadsheetTableCount));
4406
+ const rowRange = formatSpreadsheetRowRange(spreadsheetRowStart, spreadsheetRowEnd);
4407
+ if (tableLabel && rowRange) {
4408
+ return `Sheet ${sheet} \xB7 ${tableLabel} \xB7 ${rowRange}`;
4409
+ }
4410
+ if (tableLabel) {
4411
+ return `Sheet ${sheet} \xB7 ${tableLabel}`;
4412
+ }
4413
+ return rowRange ? `Sheet ${sheet} \xB7 ${rowRange}` : `Sheet ${sheet}`;
4325
4414
  }
4326
4415
  const slide = getContextNumber2(metadata.slide) ?? getContextNumber2(metadata.slideNumber) ?? (typeof metadata.slideIndex === "number" ? metadata.slideIndex + 1 : undefined);
4327
4416
  if (slide) {
4328
- return `Slide ${slide}`;
4417
+ return slideTitle ? `Slide ${slide} \xB7 ${slideTitle}` : `Slide ${slide}`;
4329
4418
  }
4330
- const archiveEntry = getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
4419
+ const archiveEntry = getContextString2(metadata.archiveFullPath) ?? getContextString2(metadata.archivePath) ?? getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
4331
4420
  if (archiveEntry) {
4332
4421
  return `Archive entry ${archiveEntry}`;
4333
4422
  }
4334
4423
  const emailKind = getContextString2(metadata.emailKind);
4335
4424
  if (emailKind === "attachment") {
4336
4425
  const attachmentName = getContextString2(metadata.attachmentName) ?? getAttachmentName2(source, title);
4426
+ const replyDepth = getContextNumber2(metadata.replyDepth);
4427
+ if (attachmentName && replyDepth && replyDepth > 0) {
4428
+ return `Attachment ${attachmentName} \xB7 Reply depth ${replyDepth}`;
4429
+ }
4337
4430
  return attachmentName ? `Attachment ${attachmentName}` : "Attachment";
4338
4431
  }
4339
4432
  const mediaStart = formatMediaTimestamp2(metadata.startMs);
@@ -4364,18 +4457,36 @@ var buildProvenanceLabel2 = (metadata) => {
4364
4457
  return;
4365
4458
  }
4366
4459
  const threadTopic = getContextString2(metadata.threadTopic);
4460
+ const replyDepth = getContextNumber2(metadata.replyDepth);
4461
+ const threadMessageCount = getContextNumber2(metadata.threadMessageCount);
4462
+ const threadRootMessageId = getContextString2(metadata.threadRootMessageId);
4367
4463
  const from = getContextString2(metadata.from);
4368
4464
  const sentAt = formatTimestampLabel2(metadata.sentAt) ?? formatTimestampLabel2(metadata.receivedAt);
4369
4465
  const speaker = getContextString2(metadata.speaker);
4370
4466
  const mediaKind = getContextString2(metadata.mediaKind);
4371
4467
  const transcriptSource = getContextString2(metadata.transcriptSource);
4468
+ const mediaSpeakerCount = getContextNumber2(metadata.mediaSpeakerCount);
4469
+ const mediaSegmentCount = getContextNumber2(metadata.mediaSegmentCount);
4470
+ const mediaSegmentGroupSize = getContextNumber2(metadata.mediaSegmentGroupSize);
4471
+ const mediaSegmentGroupIndex = getContextNumber2(metadata.mediaSegmentGroupIndex);
4472
+ const mediaChannel = getContextString2(metadata.mediaChannel);
4473
+ const mediaDurationLabel = formatMediaDurationLabel2(metadata.mediaDurationMs);
4474
+ const spreadsheetHeaders = getSpreadsheetHeaders(metadata);
4475
+ const slideNotesText = getContextString2(metadata.slideNotesText);
4372
4476
  const pdfTextMode = getContextString2(metadata.pdfTextMode);
4373
4477
  const pdfTextKind = getContextString2(metadata.pdfTextKind);
4374
4478
  const officeBlockKind = getContextString2(metadata.officeBlockKind);
4375
4479
  const ocrEngine = getContextString2(metadata.ocrEngine);
4376
4480
  const extractorRegistryMatch = getContextString2(metadata.extractorRegistryMatch);
4377
4481
  const chunkingProfile = getContextString2(metadata.chunkingProfile);
4482
+ const archiveDepth = getContextNumber2(metadata.archiveDepth);
4483
+ const archiveNestedDepth = getContextNumber2(metadata.archiveNestedDepth);
4484
+ const archiveContainerPath = getContextString2(metadata.archiveContainerPath);
4485
+ const archiveRootName = getContextString2(metadata.archiveRootName);
4486
+ const spreadsheetTableLabel = formatSpreadsheetTableLabel(getContextNumber2(metadata.spreadsheetTableIndex), getContextNumber2(metadata.spreadsheetTableCount));
4378
4487
  const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
4488
+ const ocrAverageConfidence = getContextNumber2(metadata.ocrPageAverageConfidence) ?? getContextNumber2(metadata.ocrAverageConfidence);
4489
+ const ocrRegionCount = getContextNumber2(metadata.ocrRegionCount);
4379
4490
  const labels = [
4380
4491
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
4381
4492
  pdfTextKind === "table_like" ? "PDF table block" : pdfTextKind === "paragraph" ? "PDF text block" : "",
@@ -4384,9 +4495,27 @@ var buildProvenanceLabel2 = (metadata) => {
4384
4495
  extractorRegistryMatch ? `Extractor ${extractorRegistryMatch}` : "",
4385
4496
  chunkingProfile ? `Chunking ${chunkingProfile}` : "",
4386
4497
  typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
4498
+ typeof ocrAverageConfidence === "number" && ocrAverageConfidence !== ocrConfidence ? `Average ${ocrAverageConfidence.toFixed(2)}` : "",
4499
+ typeof ocrRegionCount === "number" ? `${ocrRegionCount} regions` : "",
4500
+ spreadsheetHeaders.length > 0 ? `Spreadsheet ${spreadsheetHeaders.join(", ")}` : "",
4501
+ spreadsheetTableLabel ? `Spreadsheet ${spreadsheetTableLabel}` : "",
4387
4502
  mediaKind ? `Media ${mediaKind}` : "",
4503
+ mediaSegmentCount ? `${mediaSegmentCount} segments` : "",
4504
+ mediaSegmentGroupSize ? `${mediaSegmentGroupSize} grouped segments` : "",
4505
+ mediaSegmentGroupIndex !== undefined ? `Segment group ${mediaSegmentGroupIndex + 1}` : "",
4506
+ mediaChannel ? `Channel ${mediaChannel}` : "",
4507
+ mediaSpeakerCount ? `${mediaSpeakerCount} speakers` : "",
4508
+ mediaDurationLabel ? `Duration ${mediaDurationLabel}` : "",
4388
4509
  transcriptSource ? `Transcript ${transcriptSource}` : "",
4389
4510
  threadTopic ? `Thread ${threadTopic}` : "",
4511
+ threadRootMessageId ? `Thread root ${threadRootMessageId}` : "",
4512
+ threadMessageCount ? `${threadMessageCount} thread messages` : "",
4513
+ replyDepth ? `Reply depth ${replyDepth}` : "",
4514
+ slideNotesText ? "Speaker notes" : "",
4515
+ archiveDepth ? `Archive depth ${archiveDepth}` : "",
4516
+ archiveNestedDepth ? `Archive nested depth ${archiveNestedDepth}` : "",
4517
+ archiveContainerPath ? `Archive container ${archiveContainerPath}` : "",
4518
+ archiveRootName ? `Archive root ${archiveRootName}` : "",
4390
4519
  speaker ? `Speaker ${speaker}` : "",
4391
4520
  from ? `Sender ${from}` : "",
4392
4521
  sentAt ? `Sent ${sentAt}` : ""
@@ -4763,7 +4892,7 @@ var getSectionPathFromSource = (source) => {
4763
4892
  const path = source.structure?.section?.path ?? (Array.isArray(source.metadata?.sectionPath) ? source.metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : []);
4764
4893
  return path.length > 0 ? path : undefined;
4765
4894
  };
4766
- var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office "));
4895
+ var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office ") || value.startsWith("Slide "));
4767
4896
  var getStructuredSectionScoreWeight = (metadata) => {
4768
4897
  if (!metadata) {
4769
4898
  return 1;
@@ -4771,6 +4900,8 @@ var getStructuredSectionScoreWeight = (metadata) => {
4771
4900
  const pdfTextKind = getContextString2(metadata.pdfTextKind);
4772
4901
  const officeBlockKind = getContextString2(metadata.officeBlockKind);
4773
4902
  const sectionKind = getContextString2(metadata.sectionKind);
4903
+ const slideTitle = getContextString2(metadata.slideTitle);
4904
+ const slideNotesText = getContextString2(metadata.slideNotesText);
4774
4905
  if (pdfTextKind === "table_like") {
4775
4906
  return 1.28;
4776
4907
  }
@@ -4780,6 +4911,12 @@ var getStructuredSectionScoreWeight = (metadata) => {
4780
4911
  if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
4781
4912
  return 1.12;
4782
4913
  }
4914
+ if (sectionKind === "presentation_slide" && slideNotesText) {
4915
+ return 1.2;
4916
+ }
4917
+ if (sectionKind === "presentation_slide" && slideTitle) {
4918
+ return 1.14;
4919
+ }
4783
4920
  return 1;
4784
4921
  };
4785
4922
  var getStructuredSourceLeadScore = (source) => source.score * getStructuredSectionScoreWeight(source.metadata);
@@ -8560,10 +8697,32 @@ var scoreLoosePhraseMatch2 = (query, text) => {
8560
8697
  return 0;
8561
8698
  };
8562
8699
  var queryHasAnyToken = (queryTokens, candidates) => candidates.some((candidate) => queryTokens.includes(candidate));
8700
+ var metadataString = (value) => typeof value === "string" && value.trim().length > 0 ? value.trim().toLowerCase() : undefined;
8563
8701
  var scoreStructuredEvidenceMatch = (queryTokens, result) => {
8564
8702
  const metadata = result.metadata ?? {};
8565
8703
  const pdfTextKind = typeof metadata.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
8566
8704
  const officeBlockKind = typeof metadata.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
8705
+ const slideTitle = metadataString(metadata.slideTitle);
8706
+ const slideNotesText = metadataString(metadata.slideNotesText);
8707
+ const threadTopic = metadataString(metadata.threadTopic);
8708
+ const threadRootMessageId = metadataString(metadata.threadRootMessageId);
8709
+ const threadMessageCount = typeof metadata.threadMessageCount === "number" ? metadata.threadMessageCount : undefined;
8710
+ const attachmentName = metadataString(metadata.attachmentName);
8711
+ const archivePath = metadataString(metadata.archivePath);
8712
+ const archiveFullPath = metadataString(metadata.archiveFullPath);
8713
+ const archiveContainerPath = metadataString(metadata.archiveContainerPath);
8714
+ const archiveNestedDepth = typeof metadata.archiveNestedDepth === "number" ? metadata.archiveNestedDepth : undefined;
8715
+ const mediaSpeakerCount = typeof metadata.mediaSpeakerCount === "number" ? metadata.mediaSpeakerCount : undefined;
8716
+ const mediaSegmentCount = typeof metadata.mediaSegmentCount === "number" ? metadata.mediaSegmentCount : undefined;
8717
+ const mediaSegmentGroupSize = typeof metadata.mediaSegmentGroupSize === "number" ? metadata.mediaSegmentGroupSize : undefined;
8718
+ const mediaChannel = metadataString(metadata.mediaChannel);
8719
+ const speaker = metadataString(metadata.speaker);
8720
+ const ocrConfidence = typeof metadata.ocrRegionConfidence === "number" ? metadata.ocrRegionConfidence : typeof metadata.ocrPageAverageConfidence === "number" ? metadata.ocrPageAverageConfidence : typeof metadata.ocrAverageConfidence === "number" ? metadata.ocrAverageConfidence : typeof metadata.ocrConfidence === "number" ? metadata.ocrConfidence : undefined;
8721
+ const isOCREvidence = typeof ocrConfidence === "number" || metadataString(metadata.pdfTextMode) === "ocr";
8722
+ const spreadsheetHeaders = Array.isArray(metadata.spreadsheetHeaders) ? metadata.spreadsheetHeaders.map((value) => metadataString(value)).filter((value) => typeof value === "string") : [];
8723
+ const spreadsheetTableIndex = typeof metadata.spreadsheetTableIndex === "number" ? metadata.spreadsheetTableIndex : undefined;
8724
+ const spreadsheetTableCount = typeof metadata.spreadsheetTableCount === "number" ? metadata.spreadsheetTableCount : undefined;
8725
+ const hasSpreadsheetRows = typeof metadata.spreadsheetRowStart === "number" || typeof metadata.spreadsheetRowEnd === "number";
8567
8726
  const hasBlockMetadata = typeof metadata.pdfBlockNumber === "number" || typeof metadata.officeBlockNumber === "number";
8568
8727
  let score = 0;
8569
8728
  if (hasBlockMetadata) {
@@ -8606,6 +8765,162 @@ var scoreStructuredEvidenceMatch = (queryTokens, result) => {
8606
8765
  ])) {
8607
8766
  score += 0.55;
8608
8767
  }
8768
+ if (spreadsheetHeaders.length > 0 && (queryHasAnyToken(queryTokens, [
8769
+ "sheet",
8770
+ "spreadsheet",
8771
+ "workbook",
8772
+ "column",
8773
+ "columns",
8774
+ "row",
8775
+ "rows"
8776
+ ]) || queryTokens.some((token) => spreadsheetHeaders.some((header) => header.includes(token))))) {
8777
+ score += 0.45;
8778
+ }
8779
+ if (hasSpreadsheetRows && queryHasAnyToken(queryTokens, ["row", "rows", "sheet", "spreadsheet"])) {
8780
+ score += 0.18;
8781
+ }
8782
+ if (typeof spreadsheetTableIndex === "number" && queryHasAnyToken(queryTokens, [
8783
+ "table",
8784
+ "tables",
8785
+ "sheet",
8786
+ "spreadsheet"
8787
+ ])) {
8788
+ score += 0.16;
8789
+ if (typeof spreadsheetTableCount === "number" && spreadsheetTableCount > 1) {
8790
+ score += 0.08;
8791
+ }
8792
+ }
8793
+ if (slideTitle && (queryHasAnyToken(queryTokens, [
8794
+ "slide",
8795
+ "slides",
8796
+ "deck",
8797
+ "presentation"
8798
+ ]) || queryTokens.some((token) => slideTitle.includes(token)))) {
8799
+ score += 0.4;
8800
+ }
8801
+ if (slideNotesText && queryHasAnyToken(queryTokens, [
8802
+ "notes",
8803
+ "speaker",
8804
+ "speakers",
8805
+ "talking"
8806
+ ])) {
8807
+ score += 0.2;
8808
+ }
8809
+ if (speaker && queryHasAnyToken(queryTokens, ["speaker", "speakers", "said", "says"])) {
8810
+ score += 0.22;
8811
+ }
8812
+ if (typeof mediaSpeakerCount === "number" && mediaSpeakerCount > 1 && queryHasAnyToken(queryTokens, [
8813
+ "speaker",
8814
+ "speakers",
8815
+ "conversation",
8816
+ "dialogue"
8817
+ ])) {
8818
+ score += 0.12;
8819
+ }
8820
+ if (typeof mediaSegmentCount === "number" && mediaSegmentCount > 1 && queryHasAnyToken(queryTokens, [
8821
+ "timestamp",
8822
+ "segment",
8823
+ "segments",
8824
+ "audio",
8825
+ "video"
8826
+ ])) {
8827
+ score += 0.08;
8828
+ }
8829
+ if (typeof mediaSegmentGroupSize === "number" && mediaSegmentGroupSize > 1 && queryHasAnyToken(queryTokens, [
8830
+ "segment",
8831
+ "segments",
8832
+ "timestamp",
8833
+ "group",
8834
+ "audio",
8835
+ "video"
8836
+ ])) {
8837
+ score += 0.06;
8838
+ }
8839
+ if (mediaChannel && queryHasAnyToken(queryTokens, [
8840
+ "channel",
8841
+ "channels",
8842
+ "left",
8843
+ "right",
8844
+ "audio",
8845
+ "video"
8846
+ ])) {
8847
+ score += 0.12;
8848
+ }
8849
+ if (threadTopic && (queryHasAnyToken(queryTokens, [
8850
+ "email",
8851
+ "emails",
8852
+ "thread",
8853
+ "reply",
8854
+ "replies",
8855
+ "attachment"
8856
+ ]) || queryTokens.some((token) => threadTopic.includes(token)))) {
8857
+ score += 0.34;
8858
+ }
8859
+ if (threadRootMessageId && queryHasAnyToken(queryTokens, [
8860
+ "thread",
8861
+ "reply",
8862
+ "replies",
8863
+ "root",
8864
+ "email"
8865
+ ])) {
8866
+ score += 0.12;
8867
+ }
8868
+ if (typeof threadMessageCount === "number" && threadMessageCount > 1 && queryHasAnyToken(queryTokens, [
8869
+ "thread",
8870
+ "reply",
8871
+ "replies",
8872
+ "attachment"
8873
+ ])) {
8874
+ score += 0.08;
8875
+ }
8876
+ if (attachmentName && queryHasAnyToken(queryTokens, [
8877
+ "attachment",
8878
+ "attachments",
8879
+ "file",
8880
+ "files"
8881
+ ])) {
8882
+ score += 0.18;
8883
+ }
8884
+ if ((archiveFullPath || archivePath) && (queryHasAnyToken(queryTokens, [
8885
+ "archive",
8886
+ "archives",
8887
+ "entry",
8888
+ "entries",
8889
+ "bundle",
8890
+ "zip"
8891
+ ]) || queryTokens.some((token) => (archiveFullPath ?? archivePath ?? "").includes(token)))) {
8892
+ score += 0.34;
8893
+ }
8894
+ if (archiveContainerPath && queryHasAnyToken(queryTokens, [
8895
+ "nested",
8896
+ "inner",
8897
+ "container",
8898
+ "archive"
8899
+ ])) {
8900
+ score += 0.12;
8901
+ }
8902
+ if (typeof archiveNestedDepth === "number" && archiveNestedDepth > 1 && queryHasAnyToken(queryTokens, ["nested", "inner", "archive"])) {
8903
+ score += 0.08;
8904
+ }
8905
+ if (isOCREvidence && queryHasAnyToken(queryTokens, [
8906
+ "ocr",
8907
+ "scan",
8908
+ "scanned",
8909
+ "image",
8910
+ "photo",
8911
+ "region",
8912
+ "regions",
8913
+ "page",
8914
+ "pages"
8915
+ ])) {
8916
+ if (typeof ocrConfidence === "number" && ocrConfidence >= 0.9) {
8917
+ score += 0.12;
8918
+ } else if (typeof ocrConfidence === "number" && ocrConfidence >= 0.75) {
8919
+ score += 0.05;
8920
+ } else if (typeof ocrConfidence === "number" && ocrConfidence < 0.55) {
8921
+ score -= 0.05;
8922
+ }
8923
+ }
8609
8924
  return score;
8610
8925
  };
8611
8926
  var scoreHeuristicMatch = ({
@@ -8817,6 +9132,62 @@ var formatMediaTimestampForIngest = (value) => {
8817
9132
  const milliseconds = Math.floor(value % 1000);
8818
9133
  return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}.${String(milliseconds).padStart(3, "0")}`;
8819
9134
  };
9135
+ var normalizeMediaSpeaker = (value) => typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
9136
+ var normalizeMediaChannel = (value) => typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
9137
+ var buildMediaTimestampBoundary = (segments) => {
9138
+ let startMs;
9139
+ let endMs;
9140
+ for (const segment of segments) {
9141
+ if (typeof segment.startMs === "number" && Number.isFinite(segment.startMs)) {
9142
+ startMs = segment.startMs;
9143
+ break;
9144
+ }
9145
+ }
9146
+ for (let index = segments.length - 1;index >= 0; index--) {
9147
+ const segment = segments[index];
9148
+ if (!segment) {
9149
+ continue;
9150
+ }
9151
+ if (typeof segment.endMs === "number" && Number.isFinite(segment.endMs)) {
9152
+ endMs = segment.endMs;
9153
+ break;
9154
+ }
9155
+ }
9156
+ return { endMs, startMs };
9157
+ };
9158
+ var groupTranscriptSegments = (segments) => {
9159
+ const groups = [];
9160
+ for (const segment of segments) {
9161
+ if (!segment || typeof segment !== "object") {
9162
+ continue;
9163
+ }
9164
+ const text = normalizeWhitespace(segment.text ?? "");
9165
+ if (!text) {
9166
+ continue;
9167
+ }
9168
+ const speaker = normalizeMediaSpeaker(segment.speaker);
9169
+ const channel = normalizeMediaChannel(segment.channel);
9170
+ const lastGroup = groups.at(-1);
9171
+ if (!lastGroup || lastGroup.speaker !== speaker || lastGroup.channel !== channel) {
9172
+ groups.push({
9173
+ channel,
9174
+ endMs: segment.endMs,
9175
+ segments: [segment],
9176
+ speaker,
9177
+ startMs: segment.startMs
9178
+ });
9179
+ continue;
9180
+ }
9181
+ lastGroup.endMs = typeof segment.endMs === "number" && Number.isFinite(segment.endMs) ? segment.endMs : lastGroup.endMs;
9182
+ lastGroup.segments.push(segment);
9183
+ if (typeof segment.startMs === "number" && Number.isFinite(segment.startMs)) {
9184
+ if (typeof lastGroup.startMs !== "number" || !Number.isFinite(lastGroup.startMs) || segment.startMs < lastGroup.startMs) {
9185
+ lastGroup.startMs = segment.startMs;
9186
+ }
9187
+ }
9188
+ }
9189
+ return groups;
9190
+ };
8820
9191
  var decodeHtmlEntities = (value) => {
8821
9192
  let output = value;
8822
9193
  for (const [pattern, replacement] of HTML_ENTITY_REPLACEMENTS) {
@@ -9178,12 +9549,17 @@ var spreadsheetStructureUnits = (value, metadata) => {
9178
9549
  return [];
9179
9550
  }
9180
9551
  const sheetName = typeof metadata?.sheetName === "string" && metadata.sheetName || lines[0].replace(/^Sheet\s+/i, "");
9552
+ const spreadsheetHeaders = Array.isArray(metadata?.sheetHeaders) ? metadata.sheetHeaders.filter((value2) => typeof value2 === "string" && value2.trim().length > 0) : [];
9553
+ const repeatedHeaderRowNumbers = Array.isArray(metadata?.repeatedHeaderRowNumbers) ? metadata.repeatedHeaderRowNumbers.filter((value2) => typeof value2 === "number" && Number.isFinite(value2)) : [];
9554
+ const spreadsheetTableCount = typeof metadata?.sheetTableCount === "number" && Number.isFinite(metadata.sheetTableCount) ? metadata.sheetTableCount : Math.max(repeatedHeaderRowNumbers.length + 1, 1);
9181
9555
  const rowLines = lines.filter((line) => /^Row \d+\./.test(line));
9182
9556
  if (rowLines.length === 0) {
9183
9557
  return [
9184
9558
  {
9185
9559
  sectionDepth: 1,
9186
9560
  sectionKind: "spreadsheet_rows",
9561
+ ...spreadsheetHeaders.length > 0 ? { spreadsheetHeaders } : {},
9562
+ ...spreadsheetTableCount > 1 ? { spreadsheetTableCount, spreadsheetTableIndex: 1 } : {},
9187
9563
  sectionPath: [sheetName],
9188
9564
  sectionTitle: sheetName,
9189
9565
  text: normalizeWhitespace(lines.join(`
@@ -9191,35 +9567,69 @@ var spreadsheetStructureUnits = (value, metadata) => {
9191
9567
  }
9192
9568
  ];
9193
9569
  }
9194
- const groups = [];
9195
- let current = [];
9570
+ const tableSegments = [];
9571
+ let currentTableRows = [];
9572
+ let tableIndex = 1;
9196
9573
  for (const row of rowLines) {
9197
- const candidate = [...current, row].join(`
9198
- `);
9199
- if (current.length > 0 && candidate.length > DEFAULT_MAX_CHUNK_LENGTH) {
9200
- groups.push(current);
9201
- current = [row];
9574
+ const rowNumber = Number(row.match(/^Row (\d+)\./)?.[1] ?? NaN);
9575
+ if (currentTableRows.length > 0 && Number.isFinite(rowNumber) && repeatedHeaderRowNumbers.includes(rowNumber)) {
9576
+ tableSegments.push({ rows: currentTableRows, tableIndex });
9577
+ currentTableRows = [row];
9578
+ tableIndex += 1;
9202
9579
  continue;
9203
9580
  }
9204
- current.push(row);
9581
+ currentTableRows.push(row);
9582
+ }
9583
+ if (currentTableRows.length > 0) {
9584
+ tableSegments.push({ rows: currentTableRows, tableIndex });
9205
9585
  }
9206
- if (current.length > 0) {
9207
- groups.push(current);
9586
+ const groups = [];
9587
+ for (const segment of tableSegments) {
9588
+ let current = [];
9589
+ for (const row of segment.rows) {
9590
+ const candidate = [...current, row].join(`
9591
+ `);
9592
+ if (current.length > 0 && candidate.length > DEFAULT_MAX_CHUNK_LENGTH) {
9593
+ groups.push({ rows: current, tableIndex: segment.tableIndex });
9594
+ current = [row];
9595
+ continue;
9596
+ }
9597
+ current.push(row);
9598
+ }
9599
+ if (current.length > 0) {
9600
+ groups.push({ rows: current, tableIndex: segment.tableIndex });
9601
+ }
9208
9602
  }
9209
- return groups.map((rows) => ({
9210
- preferredChunkUnits: rows,
9211
- sectionDepth: 1,
9212
- sectionKind: "spreadsheet_rows",
9213
- sectionPath: [sheetName],
9214
- sectionTitle: sheetName,
9215
- text: normalizeWhitespace([`Sheet ${sheetName}`, ...rows].join(`
9603
+ return groups.map(({ rows, tableIndex: tableIndex2 }) => {
9604
+ const rowNumbers = rows.map((row) => Number(row.match(/^Row (\d+)\./)?.[1] ?? NaN)).filter((value2) => Number.isFinite(value2));
9605
+ return {
9606
+ preferredChunkUnits: rows,
9607
+ sectionDepth: 1,
9608
+ sectionKind: "spreadsheet_rows",
9609
+ sectionPath: [sheetName],
9610
+ sectionTitle: sheetName,
9611
+ ...spreadsheetHeaders.length > 0 ? { spreadsheetHeaders } : {},
9612
+ ...spreadsheetTableCount > 1 ? { spreadsheetTableCount, spreadsheetTableIndex: tableIndex2 } : {},
9613
+ ...rowNumbers.length > 0 ? {
9614
+ spreadsheetRowEnd: rowNumbers[rowNumbers.length - 1],
9615
+ spreadsheetRowStart: rowNumbers[0]
9616
+ } : {},
9617
+ text: normalizeWhitespace([`Sheet ${sheetName}`, ...rows].join(`
9216
9618
  `))
9217
- }));
9619
+ };
9620
+ });
9218
9621
  };
9219
9622
  var presentationStructureUnits = (value, metadata) => {
9220
9623
  const slideNumber = typeof metadata?.slideNumber === "number" ? metadata.slideNumber : typeof metadata?.slideIndex === "number" ? metadata.slideIndex + 1 : undefined;
9221
- const slideLabel = slideNumber ? `Slide ${slideNumber}` : "Slide";
9222
- const paragraphs = paragraphUnits(value);
9624
+ const slideTitle = typeof metadata?.slideTitle === "string" && metadata.slideTitle.trim() ? metadata.slideTitle.trim() : undefined;
9625
+ const slideBodyText = typeof metadata?.slideBodyText === "string" && metadata.slideBodyText.trim() ? metadata.slideBodyText.trim() : undefined;
9626
+ const slideNotesText = typeof metadata?.slideNotesText === "string" && metadata.slideNotesText.trim() ? metadata.slideNotesText.trim() : undefined;
9627
+ const slideLabel = slideTitle || (slideNumber ? `Slide ${slideNumber}` : "Slide");
9628
+ const paragraphs = [
9629
+ slideTitle,
9630
+ slideBodyText,
9631
+ slideNotesText ? `Speaker notes: ${slideNotesText}` : undefined
9632
+ ].filter((entry) => Boolean(entry)).flatMap((entry) => paragraphUnits(entry));
9223
9633
  return [
9224
9634
  {
9225
9635
  preferredChunkUnits: paragraphs,
@@ -9674,6 +10084,8 @@ var spreadsheetRowText = (row, headers) => {
9674
10084
  });
9675
10085
  return normalizeWhitespace(entries.join(" | "));
9676
10086
  };
10087
+ var normalizeSpreadsheetHeaderValue = (value) => normalizeWhitespace(value).toLowerCase();
10088
+ var isSpreadsheetHeaderRow = (row, headers) => row.length === headers.length && row.every((cell, index) => normalizeSpreadsheetHeaderValue(cell.value) === normalizeSpreadsheetHeaderValue(headers[index] ?? ""));
9677
10089
  var spreadsheetSheetTexts = (entries) => {
9678
10090
  const sharedStrings = spreadsheetSharedStrings(entries);
9679
10091
  const sheetNames = spreadsheetSheetNames(entries);
@@ -9684,14 +10096,28 @@ var spreadsheetSheetTexts = (entries) => {
9684
10096
  return null;
9685
10097
  }
9686
10098
  const headers = rows[0].map((cell) => cell.value);
9687
- const rowTexts = rows.map((row, rowIndex) => normalizeWhitespace(`Row ${rowIndex + 1}. ${spreadsheetRowText(row, rowIndex === 0 ? [] : headers)}`));
10099
+ const repeatedHeaderRowNumbers = [];
10100
+ let tableCount = 1;
10101
+ const rowTexts = rows.map((row, rowIndex) => {
10102
+ const rowNumber = rowIndex + 1;
10103
+ const isHeaderRow = rowIndex === 0 || isSpreadsheetHeaderRow(row, headers);
10104
+ if (rowIndex > 0 && isHeaderRow) {
10105
+ repeatedHeaderRowNumbers.push(rowNumber);
10106
+ tableCount += 1;
10107
+ }
10108
+ return normalizeWhitespace(`Row ${rowNumber}. ${spreadsheetRowText(row, isHeaderRow ? [] : headers)}`);
10109
+ });
9688
10110
  const text = normalizeWhitespace(rowTexts.join(`
9689
10111
  `));
9690
10112
  if (!text) {
9691
10113
  return null;
9692
10114
  }
9693
10115
  return {
10116
+ headers,
9694
10117
  name: sheetNames[index] ?? `Sheet ${index + 1}`,
10118
+ repeatedHeaderRowNumbers,
10119
+ rowCount: rowTexts.length,
10120
+ tableCount,
9695
10121
  text
9696
10122
  };
9697
10123
  }).filter((entry) => Boolean(entry));
@@ -9714,12 +10140,21 @@ var presentationNotesByIndex = (entries) => new Map(entries.filter((entry) => en
9714
10140
  var presentationSlides = (entries) => {
9715
10141
  const notesByIndex = presentationNotesByIndex(entries);
9716
10142
  return entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => {
9717
- const slideText = normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)));
10143
+ const textRuns = [
10144
+ ...decodeUtf8(entry.data).matchAll(/<a:t[^>]*>([\s\S]*?)<\/a:t>/gi)
10145
+ ].map((match) => normalizeWhitespace(decodeHtmlEntities(match[1] ?? ""))).filter(Boolean);
10146
+ const slideTitle = textRuns[0];
10147
+ const slideBodyText = normalizeWhitespace(textRuns.slice(1).join(`
10148
+ `));
10149
+ const slideText = normalizeWhitespace([slideTitle, slideBodyText].filter(Boolean).join(`
10150
+ `));
9718
10151
  const notesText = notesByIndex.get(index);
9719
10152
  const text = normalizeWhitespace([slideText, notesText ? `Speaker notes: ${notesText}` : ""].filter(Boolean).join(`
9720
10153
  `));
9721
10154
  return {
9722
10155
  index,
10156
+ slideBodyText,
10157
+ slideTitle,
9723
10158
  notesText,
9724
10159
  text
9725
10160
  };
@@ -9864,6 +10299,15 @@ var parseEmailHeaders = (raw) => {
9864
10299
  to: getHeader("To")
9865
10300
  };
9866
10301
  };
10302
+ var normalizeEmailThreadKey = (value) => {
10303
+ const normalized = normalizeWhitespace(value?.replace(/^(re|fw|fwd)\s*:\s*/gi, "")?.replace(/[<>]/g, "")?.toLowerCase() ?? "");
10304
+ return normalized || undefined;
10305
+ };
10306
+ var normalizeEmailMessageId = (value) => {
10307
+ const normalized = normalizeWhitespace(value ?? "");
10308
+ return normalized || undefined;
10309
+ };
10310
+ var parseEmailReferenceChain = (references) => (references?.match(/<[^>]+>/g) ?? []).map((entry) => normalizeWhitespace(entry)).filter(Boolean);
9867
10311
  var stripRTF = (value) => {
9868
10312
  const withoutBinary = value.replace(/\\bin\d+ [\s\S]*?(?=[\\}])/g, " ");
9869
10313
  const withoutControls = withoutBinary.replace(/\\par[d]?/g, `
@@ -9881,6 +10325,9 @@ var extractPrintableStrings = (data) => {
9881
10325
  };
9882
10326
  var ocrMetadata = (result) => {
9883
10327
  const regions = result.regions?.filter((region) => normalizeWhitespace(region.text ?? "").length > 0);
10328
+ const pageNumbers = [
10329
+ ...new Set((regions ?? []).map((region) => typeof region.page === "number" && region.page > 0 ? region.page : undefined).filter((value) => value !== undefined))
10330
+ ].sort((left, right) => left - right);
9884
10331
  const confidenceValues = [
9885
10332
  typeof result.confidence === "number" ? result.confidence : undefined,
9886
10333
  ...(regions ?? []).map((region) => typeof region.confidence === "number" ? region.confidence : undefined)
@@ -9889,6 +10336,8 @@ var ocrMetadata = (result) => {
9889
10336
  return {
9890
10337
  ...result.metadata ?? {},
9891
10338
  ocrConfidence: result.confidence,
10339
+ ocrPageCount: pageNumbers.length,
10340
+ ocrPageNumbers: pageNumbers,
9892
10341
  ocrRegionCount: regions?.length,
9893
10342
  ocrRegions: regions,
9894
10343
  ocrAverageConfidence: averageConfidence
@@ -9905,25 +10354,32 @@ var ocrPageDocuments = (result, input, baseMetadata) => {
9905
10354
  bucket.push({ ...region, text });
9906
10355
  grouped.set(region.page, bucket);
9907
10356
  }
9908
- return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => ({
9909
- chunking: input.chunking,
9910
- contentType: input.contentType,
9911
- format: "text",
9912
- metadata: {
9913
- ...input.metadata ?? {},
9914
- ...baseMetadata,
9915
- ocrRegionCount: regions.length,
9916
- ocrRegions: regions,
9917
- pageNumber,
9918
- pageIndex: pageNumber - 1,
9919
- sourceNativeKind: "pdf_page"
9920
- },
9921
- source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
9922
- text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
10357
+ return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => {
10358
+ const confidenceValues = regions.map((region) => typeof region.confidence === "number" ? region.confidence : undefined).filter((value) => value !== undefined);
10359
+ const averageConfidence = confidenceValues.length > 0 ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length : undefined;
10360
+ return {
10361
+ chunking: input.chunking,
10362
+ contentType: input.contentType,
10363
+ format: "text",
10364
+ metadata: {
10365
+ ...input.metadata ?? {},
10366
+ ...baseMetadata,
10367
+ ocrPageAverageConfidence: averageConfidence,
10368
+ ocrPageConfidence: averageConfidence,
10369
+ ocrRegionCount: regions.length,
10370
+ ocrRegionNumbers: regions.map((_region, index) => index + 1),
10371
+ ocrRegions: regions,
10372
+ pageNumber,
10373
+ pageIndex: pageNumber - 1,
10374
+ sourceNativeKind: "pdf_page"
10375
+ },
10376
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
10377
+ text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
9923
10378
  ${regions.map((region) => region.text).join(`
9924
10379
  `)}`),
9925
- title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
9926
- }));
10380
+ title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
10381
+ };
10382
+ });
9927
10383
  };
9928
10384
  var ocrRegionDocuments = (result, input, baseMetadata) => {
9929
10385
  const documents = [];
@@ -9941,6 +10397,8 @@ var ocrRegionDocuments = (result, input, baseMetadata) => {
9941
10397
  metadata: {
9942
10398
  ...input.metadata ?? {},
9943
10399
  ...baseMetadata,
10400
+ ocrPageCount: 1,
10401
+ ocrPageNumbers: [pageNumber],
9944
10402
  ocrRegionConfidence: region.confidence,
9945
10403
  ocrRegionHeight: region.height,
9946
10404
  ocrRegionWidth: region.width,
@@ -10051,15 +10509,34 @@ var createEmailExtractor = () => ({
10051
10509
  const { body } = splitEmailMessage(raw);
10052
10510
  const parsed = parseEmailMimeParts(body, headers.contentType);
10053
10511
  const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`;
10512
+ const referenceChain = parseEmailReferenceChain(headers.references);
10513
+ const messageId = normalizeEmailMessageId(headers.messageId);
10514
+ const inReplyTo = normalizeEmailMessageId(headers.inReplyTo);
10515
+ const threadMessageIds = [
10516
+ ...new Set([
10517
+ ...referenceChain.map((entry) => normalizeEmailMessageId(entry)),
10518
+ messageId
10519
+ ].filter((value) => typeof value === "string"))
10520
+ ];
10521
+ const replyDepth = Math.max(referenceChain.length, headers.inReplyTo ? 1 : 0);
10522
+ const threadTopic = headers.threadTopic ?? headers.subject;
10523
+ const threadRootMessageId = normalizeEmailMessageId(referenceChain[0]) ?? inReplyTo ?? messageId;
10524
+ const threadKey = normalizeEmailThreadKey(threadTopic) ?? normalizeEmailThreadKey(messageId) ?? normalizeEmailThreadKey(headers.subject);
10054
10525
  const messageMetadata = {
10055
10526
  ...input.metadata ?? {},
10056
10527
  emailKind: "message",
10057
10528
  fileKind: "email",
10058
10529
  from: headers.from,
10059
- inReplyTo: headers.inReplyTo,
10060
- messageId: headers.messageId,
10530
+ inReplyTo,
10531
+ messageId,
10061
10532
  references: headers.references,
10062
- threadTopic: headers.subject,
10533
+ replyDepth,
10534
+ replyReferenceCount: referenceChain.length,
10535
+ threadMessageCount: threadMessageIds.length,
10536
+ threadMessageIds,
10537
+ threadKey,
10538
+ threadRootMessageId,
10539
+ threadTopic,
10063
10540
  to: headers.to,
10064
10541
  hasAttachments: parsed.attachments.length > 0
10065
10542
  };
@@ -10172,9 +10649,13 @@ var createOfficeDocumentExtractor = () => ({
10172
10649
  ...input.metadata ?? {},
10173
10650
  fileKind: "office",
10174
10651
  ...officeMetadata,
10652
+ repeatedHeaderRowNumbers: sheet.repeatedHeaderRowNumbers,
10653
+ sheetHeaders: sheet.headers,
10175
10654
  sourceNativeKind: "spreadsheet_sheet",
10176
10655
  sheetIndex: index,
10177
- sheetName: sheet.name
10656
+ sheetName: sheet.name,
10657
+ sheetRowCount: sheet.rowCount,
10658
+ sheetTableCount: sheet.tableCount
10178
10659
  },
10179
10660
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
10180
10661
  text: normalizeWhitespace(`Spreadsheet workbook ${workbookLabel}. ` + `Worksheet ${index + 1}. ` + `Workbook sheet named ${sheet.name}. ` + `Sheet ${sheet.name} from spreadsheet workbook ${workbookLabel}.` + `
@@ -10195,6 +10676,9 @@ ${sheet.text}`),
10195
10676
  ...input.metadata ?? {},
10196
10677
  fileKind: "office",
10197
10678
  ...officeMetadata,
10679
+ ...slide.slideBodyText ? { slideBodyText: slide.slideBodyText } : {},
10680
+ ...slide.notesText ? { slideNotesText: slide.notesText } : {},
10681
+ ...slide.slideTitle ? { slideTitle: slide.slideTitle } : {},
10198
10682
  sourceNativeKind: "presentation_slide",
10199
10683
  slideIndex: slide.index,
10200
10684
  slideNumber: slide.index + 1
@@ -10240,7 +10724,8 @@ var createRAGImageOCRExtractor = (provider) => ({
10240
10724
  metadata: {
10241
10725
  ...input.metadata ?? {},
10242
10726
  ...ocrMetadata(result),
10243
- fileKind: "image"
10727
+ fileKind: "image",
10728
+ sourceNativeKind: "image_ocr"
10244
10729
  },
10245
10730
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.image.txt`,
10246
10731
  text: result.text,
@@ -10253,15 +10738,34 @@ var createRAGMediaFileExtractor = (transcriber) => ({
10253
10738
  supports: mediaExtractorSupports,
10254
10739
  extract: async (input) => {
10255
10740
  const result = await transcriber.transcribe(input);
10741
+ const rawSegments = (result.segments ?? []).filter((segment) => {
10742
+ if (!segment || typeof segment !== "object") {
10743
+ return false;
10744
+ }
10745
+ return normalizeWhitespace(segment.text ?? "").length > 0;
10746
+ });
10747
+ const segmentGroups = groupTranscriptSegments(rawSegments);
10256
10748
  const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
10749
+ const segmentCount = rawSegments.length;
10750
+ const mediaDurationMs = rawSegments.reduce((max, segment) => {
10751
+ const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
10752
+ if (typeof endMs !== "number") {
10753
+ return max;
10754
+ }
10755
+ return typeof max === "number" ? Math.max(max, endMs) : endMs;
10756
+ }, undefined);
10757
+ const mediaSpeakers = [
10758
+ ...new Set(rawSegments.map((segment) => normalizeMediaSpeaker(segment.speaker)).filter((value) => typeof value === "string"))
10759
+ ];
10257
10760
  const segmentDocuments = [];
10258
- for (const [index, segment] of (result.segments ?? []).entries()) {
10259
- const text = normalizeWhitespace(segment.text ?? "");
10260
- if (!text) {
10761
+ for (const [index, segmentGroup] of segmentGroups.entries()) {
10762
+ const { endMs, startMs } = buildMediaTimestampBoundary(segmentGroup.segments);
10763
+ const groupText = normalizeWhitespace(segmentGroup.segments.map((segment) => normalizeWhitespace(segment.text ?? "")).filter((value) => value.length > 0).join(" "));
10764
+ if (!groupText) {
10261
10765
  continue;
10262
10766
  }
10263
- const startMs = typeof segment.startMs === "number" ? segment.startMs : undefined;
10264
- const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
10767
+ const mediaSegmentStartMs = startMs;
10768
+ const mediaSegmentEndMs = endMs;
10265
10769
  const startLabel = formatMediaTimestampForIngest(startMs);
10266
10770
  const endLabel = formatMediaTimestampForIngest(endMs);
10267
10771
  const mediaKind = typeof result.metadata?.mediaKind === "string" ? result.metadata.mediaKind : "media";
@@ -10274,15 +10778,27 @@ var createRAGMediaFileExtractor = (transcriber) => ({
10274
10778
  ...result.metadata ?? {},
10275
10779
  fileKind: "media",
10276
10780
  sourceNativeKind: "media_segment",
10781
+ mediaDurationMs,
10277
10782
  mediaSegmentIndex: index,
10278
- mediaSegmentStartMs: startMs,
10279
- mediaSegmentEndMs: endMs,
10280
- mediaSegments: [segment],
10281
- speaker: typeof segment.speaker === "string" ? segment.speaker : undefined
10783
+ mediaSegmentStartMs,
10784
+ mediaSegmentEndMs,
10785
+ mediaSegmentCount: segmentCount,
10786
+ mediaSegmentGroupIndex: index,
10787
+ mediaSegmentGroupSize: segmentGroup.segments.length,
10788
+ mediaSegmentGroupSpeaker: segmentGroup.speaker,
10789
+ mediaChannel: segmentGroup.channel,
10790
+ mediaSegments: segmentGroup.segments,
10791
+ startMs: mediaSegmentStartMs,
10792
+ endMs: mediaSegmentEndMs,
10793
+ ...mediaSpeakers.length > 0 ? {
10794
+ mediaSpeakerCount: mediaSpeakers.length,
10795
+ mediaSpeakers
10796
+ } : {},
10797
+ speaker: segmentGroup.speaker
10282
10798
  },
10283
10799
  source,
10284
10800
  text: normalizeWhitespace(`${mediaKind} transcript segment${startLabel ? ` at timestamp ${startLabel}${endLabel ? ` to ${endLabel}` : ""}` : ""} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}. ` + `${mediaKind} timestamp evidence${startLabel ? ` ${startLabel}${endLabel ? ` to ${endLabel}` : ""}` : ""}.` + `
10285
- ${text}`),
10801
+ ${groupText}`),
10286
10802
  title: input.title ? `${input.title} \xB7 ${mediaKind[0]?.toUpperCase() + mediaKind.slice(1)} segment ${index + 1}` : `${mediaKind[0]?.toUpperCase() + mediaKind.slice(1)} segment ${index + 1}`
10287
10803
  });
10288
10804
  }
@@ -10294,7 +10810,13 @@ ${text}`),
10294
10810
  ...input.metadata ?? {},
10295
10811
  ...result.metadata ?? {},
10296
10812
  fileKind: "media",
10297
- mediaSegments: result.segments
10813
+ mediaDurationMs,
10814
+ mediaSegmentCount: segmentCount,
10815
+ mediaSegments: rawSegments,
10816
+ ...mediaSpeakers.length > 0 ? {
10817
+ mediaSpeakerCount: mediaSpeakers.length,
10818
+ mediaSpeakers
10819
+ } : {}
10298
10820
  },
10299
10821
  source,
10300
10822
  text: result.text,
@@ -10319,6 +10841,13 @@ var createTextFileExtractor = () => ({
10319
10841
  })
10320
10842
  });
10321
10843
  var expandArchiveEntry = async (entry, archiveInput, extractors, registry) => {
10844
+ const parentArchiveLineage = Array.isArray(archiveInput.metadata?.archiveLineage) ? archiveInput.metadata.archiveLineage.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
10845
+ const entryArchiveLineage = entry.path.split(/[\\/]/).map((segment) => normalizeWhitespace(segment)).filter(Boolean);
10846
+ const archiveLineage = [...parentArchiveLineage, ...entryArchiveLineage];
10847
+ const parentArchivePath = typeof archiveInput.metadata?.archivePath === "string" && archiveInput.metadata.archivePath.trim().length > 0 ? archiveInput.metadata.archivePath.trim() : undefined;
10848
+ const archiveFullPath = parentArchivePath ? `${parentArchivePath}!${entry.path}` : entry.path;
10849
+ const archiveRootName = (typeof archiveInput.metadata?.archiveRootName === "string" && archiveInput.metadata.archiveRootName.trim().length > 0 ? archiveInput.metadata.archiveRootName.trim() : undefined) ?? archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source;
10850
+ const archiveRootSource = (typeof archiveInput.metadata?.archiveRootSource === "string" && archiveInput.metadata.archiveRootSource.trim().length > 0 ? archiveInput.metadata.archiveRootSource.trim() : undefined) ?? archiveInput.source ?? archiveInput.path ?? archiveInput.name;
10322
10851
  const documents = await extractRAGFileDocuments({
10323
10852
  chunking: archiveInput.chunking,
10324
10853
  contentType: entry.contentType,
@@ -10331,7 +10860,14 @@ var expandArchiveEntry = async (entry, archiveInput, extractors, registry) => {
10331
10860
  archiveEntryName: basename(entry.path),
10332
10861
  archiveParentName: archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source,
10333
10862
  archiveParentSource: archiveInput.source ?? archiveInput.path ?? archiveInput.name,
10863
+ archiveContainerPath: parentArchivePath,
10864
+ archiveDepth: archiveLineage.length,
10865
+ archiveFullPath,
10866
+ archiveLineage,
10334
10867
  archivePath: entry.path,
10868
+ archiveRootName,
10869
+ archiveRootSource,
10870
+ archiveNestedDepth: parentArchiveLineage.length + 1,
10335
10871
  fileKind: "archive_entry"
10336
10872
  },
10337
10873
  name: basename(entry.path),
@@ -10748,10 +11284,27 @@ ${text}`);
10748
11284
  }
10749
11285
  return text;
10750
11286
  };
10751
- return merged.map((text) => ({
10752
- ...unit,
10753
- text: decorateSourceAwareChunkText(text)
10754
- }));
11287
+ const resolveSpreadsheetChunkRowRange = (text) => {
11288
+ if (unit.sectionKind !== "spreadsheet_rows") {
11289
+ return {};
11290
+ }
11291
+ const rowNumbers = [...text.matchAll(/^Row (\d+)\./gm)].map((match) => Number(match[1] ?? NaN)).filter((value) => Number.isFinite(value));
11292
+ if (rowNumbers.length === 0) {
11293
+ return {};
11294
+ }
11295
+ return {
11296
+ spreadsheetRowEnd: rowNumbers[rowNumbers.length - 1],
11297
+ spreadsheetRowStart: rowNumbers[0]
11298
+ };
11299
+ };
11300
+ return merged.map((text) => {
11301
+ const decoratedText = decorateSourceAwareChunkText(text);
11302
+ return {
11303
+ ...unit,
11304
+ ...resolveSpreadsheetChunkRowRange(decoratedText),
11305
+ text: decoratedText
11306
+ };
11307
+ });
10755
11308
  };
10756
11309
  var resolveChunkingUnits = (text, options) => {
10757
11310
  if (options.strategy === "fixed") {
@@ -10922,6 +11475,11 @@ var prepareRAGDocument = (document, defaultChunking, chunkingRegistry) => {
10922
11475
  ...sectionTitle ? { sectionTitle } : {},
10923
11476
  ...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
10924
11477
  ...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
11478
+ ...Array.isArray(entry.spreadsheetHeaders) && entry.spreadsheetHeaders.length > 0 ? { spreadsheetHeaders: entry.spreadsheetHeaders } : {},
11479
+ ...typeof entry.spreadsheetTableIndex === "number" ? { spreadsheetTableIndex: entry.spreadsheetTableIndex } : {},
11480
+ ...typeof entry.spreadsheetTableCount === "number" ? { spreadsheetTableCount: entry.spreadsheetTableCount } : {},
11481
+ ...typeof entry.spreadsheetRowStart === "number" ? { spreadsheetRowStart: entry.spreadsheetRowStart } : {},
11482
+ ...typeof entry.spreadsheetRowEnd === "number" ? { spreadsheetRowEnd: entry.spreadsheetRowEnd } : {},
10925
11483
  ...typeof entry.pageNumber === "number" ? { pageNumber: entry.pageNumber } : {},
10926
11484
  ...typeof entry.officeBlockNumber === "number" ? { officeBlockNumber: entry.officeBlockNumber } : {},
10927
11485
  ...entry.officeBlockKind ? { officeBlockKind: entry.officeBlockKind } : {},
@@ -24278,5 +24836,5 @@ export {
24278
24836
  aiChat
24279
24837
  };
24280
24838
 
24281
- //# debugId=23520EDE705830A964756E2164756E21
24839
+ //# debugId=AFAF0A5BC1AB4BC864756E2164756E21
24282
24840
  //# sourceMappingURL=index.js.map