@absolutejs/absolute 0.19.0-beta.619 → 0.19.0-beta.620
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/client/index.js +144 -7
- package/dist/ai/client/index.js.map +4 -4
- package/dist/ai/client/ui.js +144 -7
- package/dist/ai/client/ui.js.map +4 -4
- package/dist/ai/index.js +624 -66
- package/dist/ai/index.js.map +6 -6
- package/dist/ai/rag/quality.js +19 -1
- package/dist/ai/rag/quality.js.map +3 -3
- package/dist/ai/rag/ui.js +144 -7
- package/dist/ai/rag/ui.js.map +4 -4
- package/dist/ai-client/angular/ai/index.js +143 -6
- package/dist/ai-client/react/ai/index.js +143 -6
- package/dist/ai-client/vue/ai/index.js +143 -6
- package/dist/angular/ai/index.js +144 -7
- package/dist/angular/ai/index.js.map +4 -4
- package/dist/react/ai/index.js +144 -7
- package/dist/react/ai/index.js.map +4 -4
- package/dist/svelte/ai/index.js +144 -7
- package/dist/svelte/ai/index.js.map +4 -4
- package/dist/types/ai.d.ts +1 -0
- package/dist/vue/ai/index.js +144 -7
- package/dist/vue/ai/index.js.map +4 -4
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -260,6 +260,12 @@ var formatMediaTimestamp = (value) => {
|
|
|
260
260
|
const milliseconds = Math.floor(value % 1000);
|
|
261
261
|
return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}.${String(milliseconds).padStart(3, "0")}`;
|
|
262
262
|
};
|
|
263
|
+
var formatMediaDurationLabel = (value) => {
|
|
264
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0) {
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
return formatMediaTimestamp(value);
|
|
268
|
+
};
|
|
263
269
|
var buildLocatorLabel = (metadata, source, title) => {
|
|
264
270
|
if (!metadata) {
|
|
265
271
|
return;
|
|
@@ -322,6 +328,12 @@ var buildProvenanceLabel = (metadata) => {
|
|
|
322
328
|
const sentAt = formatTimestampLabel(metadata.sentAt) ?? formatTimestampLabel(metadata.receivedAt);
|
|
323
329
|
const speaker = getContextString(metadata.speaker);
|
|
324
330
|
const mediaKind = getContextString(metadata.mediaKind);
|
|
331
|
+
const mediaSegmentCount = getContextNumber(metadata.mediaSegmentCount);
|
|
332
|
+
const mediaSegmentGroupSize = getContextNumber(metadata.mediaSegmentGroupSize);
|
|
333
|
+
const mediaSegmentGroupIndex = getContextNumber(metadata.mediaSegmentGroupIndex);
|
|
334
|
+
const mediaChannel = getContextString(metadata.mediaChannel);
|
|
335
|
+
const mediaSpeakerCount = getContextNumber(metadata.mediaSpeakerCount);
|
|
336
|
+
const mediaDurationLabel = formatMediaDurationLabel(metadata.mediaDurationMs);
|
|
325
337
|
const transcriptSource = getContextString(metadata.transcriptSource);
|
|
326
338
|
const pdfTextMode = getContextString(metadata.pdfTextMode);
|
|
327
339
|
const ocrEngine = getContextString(metadata.ocrEngine);
|
|
@@ -331,6 +343,12 @@ var buildProvenanceLabel = (metadata) => {
|
|
|
331
343
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
332
344
|
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
333
345
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
346
|
+
mediaSegmentCount ? `${mediaSegmentCount} segments` : "",
|
|
347
|
+
mediaSegmentGroupSize ? `${mediaSegmentGroupSize} grouped segments` : "",
|
|
348
|
+
mediaSegmentGroupIndex !== undefined ? `Segment group ${mediaSegmentGroupIndex + 1}` : "",
|
|
349
|
+
mediaChannel ? `Channel ${mediaChannel}` : "",
|
|
350
|
+
mediaSpeakerCount ? `${mediaSpeakerCount} speakers` : "",
|
|
351
|
+
mediaDurationLabel ? `Duration ${mediaDurationLabel}` : "",
|
|
334
352
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
335
353
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
336
354
|
speaker ? `Speaker ${speaker}` : "",
|
|
@@ -4233,6 +4251,34 @@ var getAttachmentName2 = (source, title) => {
|
|
|
4233
4251
|
}
|
|
4234
4252
|
return;
|
|
4235
4253
|
};
|
|
4254
|
+
var getSpreadsheetHeaders = (metadata) => Array.isArray(metadata?.spreadsheetHeaders) ? metadata.spreadsheetHeaders.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4255
|
+
var formatSpreadsheetRowRange = (rowStart, rowEnd) => {
|
|
4256
|
+
if (typeof rowStart !== "number" || !Number.isFinite(rowStart)) {
|
|
4257
|
+
return;
|
|
4258
|
+
}
|
|
4259
|
+
if (typeof rowEnd !== "number" && typeof rowStart === "number" && Number.isFinite(rowStart)) {
|
|
4260
|
+
return `Rows ${rowStart}`;
|
|
4261
|
+
}
|
|
4262
|
+
if (rowStart === rowEnd) {
|
|
4263
|
+
return `Rows ${rowStart}`;
|
|
4264
|
+
}
|
|
4265
|
+
return `Rows ${rowStart}-${rowEnd}`;
|
|
4266
|
+
};
|
|
4267
|
+
var formatSpreadsheetTableLabel = (tableIndex, tableCount) => {
|
|
4268
|
+
if (typeof tableIndex !== "number" || !Number.isFinite(tableIndex) || tableIndex < 1) {
|
|
4269
|
+
return;
|
|
4270
|
+
}
|
|
4271
|
+
if (typeof tableCount === "number" && Number.isFinite(tableCount) && tableCount >= tableIndex) {
|
|
4272
|
+
return `Table ${tableIndex} of ${tableCount}`;
|
|
4273
|
+
}
|
|
4274
|
+
return `Table ${tableIndex}`;
|
|
4275
|
+
};
|
|
4276
|
+
var formatMediaDurationLabel2 = (value) => {
|
|
4277
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0) {
|
|
4278
|
+
return;
|
|
4279
|
+
}
|
|
4280
|
+
return formatMediaTimestamp2(value);
|
|
4281
|
+
};
|
|
4236
4282
|
var buildContextLabel2 = (metadata) => {
|
|
4237
4283
|
if (!metadata) {
|
|
4238
4284
|
return;
|
|
@@ -4258,29 +4304,61 @@ var buildContextLabel2 = (metadata) => {
|
|
|
4258
4304
|
}
|
|
4259
4305
|
const emailKind = getContextString2(metadata.emailKind);
|
|
4260
4306
|
if (emailKind === "attachment") {
|
|
4261
|
-
|
|
4307
|
+
const attachmentName = getContextString2(metadata.attachmentName);
|
|
4308
|
+
const threadTopic2 = getContextString2(metadata.threadTopic);
|
|
4309
|
+
return attachmentName ? threadTopic2 ? `Attachment evidence ${attachmentName} in ${threadTopic2}` : `Attachment evidence ${attachmentName}` : "Attachment evidence";
|
|
4262
4310
|
}
|
|
4263
4311
|
if (emailKind === "message") {
|
|
4312
|
+
const threadTopic2 = getContextString2(metadata.threadTopic);
|
|
4264
4313
|
const from = getContextString2(metadata.from);
|
|
4314
|
+
if (threadTopic2) {
|
|
4315
|
+
return from ? `Message in ${threadTopic2} from ${from}` : `Message in ${threadTopic2}`;
|
|
4316
|
+
}
|
|
4265
4317
|
return from ? `Message from ${from}` : "Message evidence";
|
|
4266
4318
|
}
|
|
4267
4319
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4268
4320
|
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4321
|
+
const hasOCRTrace = typeof metadata.ocrRegionConfidence === "number" || typeof metadata.ocrConfidence === "number" || getContextString2(metadata.pdfTextMode) === "ocr" || typeof metadata.ocrRegionCount === "number";
|
|
4269
4322
|
if (page && region) {
|
|
4323
|
+
if (hasOCRTrace) {
|
|
4324
|
+
return `OCR page ${page} region ${region}`;
|
|
4325
|
+
}
|
|
4270
4326
|
return `Page ${page} region ${region}`;
|
|
4271
4327
|
}
|
|
4272
4328
|
if (page) {
|
|
4329
|
+
if (hasOCRTrace) {
|
|
4330
|
+
return `OCR page ${page}`;
|
|
4331
|
+
}
|
|
4273
4332
|
return `Page ${page}`;
|
|
4274
4333
|
}
|
|
4275
4334
|
const sheet = getContextString2(metadata.sheetName) ?? (Array.isArray(metadata.sheetNames) ? getContextString2(metadata.sheetNames[0]) : undefined);
|
|
4276
4335
|
if (sheet) {
|
|
4336
|
+
const tableLabel = formatSpreadsheetTableLabel(getContextNumber2(metadata.spreadsheetTableIndex), getContextNumber2(metadata.spreadsheetTableCount));
|
|
4337
|
+
const rowRange = formatSpreadsheetRowRange(getContextNumber2(metadata.spreadsheetRowStart), getContextNumber2(metadata.spreadsheetRowEnd));
|
|
4338
|
+
const headers = getSpreadsheetHeaders(metadata);
|
|
4339
|
+
if (tableLabel && rowRange) {
|
|
4340
|
+
return `Sheet ${sheet} ${tableLabel} ${rowRange}`;
|
|
4341
|
+
}
|
|
4342
|
+
if (tableLabel) {
|
|
4343
|
+
return `Sheet ${sheet} ${tableLabel}`;
|
|
4344
|
+
}
|
|
4345
|
+
if (rowRange) {
|
|
4346
|
+
return `Sheet ${sheet} ${rowRange}`;
|
|
4347
|
+
}
|
|
4348
|
+
if (headers.length > 0) {
|
|
4349
|
+
return `Sheet ${sheet} by ${headers.slice(0, 2).join(", ")}`;
|
|
4350
|
+
}
|
|
4277
4351
|
return `Sheet ${sheet}`;
|
|
4278
4352
|
}
|
|
4279
4353
|
const slide = getContextNumber2(metadata.slide) ?? getContextNumber2(metadata.slideNumber) ?? (typeof metadata.slideIndex === "number" ? metadata.slideIndex + 1 : undefined);
|
|
4354
|
+
const slideTitle = getContextString2(metadata.slideTitle);
|
|
4280
4355
|
if (slide) {
|
|
4356
|
+
if (slideTitle) {
|
|
4357
|
+
return `Slide ${slide} ${slideTitle}`;
|
|
4358
|
+
}
|
|
4281
4359
|
return `Slide ${slide}`;
|
|
4282
4360
|
}
|
|
4283
|
-
const archiveEntry = getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
|
|
4361
|
+
const archiveEntry = getContextString2(metadata.archiveFullPath) ?? getContextString2(metadata.archivePath) ?? getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
|
|
4284
4362
|
if (archiveEntry) {
|
|
4285
4363
|
return `Archive entry ${archiveEntry}`;
|
|
4286
4364
|
}
|
|
@@ -4305,6 +4383,9 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4305
4383
|
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4306
4384
|
const pdfBlockNumber = getContextNumber2(metadata.pdfBlockNumber);
|
|
4307
4385
|
const officeBlockNumber = getContextNumber2(metadata.officeBlockNumber);
|
|
4386
|
+
const spreadsheetRowStart = getContextNumber2(metadata.spreadsheetRowStart);
|
|
4387
|
+
const spreadsheetRowEnd = getContextNumber2(metadata.spreadsheetRowEnd);
|
|
4388
|
+
const slideTitle = getContextString2(metadata.slideTitle);
|
|
4308
4389
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4309
4390
|
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4310
4391
|
if (page && region) {
|
|
@@ -4321,19 +4402,31 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4321
4402
|
}
|
|
4322
4403
|
const sheet = getContextString2(metadata.sheetName) ?? (Array.isArray(metadata.sheetNames) ? getContextString2(metadata.sheetNames[0]) : undefined);
|
|
4323
4404
|
if (sheet) {
|
|
4324
|
-
|
|
4405
|
+
const tableLabel = formatSpreadsheetTableLabel(getContextNumber2(metadata.spreadsheetTableIndex), getContextNumber2(metadata.spreadsheetTableCount));
|
|
4406
|
+
const rowRange = formatSpreadsheetRowRange(spreadsheetRowStart, spreadsheetRowEnd);
|
|
4407
|
+
if (tableLabel && rowRange) {
|
|
4408
|
+
return `Sheet ${sheet} \xB7 ${tableLabel} \xB7 ${rowRange}`;
|
|
4409
|
+
}
|
|
4410
|
+
if (tableLabel) {
|
|
4411
|
+
return `Sheet ${sheet} \xB7 ${tableLabel}`;
|
|
4412
|
+
}
|
|
4413
|
+
return rowRange ? `Sheet ${sheet} \xB7 ${rowRange}` : `Sheet ${sheet}`;
|
|
4325
4414
|
}
|
|
4326
4415
|
const slide = getContextNumber2(metadata.slide) ?? getContextNumber2(metadata.slideNumber) ?? (typeof metadata.slideIndex === "number" ? metadata.slideIndex + 1 : undefined);
|
|
4327
4416
|
if (slide) {
|
|
4328
|
-
return `Slide ${slide}`;
|
|
4417
|
+
return slideTitle ? `Slide ${slide} \xB7 ${slideTitle}` : `Slide ${slide}`;
|
|
4329
4418
|
}
|
|
4330
|
-
const archiveEntry = getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
|
|
4419
|
+
const archiveEntry = getContextString2(metadata.archiveFullPath) ?? getContextString2(metadata.archivePath) ?? getContextString2(metadata.archiveEntryPath) ?? getContextString2(metadata.entryPath);
|
|
4331
4420
|
if (archiveEntry) {
|
|
4332
4421
|
return `Archive entry ${archiveEntry}`;
|
|
4333
4422
|
}
|
|
4334
4423
|
const emailKind = getContextString2(metadata.emailKind);
|
|
4335
4424
|
if (emailKind === "attachment") {
|
|
4336
4425
|
const attachmentName = getContextString2(metadata.attachmentName) ?? getAttachmentName2(source, title);
|
|
4426
|
+
const replyDepth = getContextNumber2(metadata.replyDepth);
|
|
4427
|
+
if (attachmentName && replyDepth && replyDepth > 0) {
|
|
4428
|
+
return `Attachment ${attachmentName} \xB7 Reply depth ${replyDepth}`;
|
|
4429
|
+
}
|
|
4337
4430
|
return attachmentName ? `Attachment ${attachmentName}` : "Attachment";
|
|
4338
4431
|
}
|
|
4339
4432
|
const mediaStart = formatMediaTimestamp2(metadata.startMs);
|
|
@@ -4364,18 +4457,36 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4364
4457
|
return;
|
|
4365
4458
|
}
|
|
4366
4459
|
const threadTopic = getContextString2(metadata.threadTopic);
|
|
4460
|
+
const replyDepth = getContextNumber2(metadata.replyDepth);
|
|
4461
|
+
const threadMessageCount = getContextNumber2(metadata.threadMessageCount);
|
|
4462
|
+
const threadRootMessageId = getContextString2(metadata.threadRootMessageId);
|
|
4367
4463
|
const from = getContextString2(metadata.from);
|
|
4368
4464
|
const sentAt = formatTimestampLabel2(metadata.sentAt) ?? formatTimestampLabel2(metadata.receivedAt);
|
|
4369
4465
|
const speaker = getContextString2(metadata.speaker);
|
|
4370
4466
|
const mediaKind = getContextString2(metadata.mediaKind);
|
|
4371
4467
|
const transcriptSource = getContextString2(metadata.transcriptSource);
|
|
4468
|
+
const mediaSpeakerCount = getContextNumber2(metadata.mediaSpeakerCount);
|
|
4469
|
+
const mediaSegmentCount = getContextNumber2(metadata.mediaSegmentCount);
|
|
4470
|
+
const mediaSegmentGroupSize = getContextNumber2(metadata.mediaSegmentGroupSize);
|
|
4471
|
+
const mediaSegmentGroupIndex = getContextNumber2(metadata.mediaSegmentGroupIndex);
|
|
4472
|
+
const mediaChannel = getContextString2(metadata.mediaChannel);
|
|
4473
|
+
const mediaDurationLabel = formatMediaDurationLabel2(metadata.mediaDurationMs);
|
|
4474
|
+
const spreadsheetHeaders = getSpreadsheetHeaders(metadata);
|
|
4475
|
+
const slideNotesText = getContextString2(metadata.slideNotesText);
|
|
4372
4476
|
const pdfTextMode = getContextString2(metadata.pdfTextMode);
|
|
4373
4477
|
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4374
4478
|
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4375
4479
|
const ocrEngine = getContextString2(metadata.ocrEngine);
|
|
4376
4480
|
const extractorRegistryMatch = getContextString2(metadata.extractorRegistryMatch);
|
|
4377
4481
|
const chunkingProfile = getContextString2(metadata.chunkingProfile);
|
|
4482
|
+
const archiveDepth = getContextNumber2(metadata.archiveDepth);
|
|
4483
|
+
const archiveNestedDepth = getContextNumber2(metadata.archiveNestedDepth);
|
|
4484
|
+
const archiveContainerPath = getContextString2(metadata.archiveContainerPath);
|
|
4485
|
+
const archiveRootName = getContextString2(metadata.archiveRootName);
|
|
4486
|
+
const spreadsheetTableLabel = formatSpreadsheetTableLabel(getContextNumber2(metadata.spreadsheetTableIndex), getContextNumber2(metadata.spreadsheetTableCount));
|
|
4378
4487
|
const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
|
|
4488
|
+
const ocrAverageConfidence = getContextNumber2(metadata.ocrPageAverageConfidence) ?? getContextNumber2(metadata.ocrAverageConfidence);
|
|
4489
|
+
const ocrRegionCount = getContextNumber2(metadata.ocrRegionCount);
|
|
4379
4490
|
const labels = [
|
|
4380
4491
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
4381
4492
|
pdfTextKind === "table_like" ? "PDF table block" : pdfTextKind === "paragraph" ? "PDF text block" : "",
|
|
@@ -4384,9 +4495,27 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4384
4495
|
extractorRegistryMatch ? `Extractor ${extractorRegistryMatch}` : "",
|
|
4385
4496
|
chunkingProfile ? `Chunking ${chunkingProfile}` : "",
|
|
4386
4497
|
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
4498
|
+
typeof ocrAverageConfidence === "number" && ocrAverageConfidence !== ocrConfidence ? `Average ${ocrAverageConfidence.toFixed(2)}` : "",
|
|
4499
|
+
typeof ocrRegionCount === "number" ? `${ocrRegionCount} regions` : "",
|
|
4500
|
+
spreadsheetHeaders.length > 0 ? `Spreadsheet ${spreadsheetHeaders.join(", ")}` : "",
|
|
4501
|
+
spreadsheetTableLabel ? `Spreadsheet ${spreadsheetTableLabel}` : "",
|
|
4387
4502
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
4503
|
+
mediaSegmentCount ? `${mediaSegmentCount} segments` : "",
|
|
4504
|
+
mediaSegmentGroupSize ? `${mediaSegmentGroupSize} grouped segments` : "",
|
|
4505
|
+
mediaSegmentGroupIndex !== undefined ? `Segment group ${mediaSegmentGroupIndex + 1}` : "",
|
|
4506
|
+
mediaChannel ? `Channel ${mediaChannel}` : "",
|
|
4507
|
+
mediaSpeakerCount ? `${mediaSpeakerCount} speakers` : "",
|
|
4508
|
+
mediaDurationLabel ? `Duration ${mediaDurationLabel}` : "",
|
|
4388
4509
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
4389
4510
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
4511
|
+
threadRootMessageId ? `Thread root ${threadRootMessageId}` : "",
|
|
4512
|
+
threadMessageCount ? `${threadMessageCount} thread messages` : "",
|
|
4513
|
+
replyDepth ? `Reply depth ${replyDepth}` : "",
|
|
4514
|
+
slideNotesText ? "Speaker notes" : "",
|
|
4515
|
+
archiveDepth ? `Archive depth ${archiveDepth}` : "",
|
|
4516
|
+
archiveNestedDepth ? `Archive nested depth ${archiveNestedDepth}` : "",
|
|
4517
|
+
archiveContainerPath ? `Archive container ${archiveContainerPath}` : "",
|
|
4518
|
+
archiveRootName ? `Archive root ${archiveRootName}` : "",
|
|
4390
4519
|
speaker ? `Speaker ${speaker}` : "",
|
|
4391
4520
|
from ? `Sender ${from}` : "",
|
|
4392
4521
|
sentAt ? `Sent ${sentAt}` : ""
|
|
@@ -4763,7 +4892,7 @@ var getSectionPathFromSource = (source) => {
|
|
|
4763
4892
|
const path = source.structure?.section?.path ?? (Array.isArray(source.metadata?.sectionPath) ? source.metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : []);
|
|
4764
4893
|
return path.length > 0 ? path : undefined;
|
|
4765
4894
|
};
|
|
4766
|
-
var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office "));
|
|
4895
|
+
var isBlockAwareContextLabel = (value) => typeof value === "string" && (value.startsWith("PDF ") || value.startsWith("Office ") || value.startsWith("Slide "));
|
|
4767
4896
|
var getStructuredSectionScoreWeight = (metadata) => {
|
|
4768
4897
|
if (!metadata) {
|
|
4769
4898
|
return 1;
|
|
@@ -4771,6 +4900,8 @@ var getStructuredSectionScoreWeight = (metadata) => {
|
|
|
4771
4900
|
const pdfTextKind = getContextString2(metadata.pdfTextKind);
|
|
4772
4901
|
const officeBlockKind = getContextString2(metadata.officeBlockKind);
|
|
4773
4902
|
const sectionKind = getContextString2(metadata.sectionKind);
|
|
4903
|
+
const slideTitle = getContextString2(metadata.slideTitle);
|
|
4904
|
+
const slideNotesText = getContextString2(metadata.slideNotesText);
|
|
4774
4905
|
if (pdfTextKind === "table_like") {
|
|
4775
4906
|
return 1.28;
|
|
4776
4907
|
}
|
|
@@ -4780,6 +4911,12 @@ var getStructuredSectionScoreWeight = (metadata) => {
|
|
|
4780
4911
|
if (sectionKind === "pdf_block" || sectionKind === "office_block" || officeBlockKind === "paragraph" || pdfTextKind === "paragraph") {
|
|
4781
4912
|
return 1.12;
|
|
4782
4913
|
}
|
|
4914
|
+
if (sectionKind === "presentation_slide" && slideNotesText) {
|
|
4915
|
+
return 1.2;
|
|
4916
|
+
}
|
|
4917
|
+
if (sectionKind === "presentation_slide" && slideTitle) {
|
|
4918
|
+
return 1.14;
|
|
4919
|
+
}
|
|
4783
4920
|
return 1;
|
|
4784
4921
|
};
|
|
4785
4922
|
var getStructuredSourceLeadScore = (source) => source.score * getStructuredSectionScoreWeight(source.metadata);
|
|
@@ -8560,10 +8697,32 @@ var scoreLoosePhraseMatch2 = (query, text) => {
|
|
|
8560
8697
|
return 0;
|
|
8561
8698
|
};
|
|
8562
8699
|
var queryHasAnyToken = (queryTokens, candidates) => candidates.some((candidate) => queryTokens.includes(candidate));
|
|
8700
|
+
var metadataString = (value) => typeof value === "string" && value.trim().length > 0 ? value.trim().toLowerCase() : undefined;
|
|
8563
8701
|
var scoreStructuredEvidenceMatch = (queryTokens, result) => {
|
|
8564
8702
|
const metadata = result.metadata ?? {};
|
|
8565
8703
|
const pdfTextKind = typeof metadata.pdfTextKind === "string" ? metadata.pdfTextKind : undefined;
|
|
8566
8704
|
const officeBlockKind = typeof metadata.officeBlockKind === "string" ? metadata.officeBlockKind : undefined;
|
|
8705
|
+
const slideTitle = metadataString(metadata.slideTitle);
|
|
8706
|
+
const slideNotesText = metadataString(metadata.slideNotesText);
|
|
8707
|
+
const threadTopic = metadataString(metadata.threadTopic);
|
|
8708
|
+
const threadRootMessageId = metadataString(metadata.threadRootMessageId);
|
|
8709
|
+
const threadMessageCount = typeof metadata.threadMessageCount === "number" ? metadata.threadMessageCount : undefined;
|
|
8710
|
+
const attachmentName = metadataString(metadata.attachmentName);
|
|
8711
|
+
const archivePath = metadataString(metadata.archivePath);
|
|
8712
|
+
const archiveFullPath = metadataString(metadata.archiveFullPath);
|
|
8713
|
+
const archiveContainerPath = metadataString(metadata.archiveContainerPath);
|
|
8714
|
+
const archiveNestedDepth = typeof metadata.archiveNestedDepth === "number" ? metadata.archiveNestedDepth : undefined;
|
|
8715
|
+
const mediaSpeakerCount = typeof metadata.mediaSpeakerCount === "number" ? metadata.mediaSpeakerCount : undefined;
|
|
8716
|
+
const mediaSegmentCount = typeof metadata.mediaSegmentCount === "number" ? metadata.mediaSegmentCount : undefined;
|
|
8717
|
+
const mediaSegmentGroupSize = typeof metadata.mediaSegmentGroupSize === "number" ? metadata.mediaSegmentGroupSize : undefined;
|
|
8718
|
+
const mediaChannel = metadataString(metadata.mediaChannel);
|
|
8719
|
+
const speaker = metadataString(metadata.speaker);
|
|
8720
|
+
const ocrConfidence = typeof metadata.ocrRegionConfidence === "number" ? metadata.ocrRegionConfidence : typeof metadata.ocrPageAverageConfidence === "number" ? metadata.ocrPageAverageConfidence : typeof metadata.ocrAverageConfidence === "number" ? metadata.ocrAverageConfidence : typeof metadata.ocrConfidence === "number" ? metadata.ocrConfidence : undefined;
|
|
8721
|
+
const isOCREvidence = typeof ocrConfidence === "number" || metadataString(metadata.pdfTextMode) === "ocr";
|
|
8722
|
+
const spreadsheetHeaders = Array.isArray(metadata.spreadsheetHeaders) ? metadata.spreadsheetHeaders.map((value) => metadataString(value)).filter((value) => typeof value === "string") : [];
|
|
8723
|
+
const spreadsheetTableIndex = typeof metadata.spreadsheetTableIndex === "number" ? metadata.spreadsheetTableIndex : undefined;
|
|
8724
|
+
const spreadsheetTableCount = typeof metadata.spreadsheetTableCount === "number" ? metadata.spreadsheetTableCount : undefined;
|
|
8725
|
+
const hasSpreadsheetRows = typeof metadata.spreadsheetRowStart === "number" || typeof metadata.spreadsheetRowEnd === "number";
|
|
8567
8726
|
const hasBlockMetadata = typeof metadata.pdfBlockNumber === "number" || typeof metadata.officeBlockNumber === "number";
|
|
8568
8727
|
let score = 0;
|
|
8569
8728
|
if (hasBlockMetadata) {
|
|
@@ -8606,6 +8765,162 @@ var scoreStructuredEvidenceMatch = (queryTokens, result) => {
|
|
|
8606
8765
|
])) {
|
|
8607
8766
|
score += 0.55;
|
|
8608
8767
|
}
|
|
8768
|
+
if (spreadsheetHeaders.length > 0 && (queryHasAnyToken(queryTokens, [
|
|
8769
|
+
"sheet",
|
|
8770
|
+
"spreadsheet",
|
|
8771
|
+
"workbook",
|
|
8772
|
+
"column",
|
|
8773
|
+
"columns",
|
|
8774
|
+
"row",
|
|
8775
|
+
"rows"
|
|
8776
|
+
]) || queryTokens.some((token) => spreadsheetHeaders.some((header) => header.includes(token))))) {
|
|
8777
|
+
score += 0.45;
|
|
8778
|
+
}
|
|
8779
|
+
if (hasSpreadsheetRows && queryHasAnyToken(queryTokens, ["row", "rows", "sheet", "spreadsheet"])) {
|
|
8780
|
+
score += 0.18;
|
|
8781
|
+
}
|
|
8782
|
+
if (typeof spreadsheetTableIndex === "number" && queryHasAnyToken(queryTokens, [
|
|
8783
|
+
"table",
|
|
8784
|
+
"tables",
|
|
8785
|
+
"sheet",
|
|
8786
|
+
"spreadsheet"
|
|
8787
|
+
])) {
|
|
8788
|
+
score += 0.16;
|
|
8789
|
+
if (typeof spreadsheetTableCount === "number" && spreadsheetTableCount > 1) {
|
|
8790
|
+
score += 0.08;
|
|
8791
|
+
}
|
|
8792
|
+
}
|
|
8793
|
+
if (slideTitle && (queryHasAnyToken(queryTokens, [
|
|
8794
|
+
"slide",
|
|
8795
|
+
"slides",
|
|
8796
|
+
"deck",
|
|
8797
|
+
"presentation"
|
|
8798
|
+
]) || queryTokens.some((token) => slideTitle.includes(token)))) {
|
|
8799
|
+
score += 0.4;
|
|
8800
|
+
}
|
|
8801
|
+
if (slideNotesText && queryHasAnyToken(queryTokens, [
|
|
8802
|
+
"notes",
|
|
8803
|
+
"speaker",
|
|
8804
|
+
"speakers",
|
|
8805
|
+
"talking"
|
|
8806
|
+
])) {
|
|
8807
|
+
score += 0.2;
|
|
8808
|
+
}
|
|
8809
|
+
if (speaker && queryHasAnyToken(queryTokens, ["speaker", "speakers", "said", "says"])) {
|
|
8810
|
+
score += 0.22;
|
|
8811
|
+
}
|
|
8812
|
+
if (typeof mediaSpeakerCount === "number" && mediaSpeakerCount > 1 && queryHasAnyToken(queryTokens, [
|
|
8813
|
+
"speaker",
|
|
8814
|
+
"speakers",
|
|
8815
|
+
"conversation",
|
|
8816
|
+
"dialogue"
|
|
8817
|
+
])) {
|
|
8818
|
+
score += 0.12;
|
|
8819
|
+
}
|
|
8820
|
+
if (typeof mediaSegmentCount === "number" && mediaSegmentCount > 1 && queryHasAnyToken(queryTokens, [
|
|
8821
|
+
"timestamp",
|
|
8822
|
+
"segment",
|
|
8823
|
+
"segments",
|
|
8824
|
+
"audio",
|
|
8825
|
+
"video"
|
|
8826
|
+
])) {
|
|
8827
|
+
score += 0.08;
|
|
8828
|
+
}
|
|
8829
|
+
if (typeof mediaSegmentGroupSize === "number" && mediaSegmentGroupSize > 1 && queryHasAnyToken(queryTokens, [
|
|
8830
|
+
"segment",
|
|
8831
|
+
"segments",
|
|
8832
|
+
"timestamp",
|
|
8833
|
+
"group",
|
|
8834
|
+
"audio",
|
|
8835
|
+
"video"
|
|
8836
|
+
])) {
|
|
8837
|
+
score += 0.06;
|
|
8838
|
+
}
|
|
8839
|
+
if (mediaChannel && queryHasAnyToken(queryTokens, [
|
|
8840
|
+
"channel",
|
|
8841
|
+
"channels",
|
|
8842
|
+
"left",
|
|
8843
|
+
"right",
|
|
8844
|
+
"audio",
|
|
8845
|
+
"video"
|
|
8846
|
+
])) {
|
|
8847
|
+
score += 0.12;
|
|
8848
|
+
}
|
|
8849
|
+
if (threadTopic && (queryHasAnyToken(queryTokens, [
|
|
8850
|
+
"email",
|
|
8851
|
+
"emails",
|
|
8852
|
+
"thread",
|
|
8853
|
+
"reply",
|
|
8854
|
+
"replies",
|
|
8855
|
+
"attachment"
|
|
8856
|
+
]) || queryTokens.some((token) => threadTopic.includes(token)))) {
|
|
8857
|
+
score += 0.34;
|
|
8858
|
+
}
|
|
8859
|
+
if (threadRootMessageId && queryHasAnyToken(queryTokens, [
|
|
8860
|
+
"thread",
|
|
8861
|
+
"reply",
|
|
8862
|
+
"replies",
|
|
8863
|
+
"root",
|
|
8864
|
+
"email"
|
|
8865
|
+
])) {
|
|
8866
|
+
score += 0.12;
|
|
8867
|
+
}
|
|
8868
|
+
if (typeof threadMessageCount === "number" && threadMessageCount > 1 && queryHasAnyToken(queryTokens, [
|
|
8869
|
+
"thread",
|
|
8870
|
+
"reply",
|
|
8871
|
+
"replies",
|
|
8872
|
+
"attachment"
|
|
8873
|
+
])) {
|
|
8874
|
+
score += 0.08;
|
|
8875
|
+
}
|
|
8876
|
+
if (attachmentName && queryHasAnyToken(queryTokens, [
|
|
8877
|
+
"attachment",
|
|
8878
|
+
"attachments",
|
|
8879
|
+
"file",
|
|
8880
|
+
"files"
|
|
8881
|
+
])) {
|
|
8882
|
+
score += 0.18;
|
|
8883
|
+
}
|
|
8884
|
+
if ((archiveFullPath || archivePath) && (queryHasAnyToken(queryTokens, [
|
|
8885
|
+
"archive",
|
|
8886
|
+
"archives",
|
|
8887
|
+
"entry",
|
|
8888
|
+
"entries",
|
|
8889
|
+
"bundle",
|
|
8890
|
+
"zip"
|
|
8891
|
+
]) || queryTokens.some((token) => (archiveFullPath ?? archivePath ?? "").includes(token)))) {
|
|
8892
|
+
score += 0.34;
|
|
8893
|
+
}
|
|
8894
|
+
if (archiveContainerPath && queryHasAnyToken(queryTokens, [
|
|
8895
|
+
"nested",
|
|
8896
|
+
"inner",
|
|
8897
|
+
"container",
|
|
8898
|
+
"archive"
|
|
8899
|
+
])) {
|
|
8900
|
+
score += 0.12;
|
|
8901
|
+
}
|
|
8902
|
+
if (typeof archiveNestedDepth === "number" && archiveNestedDepth > 1 && queryHasAnyToken(queryTokens, ["nested", "inner", "archive"])) {
|
|
8903
|
+
score += 0.08;
|
|
8904
|
+
}
|
|
8905
|
+
if (isOCREvidence && queryHasAnyToken(queryTokens, [
|
|
8906
|
+
"ocr",
|
|
8907
|
+
"scan",
|
|
8908
|
+
"scanned",
|
|
8909
|
+
"image",
|
|
8910
|
+
"photo",
|
|
8911
|
+
"region",
|
|
8912
|
+
"regions",
|
|
8913
|
+
"page",
|
|
8914
|
+
"pages"
|
|
8915
|
+
])) {
|
|
8916
|
+
if (typeof ocrConfidence === "number" && ocrConfidence >= 0.9) {
|
|
8917
|
+
score += 0.12;
|
|
8918
|
+
} else if (typeof ocrConfidence === "number" && ocrConfidence >= 0.75) {
|
|
8919
|
+
score += 0.05;
|
|
8920
|
+
} else if (typeof ocrConfidence === "number" && ocrConfidence < 0.55) {
|
|
8921
|
+
score -= 0.05;
|
|
8922
|
+
}
|
|
8923
|
+
}
|
|
8609
8924
|
return score;
|
|
8610
8925
|
};
|
|
8611
8926
|
var scoreHeuristicMatch = ({
|
|
@@ -8817,6 +9132,62 @@ var formatMediaTimestampForIngest = (value) => {
|
|
|
8817
9132
|
const milliseconds = Math.floor(value % 1000);
|
|
8818
9133
|
return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}.${String(milliseconds).padStart(3, "0")}`;
|
|
8819
9134
|
};
|
|
9135
|
+
var normalizeMediaSpeaker = (value) => typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
9136
|
+
var normalizeMediaChannel = (value) => typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
9137
|
+
var buildMediaTimestampBoundary = (segments) => {
|
|
9138
|
+
let startMs;
|
|
9139
|
+
let endMs;
|
|
9140
|
+
for (const segment of segments) {
|
|
9141
|
+
if (typeof segment.startMs === "number" && Number.isFinite(segment.startMs)) {
|
|
9142
|
+
startMs = segment.startMs;
|
|
9143
|
+
break;
|
|
9144
|
+
}
|
|
9145
|
+
}
|
|
9146
|
+
for (let index = segments.length - 1;index >= 0; index--) {
|
|
9147
|
+
const segment = segments[index];
|
|
9148
|
+
if (!segment) {
|
|
9149
|
+
continue;
|
|
9150
|
+
}
|
|
9151
|
+
if (typeof segment.endMs === "number" && Number.isFinite(segment.endMs)) {
|
|
9152
|
+
endMs = segment.endMs;
|
|
9153
|
+
break;
|
|
9154
|
+
}
|
|
9155
|
+
}
|
|
9156
|
+
return { endMs, startMs };
|
|
9157
|
+
};
|
|
9158
|
+
var groupTranscriptSegments = (segments) => {
|
|
9159
|
+
const groups = [];
|
|
9160
|
+
for (const segment of segments) {
|
|
9161
|
+
if (!segment || typeof segment !== "object") {
|
|
9162
|
+
continue;
|
|
9163
|
+
}
|
|
9164
|
+
const text = normalizeWhitespace(segment.text ?? "");
|
|
9165
|
+
if (!text) {
|
|
9166
|
+
continue;
|
|
9167
|
+
}
|
|
9168
|
+
const speaker = normalizeMediaSpeaker(segment.speaker);
|
|
9169
|
+
const channel = normalizeMediaChannel(segment.channel);
|
|
9170
|
+
const lastGroup = groups.at(-1);
|
|
9171
|
+
if (!lastGroup || lastGroup.speaker !== speaker || lastGroup.channel !== channel) {
|
|
9172
|
+
groups.push({
|
|
9173
|
+
channel,
|
|
9174
|
+
endMs: segment.endMs,
|
|
9175
|
+
segments: [segment],
|
|
9176
|
+
speaker,
|
|
9177
|
+
startMs: segment.startMs
|
|
9178
|
+
});
|
|
9179
|
+
continue;
|
|
9180
|
+
}
|
|
9181
|
+
lastGroup.endMs = typeof segment.endMs === "number" && Number.isFinite(segment.endMs) ? segment.endMs : lastGroup.endMs;
|
|
9182
|
+
lastGroup.segments.push(segment);
|
|
9183
|
+
if (typeof segment.startMs === "number" && Number.isFinite(segment.startMs)) {
|
|
9184
|
+
if (typeof lastGroup.startMs !== "number" || !Number.isFinite(lastGroup.startMs) || segment.startMs < lastGroup.startMs) {
|
|
9185
|
+
lastGroup.startMs = segment.startMs;
|
|
9186
|
+
}
|
|
9187
|
+
}
|
|
9188
|
+
}
|
|
9189
|
+
return groups;
|
|
9190
|
+
};
|
|
8820
9191
|
var decodeHtmlEntities = (value) => {
|
|
8821
9192
|
let output = value;
|
|
8822
9193
|
for (const [pattern, replacement] of HTML_ENTITY_REPLACEMENTS) {
|
|
@@ -9178,12 +9549,17 @@ var spreadsheetStructureUnits = (value, metadata) => {
|
|
|
9178
9549
|
return [];
|
|
9179
9550
|
}
|
|
9180
9551
|
const sheetName = typeof metadata?.sheetName === "string" && metadata.sheetName || lines[0].replace(/^Sheet\s+/i, "");
|
|
9552
|
+
const spreadsheetHeaders = Array.isArray(metadata?.sheetHeaders) ? metadata.sheetHeaders.filter((value2) => typeof value2 === "string" && value2.trim().length > 0) : [];
|
|
9553
|
+
const repeatedHeaderRowNumbers = Array.isArray(metadata?.repeatedHeaderRowNumbers) ? metadata.repeatedHeaderRowNumbers.filter((value2) => typeof value2 === "number" && Number.isFinite(value2)) : [];
|
|
9554
|
+
const spreadsheetTableCount = typeof metadata?.sheetTableCount === "number" && Number.isFinite(metadata.sheetTableCount) ? metadata.sheetTableCount : Math.max(repeatedHeaderRowNumbers.length + 1, 1);
|
|
9181
9555
|
const rowLines = lines.filter((line) => /^Row \d+\./.test(line));
|
|
9182
9556
|
if (rowLines.length === 0) {
|
|
9183
9557
|
return [
|
|
9184
9558
|
{
|
|
9185
9559
|
sectionDepth: 1,
|
|
9186
9560
|
sectionKind: "spreadsheet_rows",
|
|
9561
|
+
...spreadsheetHeaders.length > 0 ? { spreadsheetHeaders } : {},
|
|
9562
|
+
...spreadsheetTableCount > 1 ? { spreadsheetTableCount, spreadsheetTableIndex: 1 } : {},
|
|
9187
9563
|
sectionPath: [sheetName],
|
|
9188
9564
|
sectionTitle: sheetName,
|
|
9189
9565
|
text: normalizeWhitespace(lines.join(`
|
|
@@ -9191,35 +9567,69 @@ var spreadsheetStructureUnits = (value, metadata) => {
|
|
|
9191
9567
|
}
|
|
9192
9568
|
];
|
|
9193
9569
|
}
|
|
9194
|
-
const
|
|
9195
|
-
let
|
|
9570
|
+
const tableSegments = [];
|
|
9571
|
+
let currentTableRows = [];
|
|
9572
|
+
let tableIndex = 1;
|
|
9196
9573
|
for (const row of rowLines) {
|
|
9197
|
-
const
|
|
9198
|
-
|
|
9199
|
-
|
|
9200
|
-
|
|
9201
|
-
|
|
9574
|
+
const rowNumber = Number(row.match(/^Row (\d+)\./)?.[1] ?? NaN);
|
|
9575
|
+
if (currentTableRows.length > 0 && Number.isFinite(rowNumber) && repeatedHeaderRowNumbers.includes(rowNumber)) {
|
|
9576
|
+
tableSegments.push({ rows: currentTableRows, tableIndex });
|
|
9577
|
+
currentTableRows = [row];
|
|
9578
|
+
tableIndex += 1;
|
|
9202
9579
|
continue;
|
|
9203
9580
|
}
|
|
9204
|
-
|
|
9581
|
+
currentTableRows.push(row);
|
|
9582
|
+
}
|
|
9583
|
+
if (currentTableRows.length > 0) {
|
|
9584
|
+
tableSegments.push({ rows: currentTableRows, tableIndex });
|
|
9205
9585
|
}
|
|
9206
|
-
|
|
9207
|
-
|
|
9586
|
+
const groups = [];
|
|
9587
|
+
for (const segment of tableSegments) {
|
|
9588
|
+
let current = [];
|
|
9589
|
+
for (const row of segment.rows) {
|
|
9590
|
+
const candidate = [...current, row].join(`
|
|
9591
|
+
`);
|
|
9592
|
+
if (current.length > 0 && candidate.length > DEFAULT_MAX_CHUNK_LENGTH) {
|
|
9593
|
+
groups.push({ rows: current, tableIndex: segment.tableIndex });
|
|
9594
|
+
current = [row];
|
|
9595
|
+
continue;
|
|
9596
|
+
}
|
|
9597
|
+
current.push(row);
|
|
9598
|
+
}
|
|
9599
|
+
if (current.length > 0) {
|
|
9600
|
+
groups.push({ rows: current, tableIndex: segment.tableIndex });
|
|
9601
|
+
}
|
|
9208
9602
|
}
|
|
9209
|
-
return groups.map((rows) =>
|
|
9210
|
-
|
|
9211
|
-
|
|
9212
|
-
|
|
9213
|
-
|
|
9214
|
-
|
|
9215
|
-
|
|
9603
|
+
return groups.map(({ rows, tableIndex: tableIndex2 }) => {
|
|
9604
|
+
const rowNumbers = rows.map((row) => Number(row.match(/^Row (\d+)\./)?.[1] ?? NaN)).filter((value2) => Number.isFinite(value2));
|
|
9605
|
+
return {
|
|
9606
|
+
preferredChunkUnits: rows,
|
|
9607
|
+
sectionDepth: 1,
|
|
9608
|
+
sectionKind: "spreadsheet_rows",
|
|
9609
|
+
sectionPath: [sheetName],
|
|
9610
|
+
sectionTitle: sheetName,
|
|
9611
|
+
...spreadsheetHeaders.length > 0 ? { spreadsheetHeaders } : {},
|
|
9612
|
+
...spreadsheetTableCount > 1 ? { spreadsheetTableCount, spreadsheetTableIndex: tableIndex2 } : {},
|
|
9613
|
+
...rowNumbers.length > 0 ? {
|
|
9614
|
+
spreadsheetRowEnd: rowNumbers[rowNumbers.length - 1],
|
|
9615
|
+
spreadsheetRowStart: rowNumbers[0]
|
|
9616
|
+
} : {},
|
|
9617
|
+
text: normalizeWhitespace([`Sheet ${sheetName}`, ...rows].join(`
|
|
9216
9618
|
`))
|
|
9217
|
-
|
|
9619
|
+
};
|
|
9620
|
+
});
|
|
9218
9621
|
};
|
|
9219
9622
|
var presentationStructureUnits = (value, metadata) => {
|
|
9220
9623
|
const slideNumber = typeof metadata?.slideNumber === "number" ? metadata.slideNumber : typeof metadata?.slideIndex === "number" ? metadata.slideIndex + 1 : undefined;
|
|
9221
|
-
const
|
|
9222
|
-
const
|
|
9624
|
+
const slideTitle = typeof metadata?.slideTitle === "string" && metadata.slideTitle.trim() ? metadata.slideTitle.trim() : undefined;
|
|
9625
|
+
const slideBodyText = typeof metadata?.slideBodyText === "string" && metadata.slideBodyText.trim() ? metadata.slideBodyText.trim() : undefined;
|
|
9626
|
+
const slideNotesText = typeof metadata?.slideNotesText === "string" && metadata.slideNotesText.trim() ? metadata.slideNotesText.trim() : undefined;
|
|
9627
|
+
const slideLabel = slideTitle || (slideNumber ? `Slide ${slideNumber}` : "Slide");
|
|
9628
|
+
const paragraphs = [
|
|
9629
|
+
slideTitle,
|
|
9630
|
+
slideBodyText,
|
|
9631
|
+
slideNotesText ? `Speaker notes: ${slideNotesText}` : undefined
|
|
9632
|
+
].filter((entry) => Boolean(entry)).flatMap((entry) => paragraphUnits(entry));
|
|
9223
9633
|
return [
|
|
9224
9634
|
{
|
|
9225
9635
|
preferredChunkUnits: paragraphs,
|
|
@@ -9674,6 +10084,8 @@ var spreadsheetRowText = (row, headers) => {
|
|
|
9674
10084
|
});
|
|
9675
10085
|
return normalizeWhitespace(entries.join(" | "));
|
|
9676
10086
|
};
|
|
10087
|
+
var normalizeSpreadsheetHeaderValue = (value) => normalizeWhitespace(value).toLowerCase();
|
|
10088
|
+
var isSpreadsheetHeaderRow = (row, headers) => row.length === headers.length && row.every((cell, index) => normalizeSpreadsheetHeaderValue(cell.value) === normalizeSpreadsheetHeaderValue(headers[index] ?? ""));
|
|
9677
10089
|
var spreadsheetSheetTexts = (entries) => {
|
|
9678
10090
|
const sharedStrings = spreadsheetSharedStrings(entries);
|
|
9679
10091
|
const sheetNames = spreadsheetSheetNames(entries);
|
|
@@ -9684,14 +10096,28 @@ var spreadsheetSheetTexts = (entries) => {
|
|
|
9684
10096
|
return null;
|
|
9685
10097
|
}
|
|
9686
10098
|
const headers = rows[0].map((cell) => cell.value);
|
|
9687
|
-
const
|
|
10099
|
+
const repeatedHeaderRowNumbers = [];
|
|
10100
|
+
let tableCount = 1;
|
|
10101
|
+
const rowTexts = rows.map((row, rowIndex) => {
|
|
10102
|
+
const rowNumber = rowIndex + 1;
|
|
10103
|
+
const isHeaderRow = rowIndex === 0 || isSpreadsheetHeaderRow(row, headers);
|
|
10104
|
+
if (rowIndex > 0 && isHeaderRow) {
|
|
10105
|
+
repeatedHeaderRowNumbers.push(rowNumber);
|
|
10106
|
+
tableCount += 1;
|
|
10107
|
+
}
|
|
10108
|
+
return normalizeWhitespace(`Row ${rowNumber}. ${spreadsheetRowText(row, isHeaderRow ? [] : headers)}`);
|
|
10109
|
+
});
|
|
9688
10110
|
const text = normalizeWhitespace(rowTexts.join(`
|
|
9689
10111
|
`));
|
|
9690
10112
|
if (!text) {
|
|
9691
10113
|
return null;
|
|
9692
10114
|
}
|
|
9693
10115
|
return {
|
|
10116
|
+
headers,
|
|
9694
10117
|
name: sheetNames[index] ?? `Sheet ${index + 1}`,
|
|
10118
|
+
repeatedHeaderRowNumbers,
|
|
10119
|
+
rowCount: rowTexts.length,
|
|
10120
|
+
tableCount,
|
|
9695
10121
|
text
|
|
9696
10122
|
};
|
|
9697
10123
|
}).filter((entry) => Boolean(entry));
|
|
@@ -9714,12 +10140,21 @@ var presentationNotesByIndex = (entries) => new Map(entries.filter((entry) => en
|
|
|
9714
10140
|
var presentationSlides = (entries) => {
|
|
9715
10141
|
const notesByIndex = presentationNotesByIndex(entries);
|
|
9716
10142
|
return entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => {
|
|
9717
|
-
const
|
|
10143
|
+
const textRuns = [
|
|
10144
|
+
...decodeUtf8(entry.data).matchAll(/<a:t[^>]*>([\s\S]*?)<\/a:t>/gi)
|
|
10145
|
+
].map((match) => normalizeWhitespace(decodeHtmlEntities(match[1] ?? ""))).filter(Boolean);
|
|
10146
|
+
const slideTitle = textRuns[0];
|
|
10147
|
+
const slideBodyText = normalizeWhitespace(textRuns.slice(1).join(`
|
|
10148
|
+
`));
|
|
10149
|
+
const slideText = normalizeWhitespace([slideTitle, slideBodyText].filter(Boolean).join(`
|
|
10150
|
+
`));
|
|
9718
10151
|
const notesText = notesByIndex.get(index);
|
|
9719
10152
|
const text = normalizeWhitespace([slideText, notesText ? `Speaker notes: ${notesText}` : ""].filter(Boolean).join(`
|
|
9720
10153
|
`));
|
|
9721
10154
|
return {
|
|
9722
10155
|
index,
|
|
10156
|
+
slideBodyText,
|
|
10157
|
+
slideTitle,
|
|
9723
10158
|
notesText,
|
|
9724
10159
|
text
|
|
9725
10160
|
};
|
|
@@ -9864,6 +10299,15 @@ var parseEmailHeaders = (raw) => {
|
|
|
9864
10299
|
to: getHeader("To")
|
|
9865
10300
|
};
|
|
9866
10301
|
};
|
|
10302
|
+
var normalizeEmailThreadKey = (value) => {
|
|
10303
|
+
const normalized = normalizeWhitespace(value?.replace(/^(re|fw|fwd)\s*:\s*/gi, "")?.replace(/[<>]/g, "")?.toLowerCase() ?? "");
|
|
10304
|
+
return normalized || undefined;
|
|
10305
|
+
};
|
|
10306
|
+
var normalizeEmailMessageId = (value) => {
|
|
10307
|
+
const normalized = normalizeWhitespace(value ?? "");
|
|
10308
|
+
return normalized || undefined;
|
|
10309
|
+
};
|
|
10310
|
+
var parseEmailReferenceChain = (references) => (references?.match(/<[^>]+>/g) ?? []).map((entry) => normalizeWhitespace(entry)).filter(Boolean);
|
|
9867
10311
|
var stripRTF = (value) => {
|
|
9868
10312
|
const withoutBinary = value.replace(/\\bin\d+ [\s\S]*?(?=[\\}])/g, " ");
|
|
9869
10313
|
const withoutControls = withoutBinary.replace(/\\par[d]?/g, `
|
|
@@ -9881,6 +10325,9 @@ var extractPrintableStrings = (data) => {
|
|
|
9881
10325
|
};
|
|
9882
10326
|
var ocrMetadata = (result) => {
|
|
9883
10327
|
const regions = result.regions?.filter((region) => normalizeWhitespace(region.text ?? "").length > 0);
|
|
10328
|
+
const pageNumbers = [
|
|
10329
|
+
...new Set((regions ?? []).map((region) => typeof region.page === "number" && region.page > 0 ? region.page : undefined).filter((value) => value !== undefined))
|
|
10330
|
+
].sort((left, right) => left - right);
|
|
9884
10331
|
const confidenceValues = [
|
|
9885
10332
|
typeof result.confidence === "number" ? result.confidence : undefined,
|
|
9886
10333
|
...(regions ?? []).map((region) => typeof region.confidence === "number" ? region.confidence : undefined)
|
|
@@ -9889,6 +10336,8 @@ var ocrMetadata = (result) => {
|
|
|
9889
10336
|
return {
|
|
9890
10337
|
...result.metadata ?? {},
|
|
9891
10338
|
ocrConfidence: result.confidence,
|
|
10339
|
+
ocrPageCount: pageNumbers.length,
|
|
10340
|
+
ocrPageNumbers: pageNumbers,
|
|
9892
10341
|
ocrRegionCount: regions?.length,
|
|
9893
10342
|
ocrRegions: regions,
|
|
9894
10343
|
ocrAverageConfidence: averageConfidence
|
|
@@ -9905,25 +10354,32 @@ var ocrPageDocuments = (result, input, baseMetadata) => {
|
|
|
9905
10354
|
bucket.push({ ...region, text });
|
|
9906
10355
|
grouped.set(region.page, bucket);
|
|
9907
10356
|
}
|
|
9908
|
-
return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) =>
|
|
9909
|
-
|
|
9910
|
-
|
|
9911
|
-
|
|
9912
|
-
|
|
9913
|
-
|
|
9914
|
-
|
|
9915
|
-
|
|
9916
|
-
|
|
9917
|
-
|
|
9918
|
-
|
|
9919
|
-
|
|
9920
|
-
|
|
9921
|
-
|
|
9922
|
-
|
|
10357
|
+
return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => {
|
|
10358
|
+
const confidenceValues = regions.map((region) => typeof region.confidence === "number" ? region.confidence : undefined).filter((value) => value !== undefined);
|
|
10359
|
+
const averageConfidence = confidenceValues.length > 0 ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length : undefined;
|
|
10360
|
+
return {
|
|
10361
|
+
chunking: input.chunking,
|
|
10362
|
+
contentType: input.contentType,
|
|
10363
|
+
format: "text",
|
|
10364
|
+
metadata: {
|
|
10365
|
+
...input.metadata ?? {},
|
|
10366
|
+
...baseMetadata,
|
|
10367
|
+
ocrPageAverageConfidence: averageConfidence,
|
|
10368
|
+
ocrPageConfidence: averageConfidence,
|
|
10369
|
+
ocrRegionCount: regions.length,
|
|
10370
|
+
ocrRegionNumbers: regions.map((_region, index) => index + 1),
|
|
10371
|
+
ocrRegions: regions,
|
|
10372
|
+
pageNumber,
|
|
10373
|
+
pageIndex: pageNumber - 1,
|
|
10374
|
+
sourceNativeKind: "pdf_page"
|
|
10375
|
+
},
|
|
10376
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
10377
|
+
text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
9923
10378
|
${regions.map((region) => region.text).join(`
|
|
9924
10379
|
`)}`),
|
|
9925
|
-
|
|
9926
|
-
|
|
10380
|
+
title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
|
|
10381
|
+
};
|
|
10382
|
+
});
|
|
9927
10383
|
};
|
|
9928
10384
|
var ocrRegionDocuments = (result, input, baseMetadata) => {
|
|
9929
10385
|
const documents = [];
|
|
@@ -9941,6 +10397,8 @@ var ocrRegionDocuments = (result, input, baseMetadata) => {
|
|
|
9941
10397
|
metadata: {
|
|
9942
10398
|
...input.metadata ?? {},
|
|
9943
10399
|
...baseMetadata,
|
|
10400
|
+
ocrPageCount: 1,
|
|
10401
|
+
ocrPageNumbers: [pageNumber],
|
|
9944
10402
|
ocrRegionConfidence: region.confidence,
|
|
9945
10403
|
ocrRegionHeight: region.height,
|
|
9946
10404
|
ocrRegionWidth: region.width,
|
|
@@ -10051,15 +10509,34 @@ var createEmailExtractor = () => ({
|
|
|
10051
10509
|
const { body } = splitEmailMessage(raw);
|
|
10052
10510
|
const parsed = parseEmailMimeParts(body, headers.contentType);
|
|
10053
10511
|
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`;
|
|
10512
|
+
const referenceChain = parseEmailReferenceChain(headers.references);
|
|
10513
|
+
const messageId = normalizeEmailMessageId(headers.messageId);
|
|
10514
|
+
const inReplyTo = normalizeEmailMessageId(headers.inReplyTo);
|
|
10515
|
+
const threadMessageIds = [
|
|
10516
|
+
...new Set([
|
|
10517
|
+
...referenceChain.map((entry) => normalizeEmailMessageId(entry)),
|
|
10518
|
+
messageId
|
|
10519
|
+
].filter((value) => typeof value === "string"))
|
|
10520
|
+
];
|
|
10521
|
+
const replyDepth = Math.max(referenceChain.length, headers.inReplyTo ? 1 : 0);
|
|
10522
|
+
const threadTopic = headers.threadTopic ?? headers.subject;
|
|
10523
|
+
const threadRootMessageId = normalizeEmailMessageId(referenceChain[0]) ?? inReplyTo ?? messageId;
|
|
10524
|
+
const threadKey = normalizeEmailThreadKey(threadTopic) ?? normalizeEmailThreadKey(messageId) ?? normalizeEmailThreadKey(headers.subject);
|
|
10054
10525
|
const messageMetadata = {
|
|
10055
10526
|
...input.metadata ?? {},
|
|
10056
10527
|
emailKind: "message",
|
|
10057
10528
|
fileKind: "email",
|
|
10058
10529
|
from: headers.from,
|
|
10059
|
-
inReplyTo
|
|
10060
|
-
messageId
|
|
10530
|
+
inReplyTo,
|
|
10531
|
+
messageId,
|
|
10061
10532
|
references: headers.references,
|
|
10062
|
-
|
|
10533
|
+
replyDepth,
|
|
10534
|
+
replyReferenceCount: referenceChain.length,
|
|
10535
|
+
threadMessageCount: threadMessageIds.length,
|
|
10536
|
+
threadMessageIds,
|
|
10537
|
+
threadKey,
|
|
10538
|
+
threadRootMessageId,
|
|
10539
|
+
threadTopic,
|
|
10063
10540
|
to: headers.to,
|
|
10064
10541
|
hasAttachments: parsed.attachments.length > 0
|
|
10065
10542
|
};
|
|
@@ -10172,9 +10649,13 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
10172
10649
|
...input.metadata ?? {},
|
|
10173
10650
|
fileKind: "office",
|
|
10174
10651
|
...officeMetadata,
|
|
10652
|
+
repeatedHeaderRowNumbers: sheet.repeatedHeaderRowNumbers,
|
|
10653
|
+
sheetHeaders: sheet.headers,
|
|
10175
10654
|
sourceNativeKind: "spreadsheet_sheet",
|
|
10176
10655
|
sheetIndex: index,
|
|
10177
|
-
sheetName: sheet.name
|
|
10656
|
+
sheetName: sheet.name,
|
|
10657
|
+
sheetRowCount: sheet.rowCount,
|
|
10658
|
+
sheetTableCount: sheet.tableCount
|
|
10178
10659
|
},
|
|
10179
10660
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
10180
10661
|
text: normalizeWhitespace(`Spreadsheet workbook ${workbookLabel}. ` + `Worksheet ${index + 1}. ` + `Workbook sheet named ${sheet.name}. ` + `Sheet ${sheet.name} from spreadsheet workbook ${workbookLabel}.` + `
|
|
@@ -10195,6 +10676,9 @@ ${sheet.text}`),
|
|
|
10195
10676
|
...input.metadata ?? {},
|
|
10196
10677
|
fileKind: "office",
|
|
10197
10678
|
...officeMetadata,
|
|
10679
|
+
...slide.slideBodyText ? { slideBodyText: slide.slideBodyText } : {},
|
|
10680
|
+
...slide.notesText ? { slideNotesText: slide.notesText } : {},
|
|
10681
|
+
...slide.slideTitle ? { slideTitle: slide.slideTitle } : {},
|
|
10198
10682
|
sourceNativeKind: "presentation_slide",
|
|
10199
10683
|
slideIndex: slide.index,
|
|
10200
10684
|
slideNumber: slide.index + 1
|
|
@@ -10240,7 +10724,8 @@ var createRAGImageOCRExtractor = (provider) => ({
|
|
|
10240
10724
|
metadata: {
|
|
10241
10725
|
...input.metadata ?? {},
|
|
10242
10726
|
...ocrMetadata(result),
|
|
10243
|
-
fileKind: "image"
|
|
10727
|
+
fileKind: "image",
|
|
10728
|
+
sourceNativeKind: "image_ocr"
|
|
10244
10729
|
},
|
|
10245
10730
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.image.txt`,
|
|
10246
10731
|
text: result.text,
|
|
@@ -10253,15 +10738,34 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
10253
10738
|
supports: mediaExtractorSupports,
|
|
10254
10739
|
extract: async (input) => {
|
|
10255
10740
|
const result = await transcriber.transcribe(input);
|
|
10741
|
+
const rawSegments = (result.segments ?? []).filter((segment) => {
|
|
10742
|
+
if (!segment || typeof segment !== "object") {
|
|
10743
|
+
return false;
|
|
10744
|
+
}
|
|
10745
|
+
return normalizeWhitespace(segment.text ?? "").length > 0;
|
|
10746
|
+
});
|
|
10747
|
+
const segmentGroups = groupTranscriptSegments(rawSegments);
|
|
10256
10748
|
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
|
|
10749
|
+
const segmentCount = rawSegments.length;
|
|
10750
|
+
const mediaDurationMs = rawSegments.reduce((max, segment) => {
|
|
10751
|
+
const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
|
|
10752
|
+
if (typeof endMs !== "number") {
|
|
10753
|
+
return max;
|
|
10754
|
+
}
|
|
10755
|
+
return typeof max === "number" ? Math.max(max, endMs) : endMs;
|
|
10756
|
+
}, undefined);
|
|
10757
|
+
const mediaSpeakers = [
|
|
10758
|
+
...new Set(rawSegments.map((segment) => normalizeMediaSpeaker(segment.speaker)).filter((value) => typeof value === "string"))
|
|
10759
|
+
];
|
|
10257
10760
|
const segmentDocuments = [];
|
|
10258
|
-
for (const [index,
|
|
10259
|
-
const
|
|
10260
|
-
|
|
10761
|
+
for (const [index, segmentGroup] of segmentGroups.entries()) {
|
|
10762
|
+
const { endMs, startMs } = buildMediaTimestampBoundary(segmentGroup.segments);
|
|
10763
|
+
const groupText = normalizeWhitespace(segmentGroup.segments.map((segment) => normalizeWhitespace(segment.text ?? "")).filter((value) => value.length > 0).join(" "));
|
|
10764
|
+
if (!groupText) {
|
|
10261
10765
|
continue;
|
|
10262
10766
|
}
|
|
10263
|
-
const
|
|
10264
|
-
const
|
|
10767
|
+
const mediaSegmentStartMs = startMs;
|
|
10768
|
+
const mediaSegmentEndMs = endMs;
|
|
10265
10769
|
const startLabel = formatMediaTimestampForIngest(startMs);
|
|
10266
10770
|
const endLabel = formatMediaTimestampForIngest(endMs);
|
|
10267
10771
|
const mediaKind = typeof result.metadata?.mediaKind === "string" ? result.metadata.mediaKind : "media";
|
|
@@ -10274,15 +10778,27 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
10274
10778
|
...result.metadata ?? {},
|
|
10275
10779
|
fileKind: "media",
|
|
10276
10780
|
sourceNativeKind: "media_segment",
|
|
10781
|
+
mediaDurationMs,
|
|
10277
10782
|
mediaSegmentIndex: index,
|
|
10278
|
-
mediaSegmentStartMs
|
|
10279
|
-
mediaSegmentEndMs
|
|
10280
|
-
|
|
10281
|
-
|
|
10783
|
+
mediaSegmentStartMs,
|
|
10784
|
+
mediaSegmentEndMs,
|
|
10785
|
+
mediaSegmentCount: segmentCount,
|
|
10786
|
+
mediaSegmentGroupIndex: index,
|
|
10787
|
+
mediaSegmentGroupSize: segmentGroup.segments.length,
|
|
10788
|
+
mediaSegmentGroupSpeaker: segmentGroup.speaker,
|
|
10789
|
+
mediaChannel: segmentGroup.channel,
|
|
10790
|
+
mediaSegments: segmentGroup.segments,
|
|
10791
|
+
startMs: mediaSegmentStartMs,
|
|
10792
|
+
endMs: mediaSegmentEndMs,
|
|
10793
|
+
...mediaSpeakers.length > 0 ? {
|
|
10794
|
+
mediaSpeakerCount: mediaSpeakers.length,
|
|
10795
|
+
mediaSpeakers
|
|
10796
|
+
} : {},
|
|
10797
|
+
speaker: segmentGroup.speaker
|
|
10282
10798
|
},
|
|
10283
10799
|
source,
|
|
10284
10800
|
text: normalizeWhitespace(`${mediaKind} transcript segment${startLabel ? ` at timestamp ${startLabel}${endLabel ? ` to ${endLabel}` : ""}` : ""} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}. ` + `${mediaKind} timestamp evidence${startLabel ? ` ${startLabel}${endLabel ? ` to ${endLabel}` : ""}` : ""}.` + `
|
|
10285
|
-
${
|
|
10801
|
+
${groupText}`),
|
|
10286
10802
|
title: input.title ? `${input.title} \xB7 ${mediaKind[0]?.toUpperCase() + mediaKind.slice(1)} segment ${index + 1}` : `${mediaKind[0]?.toUpperCase() + mediaKind.slice(1)} segment ${index + 1}`
|
|
10287
10803
|
});
|
|
10288
10804
|
}
|
|
@@ -10294,7 +10810,13 @@ ${text}`),
|
|
|
10294
10810
|
...input.metadata ?? {},
|
|
10295
10811
|
...result.metadata ?? {},
|
|
10296
10812
|
fileKind: "media",
|
|
10297
|
-
|
|
10813
|
+
mediaDurationMs,
|
|
10814
|
+
mediaSegmentCount: segmentCount,
|
|
10815
|
+
mediaSegments: rawSegments,
|
|
10816
|
+
...mediaSpeakers.length > 0 ? {
|
|
10817
|
+
mediaSpeakerCount: mediaSpeakers.length,
|
|
10818
|
+
mediaSpeakers
|
|
10819
|
+
} : {}
|
|
10298
10820
|
},
|
|
10299
10821
|
source,
|
|
10300
10822
|
text: result.text,
|
|
@@ -10319,6 +10841,13 @@ var createTextFileExtractor = () => ({
|
|
|
10319
10841
|
})
|
|
10320
10842
|
});
|
|
10321
10843
|
var expandArchiveEntry = async (entry, archiveInput, extractors, registry) => {
|
|
10844
|
+
const parentArchiveLineage = Array.isArray(archiveInput.metadata?.archiveLineage) ? archiveInput.metadata.archiveLineage.filter((value) => typeof value === "string" && value.trim().length > 0) : [];
|
|
10845
|
+
const entryArchiveLineage = entry.path.split(/[\\/]/).map((segment) => normalizeWhitespace(segment)).filter(Boolean);
|
|
10846
|
+
const archiveLineage = [...parentArchiveLineage, ...entryArchiveLineage];
|
|
10847
|
+
const parentArchivePath = typeof archiveInput.metadata?.archivePath === "string" && archiveInput.metadata.archivePath.trim().length > 0 ? archiveInput.metadata.archivePath.trim() : undefined;
|
|
10848
|
+
const archiveFullPath = parentArchivePath ? `${parentArchivePath}!${entry.path}` : entry.path;
|
|
10849
|
+
const archiveRootName = (typeof archiveInput.metadata?.archiveRootName === "string" && archiveInput.metadata.archiveRootName.trim().length > 0 ? archiveInput.metadata.archiveRootName.trim() : undefined) ?? archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source;
|
|
10850
|
+
const archiveRootSource = (typeof archiveInput.metadata?.archiveRootSource === "string" && archiveInput.metadata.archiveRootSource.trim().length > 0 ? archiveInput.metadata.archiveRootSource.trim() : undefined) ?? archiveInput.source ?? archiveInput.path ?? archiveInput.name;
|
|
10322
10851
|
const documents = await extractRAGFileDocuments({
|
|
10323
10852
|
chunking: archiveInput.chunking,
|
|
10324
10853
|
contentType: entry.contentType,
|
|
@@ -10331,7 +10860,14 @@ var expandArchiveEntry = async (entry, archiveInput, extractors, registry) => {
|
|
|
10331
10860
|
archiveEntryName: basename(entry.path),
|
|
10332
10861
|
archiveParentName: archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source,
|
|
10333
10862
|
archiveParentSource: archiveInput.source ?? archiveInput.path ?? archiveInput.name,
|
|
10863
|
+
archiveContainerPath: parentArchivePath,
|
|
10864
|
+
archiveDepth: archiveLineage.length,
|
|
10865
|
+
archiveFullPath,
|
|
10866
|
+
archiveLineage,
|
|
10334
10867
|
archivePath: entry.path,
|
|
10868
|
+
archiveRootName,
|
|
10869
|
+
archiveRootSource,
|
|
10870
|
+
archiveNestedDepth: parentArchiveLineage.length + 1,
|
|
10335
10871
|
fileKind: "archive_entry"
|
|
10336
10872
|
},
|
|
10337
10873
|
name: basename(entry.path),
|
|
@@ -10748,10 +11284,27 @@ ${text}`);
|
|
|
10748
11284
|
}
|
|
10749
11285
|
return text;
|
|
10750
11286
|
};
|
|
10751
|
-
|
|
10752
|
-
|
|
10753
|
-
|
|
10754
|
-
|
|
11287
|
+
const resolveSpreadsheetChunkRowRange = (text) => {
|
|
11288
|
+
if (unit.sectionKind !== "spreadsheet_rows") {
|
|
11289
|
+
return {};
|
|
11290
|
+
}
|
|
11291
|
+
const rowNumbers = [...text.matchAll(/^Row (\d+)\./gm)].map((match) => Number(match[1] ?? NaN)).filter((value) => Number.isFinite(value));
|
|
11292
|
+
if (rowNumbers.length === 0) {
|
|
11293
|
+
return {};
|
|
11294
|
+
}
|
|
11295
|
+
return {
|
|
11296
|
+
spreadsheetRowEnd: rowNumbers[rowNumbers.length - 1],
|
|
11297
|
+
spreadsheetRowStart: rowNumbers[0]
|
|
11298
|
+
};
|
|
11299
|
+
};
|
|
11300
|
+
return merged.map((text) => {
|
|
11301
|
+
const decoratedText = decorateSourceAwareChunkText(text);
|
|
11302
|
+
return {
|
|
11303
|
+
...unit,
|
|
11304
|
+
...resolveSpreadsheetChunkRowRange(decoratedText),
|
|
11305
|
+
text: decoratedText
|
|
11306
|
+
};
|
|
11307
|
+
});
|
|
10755
11308
|
};
|
|
10756
11309
|
var resolveChunkingUnits = (text, options) => {
|
|
10757
11310
|
if (options.strategy === "fixed") {
|
|
@@ -10922,6 +11475,11 @@ var prepareRAGDocument = (document, defaultChunking, chunkingRegistry) => {
|
|
|
10922
11475
|
...sectionTitle ? { sectionTitle } : {},
|
|
10923
11476
|
...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
|
|
10924
11477
|
...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
|
|
11478
|
+
...Array.isArray(entry.spreadsheetHeaders) && entry.spreadsheetHeaders.length > 0 ? { spreadsheetHeaders: entry.spreadsheetHeaders } : {},
|
|
11479
|
+
...typeof entry.spreadsheetTableIndex === "number" ? { spreadsheetTableIndex: entry.spreadsheetTableIndex } : {},
|
|
11480
|
+
...typeof entry.spreadsheetTableCount === "number" ? { spreadsheetTableCount: entry.spreadsheetTableCount } : {},
|
|
11481
|
+
...typeof entry.spreadsheetRowStart === "number" ? { spreadsheetRowStart: entry.spreadsheetRowStart } : {},
|
|
11482
|
+
...typeof entry.spreadsheetRowEnd === "number" ? { spreadsheetRowEnd: entry.spreadsheetRowEnd } : {},
|
|
10925
11483
|
...typeof entry.pageNumber === "number" ? { pageNumber: entry.pageNumber } : {},
|
|
10926
11484
|
...typeof entry.officeBlockNumber === "number" ? { officeBlockNumber: entry.officeBlockNumber } : {},
|
|
10927
11485
|
...entry.officeBlockKind ? { officeBlockKind: entry.officeBlockKind } : {},
|
|
@@ -24278,5 +24836,5 @@ export {
|
|
|
24278
24836
|
aiChat
|
|
24279
24837
|
};
|
|
24280
24838
|
|
|
24281
|
-
//# debugId=
|
|
24839
|
+
//# debugId=AFAF0A5BC1AB4BC864756E2164756E21
|
|
24282
24840
|
//# sourceMappingURL=index.js.map
|