@absolutejs/absolute 0.19.0-beta.603 → 0.19.0-beta.605
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/client/index.js +244 -10
- package/dist/ai/client/index.js.map +4 -4
- package/dist/ai/client/ui.js +248 -10
- package/dist/ai/client/ui.js.map +4 -4
- package/dist/ai/index.js +1003 -110
- package/dist/ai/index.js.map +8 -8
- package/dist/ai/rag/quality.js +27 -6
- package/dist/ai/rag/quality.js.map +3 -3
- package/dist/ai/rag/ui.js +248 -10
- package/dist/ai/rag/ui.js.map +4 -4
- package/dist/ai-client/angular/ai/index.js +243 -9
- package/dist/ai-client/react/ai/index.js +258 -10
- package/dist/ai-client/vue/ai/index.js +347 -101
- package/dist/angular/ai/index.js +244 -10
- package/dist/angular/ai/index.js.map +4 -4
- package/dist/react/ai/index.js +259 -11
- package/dist/react/ai/index.js.map +6 -6
- package/dist/src/ai/client/ui.d.ts +1 -1
- package/dist/src/ai/rag/index.d.ts +1 -1
- package/dist/src/ai/rag/presentation.d.ts +12 -1
- package/dist/src/ai/rag/ui.d.ts +1 -1
- package/dist/src/react/ai/useRAG.d.ts +5 -0
- package/dist/src/react/ai/useRAGChunkPreview.d.ts +4 -0
- package/dist/src/react/ai/useRAGSources.d.ts +1 -0
- package/dist/src/svelte/ai/createRAG.d.ts +5 -0
- package/dist/src/svelte/ai/createRAGChunkPreview.d.ts +4 -0
- package/dist/src/svelte/ai/createRAGSources.d.ts +1 -0
- package/dist/src/vue/ai/useRAG.d.ts +125 -0
- package/dist/src/vue/ai/useRAGChunkPreview.d.ts +54 -0
- package/dist/src/vue/ai/useRAGDocuments.d.ts +20 -0
- package/dist/src/vue/ai/useRAGIndexAdmin.d.ts +10 -0
- package/dist/src/vue/ai/useRAGSearch.d.ts +40 -0
- package/dist/src/vue/ai/useRAGSources.d.ts +1 -0
- package/dist/svelte/ai/index.js +305 -57
- package/dist/svelte/ai/index.js.map +6 -6
- package/dist/types/ai.d.ts +102 -1
- package/dist/vue/ai/index.js +311 -63
- package/dist/vue/ai/index.js.map +6 -6
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -216,6 +216,10 @@ var buildContextLabel = (metadata) => {
|
|
|
216
216
|
return from ? `Message from ${from}` : "Message evidence";
|
|
217
217
|
}
|
|
218
218
|
const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
219
|
+
const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
220
|
+
if (page && region) {
|
|
221
|
+
return `Page ${page} region ${region}`;
|
|
222
|
+
}
|
|
219
223
|
if (page) {
|
|
220
224
|
return `Page ${page}`;
|
|
221
225
|
}
|
|
@@ -239,6 +243,11 @@ var buildContextLabel = (metadata) => {
|
|
|
239
243
|
if (speaker) {
|
|
240
244
|
return `Speaker ${speaker}`;
|
|
241
245
|
}
|
|
246
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString(value)).filter((value) => typeof value === "string") : [];
|
|
247
|
+
const sectionTitle = getContextString(metadata.sectionTitle) ?? sectionPath.at(-1);
|
|
248
|
+
if (sectionTitle) {
|
|
249
|
+
return `Section ${sectionTitle}`;
|
|
250
|
+
}
|
|
242
251
|
return;
|
|
243
252
|
};
|
|
244
253
|
var formatMediaTimestamp = (value) => {
|
|
@@ -256,6 +265,10 @@ var buildLocatorLabel = (metadata, source, title) => {
|
|
|
256
265
|
return;
|
|
257
266
|
}
|
|
258
267
|
const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
268
|
+
const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
269
|
+
if (page && region) {
|
|
270
|
+
return `Page ${page} \xB7 Region ${region}`;
|
|
271
|
+
}
|
|
259
272
|
if (page) {
|
|
260
273
|
return `Page ${page}`;
|
|
261
274
|
}
|
|
@@ -284,6 +297,10 @@ var buildLocatorLabel = (metadata, source, title) => {
|
|
|
284
297
|
if (mediaStart) {
|
|
285
298
|
return `Timestamp ${mediaStart}`;
|
|
286
299
|
}
|
|
300
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString(value)).filter((value) => typeof value === "string") : [];
|
|
301
|
+
if (sectionPath.length > 0) {
|
|
302
|
+
return `Section ${sectionPath.join(" > ")}`;
|
|
303
|
+
}
|
|
287
304
|
return;
|
|
288
305
|
};
|
|
289
306
|
var formatTimestampLabel = (value) => {
|
|
@@ -308,9 +325,11 @@ var buildProvenanceLabel = (metadata) => {
|
|
|
308
325
|
const transcriptSource = getContextString(metadata.transcriptSource);
|
|
309
326
|
const pdfTextMode = getContextString(metadata.pdfTextMode);
|
|
310
327
|
const ocrEngine = getContextString(metadata.ocrEngine);
|
|
328
|
+
const ocrConfidence = getContextNumber(metadata.ocrRegionConfidence) ?? getContextNumber(metadata.ocrConfidence);
|
|
311
329
|
const labels = [
|
|
312
330
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
313
331
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
332
|
+
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
314
333
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
315
334
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
316
335
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
@@ -331,8 +350,10 @@ var buildExcerpt = (text, maxLength = 160) => {
|
|
|
331
350
|
var buildGroundingReferenceEvidenceLabel = (reference) => [reference.label, reference.locatorLabel, reference.contextLabel].filter((value) => Boolean(value && value.length > 0)).filter((value, index, values) => values.findIndex((entry) => entry === value) === index).join(" \xB7 ");
|
|
332
351
|
var buildGroundingReferenceEvidenceSummary = (reference) => [
|
|
333
352
|
reference.source ?? reference.title ?? reference.chunkId,
|
|
353
|
+
reference.locatorLabel,
|
|
354
|
+
reference.contextLabel,
|
|
334
355
|
reference.provenanceLabel
|
|
335
|
-
].filter((value) => Boolean(value && value.length > 0)).join(" \xB7 ");
|
|
356
|
+
].filter((value) => Boolean(value && value.length > 0)).filter((value, index, values) => values.findIndex((entry) => entry === value) === index).join(" \xB7 ");
|
|
336
357
|
var buildGroundedAnswerCitationDetail = (reference) => ({
|
|
337
358
|
contextLabel: reference.contextLabel,
|
|
338
359
|
evidenceLabel: buildGroundingReferenceEvidenceLabel(reference),
|
|
@@ -356,12 +377,12 @@ var buildRAGCitations = (sources) => {
|
|
|
356
377
|
continue;
|
|
357
378
|
unique.set(key, {
|
|
358
379
|
chunkId: source.chunkId,
|
|
359
|
-
contextLabel: buildContextLabel(source.metadata),
|
|
380
|
+
contextLabel: source.labels?.contextLabel ?? buildContextLabel(source.metadata),
|
|
360
381
|
key,
|
|
361
382
|
label: buildSourceLabel(source),
|
|
362
|
-
locatorLabel: buildLocatorLabel(source.metadata, source.source, source.title),
|
|
383
|
+
locatorLabel: source.labels?.locatorLabel ?? buildLocatorLabel(source.metadata, source.source, source.title),
|
|
363
384
|
metadata: source.metadata,
|
|
364
|
-
provenanceLabel: buildProvenanceLabel(source.metadata),
|
|
385
|
+
provenanceLabel: source.labels?.provenanceLabel ?? buildProvenanceLabel(source.metadata),
|
|
365
386
|
score: source.score,
|
|
366
387
|
source: source.source,
|
|
367
388
|
text: source.text,
|
|
@@ -431,7 +452,7 @@ var buildRAGGroundingReferences = (sources) => {
|
|
|
431
452
|
const citationReferenceMap = buildRAGCitationReferenceMap(citations);
|
|
432
453
|
return citations.map((citation) => ({
|
|
433
454
|
chunkId: citation.chunkId,
|
|
434
|
-
contextLabel: buildContextLabel(citation.metadata),
|
|
455
|
+
contextLabel: citation.contextLabel ?? buildContextLabel(citation.metadata),
|
|
435
456
|
excerpt: buildExcerpt(citation.text),
|
|
436
457
|
label: citation.label,
|
|
437
458
|
locatorLabel: citation.locatorLabel ?? buildLocatorLabel(citation.metadata, citation.source, citation.title),
|
|
@@ -3977,6 +3998,10 @@ var buildContextLabel2 = (metadata) => {
|
|
|
3977
3998
|
return from ? `Message from ${from}` : "Message evidence";
|
|
3978
3999
|
}
|
|
3979
4000
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4001
|
+
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4002
|
+
if (page && region) {
|
|
4003
|
+
return `Page ${page} region ${region}`;
|
|
4004
|
+
}
|
|
3980
4005
|
if (page) {
|
|
3981
4006
|
return `Page ${page}`;
|
|
3982
4007
|
}
|
|
@@ -4000,6 +4025,11 @@ var buildContextLabel2 = (metadata) => {
|
|
|
4000
4025
|
if (speaker) {
|
|
4001
4026
|
return `Speaker ${speaker}`;
|
|
4002
4027
|
}
|
|
4028
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4029
|
+
const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
|
|
4030
|
+
if (sectionTitle) {
|
|
4031
|
+
return `Section ${sectionTitle}`;
|
|
4032
|
+
}
|
|
4003
4033
|
return;
|
|
4004
4034
|
};
|
|
4005
4035
|
var buildLocatorLabel2 = (metadata, source, title) => {
|
|
@@ -4007,6 +4037,10 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4007
4037
|
return;
|
|
4008
4038
|
}
|
|
4009
4039
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4040
|
+
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4041
|
+
if (page && region) {
|
|
4042
|
+
return `Page ${page} \xB7 Region ${region}`;
|
|
4043
|
+
}
|
|
4010
4044
|
if (page) {
|
|
4011
4045
|
return `Page ${page}`;
|
|
4012
4046
|
}
|
|
@@ -4035,6 +4069,10 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4035
4069
|
if (mediaStart) {
|
|
4036
4070
|
return `Timestamp ${mediaStart}`;
|
|
4037
4071
|
}
|
|
4072
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
|
|
4073
|
+
if (sectionPath.length > 0) {
|
|
4074
|
+
return `Section ${sectionPath.join(" > ")}`;
|
|
4075
|
+
}
|
|
4038
4076
|
return;
|
|
4039
4077
|
};
|
|
4040
4078
|
var buildProvenanceLabel2 = (metadata) => {
|
|
@@ -4049,9 +4087,11 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4049
4087
|
const transcriptSource = getContextString2(metadata.transcriptSource);
|
|
4050
4088
|
const pdfTextMode = getContextString2(metadata.pdfTextMode);
|
|
4051
4089
|
const ocrEngine = getContextString2(metadata.ocrEngine);
|
|
4090
|
+
const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
|
|
4052
4091
|
const labels = [
|
|
4053
4092
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
4054
4093
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
4094
|
+
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
4055
4095
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
4056
4096
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
4057
4097
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
@@ -4061,6 +4101,50 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4061
4101
|
].filter((value) => value.length > 0);
|
|
4062
4102
|
return labels.length > 0 ? labels.join(" \xB7 ") : undefined;
|
|
4063
4103
|
};
|
|
4104
|
+
var buildRAGSourceLabels = ({
|
|
4105
|
+
metadata,
|
|
4106
|
+
source,
|
|
4107
|
+
title
|
|
4108
|
+
}) => {
|
|
4109
|
+
const contextLabel = buildContextLabel2(metadata);
|
|
4110
|
+
const locatorLabel = buildLocatorLabel2(metadata, source, title);
|
|
4111
|
+
const provenanceLabel = buildProvenanceLabel2(metadata);
|
|
4112
|
+
if (!contextLabel && !locatorLabel && !provenanceLabel) {
|
|
4113
|
+
return;
|
|
4114
|
+
}
|
|
4115
|
+
return {
|
|
4116
|
+
contextLabel,
|
|
4117
|
+
locatorLabel,
|
|
4118
|
+
provenanceLabel
|
|
4119
|
+
};
|
|
4120
|
+
};
|
|
4121
|
+
var buildRAGChunkStructure = (metadata) => {
|
|
4122
|
+
if (!metadata) {
|
|
4123
|
+
return;
|
|
4124
|
+
}
|
|
4125
|
+
const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : undefined;
|
|
4126
|
+
const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" ? metadata.sectionKind : undefined;
|
|
4127
|
+
const section = {
|
|
4128
|
+
depth: getContextNumber2(metadata.sectionDepth),
|
|
4129
|
+
kind: sectionKind,
|
|
4130
|
+
path: sectionPath && sectionPath.length > 0 ? sectionPath : undefined,
|
|
4131
|
+
title: getContextString2(metadata.sectionTitle)
|
|
4132
|
+
};
|
|
4133
|
+
const sequence = {
|
|
4134
|
+
nextChunkId: getContextString2(metadata.nextChunkId),
|
|
4135
|
+
previousChunkId: getContextString2(metadata.previousChunkId),
|
|
4136
|
+
sectionChunkCount: getContextNumber2(metadata.sectionChunkCount),
|
|
4137
|
+
sectionChunkId: getContextString2(metadata.sectionChunkId),
|
|
4138
|
+
sectionChunkIndex: getContextNumber2(metadata.sectionChunkIndex)
|
|
4139
|
+
};
|
|
4140
|
+
if (!section.title && (!section.path || section.path.length === 0) && typeof section.depth !== "number" && !section.kind && !sequence.nextChunkId && !sequence.previousChunkId && typeof sequence.sectionChunkCount !== "number" && !sequence.sectionChunkId && typeof sequence.sectionChunkIndex !== "number") {
|
|
4141
|
+
return;
|
|
4142
|
+
}
|
|
4143
|
+
return {
|
|
4144
|
+
section: section.title || section.path && section.path.length > 0 || typeof section.depth === "number" || section.kind ? section : undefined,
|
|
4145
|
+
sequence: sequence.nextChunkId || sequence.previousChunkId || typeof sequence.sectionChunkCount === "number" || sequence.sectionChunkId || typeof sequence.sectionChunkIndex === "number" ? sequence : undefined
|
|
4146
|
+
};
|
|
4147
|
+
};
|
|
4064
4148
|
var buildExcerpt2 = (text, maxLength = 160) => {
|
|
4065
4149
|
const normalized = text.replaceAll(/\s+/g, " ").trim();
|
|
4066
4150
|
if (normalized.length <= maxLength) {
|
|
@@ -4068,6 +4152,136 @@ var buildExcerpt2 = (text, maxLength = 160) => {
|
|
|
4068
4152
|
}
|
|
4069
4153
|
return `${normalized.slice(0, Math.max(0, maxLength - 1)).trimEnd()}\u2026`;
|
|
4070
4154
|
};
|
|
4155
|
+
var buildRAGChunkGraph = (chunks) => {
|
|
4156
|
+
const nodes = [];
|
|
4157
|
+
const edges = [];
|
|
4158
|
+
const edgeKeys = new Set;
|
|
4159
|
+
const sections = new Map;
|
|
4160
|
+
for (const chunk of chunks) {
|
|
4161
|
+
const labels = chunk.labels ?? buildRAGSourceLabels({
|
|
4162
|
+
metadata: chunk.metadata,
|
|
4163
|
+
source: chunk.source,
|
|
4164
|
+
title: chunk.title
|
|
4165
|
+
});
|
|
4166
|
+
const structure = chunk.structure ?? buildRAGChunkStructure(chunk.metadata);
|
|
4167
|
+
nodes.push({
|
|
4168
|
+
chunkId: chunk.chunkId,
|
|
4169
|
+
contextLabel: labels?.contextLabel,
|
|
4170
|
+
label: chunk.source ?? chunk.title ?? chunk.chunkId,
|
|
4171
|
+
locatorLabel: labels?.locatorLabel,
|
|
4172
|
+
provenanceLabel: labels?.provenanceLabel,
|
|
4173
|
+
score: chunk.score,
|
|
4174
|
+
source: chunk.source,
|
|
4175
|
+
structure,
|
|
4176
|
+
title: chunk.title
|
|
4177
|
+
});
|
|
4178
|
+
const previousChunkId = structure?.sequence?.previousChunkId;
|
|
4179
|
+
if (previousChunkId) {
|
|
4180
|
+
const key = `previous:${previousChunkId}:${chunk.chunkId}`;
|
|
4181
|
+
if (!edgeKeys.has(key)) {
|
|
4182
|
+
edgeKeys.add(key);
|
|
4183
|
+
edges.push({
|
|
4184
|
+
fromChunkId: previousChunkId,
|
|
4185
|
+
relation: "previous",
|
|
4186
|
+
toChunkId: chunk.chunkId
|
|
4187
|
+
});
|
|
4188
|
+
}
|
|
4189
|
+
}
|
|
4190
|
+
const nextChunkId = structure?.sequence?.nextChunkId;
|
|
4191
|
+
if (nextChunkId) {
|
|
4192
|
+
const key = `next:${chunk.chunkId}:${nextChunkId}`;
|
|
4193
|
+
if (!edgeKeys.has(key)) {
|
|
4194
|
+
edgeKeys.add(key);
|
|
4195
|
+
edges.push({
|
|
4196
|
+
fromChunkId: chunk.chunkId,
|
|
4197
|
+
relation: "next",
|
|
4198
|
+
toChunkId: nextChunkId
|
|
4199
|
+
});
|
|
4200
|
+
}
|
|
4201
|
+
}
|
|
4202
|
+
const sectionId = structure?.sequence?.sectionChunkId;
|
|
4203
|
+
if (sectionId) {
|
|
4204
|
+
const existing = sections.get(sectionId);
|
|
4205
|
+
if (!existing) {
|
|
4206
|
+
sections.set(sectionId, {
|
|
4207
|
+
chunkCount: structure.sequence?.sectionChunkCount ?? 1,
|
|
4208
|
+
chunkIds: [chunk.chunkId],
|
|
4209
|
+
depth: structure.section?.depth,
|
|
4210
|
+
id: sectionId,
|
|
4211
|
+
kind: structure.section?.kind,
|
|
4212
|
+
path: structure.section?.path,
|
|
4213
|
+
title: structure.section?.title
|
|
4214
|
+
});
|
|
4215
|
+
continue;
|
|
4216
|
+
}
|
|
4217
|
+
if (!existing.chunkIds.includes(chunk.chunkId)) {
|
|
4218
|
+
existing.chunkIds.push(chunk.chunkId);
|
|
4219
|
+
}
|
|
4220
|
+
existing.chunkCount = Math.max(existing.chunkCount, structure.sequence?.sectionChunkCount ?? existing.chunkCount);
|
|
4221
|
+
}
|
|
4222
|
+
}
|
|
4223
|
+
for (const section of sections.values()) {
|
|
4224
|
+
section.chunkIds.sort((left, right) => {
|
|
4225
|
+
const leftNode = nodes.find((node) => node.chunkId === left);
|
|
4226
|
+
const rightNode = nodes.find((node) => node.chunkId === right);
|
|
4227
|
+
const leftIndex = leftNode?.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
|
|
4228
|
+
const rightIndex = rightNode?.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
|
|
4229
|
+
if (leftIndex !== rightIndex) {
|
|
4230
|
+
return leftIndex - rightIndex;
|
|
4231
|
+
}
|
|
4232
|
+
return left.localeCompare(right);
|
|
4233
|
+
});
|
|
4234
|
+
}
|
|
4235
|
+
nodes.sort((left, right) => {
|
|
4236
|
+
const leftSection = left.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
|
|
4237
|
+
const rightSection = right.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
|
|
4238
|
+
if (leftSection !== rightSection) {
|
|
4239
|
+
return leftSection - rightSection;
|
|
4240
|
+
}
|
|
4241
|
+
const leftScore = left.score ?? Number.NEGATIVE_INFINITY;
|
|
4242
|
+
const rightScore = right.score ?? Number.NEGATIVE_INFINITY;
|
|
4243
|
+
if (leftScore !== rightScore) {
|
|
4244
|
+
return rightScore - leftScore;
|
|
4245
|
+
}
|
|
4246
|
+
return left.label.localeCompare(right.label);
|
|
4247
|
+
});
|
|
4248
|
+
return {
|
|
4249
|
+
edges,
|
|
4250
|
+
nodes,
|
|
4251
|
+
sections: [...sections.values()].sort((left, right) => (left.title ?? left.id).localeCompare(right.title ?? right.id))
|
|
4252
|
+
};
|
|
4253
|
+
};
|
|
4254
|
+
var buildRAGChunkPreviewGraph = (preview) => buildRAGChunkGraph(preview.chunks.map((chunk) => ({
|
|
4255
|
+
chunkId: chunk.chunkId,
|
|
4256
|
+
labels: chunk.labels,
|
|
4257
|
+
metadata: chunk.metadata,
|
|
4258
|
+
source: chunk.source ?? preview.document.source,
|
|
4259
|
+
structure: chunk.structure,
|
|
4260
|
+
title: chunk.title ?? preview.document.title
|
|
4261
|
+
})));
|
|
4262
|
+
var buildRAGChunkPreviewNavigation = (preview, activeChunkId) => buildRAGChunkGraphNavigation(buildRAGChunkPreviewGraph(preview), activeChunkId);
|
|
4263
|
+
var buildRAGChunkGraphNavigation = (graph, activeChunkId) => {
|
|
4264
|
+
if (graph.nodes.length === 0) {
|
|
4265
|
+
return {
|
|
4266
|
+
activeChunkId,
|
|
4267
|
+
sectionNodes: []
|
|
4268
|
+
};
|
|
4269
|
+
}
|
|
4270
|
+
const activeNode = (activeChunkId ? graph.nodes.find((node) => node.chunkId === activeChunkId) : undefined) ?? graph.nodes[0];
|
|
4271
|
+
const resolvedActiveChunkId = activeNode?.chunkId;
|
|
4272
|
+
const previousNode = activeNode?.structure?.sequence?.previousChunkId ? graph.nodes.find((node) => node.chunkId === activeNode.structure?.sequence?.previousChunkId) : undefined;
|
|
4273
|
+
const nextNode = activeNode?.structure?.sequence?.nextChunkId ? graph.nodes.find((node) => node.chunkId === activeNode.structure?.sequence?.nextChunkId) : undefined;
|
|
4274
|
+
const section = activeNode?.structure?.sequence?.sectionChunkId ? graph.sections.find((entry) => entry.id === activeNode.structure?.sequence?.sectionChunkId) : undefined;
|
|
4275
|
+
const sectionNodes = section ? section.chunkIds.map((chunkId) => graph.nodes.find((node) => node.chunkId === chunkId)).filter((node) => Boolean(node)) : activeNode ? [activeNode] : [];
|
|
4276
|
+
return {
|
|
4277
|
+
activeChunkId: resolvedActiveChunkId,
|
|
4278
|
+
activeNode,
|
|
4279
|
+
nextNode,
|
|
4280
|
+
previousNode,
|
|
4281
|
+
section,
|
|
4282
|
+
sectionNodes
|
|
4283
|
+
};
|
|
4284
|
+
};
|
|
4071
4285
|
var buildRAGRetrievedState = (messages) => {
|
|
4072
4286
|
const message = getLatestRetrievedMessage(messages);
|
|
4073
4287
|
if (!message) {
|
|
@@ -4102,13 +4316,14 @@ var buildRAGSourceSummaries = (sources) => {
|
|
|
4102
4316
|
citationNumbers: groupCitations.map((citation) => citationReferenceMap[citation.chunkId] ?? 0),
|
|
4103
4317
|
citations: groupCitations,
|
|
4104
4318
|
chunkIds: group.chunks.map((chunk) => chunk.chunkId),
|
|
4105
|
-
contextLabel: buildContextLabel2(leadChunk?.metadata),
|
|
4319
|
+
contextLabel: leadChunk?.labels?.contextLabel ?? buildContextLabel2(leadChunk?.metadata),
|
|
4106
4320
|
count: group.count,
|
|
4107
4321
|
excerpt: buildExcerpt2(leadChunk?.text ?? ""),
|
|
4108
4322
|
key: group.key,
|
|
4109
4323
|
label: group.label,
|
|
4110
|
-
locatorLabel: buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
|
|
4111
|
-
provenanceLabel: buildProvenanceLabel2(leadChunk?.metadata),
|
|
4324
|
+
locatorLabel: leadChunk?.labels?.locatorLabel ?? buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
|
|
4325
|
+
provenanceLabel: leadChunk?.labels?.provenanceLabel ?? buildProvenanceLabel2(leadChunk?.metadata),
|
|
4326
|
+
structure: leadChunk?.structure ?? buildRAGChunkStructure(leadChunk?.metadata),
|
|
4112
4327
|
source: group.source,
|
|
4113
4328
|
title: group.title
|
|
4114
4329
|
};
|
|
@@ -4232,6 +4447,12 @@ var buildSourceGroup = (source, key) => ({
|
|
|
4232
4447
|
count: 1,
|
|
4233
4448
|
key,
|
|
4234
4449
|
label: buildSourceLabel2(source),
|
|
4450
|
+
labels: source.labels ?? buildRAGSourceLabels({
|
|
4451
|
+
metadata: source.metadata,
|
|
4452
|
+
source: source.source,
|
|
4453
|
+
title: source.title
|
|
4454
|
+
}),
|
|
4455
|
+
structure: source.structure ?? buildRAGChunkStructure(source.metadata),
|
|
4235
4456
|
source: source.source,
|
|
4236
4457
|
title: source.title
|
|
4237
4458
|
});
|
|
@@ -4242,7 +4463,20 @@ var updateSourceGroup = (groups, source) => {
|
|
|
4242
4463
|
groups.set(key, buildSourceGroup(source, key));
|
|
4243
4464
|
return;
|
|
4244
4465
|
}
|
|
4245
|
-
|
|
4466
|
+
if (source.score > existing.bestScore) {
|
|
4467
|
+
existing.bestScore = source.score;
|
|
4468
|
+
existing.label = buildSourceLabel2(source);
|
|
4469
|
+
existing.labels = source.labels ?? buildRAGSourceLabels({
|
|
4470
|
+
metadata: source.metadata,
|
|
4471
|
+
source: source.source,
|
|
4472
|
+
title: source.title
|
|
4473
|
+
});
|
|
4474
|
+
existing.structure = source.structure ?? buildRAGChunkStructure(source.metadata);
|
|
4475
|
+
existing.source = source.source;
|
|
4476
|
+
existing.title = source.title;
|
|
4477
|
+
} else {
|
|
4478
|
+
existing.bestScore = Math.max(existing.bestScore, source.score);
|
|
4479
|
+
}
|
|
4246
4480
|
existing.count += 1;
|
|
4247
4481
|
existing.chunks.push(source);
|
|
4248
4482
|
};
|
|
@@ -7787,11 +8021,71 @@ var decodeHtmlEntities = (value) => {
|
|
|
7787
8021
|
output = output.replace(/&#(\d+);/g, (_, code) => String.fromCodePoint(Number(code)));
|
|
7788
8022
|
return output.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCodePoint(parseInt(code, 16)));
|
|
7789
8023
|
};
|
|
7790
|
-
var
|
|
7791
|
-
const
|
|
8024
|
+
var formatHtmlLinkContext = (href) => {
|
|
8025
|
+
const decoded = decodeHtmlEntities(href.trim());
|
|
8026
|
+
if (!decoded) {
|
|
8027
|
+
return;
|
|
8028
|
+
}
|
|
8029
|
+
if (decoded.startsWith("#")) {
|
|
8030
|
+
return decoded;
|
|
8031
|
+
}
|
|
8032
|
+
if (/^[a-z]+:/i.test(decoded)) {
|
|
8033
|
+
try {
|
|
8034
|
+
const url = new URL(decoded);
|
|
8035
|
+
const path = url.pathname === "/" ? "" : url.pathname;
|
|
8036
|
+
return `${url.hostname}${path}`;
|
|
8037
|
+
} catch {
|
|
8038
|
+
return decoded;
|
|
8039
|
+
}
|
|
8040
|
+
}
|
|
8041
|
+
return decoded;
|
|
8042
|
+
};
|
|
8043
|
+
var stripHtmlTags = (value) => {
|
|
8044
|
+
const withoutTags = value.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<a\b[^>]*href=(['"])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, (_match, _quote, href, inner) => {
|
|
8045
|
+
const label = normalizeWhitespace(stripHtmlTags(inner));
|
|
8046
|
+
const context = formatHtmlLinkContext(href);
|
|
8047
|
+
if (!label) {
|
|
8048
|
+
return context ?? " ";
|
|
8049
|
+
}
|
|
8050
|
+
if (!context || context === label) {
|
|
8051
|
+
return label;
|
|
8052
|
+
}
|
|
8053
|
+
return `${label} (${context})`;
|
|
8054
|
+
}).replace(/<br\s*\/?>/gi, `
|
|
7792
8055
|
`).replace(/<\/(p|div|section|article|li|ul|ol|h[1-6]|table|tr)>/gi, `
|
|
7793
8056
|
`).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
|
|
7794
|
-
return
|
|
8057
|
+
return decodeHtmlEntities(withoutTags);
|
|
8058
|
+
};
|
|
8059
|
+
var extractMainHtmlContent = (value) => {
|
|
8060
|
+
const trimmed = value.trim();
|
|
8061
|
+
if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
|
|
8062
|
+
return value;
|
|
8063
|
+
}
|
|
8064
|
+
const boilerplateStripped = trimmed.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<(nav|footer|header|aside|form)\b[^>]*>[\s\S]*?<\/\1>/gi, " ");
|
|
8065
|
+
const mainMatch = boilerplateStripped.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
|
|
8066
|
+
if (mainMatch?.[1]) {
|
|
8067
|
+
return mainMatch[1];
|
|
8068
|
+
}
|
|
8069
|
+
const articleMatches = [
|
|
8070
|
+
...boilerplateStripped.matchAll(/<article\b[^>]*>([\s\S]*?)<\/article>/gi)
|
|
8071
|
+
].map((match) => match[1]?.trim()).filter(Boolean);
|
|
8072
|
+
if (articleMatches.length > 0) {
|
|
8073
|
+
return articleMatches.join(`
|
|
8074
|
+
`);
|
|
8075
|
+
}
|
|
8076
|
+
const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
|
|
8077
|
+
if (roleMainMatch?.[3]) {
|
|
8078
|
+
return roleMainMatch[3];
|
|
8079
|
+
}
|
|
8080
|
+
const bodyMatch = boilerplateStripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
|
|
8081
|
+
if (bodyMatch?.[1]) {
|
|
8082
|
+
return bodyMatch[1];
|
|
8083
|
+
}
|
|
8084
|
+
return boilerplateStripped;
|
|
8085
|
+
};
|
|
8086
|
+
var stripHtml = (value) => {
|
|
8087
|
+
const focused = extractMainHtmlContent(value);
|
|
8088
|
+
return normalizeWhitespace(stripHtmlTags(focused));
|
|
7795
8089
|
};
|
|
7796
8090
|
var stripMarkdown = (value) => {
|
|
7797
8091
|
const withoutCodeBlocks = value.replace(/```[\s\S]*?```/g, (block) => {
|
|
@@ -7811,31 +8105,93 @@ var markdownStructureUnits = (value) => {
|
|
|
7811
8105
|
`);
|
|
7812
8106
|
const sections = [];
|
|
7813
8107
|
let current = [];
|
|
8108
|
+
let currentPath = [];
|
|
8109
|
+
const headingStack = [];
|
|
7814
8110
|
const flushCurrentSection = () => {
|
|
7815
8111
|
if (current.length === 0) {
|
|
7816
8112
|
return;
|
|
7817
8113
|
}
|
|
7818
|
-
sections.push(
|
|
7819
|
-
|
|
8114
|
+
sections.push({
|
|
8115
|
+
lines: current,
|
|
8116
|
+
sectionPath: [...currentPath]
|
|
8117
|
+
});
|
|
7820
8118
|
current = [];
|
|
7821
8119
|
};
|
|
7822
8120
|
for (const line of lines) {
|
|
7823
|
-
const
|
|
7824
|
-
if (
|
|
7825
|
-
|
|
8121
|
+
const headingMatch = line.match(/^\s*(#{1,6})\s+(.+)$/);
|
|
8122
|
+
if (headingMatch) {
|
|
8123
|
+
if (current.length > 0) {
|
|
8124
|
+
flushCurrentSection();
|
|
8125
|
+
}
|
|
8126
|
+
const depth = headingMatch[1]?.length ?? 1;
|
|
8127
|
+
const headingText = normalizeWhitespace(headingMatch[2] ?? "");
|
|
8128
|
+
if (headingText) {
|
|
8129
|
+
headingStack[depth - 1] = headingText;
|
|
8130
|
+
headingStack.length = depth;
|
|
8131
|
+
currentPath = [...headingStack];
|
|
8132
|
+
}
|
|
8133
|
+
}
|
|
7826
8134
|
current.push(line);
|
|
7827
8135
|
}
|
|
7828
8136
|
flushCurrentSection();
|
|
7829
|
-
return sections.map((
|
|
8137
|
+
return sections.map(({ lines: sectionLines, sectionPath }) => ({
|
|
8138
|
+
sectionDepth: sectionPath.length > 0 ? sectionPath.length : undefined,
|
|
8139
|
+
sectionKind: sectionPath.length > 0 ? "markdown_heading" : undefined,
|
|
8140
|
+
sectionPath: sectionPath.length > 0 ? sectionPath : undefined,
|
|
8141
|
+
sectionTitle: sectionPath.at(-1),
|
|
8142
|
+
text: normalizeWhitespace(stripMarkdown(sectionLines.join(`
|
|
8143
|
+
`)))
|
|
8144
|
+
})).filter((section) => Boolean(section.text));
|
|
8145
|
+
};
|
|
8146
|
+
var joinHtmlHeadingSection = (headings, content) => {
|
|
8147
|
+
const normalizedHeadings = headings.map((heading) => normalizeWhitespace(heading));
|
|
8148
|
+
const combined = [...normalizedHeadings, content].filter(Boolean).join(`
|
|
8149
|
+
`);
|
|
8150
|
+
return normalizeWhitespace(combined);
|
|
7830
8151
|
};
|
|
7831
8152
|
var htmlStructureUnits = (value) => {
|
|
7832
|
-
const
|
|
7833
|
-
|
|
7834
|
-
|
|
7835
|
-
|
|
7836
|
-
|
|
7837
|
-
|
|
7838
|
-
|
|
8153
|
+
const focused = extractMainHtmlContent(value);
|
|
8154
|
+
const headingPattern = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
8155
|
+
const sections = [];
|
|
8156
|
+
const headingStack = [];
|
|
8157
|
+
let cursor = 0;
|
|
8158
|
+
let currentContentStart = 0;
|
|
8159
|
+
let activeHeadings = [];
|
|
8160
|
+
const flushSection = (end) => {
|
|
8161
|
+
const content = normalizeWhitespace(stripHtmlTags(focused.slice(currentContentStart, end)));
|
|
8162
|
+
if (!content) {
|
|
8163
|
+
return;
|
|
8164
|
+
}
|
|
8165
|
+
const section = joinHtmlHeadingSection(activeHeadings, content);
|
|
8166
|
+
if (section) {
|
|
8167
|
+
sections.push({
|
|
8168
|
+
sectionDepth: activeHeadings.length > 0 ? activeHeadings.length : undefined,
|
|
8169
|
+
sectionKind: activeHeadings.length > 0 ? "html_heading" : undefined,
|
|
8170
|
+
sectionPath: activeHeadings.length > 0 ? [...activeHeadings] : undefined,
|
|
8171
|
+
sectionTitle: activeHeadings.at(-1),
|
|
8172
|
+
text: section
|
|
8173
|
+
});
|
|
8174
|
+
}
|
|
8175
|
+
};
|
|
8176
|
+
for (const match of focused.matchAll(headingPattern)) {
|
|
8177
|
+
const fullMatch = match[0];
|
|
8178
|
+
const start = match.index ?? cursor;
|
|
8179
|
+
flushSection(start);
|
|
8180
|
+
const level = Number.parseInt(match[1] ?? "1", 10);
|
|
8181
|
+
const headingText = normalizeWhitespace(stripHtmlTags(match[2] ?? ""));
|
|
8182
|
+
if (headingText) {
|
|
8183
|
+
headingStack[level - 1] = headingText;
|
|
8184
|
+
headingStack.length = level;
|
|
8185
|
+
activeHeadings = [...headingStack];
|
|
8186
|
+
}
|
|
8187
|
+
cursor = start + fullMatch.length;
|
|
8188
|
+
currentContentStart = cursor;
|
|
8189
|
+
}
|
|
8190
|
+
flushSection(focused.length);
|
|
8191
|
+
if (sections.length > 0) {
|
|
8192
|
+
return sections;
|
|
8193
|
+
}
|
|
8194
|
+
return [{ text: normalizeWhitespace(stripHtmlTags(focused)) }].filter((section) => Boolean(section.text));
|
|
7839
8195
|
};
|
|
7840
8196
|
var inferFormat = (document) => {
|
|
7841
8197
|
if (document.format) {
|
|
@@ -7927,10 +8283,77 @@ var isLikelyTextData = (data) => {
|
|
|
7927
8283
|
};
|
|
7928
8284
|
var decodePdfLiteral = (value) => value.replace(/\\([\\()])/g, "$1").replace(/\\n/g, `
|
|
7929
8285
|
`).replace(/\\r/g, "\r").replace(/\\t/g, "\t").replace(/\\b/g, "\b").replace(/\\f/g, "\f").replace(/\\([0-7]{1,3})/g, (_match, octal) => String.fromCharCode(parseInt(octal, 8)));
|
|
8286
|
+
var PDF_TABLE_GAP_THRESHOLD = 120;
|
|
8287
|
+
var extractPdfArrayText = (value) => {
|
|
8288
|
+
const parts = [];
|
|
8289
|
+
const tokenPattern = /\(((?:\\.|[^\\)])*)\)|([-+]?\d*\.?\d+)/g;
|
|
8290
|
+
let pendingColumnGap = false;
|
|
8291
|
+
for (const match of value.matchAll(tokenPattern)) {
|
|
8292
|
+
if (match[1] !== undefined) {
|
|
8293
|
+
const decoded = decodePdfLiteral(match[1]);
|
|
8294
|
+
if (pendingColumnGap && decoded && !/^\s/.test(decoded) && parts.at(-1) !== " | ") {
|
|
8295
|
+
parts.push(" | ");
|
|
8296
|
+
}
|
|
8297
|
+
parts.push(decoded);
|
|
8298
|
+
pendingColumnGap = false;
|
|
8299
|
+
continue;
|
|
8300
|
+
}
|
|
8301
|
+
const gap = Number(match[2]);
|
|
8302
|
+
if (Number.isFinite(gap) && gap >= PDF_TABLE_GAP_THRESHOLD) {
|
|
8303
|
+
pendingColumnGap = true;
|
|
8304
|
+
}
|
|
8305
|
+
}
|
|
8306
|
+
return normalizeWhitespace(parts.join("")).replace(/\s+\|\s+/g, " | ").trim();
|
|
8307
|
+
};
|
|
8308
|
+
var appendPdfText = (parts, value) => {
|
|
8309
|
+
if (!value) {
|
|
8310
|
+
return;
|
|
8311
|
+
}
|
|
8312
|
+
parts.push(value);
|
|
8313
|
+
};
|
|
8314
|
+
var appendPdfLineBreak = (parts) => {
|
|
8315
|
+
const last = parts.at(-1);
|
|
8316
|
+
if (!last || last.endsWith(`
|
|
8317
|
+
`)) {
|
|
8318
|
+
return;
|
|
8319
|
+
}
|
|
8320
|
+
parts.push(`
|
|
8321
|
+
`);
|
|
8322
|
+
};
|
|
8323
|
+
var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
|
|
8324
|
+
var extractTextFromPDFTextObject = (value) => {
|
|
8325
|
+
const parts = [];
|
|
8326
|
+
for (const match of value.matchAll(PDF_TEXT_OPERATOR_PATTERN)) {
|
|
8327
|
+
if (match[2] !== undefined) {
|
|
8328
|
+
appendPdfText(parts, extractPdfArrayText(match[2]));
|
|
8329
|
+
continue;
|
|
8330
|
+
}
|
|
8331
|
+
if (match[4] !== undefined) {
|
|
8332
|
+
appendPdfText(parts, decodePdfLiteral(match[4]));
|
|
8333
|
+
continue;
|
|
8334
|
+
}
|
|
8335
|
+
if (match[6] !== undefined) {
|
|
8336
|
+
appendPdfLineBreak(parts);
|
|
8337
|
+
appendPdfText(parts, decodePdfLiteral(match[6]));
|
|
8338
|
+
continue;
|
|
8339
|
+
}
|
|
8340
|
+
if (match[8] !== undefined) {
|
|
8341
|
+
appendPdfLineBreak(parts);
|
|
8342
|
+
appendPdfText(parts, decodePdfLiteral(match[8]));
|
|
8343
|
+
continue;
|
|
8344
|
+
}
|
|
8345
|
+
if (match[9] !== undefined || match[10] !== undefined || match[11] !== undefined) {
|
|
8346
|
+
appendPdfLineBreak(parts);
|
|
8347
|
+
}
|
|
8348
|
+
}
|
|
8349
|
+
return parts.join("");
|
|
8350
|
+
};
|
|
7930
8351
|
var extractTextFromPDFBytes = (data) => {
|
|
7931
8352
|
const raw = Buffer.from(data).toString("latin1");
|
|
7932
|
-
const
|
|
7933
|
-
const combined =
|
|
8353
|
+
const textObjects = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match) => extractTextFromPDFTextObject(match[1] ?? "")).filter(Boolean);
|
|
8354
|
+
const combined = textObjects.length > 0 ? textObjects.join(`
|
|
8355
|
+
|
|
8356
|
+
`) : [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
|
|
7934
8357
|
`);
|
|
7935
8358
|
return normalizeWhitespace(combined);
|
|
7936
8359
|
};
|
|
@@ -8022,7 +8445,40 @@ var decodeGzipEntries = (data, input) => {
|
|
|
8022
8445
|
];
|
|
8023
8446
|
};
|
|
8024
8447
|
var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
|
|
8448
|
+
var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
|
|
8449
|
+
`).replace(/<[^>]+>/g, " ")));
|
|
8450
|
+
var officeDocumentParagraphs = (entries) => {
|
|
8451
|
+
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
8452
|
+
if (!documentEntry) {
|
|
8453
|
+
return [];
|
|
8454
|
+
}
|
|
8455
|
+
const xml = decodeUtf8(documentEntry.data);
|
|
8456
|
+
const paragraphs = [...xml.matchAll(/<w:p\b[\s\S]*?<\/w:p>/g)];
|
|
8457
|
+
return paragraphs.map((match) => {
|
|
8458
|
+
const paragraphXml = match[0] ?? "";
|
|
8459
|
+
const text = extractOfficeParagraphText(paragraphXml);
|
|
8460
|
+
if (!text) {
|
|
8461
|
+
return "";
|
|
8462
|
+
}
|
|
8463
|
+
const styleMatch = paragraphXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
|
|
8464
|
+
const style = (styleMatch?.[1] ?? "").toLowerCase();
|
|
8465
|
+
if (style === "title") {
|
|
8466
|
+
return text;
|
|
8467
|
+
}
|
|
8468
|
+
const headingMatch = style.match(/^heading([1-6])$/);
|
|
8469
|
+
if (headingMatch) {
|
|
8470
|
+
return text;
|
|
8471
|
+
}
|
|
8472
|
+
return text;
|
|
8473
|
+
}).filter(Boolean);
|
|
8474
|
+
};
|
|
8025
8475
|
var officeDocumentText = (entries) => {
|
|
8476
|
+
const paragraphs = officeDocumentParagraphs(entries);
|
|
8477
|
+
if (paragraphs.length > 0) {
|
|
8478
|
+
return normalizeWhitespace(paragraphs.join(`
|
|
8479
|
+
|
|
8480
|
+
`));
|
|
8481
|
+
}
|
|
8026
8482
|
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
8027
8483
|
if (!documentEntry) {
|
|
8028
8484
|
return "";
|
|
@@ -8037,31 +8493,68 @@ var officeDocumentSectionCount = (entries) => {
|
|
|
8037
8493
|
const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
|
|
8038
8494
|
return count > 0 ? count : undefined;
|
|
8039
8495
|
};
|
|
8040
|
-
var
|
|
8041
|
-
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
const
|
|
8045
|
-
|
|
8046
|
-
|
|
8496
|
+
var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
8497
|
+
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
8498
|
+
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
8499
|
+
var spreadsheetColumnLabel = (reference) => {
|
|
8500
|
+
const match = reference?.match(/([A-Z]+)/i);
|
|
8501
|
+
return match?.[1]?.toUpperCase() ?? "";
|
|
8502
|
+
};
|
|
8503
|
+
var spreadsheetResolveCellValue = (cellXml, sharedStrings) => {
|
|
8504
|
+
const inlineMatch = cellXml.match(/<is\b[^>]*>[\s\S]*?<t[^>]*>([\s\S]*?)<\/t>[\s\S]*?<\/is>/i);
|
|
8505
|
+
if (inlineMatch?.[1]) {
|
|
8506
|
+
return normalizeWhitespace(decodeHtmlEntities(inlineMatch[1]));
|
|
8507
|
+
}
|
|
8508
|
+
const valueMatch = cellXml.match(/<v>([\s\S]*?)<\/v>/i);
|
|
8509
|
+
if (!valueMatch?.[1]) {
|
|
8510
|
+
return "";
|
|
8511
|
+
}
|
|
8512
|
+
const rawValue = decodeHtmlEntities(valueMatch[1]);
|
|
8513
|
+
const typeMatch = cellXml.match(/\bt="([^"]+)"/i);
|
|
8514
|
+
if (typeMatch?.[1] === "s") {
|
|
8515
|
+
const index = Number(rawValue);
|
|
8516
|
+
return Number.isInteger(index) && sharedStrings[index] ? sharedStrings[index] : rawValue;
|
|
8517
|
+
}
|
|
8518
|
+
return normalizeWhitespace(rawValue);
|
|
8519
|
+
};
|
|
8520
|
+
var spreadsheetWorksheetRows = (worksheetXml, sharedStrings) => [...worksheetXml.matchAll(/<row\b[^>]*>([\s\S]*?)<\/row>/gi)].map((rowMatch) => {
|
|
8521
|
+
const rowXml = rowMatch[1] ?? "";
|
|
8522
|
+
const cells = [...rowXml.matchAll(/<c\b([^>]*)>([\s\S]*?)<\/c>/gi)].map((cellMatch) => {
|
|
8523
|
+
const attributes = cellMatch[1] ?? "";
|
|
8524
|
+
const cellBody = cellMatch[2] ?? "";
|
|
8525
|
+
const referenceMatch = attributes.match(/\br="([^"]+)"/i);
|
|
8526
|
+
const reference = referenceMatch?.[1];
|
|
8527
|
+
const value = spreadsheetResolveCellValue(`<c${attributes}>${cellBody}</c>`, sharedStrings);
|
|
8528
|
+
return {
|
|
8529
|
+
column: spreadsheetColumnLabel(reference),
|
|
8530
|
+
reference,
|
|
8531
|
+
value
|
|
8532
|
+
};
|
|
8533
|
+
}).filter((cell) => cell.value);
|
|
8534
|
+
return cells;
|
|
8535
|
+
}).filter((row) => row.length > 0);
|
|
8536
|
+
var spreadsheetRowText = (row, headers) => {
|
|
8537
|
+
const entries = row.map((cell, index) => {
|
|
8538
|
+
const header = headers[index];
|
|
8539
|
+
if (header) {
|
|
8540
|
+
return `${header}: ${cell.value}`;
|
|
8541
|
+
}
|
|
8542
|
+
return cell.column ? `${cell.column}: ${cell.value}` : cell.value;
|
|
8047
8543
|
});
|
|
8048
|
-
return normalizeWhitespace(
|
|
8049
|
-
`));
|
|
8544
|
+
return normalizeWhitespace(entries.join(" | "));
|
|
8050
8545
|
};
|
|
8051
8546
|
var spreadsheetSheetTexts = (entries) => {
|
|
8052
|
-
const sharedStrings = entries
|
|
8053
|
-
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
8054
|
-
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
8547
|
+
const sharedStrings = spreadsheetSharedStrings(entries);
|
|
8055
8548
|
const sheetNames = spreadsheetSheetNames(entries);
|
|
8056
8549
|
const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
|
|
8057
8550
|
return sheetEntries.map((entry, index) => {
|
|
8058
|
-
const
|
|
8059
|
-
|
|
8060
|
-
|
|
8061
|
-
|
|
8062
|
-
|
|
8063
|
-
});
|
|
8064
|
-
const text = normalizeWhitespace(
|
|
8551
|
+
const rows = spreadsheetWorksheetRows(decodeUtf8(entry.data), sharedStrings);
|
|
8552
|
+
if (rows.length === 0) {
|
|
8553
|
+
return null;
|
|
8554
|
+
}
|
|
8555
|
+
const headers = rows[0].map((cell) => cell.value);
|
|
8556
|
+
const rowTexts = rows.map((row, rowIndex) => normalizeWhitespace(`Row ${rowIndex + 1}. ${spreadsheetRowText(row, rowIndex === 0 ? [] : headers)}`));
|
|
8557
|
+
const text = normalizeWhitespace(rowTexts.join(`
|
|
8065
8558
|
`));
|
|
8066
8559
|
if (!text) {
|
|
8067
8560
|
return null;
|
|
@@ -8072,19 +8565,38 @@ var spreadsheetSheetTexts = (entries) => {
|
|
|
8072
8565
|
};
|
|
8073
8566
|
}).filter((entry) => Boolean(entry));
|
|
8074
8567
|
};
|
|
8568
|
+
var spreadsheetText = (entries) => normalizeWhitespace(spreadsheetSheetTexts(entries).map((sheet) => `Sheet ${sheet.name}
|
|
8569
|
+
${sheet.text}`).join(`
|
|
8570
|
+
|
|
8571
|
+
`));
|
|
8075
8572
|
var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
|
|
8076
8573
|
...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
|
|
8077
8574
|
].map((match) => match[1] ?? "")).filter(Boolean);
|
|
8078
|
-
var
|
|
8079
|
-
const
|
|
8080
|
-
|
|
8081
|
-
|
|
8575
|
+
var presentationNotesByIndex = (entries) => new Map(entries.filter((entry) => entry.path.startsWith("ppt/notesSlides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry) => {
|
|
8576
|
+
const indexMatch = entry.path.match(/notesSlide(\d+)\.xml$/i);
|
|
8577
|
+
const index = Number(indexMatch?.[1] ?? "0") - 1;
|
|
8578
|
+
return [
|
|
8579
|
+
index,
|
|
8580
|
+
normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
|
|
8581
|
+
];
|
|
8582
|
+
}).filter((entry) => entry[0] >= 0 && Boolean(entry[1])));
|
|
8583
|
+
var presentationSlides = (entries) => {
|
|
8584
|
+
const notesByIndex = presentationNotesByIndex(entries);
|
|
8585
|
+
return entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => {
|
|
8586
|
+
const slideText = normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)));
|
|
8587
|
+
const notesText = notesByIndex.get(index);
|
|
8588
|
+
const text = normalizeWhitespace([slideText, notesText ? `Speaker notes: ${notesText}` : ""].filter(Boolean).join(`
|
|
8082
8589
|
`));
|
|
8590
|
+
return {
|
|
8591
|
+
index,
|
|
8592
|
+
notesText,
|
|
8593
|
+
text
|
|
8594
|
+
};
|
|
8595
|
+
}).filter((slide) => Boolean(slide.text));
|
|
8083
8596
|
};
|
|
8084
|
-
var
|
|
8085
|
-
|
|
8086
|
-
|
|
8087
|
-
})).filter((slide) => Boolean(slide.text));
|
|
8597
|
+
var presentationText = (entries) => normalizeWhitespace(presentationSlides(entries).map((slide) => slide.text).join(`
|
|
8598
|
+
|
|
8599
|
+
`));
|
|
8088
8600
|
var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
|
|
8089
8601
|
var epubText = (entries) => {
|
|
8090
8602
|
const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
|
|
@@ -8092,17 +8604,113 @@ var epubText = (entries) => {
|
|
|
8092
8604
|
|
|
8093
8605
|
`));
|
|
8094
8606
|
};
|
|
8095
|
-
var
|
|
8607
|
+
var splitEmailMessage = (raw) => {
|
|
8096
8608
|
const normalized = raw.replace(/\r\n?/g, `
|
|
8097
8609
|
`);
|
|
8098
|
-
const
|
|
8099
|
-
|
|
8100
|
-
`);
|
|
8101
|
-
const body = bodyParts.join(`
|
|
8610
|
+
const separator = normalized.indexOf(`
|
|
8102
8611
|
|
|
8103
8612
|
`);
|
|
8613
|
+
if (separator < 0) {
|
|
8614
|
+
return {
|
|
8615
|
+
body: "",
|
|
8616
|
+
headerBlock: normalized
|
|
8617
|
+
};
|
|
8618
|
+
}
|
|
8619
|
+
return {
|
|
8620
|
+
body: normalized.slice(separator + 2),
|
|
8621
|
+
headerBlock: normalized.slice(0, separator)
|
|
8622
|
+
};
|
|
8623
|
+
};
|
|
8624
|
+
var parseHeaderBlock = (headerBlock) => {
|
|
8625
|
+
const unfolded = headerBlock.replace(/\n[ \t]+/g, " ");
|
|
8626
|
+
const headers = new Map;
|
|
8627
|
+
for (const line of unfolded.split(`
|
|
8628
|
+
`)) {
|
|
8629
|
+
const separator = line.indexOf(":");
|
|
8630
|
+
if (separator < 0) {
|
|
8631
|
+
continue;
|
|
8632
|
+
}
|
|
8633
|
+
headers.set(line.slice(0, separator).trim().toLowerCase(), line.slice(separator + 1).trim());
|
|
8634
|
+
}
|
|
8635
|
+
return headers;
|
|
8636
|
+
};
|
|
8637
|
+
var decodeQuotedPrintable = (value) => value.replace(/=\r?\n/g, "").replace(/=([0-9A-F]{2})/gi, (_match, hex) => String.fromCharCode(parseInt(hex, 16)));
|
|
8638
|
+
var decodeEmailPartBody = (body, encoding) => {
|
|
8639
|
+
const normalizedEncoding = encoding?.toLowerCase();
|
|
8640
|
+
const trimmed = body.trim();
|
|
8641
|
+
if (normalizedEncoding === "base64") {
|
|
8642
|
+
return new Uint8Array(Buffer.from(trimmed.replace(/\s+/g, ""), "base64"));
|
|
8643
|
+
}
|
|
8644
|
+
if (normalizedEncoding === "quoted-printable") {
|
|
8645
|
+
return new Uint8Array(Buffer.from(decodeQuotedPrintable(body), "utf8"));
|
|
8646
|
+
}
|
|
8647
|
+
return new Uint8Array(Buffer.from(body, "utf8"));
|
|
8648
|
+
};
|
|
8649
|
+
var parseMimeBoundary = (contentType) => {
|
|
8650
|
+
const match = contentType?.match(/boundary="?([^";]+)"?/i);
|
|
8651
|
+
return match?.[1];
|
|
8652
|
+
};
|
|
8653
|
+
var parseEmailMimeParts = (body, contentType) => {
|
|
8654
|
+
const boundary = parseMimeBoundary(contentType);
|
|
8655
|
+
if (!boundary) {
|
|
8656
|
+
const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
|
|
8657
|
+
return {
|
|
8658
|
+
attachments: [],
|
|
8659
|
+
bodyHtml: htmlMatch?.[0],
|
|
8660
|
+
bodyText: htmlMatch ? undefined : body
|
|
8661
|
+
};
|
|
8662
|
+
}
|
|
8663
|
+
const attachments = [];
|
|
8664
|
+
let bodyText;
|
|
8665
|
+
let bodyHtml;
|
|
8666
|
+
const parts = body.split(`--${boundary}`);
|
|
8667
|
+
for (const rawPart of parts) {
|
|
8668
|
+
const trimmed = rawPart.trim();
|
|
8669
|
+
if (!trimmed || trimmed === "--") {
|
|
8670
|
+
continue;
|
|
8671
|
+
}
|
|
8672
|
+
const { body: partBody, headerBlock } = splitEmailMessage(trimmed);
|
|
8673
|
+
const headers = parseHeaderBlock(headerBlock);
|
|
8674
|
+
const partContentType = headers.get("content-type");
|
|
8675
|
+
const disposition = headers.get("content-disposition");
|
|
8676
|
+
const transferEncoding = headers.get("content-transfer-encoding");
|
|
8677
|
+
const filename = disposition?.match(/filename="?([^";]+)"?/i)?.[1] ?? partContentType?.match(/name="?([^";]+)"?/i)?.[1];
|
|
8678
|
+
if (filename) {
|
|
8679
|
+
attachments.push({
|
|
8680
|
+
contentType: partContentType,
|
|
8681
|
+
data: decodeEmailPartBody(partBody, transferEncoding),
|
|
8682
|
+
fileName: filename
|
|
8683
|
+
});
|
|
8684
|
+
continue;
|
|
8685
|
+
}
|
|
8686
|
+
const decoded = Buffer.from(decodeEmailPartBody(partBody, transferEncoding)).toString("utf8");
|
|
8687
|
+
if (partContentType?.toLowerCase().includes("text/html")) {
|
|
8688
|
+
bodyHtml = decoded;
|
|
8689
|
+
continue;
|
|
8690
|
+
}
|
|
8691
|
+
if (partContentType?.toLowerCase().includes("text/plain")) {
|
|
8692
|
+
bodyText = decoded;
|
|
8693
|
+
}
|
|
8694
|
+
}
|
|
8695
|
+
return {
|
|
8696
|
+
attachments,
|
|
8697
|
+
bodyHtml,
|
|
8698
|
+
bodyText
|
|
8699
|
+
};
|
|
8700
|
+
};
|
|
8701
|
+
var extractEmailText = (raw) => {
|
|
8702
|
+
const { body, headerBlock } = splitEmailMessage(raw);
|
|
8703
|
+
const headers = parseHeaderBlock(headerBlock);
|
|
8704
|
+
const parsed = parseEmailMimeParts(body, headers.get("content-type"));
|
|
8705
|
+
if (parsed.bodyHtml) {
|
|
8706
|
+
return stripHtml(parsed.bodyHtml);
|
|
8707
|
+
}
|
|
8708
|
+
if (parsed.bodyText) {
|
|
8709
|
+
return normalizeWhitespace(parsed.bodyText);
|
|
8710
|
+
}
|
|
8104
8711
|
if (!body) {
|
|
8105
|
-
return normalizeWhitespace(
|
|
8712
|
+
return normalizeWhitespace(raw.replace(/\r\n?/g, `
|
|
8713
|
+
`));
|
|
8106
8714
|
}
|
|
8107
8715
|
const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
|
|
8108
8716
|
if (htmlMatch) {
|
|
@@ -8111,17 +8719,15 @@ var extractEmailText = (raw) => {
|
|
|
8111
8719
|
return normalizeWhitespace(body);
|
|
8112
8720
|
};
|
|
8113
8721
|
var parseEmailHeaders = (raw) => {
|
|
8114
|
-
const
|
|
8115
|
-
|
|
8116
|
-
const
|
|
8117
|
-
|
|
8118
|
-
`);
|
|
8119
|
-
const getHeader = (name) => {
|
|
8120
|
-
const match = headerBlock.match(new RegExp(`^${name}:\\s*(.+)$`, "im"));
|
|
8121
|
-
return match?.[1]?.trim();
|
|
8122
|
-
};
|
|
8722
|
+
const { headerBlock } = splitEmailMessage(raw);
|
|
8723
|
+
const headers = parseHeaderBlock(headerBlock);
|
|
8724
|
+
const getHeader = (name) => headers.get(name.toLowerCase());
|
|
8123
8725
|
return {
|
|
8726
|
+
contentType: getHeader("Content-Type"),
|
|
8124
8727
|
from: getHeader("From"),
|
|
8728
|
+
inReplyTo: getHeader("In-Reply-To"),
|
|
8729
|
+
messageId: getHeader("Message-ID"),
|
|
8730
|
+
references: getHeader("References"),
|
|
8125
8731
|
subject: getHeader("Subject"),
|
|
8126
8732
|
threadTopic: getHeader("Thread-Topic") ?? getHeader("Subject"),
|
|
8127
8733
|
to: getHeader("To")
|
|
@@ -8142,6 +8748,87 @@ var extractPrintableStrings = (data) => {
|
|
|
8142
8748
|
return unique.join(`
|
|
8143
8749
|
`);
|
|
8144
8750
|
};
|
|
8751
|
+
var ocrMetadata = (result) => {
|
|
8752
|
+
const regions = result.regions?.filter((region) => normalizeWhitespace(region.text ?? "").length > 0);
|
|
8753
|
+
const confidenceValues = [
|
|
8754
|
+
typeof result.confidence === "number" ? result.confidence : undefined,
|
|
8755
|
+
...(regions ?? []).map((region) => typeof region.confidence === "number" ? region.confidence : undefined)
|
|
8756
|
+
].filter((value) => value !== undefined);
|
|
8757
|
+
const averageConfidence = confidenceValues.length > 0 ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length : undefined;
|
|
8758
|
+
return {
|
|
8759
|
+
...result.metadata ?? {},
|
|
8760
|
+
ocrConfidence: result.confidence,
|
|
8761
|
+
ocrRegionCount: regions?.length,
|
|
8762
|
+
ocrRegions: regions,
|
|
8763
|
+
ocrAverageConfidence: averageConfidence
|
|
8764
|
+
};
|
|
8765
|
+
};
|
|
8766
|
+
var ocrPageDocuments = (result, input, baseMetadata) => {
|
|
8767
|
+
const grouped = new Map;
|
|
8768
|
+
for (const region of result.regions ?? []) {
|
|
8769
|
+
const text = normalizeWhitespace(region.text ?? "");
|
|
8770
|
+
if (!text || typeof region.page !== "number" || region.page < 1) {
|
|
8771
|
+
continue;
|
|
8772
|
+
}
|
|
8773
|
+
const bucket = grouped.get(region.page) ?? [];
|
|
8774
|
+
bucket.push({ ...region, text });
|
|
8775
|
+
grouped.set(region.page, bucket);
|
|
8776
|
+
}
|
|
8777
|
+
return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => ({
|
|
8778
|
+
chunking: input.chunking,
|
|
8779
|
+
contentType: input.contentType,
|
|
8780
|
+
format: "text",
|
|
8781
|
+
metadata: {
|
|
8782
|
+
...input.metadata ?? {},
|
|
8783
|
+
...baseMetadata,
|
|
8784
|
+
ocrRegionCount: regions.length,
|
|
8785
|
+
ocrRegions: regions,
|
|
8786
|
+
pageNumber,
|
|
8787
|
+
pageIndex: pageNumber - 1,
|
|
8788
|
+
sourceNativeKind: "pdf_page"
|
|
8789
|
+
},
|
|
8790
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
8791
|
+
text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
8792
|
+
${regions.map((region) => region.text).join(`
|
|
8793
|
+
`)}`),
|
|
8794
|
+
title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
|
|
8795
|
+
}));
|
|
8796
|
+
};
|
|
8797
|
+
var ocrRegionDocuments = (result, input, baseMetadata) => {
|
|
8798
|
+
const documents = [];
|
|
8799
|
+
for (const [index, region] of (result.regions ?? []).entries()) {
|
|
8800
|
+
const text = normalizeWhitespace(region.text ?? "");
|
|
8801
|
+
if (!text || typeof region.page !== "number" || region.page < 1) {
|
|
8802
|
+
continue;
|
|
8803
|
+
}
|
|
8804
|
+
const pageNumber = region.page;
|
|
8805
|
+
const regionNumber = index + 1;
|
|
8806
|
+
documents.push({
|
|
8807
|
+
chunking: input.chunking,
|
|
8808
|
+
contentType: input.contentType,
|
|
8809
|
+
format: "text",
|
|
8810
|
+
metadata: {
|
|
8811
|
+
...input.metadata ?? {},
|
|
8812
|
+
...baseMetadata,
|
|
8813
|
+
ocrRegionConfidence: region.confidence,
|
|
8814
|
+
ocrRegionHeight: region.height,
|
|
8815
|
+
ocrRegionWidth: region.width,
|
|
8816
|
+
ocrRegionX: region.x,
|
|
8817
|
+
ocrRegionY: region.y,
|
|
8818
|
+
pageNumber,
|
|
8819
|
+
pageIndex: pageNumber - 1,
|
|
8820
|
+
regionIndex: index,
|
|
8821
|
+
regionNumber,
|
|
8822
|
+
sourceNativeKind: "pdf_region"
|
|
8823
|
+
},
|
|
8824
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
8825
|
+
text: normalizeWhitespace(`PDF page ${pageNumber} region ${regionNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
8826
|
+
${text}`),
|
|
8827
|
+
title: input.title ? `${input.title} \xB7 Page ${pageNumber} Region ${regionNumber}` : `Page ${pageNumber} Region ${regionNumber}`
|
|
8828
|
+
});
|
|
8829
|
+
}
|
|
8830
|
+
return documents;
|
|
8831
|
+
};
|
|
8145
8832
|
var textExtractorSupports = (input) => {
|
|
8146
8833
|
if (input.format) {
|
|
8147
8834
|
return true;
|
|
@@ -8227,24 +8914,52 @@ var createBuiltinArchiveExpander = () => ({
|
|
|
8227
8914
|
var createEmailExtractor = () => ({
|
|
8228
8915
|
name: "absolute_email",
|
|
8229
8916
|
supports: emailExtractorSupports,
|
|
8230
|
-
extract: (input) => {
|
|
8917
|
+
extract: async (input) => {
|
|
8231
8918
|
const raw = decodeUtf8(input.data);
|
|
8232
8919
|
const headers = parseEmailHeaders(raw);
|
|
8233
|
-
|
|
8920
|
+
const { body } = splitEmailMessage(raw);
|
|
8921
|
+
const parsed = parseEmailMimeParts(body, headers.contentType);
|
|
8922
|
+
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`;
|
|
8923
|
+
const messageMetadata = {
|
|
8924
|
+
...input.metadata ?? {},
|
|
8925
|
+
emailKind: "message",
|
|
8926
|
+
fileKind: "email",
|
|
8927
|
+
from: headers.from,
|
|
8928
|
+
inReplyTo: headers.inReplyTo,
|
|
8929
|
+
messageId: headers.messageId,
|
|
8930
|
+
references: headers.references,
|
|
8931
|
+
threadTopic: headers.subject,
|
|
8932
|
+
to: headers.to,
|
|
8933
|
+
hasAttachments: parsed.attachments.length > 0
|
|
8934
|
+
};
|
|
8935
|
+
const attachmentDocuments = await Promise.all(parsed.attachments.map(async (attachment, index) => {
|
|
8936
|
+
const documents = await extractRAGFileDocuments({
|
|
8937
|
+
chunking: input.chunking,
|
|
8938
|
+
contentType: attachment.contentType,
|
|
8939
|
+
data: attachment.data,
|
|
8940
|
+
format: inferFormatFromContentType(attachment.contentType ?? null) ?? inferFormatFromName(attachment.fileName),
|
|
8941
|
+
metadata: {
|
|
8942
|
+
...messageMetadata,
|
|
8943
|
+
attachmentIndex: index,
|
|
8944
|
+
attachmentName: attachment.fileName,
|
|
8945
|
+
emailKind: "attachment"
|
|
8946
|
+
},
|
|
8947
|
+
name: attachment.fileName,
|
|
8948
|
+
source: `${source}#attachments/${attachment.fileName}`,
|
|
8949
|
+
title: headers.subject ? `${headers.subject} \xB7 ${attachment.fileName}` : attachment.fileName
|
|
8950
|
+
});
|
|
8951
|
+
return documents;
|
|
8952
|
+
}));
|
|
8953
|
+
const messageDocument = {
|
|
8234
8954
|
chunking: input.chunking,
|
|
8235
8955
|
contentType: input.contentType,
|
|
8236
8956
|
format: "text",
|
|
8237
|
-
metadata:
|
|
8238
|
-
|
|
8239
|
-
fileKind: "email",
|
|
8240
|
-
from: headers.from,
|
|
8241
|
-
threadTopic: headers.subject,
|
|
8242
|
-
to: headers.to
|
|
8243
|
-
},
|
|
8244
|
-
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`,
|
|
8957
|
+
metadata: messageMetadata,
|
|
8958
|
+
source,
|
|
8245
8959
|
text: extractEmailText(raw),
|
|
8246
8960
|
title: input.title ?? headers.subject
|
|
8247
8961
|
};
|
|
8962
|
+
return [messageDocument, ...attachmentDocuments.flat()];
|
|
8248
8963
|
}
|
|
8249
8964
|
});
|
|
8250
8965
|
var createEPUBExtractor = () => ({
|
|
@@ -8388,7 +9103,7 @@ var createRAGImageOCRExtractor = (provider) => ({
|
|
|
8388
9103
|
format: "text",
|
|
8389
9104
|
metadata: {
|
|
8390
9105
|
...input.metadata ?? {},
|
|
8391
|
-
...result
|
|
9106
|
+
...ocrMetadata(result),
|
|
8392
9107
|
fileKind: "image"
|
|
8393
9108
|
},
|
|
8394
9109
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.image.txt`,
|
|
@@ -8476,6 +9191,9 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
|
|
|
8476
9191
|
metadata: {
|
|
8477
9192
|
...archiveInput.metadata ?? {},
|
|
8478
9193
|
...entry.metadata ?? {},
|
|
9194
|
+
archiveEntryName: basename(entry.path),
|
|
9195
|
+
archiveParentName: archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source,
|
|
9196
|
+
archiveParentSource: archiveInput.source ?? archiveInput.path ?? archiveInput.name,
|
|
8479
9197
|
archivePath: entry.path,
|
|
8480
9198
|
fileKind: "archive_entry"
|
|
8481
9199
|
},
|
|
@@ -8551,21 +9269,27 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
8551
9269
|
...input,
|
|
8552
9270
|
contentType: input.contentType ?? "application/pdf"
|
|
8553
9271
|
});
|
|
8554
|
-
|
|
9272
|
+
const baseMetadata = {
|
|
9273
|
+
...ocrMetadata(ocr),
|
|
9274
|
+
fileKind: "pdf",
|
|
9275
|
+
pageCount: estimatePDFPageCount(input.data),
|
|
9276
|
+
pdfTextMode: "ocr"
|
|
9277
|
+
};
|
|
9278
|
+
const summaryDocument = {
|
|
8555
9279
|
chunking: input.chunking,
|
|
8556
9280
|
contentType: input.contentType ?? "application/pdf",
|
|
8557
9281
|
format: "text",
|
|
8558
9282
|
metadata: {
|
|
8559
9283
|
...input.metadata ?? {},
|
|
8560
|
-
...
|
|
8561
|
-
fileKind: "pdf",
|
|
8562
|
-
pageCount: estimatePDFPageCount(input.data),
|
|
8563
|
-
pdfTextMode: "ocr"
|
|
9284
|
+
...baseMetadata
|
|
8564
9285
|
},
|
|
8565
9286
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
8566
9287
|
text: ocr.text,
|
|
8567
9288
|
title: ocr.title ?? input.title
|
|
8568
9289
|
};
|
|
9290
|
+
const pageDocuments = ocrPageDocuments(ocr, input, baseMetadata);
|
|
9291
|
+
const regionDocuments = ocrRegionDocuments(ocr, input, baseMetadata);
|
|
9292
|
+
return [summaryDocument, ...pageDocuments, ...regionDocuments];
|
|
8569
9293
|
}
|
|
8570
9294
|
});
|
|
8571
9295
|
var DEFAULT_FILE_EXTRACTORS = [
|
|
@@ -8632,7 +9356,7 @@ var fixedUnits = (text, maxChunkLength) => {
|
|
|
8632
9356
|
return units;
|
|
8633
9357
|
};
|
|
8634
9358
|
var sourceAwareUnits = (document, format, normalizedText) => {
|
|
8635
|
-
const resolveStructuredUnits = (sections) => sections.length > 0 ? sections : paragraphUnits(normalizedText);
|
|
9359
|
+
const resolveStructuredUnits = (sections) => sections.length > 0 ? sections : paragraphUnits(normalizedText).map((text) => ({ text }));
|
|
8636
9360
|
switch (format) {
|
|
8637
9361
|
case "markdown": {
|
|
8638
9362
|
const sections = markdownStructureUnits(document.text);
|
|
@@ -8644,7 +9368,7 @@ var sourceAwareUnits = (document, format, normalizedText) => {
|
|
|
8644
9368
|
}
|
|
8645
9369
|
case "text":
|
|
8646
9370
|
default:
|
|
8647
|
-
return paragraphUnits(normalizedText);
|
|
9371
|
+
return paragraphUnits(normalizedText).map((text) => ({ text }));
|
|
8648
9372
|
}
|
|
8649
9373
|
};
|
|
8650
9374
|
var overlapTail = (value, overlap) => {
|
|
@@ -8708,10 +9432,13 @@ var chunkFromUnits = (units, maxChunkLength, chunkOverlap, minChunkLength) => {
|
|
|
8708
9432
|
return merged;
|
|
8709
9433
|
};
|
|
8710
9434
|
var chunkSourceAwareUnit = (unit, options) => {
|
|
8711
|
-
if (unit.length <= options.maxChunkLength) {
|
|
9435
|
+
if (unit.text.length <= options.maxChunkLength) {
|
|
8712
9436
|
return [unit];
|
|
8713
9437
|
}
|
|
8714
|
-
return chunkFromUnits(paragraphUnits(unit), options.maxChunkLength, options.chunkOverlap, options.minChunkLength)
|
|
9438
|
+
return chunkFromUnits(paragraphUnits(unit.text), options.maxChunkLength, options.chunkOverlap, options.minChunkLength).map((text) => ({
|
|
9439
|
+
...unit,
|
|
9440
|
+
text
|
|
9441
|
+
}));
|
|
8715
9442
|
};
|
|
8716
9443
|
var resolveChunkingUnits = (text, options) => {
|
|
8717
9444
|
if (options.strategy === "fixed") {
|
|
@@ -8734,15 +9461,15 @@ var resolveChunkingOptions = (document, defaults) => {
|
|
|
8734
9461
|
strategy
|
|
8735
9462
|
};
|
|
8736
9463
|
};
|
|
8737
|
-
var
|
|
9464
|
+
var createChunkEntries = (document, format, text, options) => {
|
|
8738
9465
|
if (text.length <= options.maxChunkLength && options.strategy !== "source_aware") {
|
|
8739
|
-
return [text];
|
|
9466
|
+
return [{ text }];
|
|
8740
9467
|
}
|
|
8741
9468
|
if (options.strategy === "source_aware") {
|
|
8742
9469
|
return sourceAwareUnits(document, format, text).flatMap((unit) => chunkSourceAwareUnit(unit, options));
|
|
8743
9470
|
}
|
|
8744
9471
|
const units = resolveChunkingUnits(text, options);
|
|
8745
|
-
return chunkFromUnits(units, options.maxChunkLength, options.chunkOverlap, options.minChunkLength);
|
|
9472
|
+
return chunkFromUnits(units, options.maxChunkLength, options.chunkOverlap, options.minChunkLength).map((entry) => ({ text: entry }));
|
|
8746
9473
|
};
|
|
8747
9474
|
var prepareRAGDocument = (document, defaultChunking) => {
|
|
8748
9475
|
const format = inferFormat(document);
|
|
@@ -8764,18 +9491,46 @@ var prepareRAGDocument = (document, defaultChunking) => {
|
|
|
8764
9491
|
source,
|
|
8765
9492
|
title
|
|
8766
9493
|
};
|
|
8767
|
-
const
|
|
8768
|
-
const chunks =
|
|
8769
|
-
|
|
8770
|
-
|
|
8771
|
-
|
|
8772
|
-
|
|
8773
|
-
|
|
8774
|
-
|
|
8775
|
-
|
|
8776
|
-
|
|
8777
|
-
|
|
8778
|
-
|
|
9494
|
+
const chunkEntries = createChunkEntries(document, format, normalizedText, chunking);
|
|
9495
|
+
const chunks = chunkEntries.map((entry, index) => {
|
|
9496
|
+
const sectionPath = Array.isArray(entry.sectionPath) ? entry.sectionPath.filter((value) => typeof value === "string" && value.length > 0) : undefined;
|
|
9497
|
+
const sectionTitle = typeof entry.sectionTitle === "string" && entry.sectionTitle.length > 0 ? entry.sectionTitle : sectionPath?.at(-1);
|
|
9498
|
+
const chunkTitle = sectionTitle && sectionTitle !== title ? `${title} \xB7 ${sectionTitle}` : title;
|
|
9499
|
+
const sectionChunkId = sectionPath && sectionPath.length > 0 ? `${documentId}:section:${slugify(sectionPath.join(" "))}` : undefined;
|
|
9500
|
+
const sectionSiblingIndexes = sectionChunkId === undefined ? [index] : chunkEntries.reduce((indexes, candidate, candidateIndex) => {
|
|
9501
|
+
const candidatePath = Array.isArray(candidate.sectionPath) ? candidate.sectionPath.filter((value) => typeof value === "string" && value.length > 0) : undefined;
|
|
9502
|
+
const candidateSectionId = candidatePath && candidatePath.length > 0 ? `${documentId}:section:${slugify(candidatePath.join(" "))}` : undefined;
|
|
9503
|
+
if (candidateSectionId === sectionChunkId) {
|
|
9504
|
+
indexes.push(candidateIndex);
|
|
9505
|
+
}
|
|
9506
|
+
return indexes;
|
|
9507
|
+
}, []);
|
|
9508
|
+
const sectionChunkIndex = sectionSiblingIndexes.indexOf(index);
|
|
9509
|
+
const previousChunkId = index > 0 ? `${documentId}:${String(index).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}` : undefined;
|
|
9510
|
+
const nextChunkId = index + 1 < chunkEntries.length ? `${documentId}:${String(index + 2).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}` : undefined;
|
|
9511
|
+
return {
|
|
9512
|
+
chunkId: `${documentId}:${String(index + 1).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}`,
|
|
9513
|
+
metadata: {
|
|
9514
|
+
...metadata,
|
|
9515
|
+
chunkCount: chunkEntries.length,
|
|
9516
|
+
chunkIndex: index,
|
|
9517
|
+
...sectionTitle ? { sectionTitle } : {},
|
|
9518
|
+
...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
|
|
9519
|
+
...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
|
|
9520
|
+
...entry.sectionKind ? { sectionKind: entry.sectionKind } : {},
|
|
9521
|
+
...sectionChunkId ? { sectionChunkId } : {},
|
|
9522
|
+
...sectionChunkId && sectionChunkIndex >= 0 ? {
|
|
9523
|
+
sectionChunkCount: sectionSiblingIndexes.length,
|
|
9524
|
+
sectionChunkIndex
|
|
9525
|
+
} : {},
|
|
9526
|
+
...previousChunkId ? { previousChunkId } : {},
|
|
9527
|
+
...nextChunkId ? { nextChunkId } : {}
|
|
9528
|
+
},
|
|
9529
|
+
source,
|
|
9530
|
+
text: entry.text,
|
|
9531
|
+
title: chunkTitle
|
|
9532
|
+
};
|
|
9533
|
+
});
|
|
8779
9534
|
return {
|
|
8780
9535
|
chunks,
|
|
8781
9536
|
documentId,
|
|
@@ -9421,6 +10176,30 @@ var searchDocuments = async (collection, input) => collection.search(input);
|
|
|
9421
10176
|
// src/ai/rag/htmxWorkflowRenderers.ts
|
|
9422
10177
|
init_constants();
|
|
9423
10178
|
var escapeHtml2 = (text) => text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
10179
|
+
var renderSourceLabels = (input) => {
|
|
10180
|
+
if (!input) {
|
|
10181
|
+
return "";
|
|
10182
|
+
}
|
|
10183
|
+
const rows = [
|
|
10184
|
+
input.contextLabel ? `<li><strong>Context</strong> ${escapeHtml2(input.contextLabel)}</li>` : "",
|
|
10185
|
+
input.locatorLabel ? `<li><strong>Location</strong> ${escapeHtml2(input.locatorLabel)}</li>` : "",
|
|
10186
|
+
input.provenanceLabel ? `<li><strong>Provenance</strong> ${escapeHtml2(input.provenanceLabel)}</li>` : ""
|
|
10187
|
+
].filter((row) => row.length > 0);
|
|
10188
|
+
return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
|
|
10189
|
+
};
|
|
10190
|
+
var renderChunkStructure = (structure) => {
|
|
10191
|
+
if (!structure) {
|
|
10192
|
+
return "";
|
|
10193
|
+
}
|
|
10194
|
+
const rows = [
|
|
10195
|
+
structure.section?.title ? `<li><strong>Section</strong> ${escapeHtml2(structure.section.title)}</li>` : "",
|
|
10196
|
+
structure.section?.path && structure.section.path.length > 1 ? `<li><strong>Section path</strong> ${escapeHtml2(structure.section.path.join(" > "))}</li>` : "",
|
|
10197
|
+
typeof structure.sequence?.sectionChunkIndex === "number" && typeof structure.sequence?.sectionChunkCount === "number" ? `<li><strong>Section chunk</strong> ${structure.sequence.sectionChunkIndex + 1} of ${structure.sequence.sectionChunkCount}</li>` : "",
|
|
10198
|
+
structure.sequence?.previousChunkId ? `<li><strong>Previous</strong> ${escapeHtml2(structure.sequence.previousChunkId)}</li>` : "",
|
|
10199
|
+
structure.sequence?.nextChunkId ? `<li><strong>Next</strong> ${escapeHtml2(structure.sequence.nextChunkId)}</li>` : ""
|
|
10200
|
+
].filter((row) => row.length > 0);
|
|
10201
|
+
return rows.length > 0 ? `<ul class="rag-chunk-structure">${rows.join("")}</ul>` : "";
|
|
10202
|
+
};
|
|
9424
10203
|
var renderEmptyState = (kind) => {
|
|
9425
10204
|
switch (kind) {
|
|
9426
10205
|
case "documents":
|
|
@@ -9460,17 +10239,41 @@ var defaultStatus = ({
|
|
|
9460
10239
|
}
|
|
9461
10240
|
return `<dl class="rag-status">` + `<div><dt>Backend</dt><dd>${escapeHtml2(status.backend)}</dd></div>` + `<div><dt>Vector mode</dt><dd>${escapeHtml2(status.vectorMode)}</dd></div>` + `<div><dt>Embedding dimensions</dt><dd>${status.dimensions ?? "n/a"}</dd></div>` + `<div><dt>Vector acceleration</dt><dd>${status.native?.active ? "active" : "inactive"}</dd></div>` + `<div><dt>Documents</dt><dd>${documents?.total ?? "n/a"}</dd></div>` + `<div><dt>Total chunks</dt><dd>${documents?.chunkCount ?? "n/a"}</dd></div>` + `<div><dt>Seed docs</dt><dd>${documents?.byKind.seed ?? 0}</dd></div>` + `<div><dt>Custom docs</dt><dd>${documents?.byKind.custom ?? 0}</dd></div>` + `</dl>${renderCapabilityList(capabilities)}`;
|
|
9462
10241
|
};
|
|
9463
|
-
var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
|
|
10242
|
+
var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + renderSourceLabels(source.labels) + renderChunkStructure(source.structure) + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
|
|
9464
10243
|
var defaultSearchResults = ({
|
|
9465
10244
|
query,
|
|
9466
10245
|
results,
|
|
9467
10246
|
trace
|
|
9468
10247
|
}) => results.length === 0 ? renderEmptyState("searchResults") : `<section class="rag-search-results">` + `<p class="rag-search-summary">${results.length} results for ${escapeHtml2(query)}</p>` + (trace ? `<p class="rag-search-summary">mode=${escapeHtml2(trace.mode)} \xB7 final=${trace.resultCounts.final} \xB7 vector=${trace.resultCounts.vector} \xB7 lexical=${trace.resultCounts.lexical}</p>` : "") + `${results.map((result, index) => defaultSearchResultItem(result, index)).join("")}</section>`;
|
|
9469
|
-
var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
|
|
10248
|
+
var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + renderSourceLabels(document.labels) + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
|
|
9470
10249
|
var defaultDocuments = ({
|
|
9471
10250
|
documents
|
|
9472
10251
|
}) => documents.length === 0 ? renderEmptyState("documents") : `<section class="rag-documents">${documents.map((document, index) => defaultDocumentItem(document, index)).join("")}</section>`;
|
|
9473
|
-
var defaultChunkPreview = (input) =>
|
|
10252
|
+
var defaultChunkPreview = (input) => {
|
|
10253
|
+
const groups = input.chunks.reduce((acc, chunk) => {
|
|
10254
|
+
const metadata = chunk.metadata ?? {};
|
|
10255
|
+
const kind = typeof metadata.sourceNativeKind === "string" ? metadata.sourceNativeKind : "document_chunk";
|
|
10256
|
+
const locator = chunk.labels?.locatorLabel ?? "";
|
|
10257
|
+
const title = kind === "pdf_page" ? locator || "PDF pages" : kind === "pdf_region" ? locator || "PDF regions" : kind === "spreadsheet_sheet" ? locator || "Spreadsheet sheets" : kind === "presentation_slide" ? locator || "Presentation slides" : kind === "attachment" ? locator || "Attachments" : kind === "archive_entry" ? locator || "Archive entries" : "Chunks";
|
|
10258
|
+
const key = kind === "document_chunk" ? "document_chunk" : `${kind}:${title}`;
|
|
10259
|
+
const existing = acc.find((entry) => entry.key === key);
|
|
10260
|
+
if (existing) {
|
|
10261
|
+
existing.chunks.push(chunk);
|
|
10262
|
+
return acc;
|
|
10263
|
+
}
|
|
10264
|
+
acc.push({
|
|
10265
|
+
chunks: [chunk],
|
|
10266
|
+
key,
|
|
10267
|
+
title
|
|
10268
|
+
});
|
|
10269
|
+
return acc;
|
|
10270
|
+
}, []);
|
|
10271
|
+
const groupHtml = groups.map((group) => {
|
|
10272
|
+
const chunkHtml = group.chunks.map((chunk) => '<article class="rag-chunk">' + `<h5>${escapeHtml2(chunk.chunkId)}</h5>` + `<p class="rag-chunk-meta">chunk ${typeof chunk.metadata?.chunkIndex === "number" ? chunk.metadata.chunkIndex : 0} of ${typeof chunk.metadata?.chunkCount === "number" ? chunk.metadata.chunkCount : input.chunks.length}</p>` + renderSourceLabels(chunk.labels) + renderChunkStructure(chunk.structure) + `<pre>${escapeHtml2(chunk.text)}</pre>` + "</article>").join("");
|
|
10273
|
+
return `<section class="rag-chunk-group"><h4>${escapeHtml2(group.title)}</h4>${chunkHtml}</section>`;
|
|
10274
|
+
}).join("");
|
|
10275
|
+
return `<section class="rag-chunk-preview">` + `<h3>${escapeHtml2(input.document.title)}</h3>` + `<p class="rag-chunk-preview-source">${escapeHtml2(input.document.source)}</p>` + renderSourceLabels(input.document.labels) + `<article class="rag-chunk-normalized">` + `<h4>Normalized text</h4>` + `<pre>${escapeHtml2(input.normalizedText)}</pre>` + `</article>${groupHtml}</section>`;
|
|
10276
|
+
};
|
|
9474
10277
|
var defaultMutationResult = (input) => {
|
|
9475
10278
|
if (!input.ok) {
|
|
9476
10279
|
return `<div class="rag-mutation error">${escapeHtml2(input.error ?? "Request failed")}</div>`;
|
|
@@ -9533,6 +10336,10 @@ var buildRAGContextLocatorLabel = (metadata, source, title) => {
|
|
|
9533
10336
|
return;
|
|
9534
10337
|
}
|
|
9535
10338
|
const page = getContextNumber3(metadata.page) ?? getContextNumber3(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
10339
|
+
const region = getContextNumber3(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
10340
|
+
if (page && region) {
|
|
10341
|
+
return `Page ${page} \xB7 Region ${region}`;
|
|
10342
|
+
}
|
|
9536
10343
|
if (page) {
|
|
9537
10344
|
return `Page ${page}`;
|
|
9538
10345
|
}
|
|
@@ -9574,9 +10381,11 @@ var buildRAGContextProvenanceLabel = (metadata) => {
|
|
|
9574
10381
|
const threadTopic = getContextString3(metadata.threadTopic);
|
|
9575
10382
|
const from = getContextString3(metadata.from);
|
|
9576
10383
|
const speaker = getContextString3(metadata.speaker);
|
|
10384
|
+
const ocrConfidence = getContextNumber3(metadata.ocrRegionConfidence) ?? getContextNumber3(metadata.ocrConfidence);
|
|
9577
10385
|
const labels = [
|
|
9578
10386
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
9579
10387
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
10388
|
+
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
9580
10389
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
9581
10390
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
9582
10391
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
@@ -9886,9 +10695,15 @@ var isRAGDocumentUrlArray = (value) => Array.isArray(value) && value.every((entr
|
|
|
9886
10695
|
var isRAGDocumentChunkArray = (value) => Array.isArray(value) && value.every((entry) => isRAGDocumentChunk(entry));
|
|
9887
10696
|
var buildSources2 = (results) => results.map((result) => ({
|
|
9888
10697
|
chunkId: result.chunkId,
|
|
10698
|
+
labels: buildRAGSourceLabels({
|
|
10699
|
+
metadata: result.metadata,
|
|
10700
|
+
source: result.source,
|
|
10701
|
+
title: result.title
|
|
10702
|
+
}),
|
|
9889
10703
|
metadata: result.metadata,
|
|
9890
10704
|
score: normalizeScore(result.score),
|
|
9891
10705
|
source: result.source,
|
|
10706
|
+
structure: buildRAGChunkStructure(result.metadata),
|
|
9892
10707
|
text: result.chunkText,
|
|
9893
10708
|
title: result.title
|
|
9894
10709
|
}));
|
|
@@ -13616,6 +14431,11 @@ var ragChat = (config) => {
|
|
|
13616
14431
|
let documentsWithoutChunkPreview = 0;
|
|
13617
14432
|
let inspectedDocuments = 0;
|
|
13618
14433
|
let inspectedChunks = 0;
|
|
14434
|
+
let documentsWithSourceLabels = 0;
|
|
14435
|
+
let chunksWithSourceLabels = 0;
|
|
14436
|
+
const sourceNativeKinds = new Map;
|
|
14437
|
+
const sampleDocuments = [];
|
|
14438
|
+
const sampleChunks = [];
|
|
13619
14439
|
let oldestDocumentAgeMs;
|
|
13620
14440
|
let newestDocumentAgeMs;
|
|
13621
14441
|
const staleDocuments = [];
|
|
@@ -13656,6 +14476,27 @@ var ragChat = (config) => {
|
|
|
13656
14476
|
if ((document.chunkCount ?? 0) === 0) {
|
|
13657
14477
|
emptyDocuments += 1;
|
|
13658
14478
|
}
|
|
14479
|
+
const documentLabels = buildRAGSourceLabels({
|
|
14480
|
+
metadata: document.metadata,
|
|
14481
|
+
source: document.source,
|
|
14482
|
+
title: document.title
|
|
14483
|
+
});
|
|
14484
|
+
if (documentLabels) {
|
|
14485
|
+
documentsWithSourceLabels += 1;
|
|
14486
|
+
}
|
|
14487
|
+
const documentSourceNativeKind = typeof document.metadata?.sourceNativeKind === "string" ? document.metadata.sourceNativeKind : undefined;
|
|
14488
|
+
if (documentSourceNativeKind) {
|
|
14489
|
+
sourceNativeKinds.set(documentSourceNativeKind, (sourceNativeKinds.get(documentSourceNativeKind) ?? 0) + 1);
|
|
14490
|
+
}
|
|
14491
|
+
if (sampleDocuments.length < 5 && (documentLabels || documentSourceNativeKind)) {
|
|
14492
|
+
sampleDocuments.push({
|
|
14493
|
+
id: document.id,
|
|
14494
|
+
labels: documentLabels,
|
|
14495
|
+
source: document.source,
|
|
14496
|
+
sourceNativeKind: documentSourceNativeKind,
|
|
14497
|
+
title: document.title
|
|
14498
|
+
});
|
|
14499
|
+
}
|
|
13659
14500
|
if (indexManager?.getDocumentChunks) {
|
|
13660
14501
|
const preview = await indexManager.getDocumentChunks(document.id);
|
|
13661
14502
|
if (!preview) {
|
|
@@ -13665,6 +14506,27 @@ var ragChat = (config) => {
|
|
|
13665
14506
|
inspectedDocuments += 1;
|
|
13666
14507
|
for (const chunk of preview.chunks) {
|
|
13667
14508
|
inspectedChunks += 1;
|
|
14509
|
+
const chunkLabels = buildRAGSourceLabels({
|
|
14510
|
+
metadata: chunk.metadata,
|
|
14511
|
+
source: chunk.source ?? preview.document.source,
|
|
14512
|
+
title: chunk.title ?? preview.document.title
|
|
14513
|
+
});
|
|
14514
|
+
if (chunkLabels) {
|
|
14515
|
+
chunksWithSourceLabels += 1;
|
|
14516
|
+
}
|
|
14517
|
+
const chunkSourceNativeKind = typeof chunk.metadata?.sourceNativeKind === "string" ? chunk.metadata.sourceNativeKind : undefined;
|
|
14518
|
+
if (chunkSourceNativeKind) {
|
|
14519
|
+
sourceNativeKinds.set(chunkSourceNativeKind, (sourceNativeKinds.get(chunkSourceNativeKind) ?? 0) + 1);
|
|
14520
|
+
}
|
|
14521
|
+
if (sampleChunks.length < 8 && (chunkLabels || chunkSourceNativeKind)) {
|
|
14522
|
+
sampleChunks.push({
|
|
14523
|
+
chunkId: chunk.chunkId,
|
|
14524
|
+
documentId: document.id,
|
|
14525
|
+
labels: chunkLabels,
|
|
14526
|
+
source: chunk.source ?? preview.document.source,
|
|
14527
|
+
sourceNativeKind: chunkSourceNativeKind
|
|
14528
|
+
});
|
|
14529
|
+
}
|
|
13668
14530
|
const normalized = chunk.text.trim();
|
|
13669
14531
|
if (!normalized) {
|
|
13670
14532
|
emptyChunks += 1;
|
|
@@ -13721,6 +14583,13 @@ var ragChat = (config) => {
|
|
|
13721
14583
|
failuresByInputKind: Object.fromEntries(failuresByInputKind.entries()),
|
|
13722
14584
|
inspectedChunks,
|
|
13723
14585
|
inspectedDocuments,
|
|
14586
|
+
inspection: {
|
|
14587
|
+
chunksWithSourceLabels,
|
|
14588
|
+
documentsWithSourceLabels,
|
|
14589
|
+
sampleChunks,
|
|
14590
|
+
sampleDocuments,
|
|
14591
|
+
sourceNativeKinds: Object.fromEntries(sourceNativeKinds.entries())
|
|
14592
|
+
},
|
|
13724
14593
|
lowSignalChunks,
|
|
13725
14594
|
newestDocumentAgeMs,
|
|
13726
14595
|
oldestDocumentAgeMs,
|
|
@@ -14901,7 +15770,14 @@ var ragChat = (config) => {
|
|
|
14901
15770
|
}
|
|
14902
15771
|
const documents = await indexManager.listDocuments({ kind });
|
|
14903
15772
|
return {
|
|
14904
|
-
documents
|
|
15773
|
+
documents: documents.map((document) => ({
|
|
15774
|
+
...document,
|
|
15775
|
+
labels: buildRAGSourceLabels({
|
|
15776
|
+
metadata: document.metadata,
|
|
15777
|
+
source: document.source,
|
|
15778
|
+
title: document.title
|
|
15779
|
+
})
|
|
15780
|
+
})),
|
|
14905
15781
|
ok: true
|
|
14906
15782
|
};
|
|
14907
15783
|
};
|
|
@@ -14961,7 +15837,24 @@ var ragChat = (config) => {
|
|
|
14961
15837
|
}
|
|
14962
15838
|
return {
|
|
14963
15839
|
ok: true,
|
|
14964
|
-
...preview
|
|
15840
|
+
...preview,
|
|
15841
|
+
document: {
|
|
15842
|
+
...preview.document,
|
|
15843
|
+
labels: buildRAGSourceLabels({
|
|
15844
|
+
metadata: preview.document.metadata,
|
|
15845
|
+
source: preview.document.source,
|
|
15846
|
+
title: preview.document.title
|
|
15847
|
+
})
|
|
15848
|
+
},
|
|
15849
|
+
chunks: preview.chunks.map((chunk) => ({
|
|
15850
|
+
...chunk,
|
|
15851
|
+
labels: buildRAGSourceLabels({
|
|
15852
|
+
metadata: chunk.metadata,
|
|
15853
|
+
source: chunk.source ?? preview.document.source,
|
|
15854
|
+
title: chunk.title ?? preview.document.title
|
|
15855
|
+
}),
|
|
15856
|
+
structure: buildRAGChunkStructure(chunk.metadata)
|
|
15857
|
+
}))
|
|
14965
15858
|
};
|
|
14966
15859
|
};
|
|
14967
15860
|
const handleDeleteDocument = async (id) => {
|
|
@@ -20557,5 +21450,5 @@ export {
|
|
|
20557
21450
|
aiChat
|
|
20558
21451
|
};
|
|
20559
21452
|
|
|
20560
|
-
//# debugId=
|
|
21453
|
+
//# debugId=DE5EC1314BD5A9F664756E2164756E21
|
|
20561
21454
|
//# sourceMappingURL=index.js.map
|