@absolutejs/absolute 0.19.0-beta.603 → 0.19.0-beta.605

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/ai/client/index.js +244 -10
  2. package/dist/ai/client/index.js.map +4 -4
  3. package/dist/ai/client/ui.js +248 -10
  4. package/dist/ai/client/ui.js.map +4 -4
  5. package/dist/ai/index.js +1003 -110
  6. package/dist/ai/index.js.map +8 -8
  7. package/dist/ai/rag/quality.js +27 -6
  8. package/dist/ai/rag/quality.js.map +3 -3
  9. package/dist/ai/rag/ui.js +248 -10
  10. package/dist/ai/rag/ui.js.map +4 -4
  11. package/dist/ai-client/angular/ai/index.js +243 -9
  12. package/dist/ai-client/react/ai/index.js +258 -10
  13. package/dist/ai-client/vue/ai/index.js +347 -101
  14. package/dist/angular/ai/index.js +244 -10
  15. package/dist/angular/ai/index.js.map +4 -4
  16. package/dist/react/ai/index.js +259 -11
  17. package/dist/react/ai/index.js.map +6 -6
  18. package/dist/src/ai/client/ui.d.ts +1 -1
  19. package/dist/src/ai/rag/index.d.ts +1 -1
  20. package/dist/src/ai/rag/presentation.d.ts +12 -1
  21. package/dist/src/ai/rag/ui.d.ts +1 -1
  22. package/dist/src/react/ai/useRAG.d.ts +5 -0
  23. package/dist/src/react/ai/useRAGChunkPreview.d.ts +4 -0
  24. package/dist/src/react/ai/useRAGSources.d.ts +1 -0
  25. package/dist/src/svelte/ai/createRAG.d.ts +5 -0
  26. package/dist/src/svelte/ai/createRAGChunkPreview.d.ts +4 -0
  27. package/dist/src/svelte/ai/createRAGSources.d.ts +1 -0
  28. package/dist/src/vue/ai/useRAG.d.ts +125 -0
  29. package/dist/src/vue/ai/useRAGChunkPreview.d.ts +54 -0
  30. package/dist/src/vue/ai/useRAGDocuments.d.ts +20 -0
  31. package/dist/src/vue/ai/useRAGIndexAdmin.d.ts +10 -0
  32. package/dist/src/vue/ai/useRAGSearch.d.ts +40 -0
  33. package/dist/src/vue/ai/useRAGSources.d.ts +1 -0
  34. package/dist/svelte/ai/index.js +305 -57
  35. package/dist/svelte/ai/index.js.map +6 -6
  36. package/dist/types/ai.d.ts +102 -1
  37. package/dist/vue/ai/index.js +311 -63
  38. package/dist/vue/ai/index.js.map +6 -6
  39. package/package.json +1 -1
package/dist/ai/index.js CHANGED
@@ -216,6 +216,10 @@ var buildContextLabel = (metadata) => {
216
216
  return from ? `Message from ${from}` : "Message evidence";
217
217
  }
218
218
  const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
219
+ const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
220
+ if (page && region) {
221
+ return `Page ${page} region ${region}`;
222
+ }
219
223
  if (page) {
220
224
  return `Page ${page}`;
221
225
  }
@@ -239,6 +243,11 @@ var buildContextLabel = (metadata) => {
239
243
  if (speaker) {
240
244
  return `Speaker ${speaker}`;
241
245
  }
246
+ const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString(value)).filter((value) => typeof value === "string") : [];
247
+ const sectionTitle = getContextString(metadata.sectionTitle) ?? sectionPath.at(-1);
248
+ if (sectionTitle) {
249
+ return `Section ${sectionTitle}`;
250
+ }
242
251
  return;
243
252
  };
244
253
  var formatMediaTimestamp = (value) => {
@@ -256,6 +265,10 @@ var buildLocatorLabel = (metadata, source, title) => {
256
265
  return;
257
266
  }
258
267
  const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
268
+ const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
269
+ if (page && region) {
270
+ return `Page ${page} \xB7 Region ${region}`;
271
+ }
259
272
  if (page) {
260
273
  return `Page ${page}`;
261
274
  }
@@ -284,6 +297,10 @@ var buildLocatorLabel = (metadata, source, title) => {
284
297
  if (mediaStart) {
285
298
  return `Timestamp ${mediaStart}`;
286
299
  }
300
+ const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString(value)).filter((value) => typeof value === "string") : [];
301
+ if (sectionPath.length > 0) {
302
+ return `Section ${sectionPath.join(" > ")}`;
303
+ }
287
304
  return;
288
305
  };
289
306
  var formatTimestampLabel = (value) => {
@@ -308,9 +325,11 @@ var buildProvenanceLabel = (metadata) => {
308
325
  const transcriptSource = getContextString(metadata.transcriptSource);
309
326
  const pdfTextMode = getContextString(metadata.pdfTextMode);
310
327
  const ocrEngine = getContextString(metadata.ocrEngine);
328
+ const ocrConfidence = getContextNumber(metadata.ocrRegionConfidence) ?? getContextNumber(metadata.ocrConfidence);
311
329
  const labels = [
312
330
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
313
331
  ocrEngine ? `OCR ${ocrEngine}` : "",
332
+ typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
314
333
  mediaKind ? `Media ${mediaKind}` : "",
315
334
  transcriptSource ? `Transcript ${transcriptSource}` : "",
316
335
  threadTopic ? `Thread ${threadTopic}` : "",
@@ -331,8 +350,10 @@ var buildExcerpt = (text, maxLength = 160) => {
331
350
  var buildGroundingReferenceEvidenceLabel = (reference) => [reference.label, reference.locatorLabel, reference.contextLabel].filter((value) => Boolean(value && value.length > 0)).filter((value, index, values) => values.findIndex((entry) => entry === value) === index).join(" \xB7 ");
332
351
  var buildGroundingReferenceEvidenceSummary = (reference) => [
333
352
  reference.source ?? reference.title ?? reference.chunkId,
353
+ reference.locatorLabel,
354
+ reference.contextLabel,
334
355
  reference.provenanceLabel
335
- ].filter((value) => Boolean(value && value.length > 0)).join(" \xB7 ");
356
+ ].filter((value) => Boolean(value && value.length > 0)).filter((value, index, values) => values.findIndex((entry) => entry === value) === index).join(" \xB7 ");
336
357
  var buildGroundedAnswerCitationDetail = (reference) => ({
337
358
  contextLabel: reference.contextLabel,
338
359
  evidenceLabel: buildGroundingReferenceEvidenceLabel(reference),
@@ -356,12 +377,12 @@ var buildRAGCitations = (sources) => {
356
377
  continue;
357
378
  unique.set(key, {
358
379
  chunkId: source.chunkId,
359
- contextLabel: buildContextLabel(source.metadata),
380
+ contextLabel: source.labels?.contextLabel ?? buildContextLabel(source.metadata),
360
381
  key,
361
382
  label: buildSourceLabel(source),
362
- locatorLabel: buildLocatorLabel(source.metadata, source.source, source.title),
383
+ locatorLabel: source.labels?.locatorLabel ?? buildLocatorLabel(source.metadata, source.source, source.title),
363
384
  metadata: source.metadata,
364
- provenanceLabel: buildProvenanceLabel(source.metadata),
385
+ provenanceLabel: source.labels?.provenanceLabel ?? buildProvenanceLabel(source.metadata),
365
386
  score: source.score,
366
387
  source: source.source,
367
388
  text: source.text,
@@ -431,7 +452,7 @@ var buildRAGGroundingReferences = (sources) => {
431
452
  const citationReferenceMap = buildRAGCitationReferenceMap(citations);
432
453
  return citations.map((citation) => ({
433
454
  chunkId: citation.chunkId,
434
- contextLabel: buildContextLabel(citation.metadata),
455
+ contextLabel: citation.contextLabel ?? buildContextLabel(citation.metadata),
435
456
  excerpt: buildExcerpt(citation.text),
436
457
  label: citation.label,
437
458
  locatorLabel: citation.locatorLabel ?? buildLocatorLabel(citation.metadata, citation.source, citation.title),
@@ -3977,6 +3998,10 @@ var buildContextLabel2 = (metadata) => {
3977
3998
  return from ? `Message from ${from}` : "Message evidence";
3978
3999
  }
3979
4000
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
4001
+ const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
4002
+ if (page && region) {
4003
+ return `Page ${page} region ${region}`;
4004
+ }
3980
4005
  if (page) {
3981
4006
  return `Page ${page}`;
3982
4007
  }
@@ -4000,6 +4025,11 @@ var buildContextLabel2 = (metadata) => {
4000
4025
  if (speaker) {
4001
4026
  return `Speaker ${speaker}`;
4002
4027
  }
4028
+ const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
4029
+ const sectionTitle = getContextString2(metadata.sectionTitle) ?? sectionPath.at(-1);
4030
+ if (sectionTitle) {
4031
+ return `Section ${sectionTitle}`;
4032
+ }
4003
4033
  return;
4004
4034
  };
4005
4035
  var buildLocatorLabel2 = (metadata, source, title) => {
@@ -4007,6 +4037,10 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4007
4037
  return;
4008
4038
  }
4009
4039
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
4040
+ const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
4041
+ if (page && region) {
4042
+ return `Page ${page} \xB7 Region ${region}`;
4043
+ }
4010
4044
  if (page) {
4011
4045
  return `Page ${page}`;
4012
4046
  }
@@ -4035,6 +4069,10 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4035
4069
  if (mediaStart) {
4036
4070
  return `Timestamp ${mediaStart}`;
4037
4071
  }
4072
+ const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.map((value) => getContextString2(value)).filter((value) => typeof value === "string") : [];
4073
+ if (sectionPath.length > 0) {
4074
+ return `Section ${sectionPath.join(" > ")}`;
4075
+ }
4038
4076
  return;
4039
4077
  };
4040
4078
  var buildProvenanceLabel2 = (metadata) => {
@@ -4049,9 +4087,11 @@ var buildProvenanceLabel2 = (metadata) => {
4049
4087
  const transcriptSource = getContextString2(metadata.transcriptSource);
4050
4088
  const pdfTextMode = getContextString2(metadata.pdfTextMode);
4051
4089
  const ocrEngine = getContextString2(metadata.ocrEngine);
4090
+ const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
4052
4091
  const labels = [
4053
4092
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
4054
4093
  ocrEngine ? `OCR ${ocrEngine}` : "",
4094
+ typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
4055
4095
  mediaKind ? `Media ${mediaKind}` : "",
4056
4096
  transcriptSource ? `Transcript ${transcriptSource}` : "",
4057
4097
  threadTopic ? `Thread ${threadTopic}` : "",
@@ -4061,6 +4101,50 @@ var buildProvenanceLabel2 = (metadata) => {
4061
4101
  ].filter((value) => value.length > 0);
4062
4102
  return labels.length > 0 ? labels.join(" \xB7 ") : undefined;
4063
4103
  };
4104
+ var buildRAGSourceLabels = ({
4105
+ metadata,
4106
+ source,
4107
+ title
4108
+ }) => {
4109
+ const contextLabel = buildContextLabel2(metadata);
4110
+ const locatorLabel = buildLocatorLabel2(metadata, source, title);
4111
+ const provenanceLabel = buildProvenanceLabel2(metadata);
4112
+ if (!contextLabel && !locatorLabel && !provenanceLabel) {
4113
+ return;
4114
+ }
4115
+ return {
4116
+ contextLabel,
4117
+ locatorLabel,
4118
+ provenanceLabel
4119
+ };
4120
+ };
4121
+ var buildRAGChunkStructure = (metadata) => {
4122
+ if (!metadata) {
4123
+ return;
4124
+ }
4125
+ const sectionPath = Array.isArray(metadata.sectionPath) ? metadata.sectionPath.filter((value) => typeof value === "string" && value.trim().length > 0) : undefined;
4126
+ const sectionKind = metadata.sectionKind === "markdown_heading" || metadata.sectionKind === "html_heading" ? metadata.sectionKind : undefined;
4127
+ const section = {
4128
+ depth: getContextNumber2(metadata.sectionDepth),
4129
+ kind: sectionKind,
4130
+ path: sectionPath && sectionPath.length > 0 ? sectionPath : undefined,
4131
+ title: getContextString2(metadata.sectionTitle)
4132
+ };
4133
+ const sequence = {
4134
+ nextChunkId: getContextString2(metadata.nextChunkId),
4135
+ previousChunkId: getContextString2(metadata.previousChunkId),
4136
+ sectionChunkCount: getContextNumber2(metadata.sectionChunkCount),
4137
+ sectionChunkId: getContextString2(metadata.sectionChunkId),
4138
+ sectionChunkIndex: getContextNumber2(metadata.sectionChunkIndex)
4139
+ };
4140
+ if (!section.title && (!section.path || section.path.length === 0) && typeof section.depth !== "number" && !section.kind && !sequence.nextChunkId && !sequence.previousChunkId && typeof sequence.sectionChunkCount !== "number" && !sequence.sectionChunkId && typeof sequence.sectionChunkIndex !== "number") {
4141
+ return;
4142
+ }
4143
+ return {
4144
+ section: section.title || section.path && section.path.length > 0 || typeof section.depth === "number" || section.kind ? section : undefined,
4145
+ sequence: sequence.nextChunkId || sequence.previousChunkId || typeof sequence.sectionChunkCount === "number" || sequence.sectionChunkId || typeof sequence.sectionChunkIndex === "number" ? sequence : undefined
4146
+ };
4147
+ };
4064
4148
  var buildExcerpt2 = (text, maxLength = 160) => {
4065
4149
  const normalized = text.replaceAll(/\s+/g, " ").trim();
4066
4150
  if (normalized.length <= maxLength) {
@@ -4068,6 +4152,136 @@ var buildExcerpt2 = (text, maxLength = 160) => {
4068
4152
  }
4069
4153
  return `${normalized.slice(0, Math.max(0, maxLength - 1)).trimEnd()}\u2026`;
4070
4154
  };
4155
+ var buildRAGChunkGraph = (chunks) => {
4156
+ const nodes = [];
4157
+ const edges = [];
4158
+ const edgeKeys = new Set;
4159
+ const sections = new Map;
4160
+ for (const chunk of chunks) {
4161
+ const labels = chunk.labels ?? buildRAGSourceLabels({
4162
+ metadata: chunk.metadata,
4163
+ source: chunk.source,
4164
+ title: chunk.title
4165
+ });
4166
+ const structure = chunk.structure ?? buildRAGChunkStructure(chunk.metadata);
4167
+ nodes.push({
4168
+ chunkId: chunk.chunkId,
4169
+ contextLabel: labels?.contextLabel,
4170
+ label: chunk.source ?? chunk.title ?? chunk.chunkId,
4171
+ locatorLabel: labels?.locatorLabel,
4172
+ provenanceLabel: labels?.provenanceLabel,
4173
+ score: chunk.score,
4174
+ source: chunk.source,
4175
+ structure,
4176
+ title: chunk.title
4177
+ });
4178
+ const previousChunkId = structure?.sequence?.previousChunkId;
4179
+ if (previousChunkId) {
4180
+ const key = `previous:${previousChunkId}:${chunk.chunkId}`;
4181
+ if (!edgeKeys.has(key)) {
4182
+ edgeKeys.add(key);
4183
+ edges.push({
4184
+ fromChunkId: previousChunkId,
4185
+ relation: "previous",
4186
+ toChunkId: chunk.chunkId
4187
+ });
4188
+ }
4189
+ }
4190
+ const nextChunkId = structure?.sequence?.nextChunkId;
4191
+ if (nextChunkId) {
4192
+ const key = `next:${chunk.chunkId}:${nextChunkId}`;
4193
+ if (!edgeKeys.has(key)) {
4194
+ edgeKeys.add(key);
4195
+ edges.push({
4196
+ fromChunkId: chunk.chunkId,
4197
+ relation: "next",
4198
+ toChunkId: nextChunkId
4199
+ });
4200
+ }
4201
+ }
4202
+ const sectionId = structure?.sequence?.sectionChunkId;
4203
+ if (sectionId) {
4204
+ const existing = sections.get(sectionId);
4205
+ if (!existing) {
4206
+ sections.set(sectionId, {
4207
+ chunkCount: structure.sequence?.sectionChunkCount ?? 1,
4208
+ chunkIds: [chunk.chunkId],
4209
+ depth: structure.section?.depth,
4210
+ id: sectionId,
4211
+ kind: structure.section?.kind,
4212
+ path: structure.section?.path,
4213
+ title: structure.section?.title
4214
+ });
4215
+ continue;
4216
+ }
4217
+ if (!existing.chunkIds.includes(chunk.chunkId)) {
4218
+ existing.chunkIds.push(chunk.chunkId);
4219
+ }
4220
+ existing.chunkCount = Math.max(existing.chunkCount, structure.sequence?.sectionChunkCount ?? existing.chunkCount);
4221
+ }
4222
+ }
4223
+ for (const section of sections.values()) {
4224
+ section.chunkIds.sort((left, right) => {
4225
+ const leftNode = nodes.find((node) => node.chunkId === left);
4226
+ const rightNode = nodes.find((node) => node.chunkId === right);
4227
+ const leftIndex = leftNode?.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
4228
+ const rightIndex = rightNode?.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
4229
+ if (leftIndex !== rightIndex) {
4230
+ return leftIndex - rightIndex;
4231
+ }
4232
+ return left.localeCompare(right);
4233
+ });
4234
+ }
4235
+ nodes.sort((left, right) => {
4236
+ const leftSection = left.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
4237
+ const rightSection = right.structure?.sequence?.sectionChunkIndex ?? Number.MAX_SAFE_INTEGER;
4238
+ if (leftSection !== rightSection) {
4239
+ return leftSection - rightSection;
4240
+ }
4241
+ const leftScore = left.score ?? Number.NEGATIVE_INFINITY;
4242
+ const rightScore = right.score ?? Number.NEGATIVE_INFINITY;
4243
+ if (leftScore !== rightScore) {
4244
+ return rightScore - leftScore;
4245
+ }
4246
+ return left.label.localeCompare(right.label);
4247
+ });
4248
+ return {
4249
+ edges,
4250
+ nodes,
4251
+ sections: [...sections.values()].sort((left, right) => (left.title ?? left.id).localeCompare(right.title ?? right.id))
4252
+ };
4253
+ };
4254
+ var buildRAGChunkPreviewGraph = (preview) => buildRAGChunkGraph(preview.chunks.map((chunk) => ({
4255
+ chunkId: chunk.chunkId,
4256
+ labels: chunk.labels,
4257
+ metadata: chunk.metadata,
4258
+ source: chunk.source ?? preview.document.source,
4259
+ structure: chunk.structure,
4260
+ title: chunk.title ?? preview.document.title
4261
+ })));
4262
+ var buildRAGChunkPreviewNavigation = (preview, activeChunkId) => buildRAGChunkGraphNavigation(buildRAGChunkPreviewGraph(preview), activeChunkId);
4263
+ var buildRAGChunkGraphNavigation = (graph, activeChunkId) => {
4264
+ if (graph.nodes.length === 0) {
4265
+ return {
4266
+ activeChunkId,
4267
+ sectionNodes: []
4268
+ };
4269
+ }
4270
+ const activeNode = (activeChunkId ? graph.nodes.find((node) => node.chunkId === activeChunkId) : undefined) ?? graph.nodes[0];
4271
+ const resolvedActiveChunkId = activeNode?.chunkId;
4272
+ const previousNode = activeNode?.structure?.sequence?.previousChunkId ? graph.nodes.find((node) => node.chunkId === activeNode.structure?.sequence?.previousChunkId) : undefined;
4273
+ const nextNode = activeNode?.structure?.sequence?.nextChunkId ? graph.nodes.find((node) => node.chunkId === activeNode.structure?.sequence?.nextChunkId) : undefined;
4274
+ const section = activeNode?.structure?.sequence?.sectionChunkId ? graph.sections.find((entry) => entry.id === activeNode.structure?.sequence?.sectionChunkId) : undefined;
4275
+ const sectionNodes = section ? section.chunkIds.map((chunkId) => graph.nodes.find((node) => node.chunkId === chunkId)).filter((node) => Boolean(node)) : activeNode ? [activeNode] : [];
4276
+ return {
4277
+ activeChunkId: resolvedActiveChunkId,
4278
+ activeNode,
4279
+ nextNode,
4280
+ previousNode,
4281
+ section,
4282
+ sectionNodes
4283
+ };
4284
+ };
4071
4285
  var buildRAGRetrievedState = (messages) => {
4072
4286
  const message = getLatestRetrievedMessage(messages);
4073
4287
  if (!message) {
@@ -4102,13 +4316,14 @@ var buildRAGSourceSummaries = (sources) => {
4102
4316
  citationNumbers: groupCitations.map((citation) => citationReferenceMap[citation.chunkId] ?? 0),
4103
4317
  citations: groupCitations,
4104
4318
  chunkIds: group.chunks.map((chunk) => chunk.chunkId),
4105
- contextLabel: buildContextLabel2(leadChunk?.metadata),
4319
+ contextLabel: leadChunk?.labels?.contextLabel ?? buildContextLabel2(leadChunk?.metadata),
4106
4320
  count: group.count,
4107
4321
  excerpt: buildExcerpt2(leadChunk?.text ?? ""),
4108
4322
  key: group.key,
4109
4323
  label: group.label,
4110
- locatorLabel: buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
4111
- provenanceLabel: buildProvenanceLabel2(leadChunk?.metadata),
4324
+ locatorLabel: leadChunk?.labels?.locatorLabel ?? buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
4325
+ provenanceLabel: leadChunk?.labels?.provenanceLabel ?? buildProvenanceLabel2(leadChunk?.metadata),
4326
+ structure: leadChunk?.structure ?? buildRAGChunkStructure(leadChunk?.metadata),
4112
4327
  source: group.source,
4113
4328
  title: group.title
4114
4329
  };
@@ -4232,6 +4447,12 @@ var buildSourceGroup = (source, key) => ({
4232
4447
  count: 1,
4233
4448
  key,
4234
4449
  label: buildSourceLabel2(source),
4450
+ labels: source.labels ?? buildRAGSourceLabels({
4451
+ metadata: source.metadata,
4452
+ source: source.source,
4453
+ title: source.title
4454
+ }),
4455
+ structure: source.structure ?? buildRAGChunkStructure(source.metadata),
4235
4456
  source: source.source,
4236
4457
  title: source.title
4237
4458
  });
@@ -4242,7 +4463,20 @@ var updateSourceGroup = (groups, source) => {
4242
4463
  groups.set(key, buildSourceGroup(source, key));
4243
4464
  return;
4244
4465
  }
4245
- existing.bestScore = Math.max(existing.bestScore, source.score);
4466
+ if (source.score > existing.bestScore) {
4467
+ existing.bestScore = source.score;
4468
+ existing.label = buildSourceLabel2(source);
4469
+ existing.labels = source.labels ?? buildRAGSourceLabels({
4470
+ metadata: source.metadata,
4471
+ source: source.source,
4472
+ title: source.title
4473
+ });
4474
+ existing.structure = source.structure ?? buildRAGChunkStructure(source.metadata);
4475
+ existing.source = source.source;
4476
+ existing.title = source.title;
4477
+ } else {
4478
+ existing.bestScore = Math.max(existing.bestScore, source.score);
4479
+ }
4246
4480
  existing.count += 1;
4247
4481
  existing.chunks.push(source);
4248
4482
  };
@@ -7787,11 +8021,71 @@ var decodeHtmlEntities = (value) => {
7787
8021
  output = output.replace(/&#(\d+);/g, (_, code) => String.fromCodePoint(Number(code)));
7788
8022
  return output.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCodePoint(parseInt(code, 16)));
7789
8023
  };
7790
- var stripHtml = (value) => {
7791
- const withoutTags = value.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<br\s*\/?>/gi, `
8024
+ var formatHtmlLinkContext = (href) => {
8025
+ const decoded = decodeHtmlEntities(href.trim());
8026
+ if (!decoded) {
8027
+ return;
8028
+ }
8029
+ if (decoded.startsWith("#")) {
8030
+ return decoded;
8031
+ }
8032
+ if (/^[a-z]+:/i.test(decoded)) {
8033
+ try {
8034
+ const url = new URL(decoded);
8035
+ const path = url.pathname === "/" ? "" : url.pathname;
8036
+ return `${url.hostname}${path}`;
8037
+ } catch {
8038
+ return decoded;
8039
+ }
8040
+ }
8041
+ return decoded;
8042
+ };
8043
+ var stripHtmlTags = (value) => {
8044
+ const withoutTags = value.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<a\b[^>]*href=(['"])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, (_match, _quote, href, inner) => {
8045
+ const label = normalizeWhitespace(stripHtmlTags(inner));
8046
+ const context = formatHtmlLinkContext(href);
8047
+ if (!label) {
8048
+ return context ?? " ";
8049
+ }
8050
+ if (!context || context === label) {
8051
+ return label;
8052
+ }
8053
+ return `${label} (${context})`;
8054
+ }).replace(/<br\s*\/?>/gi, `
7792
8055
  `).replace(/<\/(p|div|section|article|li|ul|ol|h[1-6]|table|tr)>/gi, `
7793
8056
  `).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
7794
- return normalizeWhitespace(decodeHtmlEntities(withoutTags));
8057
+ return decodeHtmlEntities(withoutTags);
8058
+ };
8059
+ var extractMainHtmlContent = (value) => {
8060
+ const trimmed = value.trim();
8061
+ if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
8062
+ return value;
8063
+ }
8064
+ const boilerplateStripped = trimmed.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<(nav|footer|header|aside|form)\b[^>]*>[\s\S]*?<\/\1>/gi, " ");
8065
+ const mainMatch = boilerplateStripped.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
8066
+ if (mainMatch?.[1]) {
8067
+ return mainMatch[1];
8068
+ }
8069
+ const articleMatches = [
8070
+ ...boilerplateStripped.matchAll(/<article\b[^>]*>([\s\S]*?)<\/article>/gi)
8071
+ ].map((match) => match[1]?.trim()).filter(Boolean);
8072
+ if (articleMatches.length > 0) {
8073
+ return articleMatches.join(`
8074
+ `);
8075
+ }
8076
+ const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
8077
+ if (roleMainMatch?.[3]) {
8078
+ return roleMainMatch[3];
8079
+ }
8080
+ const bodyMatch = boilerplateStripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
8081
+ if (bodyMatch?.[1]) {
8082
+ return bodyMatch[1];
8083
+ }
8084
+ return boilerplateStripped;
8085
+ };
8086
+ var stripHtml = (value) => {
8087
+ const focused = extractMainHtmlContent(value);
8088
+ return normalizeWhitespace(stripHtmlTags(focused));
7795
8089
  };
7796
8090
  var stripMarkdown = (value) => {
7797
8091
  const withoutCodeBlocks = value.replace(/```[\s\S]*?```/g, (block) => {
@@ -7811,31 +8105,93 @@ var markdownStructureUnits = (value) => {
7811
8105
  `);
7812
8106
  const sections = [];
7813
8107
  let current = [];
8108
+ let currentPath = [];
8109
+ const headingStack = [];
7814
8110
  const flushCurrentSection = () => {
7815
8111
  if (current.length === 0) {
7816
8112
  return;
7817
8113
  }
7818
- sections.push(current.join(`
7819
- `));
8114
+ sections.push({
8115
+ lines: current,
8116
+ sectionPath: [...currentPath]
8117
+ });
7820
8118
  current = [];
7821
8119
  };
7822
8120
  for (const line of lines) {
7823
- const startsNewSection = /^\s*#{1,6}\s+/.test(line) && current.length > 0;
7824
- if (startsNewSection)
7825
- flushCurrentSection();
8121
+ const headingMatch = line.match(/^\s*(#{1,6})\s+(.+)$/);
8122
+ if (headingMatch) {
8123
+ if (current.length > 0) {
8124
+ flushCurrentSection();
8125
+ }
8126
+ const depth = headingMatch[1]?.length ?? 1;
8127
+ const headingText = normalizeWhitespace(headingMatch[2] ?? "");
8128
+ if (headingText) {
8129
+ headingStack[depth - 1] = headingText;
8130
+ headingStack.length = depth;
8131
+ currentPath = [...headingStack];
8132
+ }
8133
+ }
7826
8134
  current.push(line);
7827
8135
  }
7828
8136
  flushCurrentSection();
7829
- return sections.map((section) => stripMarkdown(section)).map((section) => normalizeWhitespace(section)).filter(Boolean);
8137
+ return sections.map(({ lines: sectionLines, sectionPath }) => ({
8138
+ sectionDepth: sectionPath.length > 0 ? sectionPath.length : undefined,
8139
+ sectionKind: sectionPath.length > 0 ? "markdown_heading" : undefined,
8140
+ sectionPath: sectionPath.length > 0 ? sectionPath : undefined,
8141
+ sectionTitle: sectionPath.at(-1),
8142
+ text: normalizeWhitespace(stripMarkdown(sectionLines.join(`
8143
+ `)))
8144
+ })).filter((section) => Boolean(section.text));
8145
+ };
8146
+ var joinHtmlHeadingSection = (headings, content) => {
8147
+ const normalizedHeadings = headings.map((heading) => normalizeWhitespace(heading));
8148
+ const combined = [...normalizedHeadings, content].filter(Boolean).join(`
8149
+ `);
8150
+ return normalizeWhitespace(combined);
7830
8151
  };
7831
8152
  var htmlStructureUnits = (value) => {
7832
- const marked = value.replace(/<(section|article|main|aside|nav|h[1-6])\b[^>]*>/gi, `
7833
-
7834
- __ABS_SECTION_BREAK__ `).replace(/<\/(section|article|main|aside|nav|h[1-6])>/gi, `
7835
-
7836
- `);
7837
- const normalized = stripHtml(marked);
7838
- return normalized.split(/__ABS_SECTION_BREAK__/).map((section) => normalizeWhitespace(section)).filter(Boolean);
8153
+ const focused = extractMainHtmlContent(value);
8154
+ const headingPattern = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
8155
+ const sections = [];
8156
+ const headingStack = [];
8157
+ let cursor = 0;
8158
+ let currentContentStart = 0;
8159
+ let activeHeadings = [];
8160
+ const flushSection = (end) => {
8161
+ const content = normalizeWhitespace(stripHtmlTags(focused.slice(currentContentStart, end)));
8162
+ if (!content) {
8163
+ return;
8164
+ }
8165
+ const section = joinHtmlHeadingSection(activeHeadings, content);
8166
+ if (section) {
8167
+ sections.push({
8168
+ sectionDepth: activeHeadings.length > 0 ? activeHeadings.length : undefined,
8169
+ sectionKind: activeHeadings.length > 0 ? "html_heading" : undefined,
8170
+ sectionPath: activeHeadings.length > 0 ? [...activeHeadings] : undefined,
8171
+ sectionTitle: activeHeadings.at(-1),
8172
+ text: section
8173
+ });
8174
+ }
8175
+ };
8176
+ for (const match of focused.matchAll(headingPattern)) {
8177
+ const fullMatch = match[0];
8178
+ const start = match.index ?? cursor;
8179
+ flushSection(start);
8180
+ const level = Number.parseInt(match[1] ?? "1", 10);
8181
+ const headingText = normalizeWhitespace(stripHtmlTags(match[2] ?? ""));
8182
+ if (headingText) {
8183
+ headingStack[level - 1] = headingText;
8184
+ headingStack.length = level;
8185
+ activeHeadings = [...headingStack];
8186
+ }
8187
+ cursor = start + fullMatch.length;
8188
+ currentContentStart = cursor;
8189
+ }
8190
+ flushSection(focused.length);
8191
+ if (sections.length > 0) {
8192
+ return sections;
8193
+ }
8194
+ return [{ text: normalizeWhitespace(stripHtmlTags(focused)) }].filter((section) => Boolean(section.text));
7839
8195
  };
7840
8196
  var inferFormat = (document) => {
7841
8197
  if (document.format) {
@@ -7927,10 +8283,77 @@ var isLikelyTextData = (data) => {
7927
8283
  };
7928
8284
  var decodePdfLiteral = (value) => value.replace(/\\([\\()])/g, "$1").replace(/\\n/g, `
7929
8285
  `).replace(/\\r/g, "\r").replace(/\\t/g, "\t").replace(/\\b/g, "\b").replace(/\\f/g, "\f").replace(/\\([0-7]{1,3})/g, (_match, octal) => String.fromCharCode(parseInt(octal, 8)));
8286
+ var PDF_TABLE_GAP_THRESHOLD = 120;
8287
+ var extractPdfArrayText = (value) => {
8288
+ const parts = [];
8289
+ const tokenPattern = /\(((?:\\.|[^\\)])*)\)|([-+]?\d*\.?\d+)/g;
8290
+ let pendingColumnGap = false;
8291
+ for (const match of value.matchAll(tokenPattern)) {
8292
+ if (match[1] !== undefined) {
8293
+ const decoded = decodePdfLiteral(match[1]);
8294
+ if (pendingColumnGap && decoded && !/^\s/.test(decoded) && parts.at(-1) !== " | ") {
8295
+ parts.push(" | ");
8296
+ }
8297
+ parts.push(decoded);
8298
+ pendingColumnGap = false;
8299
+ continue;
8300
+ }
8301
+ const gap = Number(match[2]);
8302
+ if (Number.isFinite(gap) && gap >= PDF_TABLE_GAP_THRESHOLD) {
8303
+ pendingColumnGap = true;
8304
+ }
8305
+ }
8306
+ return normalizeWhitespace(parts.join("")).replace(/\s+\|\s+/g, " | ").trim();
8307
+ };
8308
+ var appendPdfText = (parts, value) => {
8309
+ if (!value) {
8310
+ return;
8311
+ }
8312
+ parts.push(value);
8313
+ };
8314
+ var appendPdfLineBreak = (parts) => {
8315
+ const last = parts.at(-1);
8316
+ if (!last || last.endsWith(`
8317
+ `)) {
8318
+ return;
8319
+ }
8320
+ parts.push(`
8321
+ `);
8322
+ };
8323
+ var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
8324
+ var extractTextFromPDFTextObject = (value) => {
8325
+ const parts = [];
8326
+ for (const match of value.matchAll(PDF_TEXT_OPERATOR_PATTERN)) {
8327
+ if (match[2] !== undefined) {
8328
+ appendPdfText(parts, extractPdfArrayText(match[2]));
8329
+ continue;
8330
+ }
8331
+ if (match[4] !== undefined) {
8332
+ appendPdfText(parts, decodePdfLiteral(match[4]));
8333
+ continue;
8334
+ }
8335
+ if (match[6] !== undefined) {
8336
+ appendPdfLineBreak(parts);
8337
+ appendPdfText(parts, decodePdfLiteral(match[6]));
8338
+ continue;
8339
+ }
8340
+ if (match[8] !== undefined) {
8341
+ appendPdfLineBreak(parts);
8342
+ appendPdfText(parts, decodePdfLiteral(match[8]));
8343
+ continue;
8344
+ }
8345
+ if (match[9] !== undefined || match[10] !== undefined || match[11] !== undefined) {
8346
+ appendPdfLineBreak(parts);
8347
+ }
8348
+ }
8349
+ return parts.join("");
8350
+ };
7930
8351
  var extractTextFromPDFBytes = (data) => {
7931
8352
  const raw = Buffer.from(data).toString("latin1");
7932
- const matches = [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)];
7933
- const combined = matches.map((match) => decodePdfLiteral(match[1] ?? "")).join(`
8353
+ const textObjects = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match) => extractTextFromPDFTextObject(match[1] ?? "")).filter(Boolean);
8354
+ const combined = textObjects.length > 0 ? textObjects.join(`
8355
+
8356
+ `) : [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
7934
8357
  `);
7935
8358
  return normalizeWhitespace(combined);
7936
8359
  };
@@ -8022,7 +8445,40 @@ var decodeGzipEntries = (data, input) => {
8022
8445
  ];
8023
8446
  };
8024
8447
  var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
8448
+ var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
8449
+ `).replace(/<[^>]+>/g, " ")));
8450
+ var officeDocumentParagraphs = (entries) => {
8451
+ const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
8452
+ if (!documentEntry) {
8453
+ return [];
8454
+ }
8455
+ const xml = decodeUtf8(documentEntry.data);
8456
+ const paragraphs = [...xml.matchAll(/<w:p\b[\s\S]*?<\/w:p>/g)];
8457
+ return paragraphs.map((match) => {
8458
+ const paragraphXml = match[0] ?? "";
8459
+ const text = extractOfficeParagraphText(paragraphXml);
8460
+ if (!text) {
8461
+ return "";
8462
+ }
8463
+ const styleMatch = paragraphXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
8464
+ const style = (styleMatch?.[1] ?? "").toLowerCase();
8465
+ if (style === "title") {
8466
+ return text;
8467
+ }
8468
+ const headingMatch = style.match(/^heading([1-6])$/);
8469
+ if (headingMatch) {
8470
+ return text;
8471
+ }
8472
+ return text;
8473
+ }).filter(Boolean);
8474
+ };
8025
8475
  var officeDocumentText = (entries) => {
8476
+ const paragraphs = officeDocumentParagraphs(entries);
8477
+ if (paragraphs.length > 0) {
8478
+ return normalizeWhitespace(paragraphs.join(`
8479
+
8480
+ `));
8481
+ }
8026
8482
  const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
8027
8483
  if (!documentEntry) {
8028
8484
  return "";
@@ -8037,31 +8493,68 @@ var officeDocumentSectionCount = (entries) => {
8037
8493
  const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
8038
8494
  return count > 0 ? count : undefined;
8039
8495
  };
8040
- var spreadsheetText = (entries) => {
8041
- const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
8042
- ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
8043
- ].map((match) => decodeHtmlEntities(match[1] ?? "")));
8044
- const sheetValues = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).flatMap((entry) => [...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)].map((match) => match[1] ?? "")).map((value) => {
8045
- const index = Number(value);
8046
- return Number.isInteger(index) && sharedStrings[index] ? sharedStrings[index] : value;
8496
+ var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
8497
+ ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
8498
+ ].map((match) => decodeHtmlEntities(match[1] ?? "")));
8499
+ var spreadsheetColumnLabel = (reference) => {
8500
+ const match = reference?.match(/([A-Z]+)/i);
8501
+ return match?.[1]?.toUpperCase() ?? "";
8502
+ };
8503
+ var spreadsheetResolveCellValue = (cellXml, sharedStrings) => {
8504
+ const inlineMatch = cellXml.match(/<is\b[^>]*>[\s\S]*?<t[^>]*>([\s\S]*?)<\/t>[\s\S]*?<\/is>/i);
8505
+ if (inlineMatch?.[1]) {
8506
+ return normalizeWhitespace(decodeHtmlEntities(inlineMatch[1]));
8507
+ }
8508
+ const valueMatch = cellXml.match(/<v>([\s\S]*?)<\/v>/i);
8509
+ if (!valueMatch?.[1]) {
8510
+ return "";
8511
+ }
8512
+ const rawValue = decodeHtmlEntities(valueMatch[1]);
8513
+ const typeMatch = cellXml.match(/\bt="([^"]+)"/i);
8514
+ if (typeMatch?.[1] === "s") {
8515
+ const index = Number(rawValue);
8516
+ return Number.isInteger(index) && sharedStrings[index] ? sharedStrings[index] : rawValue;
8517
+ }
8518
+ return normalizeWhitespace(rawValue);
8519
+ };
8520
+ var spreadsheetWorksheetRows = (worksheetXml, sharedStrings) => [...worksheetXml.matchAll(/<row\b[^>]*>([\s\S]*?)<\/row>/gi)].map((rowMatch) => {
8521
+ const rowXml = rowMatch[1] ?? "";
8522
+ const cells = [...rowXml.matchAll(/<c\b([^>]*)>([\s\S]*?)<\/c>/gi)].map((cellMatch) => {
8523
+ const attributes = cellMatch[1] ?? "";
8524
+ const cellBody = cellMatch[2] ?? "";
8525
+ const referenceMatch = attributes.match(/\br="([^"]+)"/i);
8526
+ const reference = referenceMatch?.[1];
8527
+ const value = spreadsheetResolveCellValue(`<c${attributes}>${cellBody}</c>`, sharedStrings);
8528
+ return {
8529
+ column: spreadsheetColumnLabel(reference),
8530
+ reference,
8531
+ value
8532
+ };
8533
+ }).filter((cell) => cell.value);
8534
+ return cells;
8535
+ }).filter((row) => row.length > 0);
8536
+ var spreadsheetRowText = (row, headers) => {
8537
+ const entries = row.map((cell, index) => {
8538
+ const header = headers[index];
8539
+ if (header) {
8540
+ return `${header}: ${cell.value}`;
8541
+ }
8542
+ return cell.column ? `${cell.column}: ${cell.value}` : cell.value;
8047
8543
  });
8048
- return normalizeWhitespace(sheetValues.join(`
8049
- `));
8544
+ return normalizeWhitespace(entries.join(" | "));
8050
8545
  };
8051
8546
  var spreadsheetSheetTexts = (entries) => {
8052
- const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
8053
- ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
8054
- ].map((match) => decodeHtmlEntities(match[1] ?? "")));
8547
+ const sharedStrings = spreadsheetSharedStrings(entries);
8055
8548
  const sheetNames = spreadsheetSheetNames(entries);
8056
8549
  const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
8057
8550
  return sheetEntries.map((entry, index) => {
8058
- const values = [
8059
- ...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
8060
- ].map((match) => match[1] ?? "").map((value) => {
8061
- const sharedStringIndex = Number(value);
8062
- return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
8063
- });
8064
- const text = normalizeWhitespace(values.join(`
8551
+ const rows = spreadsheetWorksheetRows(decodeUtf8(entry.data), sharedStrings);
8552
+ if (rows.length === 0) {
8553
+ return null;
8554
+ }
8555
+ const headers = rows[0].map((cell) => cell.value);
8556
+ const rowTexts = rows.map((row, rowIndex) => normalizeWhitespace(`Row ${rowIndex + 1}. ${spreadsheetRowText(row, rowIndex === 0 ? [] : headers)}`));
8557
+ const text = normalizeWhitespace(rowTexts.join(`
8065
8558
  `));
8066
8559
  if (!text) {
8067
8560
  return null;
@@ -8072,19 +8565,38 @@ var spreadsheetSheetTexts = (entries) => {
8072
8565
  };
8073
8566
  }).filter((entry) => Boolean(entry));
8074
8567
  };
8568
+ var spreadsheetText = (entries) => normalizeWhitespace(spreadsheetSheetTexts(entries).map((sheet) => `Sheet ${sheet.name}
8569
+ ${sheet.text}`).join(`
8570
+
8571
+ `));
8075
8572
  var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
8076
8573
  ...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
8077
8574
  ].map((match) => match[1] ?? "")).filter(Boolean);
8078
- var presentationText = (entries) => {
8079
- const slides = entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).map((entry) => extractXmlText(decodeUtf8(entry.data)));
8080
- return normalizeWhitespace(slides.join(`
8081
-
8575
+ var presentationNotesByIndex = (entries) => new Map(entries.filter((entry) => entry.path.startsWith("ppt/notesSlides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry) => {
8576
+ const indexMatch = entry.path.match(/notesSlide(\d+)\.xml$/i);
8577
+ const index = Number(indexMatch?.[1] ?? "0") - 1;
8578
+ return [
8579
+ index,
8580
+ normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
8581
+ ];
8582
+ }).filter((entry) => entry[0] >= 0 && Boolean(entry[1])));
8583
+ var presentationSlides = (entries) => {
8584
+ const notesByIndex = presentationNotesByIndex(entries);
8585
+ return entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => {
8586
+ const slideText = normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)));
8587
+ const notesText = notesByIndex.get(index);
8588
+ const text = normalizeWhitespace([slideText, notesText ? `Speaker notes: ${notesText}` : ""].filter(Boolean).join(`
8082
8589
  `));
8590
+ return {
8591
+ index,
8592
+ notesText,
8593
+ text
8594
+ };
8595
+ }).filter((slide) => Boolean(slide.text));
8083
8596
  };
8084
- var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
8085
- index,
8086
- text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
8087
- })).filter((slide) => Boolean(slide.text));
8597
+ var presentationText = (entries) => normalizeWhitespace(presentationSlides(entries).map((slide) => slide.text).join(`
8598
+
8599
+ `));
8088
8600
  var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
8089
8601
  var epubText = (entries) => {
8090
8602
  const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
@@ -8092,17 +8604,113 @@ var epubText = (entries) => {
8092
8604
 
8093
8605
  `));
8094
8606
  };
8095
- var extractEmailText = (raw) => {
8607
+ var splitEmailMessage = (raw) => {
8096
8608
  const normalized = raw.replace(/\r\n?/g, `
8097
8609
  `);
8098
- const [, ...bodyParts] = normalized.split(`
8099
-
8100
- `);
8101
- const body = bodyParts.join(`
8610
+ const separator = normalized.indexOf(`
8102
8611
 
8103
8612
  `);
8613
+ if (separator < 0) {
8614
+ return {
8615
+ body: "",
8616
+ headerBlock: normalized
8617
+ };
8618
+ }
8619
+ return {
8620
+ body: normalized.slice(separator + 2),
8621
+ headerBlock: normalized.slice(0, separator)
8622
+ };
8623
+ };
8624
+ var parseHeaderBlock = (headerBlock) => {
8625
+ const unfolded = headerBlock.replace(/\n[ \t]+/g, " ");
8626
+ const headers = new Map;
8627
+ for (const line of unfolded.split(`
8628
+ `)) {
8629
+ const separator = line.indexOf(":");
8630
+ if (separator < 0) {
8631
+ continue;
8632
+ }
8633
+ headers.set(line.slice(0, separator).trim().toLowerCase(), line.slice(separator + 1).trim());
8634
+ }
8635
+ return headers;
8636
+ };
8637
+ var decodeQuotedPrintable = (value) => value.replace(/=\r?\n/g, "").replace(/=([0-9A-F]{2})/gi, (_match, hex) => String.fromCharCode(parseInt(hex, 16)));
8638
+ var decodeEmailPartBody = (body, encoding) => {
8639
+ const normalizedEncoding = encoding?.toLowerCase();
8640
+ const trimmed = body.trim();
8641
+ if (normalizedEncoding === "base64") {
8642
+ return new Uint8Array(Buffer.from(trimmed.replace(/\s+/g, ""), "base64"));
8643
+ }
8644
+ if (normalizedEncoding === "quoted-printable") {
8645
+ return new Uint8Array(Buffer.from(decodeQuotedPrintable(body), "utf8"));
8646
+ }
8647
+ return new Uint8Array(Buffer.from(body, "utf8"));
8648
+ };
8649
+ var parseMimeBoundary = (contentType) => {
8650
+ const match = contentType?.match(/boundary="?([^";]+)"?/i);
8651
+ return match?.[1];
8652
+ };
8653
+ var parseEmailMimeParts = (body, contentType) => {
8654
+ const boundary = parseMimeBoundary(contentType);
8655
+ if (!boundary) {
8656
+ const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
8657
+ return {
8658
+ attachments: [],
8659
+ bodyHtml: htmlMatch?.[0],
8660
+ bodyText: htmlMatch ? undefined : body
8661
+ };
8662
+ }
8663
+ const attachments = [];
8664
+ let bodyText;
8665
+ let bodyHtml;
8666
+ const parts = body.split(`--${boundary}`);
8667
+ for (const rawPart of parts) {
8668
+ const trimmed = rawPart.trim();
8669
+ if (!trimmed || trimmed === "--") {
8670
+ continue;
8671
+ }
8672
+ const { body: partBody, headerBlock } = splitEmailMessage(trimmed);
8673
+ const headers = parseHeaderBlock(headerBlock);
8674
+ const partContentType = headers.get("content-type");
8675
+ const disposition = headers.get("content-disposition");
8676
+ const transferEncoding = headers.get("content-transfer-encoding");
8677
+ const filename = disposition?.match(/filename="?([^";]+)"?/i)?.[1] ?? partContentType?.match(/name="?([^";]+)"?/i)?.[1];
8678
+ if (filename) {
8679
+ attachments.push({
8680
+ contentType: partContentType,
8681
+ data: decodeEmailPartBody(partBody, transferEncoding),
8682
+ fileName: filename
8683
+ });
8684
+ continue;
8685
+ }
8686
+ const decoded = Buffer.from(decodeEmailPartBody(partBody, transferEncoding)).toString("utf8");
8687
+ if (partContentType?.toLowerCase().includes("text/html")) {
8688
+ bodyHtml = decoded;
8689
+ continue;
8690
+ }
8691
+ if (partContentType?.toLowerCase().includes("text/plain")) {
8692
+ bodyText = decoded;
8693
+ }
8694
+ }
8695
+ return {
8696
+ attachments,
8697
+ bodyHtml,
8698
+ bodyText
8699
+ };
8700
+ };
8701
+ var extractEmailText = (raw) => {
8702
+ const { body, headerBlock } = splitEmailMessage(raw);
8703
+ const headers = parseHeaderBlock(headerBlock);
8704
+ const parsed = parseEmailMimeParts(body, headers.get("content-type"));
8705
+ if (parsed.bodyHtml) {
8706
+ return stripHtml(parsed.bodyHtml);
8707
+ }
8708
+ if (parsed.bodyText) {
8709
+ return normalizeWhitespace(parsed.bodyText);
8710
+ }
8104
8711
  if (!body) {
8105
- return normalizeWhitespace(normalized);
8712
+ return normalizeWhitespace(raw.replace(/\r\n?/g, `
8713
+ `));
8106
8714
  }
8107
8715
  const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
8108
8716
  if (htmlMatch) {
@@ -8111,17 +8719,15 @@ var extractEmailText = (raw) => {
8111
8719
  return normalizeWhitespace(body);
8112
8720
  };
8113
8721
  var parseEmailHeaders = (raw) => {
8114
- const normalized = raw.replace(/\r\n?/g, `
8115
- `);
8116
- const [headerBlock = ""] = normalized.split(`
8117
-
8118
- `);
8119
- const getHeader = (name) => {
8120
- const match = headerBlock.match(new RegExp(`^${name}:\\s*(.+)$`, "im"));
8121
- return match?.[1]?.trim();
8122
- };
8722
+ const { headerBlock } = splitEmailMessage(raw);
8723
+ const headers = parseHeaderBlock(headerBlock);
8724
+ const getHeader = (name) => headers.get(name.toLowerCase());
8123
8725
  return {
8726
+ contentType: getHeader("Content-Type"),
8124
8727
  from: getHeader("From"),
8728
+ inReplyTo: getHeader("In-Reply-To"),
8729
+ messageId: getHeader("Message-ID"),
8730
+ references: getHeader("References"),
8125
8731
  subject: getHeader("Subject"),
8126
8732
  threadTopic: getHeader("Thread-Topic") ?? getHeader("Subject"),
8127
8733
  to: getHeader("To")
@@ -8142,6 +8748,87 @@ var extractPrintableStrings = (data) => {
8142
8748
  return unique.join(`
8143
8749
  `);
8144
8750
  };
8751
+ var ocrMetadata = (result) => {
8752
+ const regions = result.regions?.filter((region) => normalizeWhitespace(region.text ?? "").length > 0);
8753
+ const confidenceValues = [
8754
+ typeof result.confidence === "number" ? result.confidence : undefined,
8755
+ ...(regions ?? []).map((region) => typeof region.confidence === "number" ? region.confidence : undefined)
8756
+ ].filter((value) => value !== undefined);
8757
+ const averageConfidence = confidenceValues.length > 0 ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length : undefined;
8758
+ return {
8759
+ ...result.metadata ?? {},
8760
+ ocrConfidence: result.confidence,
8761
+ ocrRegionCount: regions?.length,
8762
+ ocrRegions: regions,
8763
+ ocrAverageConfidence: averageConfidence
8764
+ };
8765
+ };
8766
+ var ocrPageDocuments = (result, input, baseMetadata) => {
8767
+ const grouped = new Map;
8768
+ for (const region of result.regions ?? []) {
8769
+ const text = normalizeWhitespace(region.text ?? "");
8770
+ if (!text || typeof region.page !== "number" || region.page < 1) {
8771
+ continue;
8772
+ }
8773
+ const bucket = grouped.get(region.page) ?? [];
8774
+ bucket.push({ ...region, text });
8775
+ grouped.set(region.page, bucket);
8776
+ }
8777
+ return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => ({
8778
+ chunking: input.chunking,
8779
+ contentType: input.contentType,
8780
+ format: "text",
8781
+ metadata: {
8782
+ ...input.metadata ?? {},
8783
+ ...baseMetadata,
8784
+ ocrRegionCount: regions.length,
8785
+ ocrRegions: regions,
8786
+ pageNumber,
8787
+ pageIndex: pageNumber - 1,
8788
+ sourceNativeKind: "pdf_page"
8789
+ },
8790
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
8791
+ text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
8792
+ ${regions.map((region) => region.text).join(`
8793
+ `)}`),
8794
+ title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
8795
+ }));
8796
+ };
8797
+ var ocrRegionDocuments = (result, input, baseMetadata) => {
8798
+ const documents = [];
8799
+ for (const [index, region] of (result.regions ?? []).entries()) {
8800
+ const text = normalizeWhitespace(region.text ?? "");
8801
+ if (!text || typeof region.page !== "number" || region.page < 1) {
8802
+ continue;
8803
+ }
8804
+ const pageNumber = region.page;
8805
+ const regionNumber = index + 1;
8806
+ documents.push({
8807
+ chunking: input.chunking,
8808
+ contentType: input.contentType,
8809
+ format: "text",
8810
+ metadata: {
8811
+ ...input.metadata ?? {},
8812
+ ...baseMetadata,
8813
+ ocrRegionConfidence: region.confidence,
8814
+ ocrRegionHeight: region.height,
8815
+ ocrRegionWidth: region.width,
8816
+ ocrRegionX: region.x,
8817
+ ocrRegionY: region.y,
8818
+ pageNumber,
8819
+ pageIndex: pageNumber - 1,
8820
+ regionIndex: index,
8821
+ regionNumber,
8822
+ sourceNativeKind: "pdf_region"
8823
+ },
8824
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
8825
+ text: normalizeWhitespace(`PDF page ${pageNumber} region ${regionNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
8826
+ ${text}`),
8827
+ title: input.title ? `${input.title} \xB7 Page ${pageNumber} Region ${regionNumber}` : `Page ${pageNumber} Region ${regionNumber}`
8828
+ });
8829
+ }
8830
+ return documents;
8831
+ };
8145
8832
  var textExtractorSupports = (input) => {
8146
8833
  if (input.format) {
8147
8834
  return true;
@@ -8227,24 +8914,52 @@ var createBuiltinArchiveExpander = () => ({
8227
8914
  var createEmailExtractor = () => ({
8228
8915
  name: "absolute_email",
8229
8916
  supports: emailExtractorSupports,
8230
- extract: (input) => {
8917
+ extract: async (input) => {
8231
8918
  const raw = decodeUtf8(input.data);
8232
8919
  const headers = parseEmailHeaders(raw);
8233
- return {
8920
+ const { body } = splitEmailMessage(raw);
8921
+ const parsed = parseEmailMimeParts(body, headers.contentType);
8922
+ const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`;
8923
+ const messageMetadata = {
8924
+ ...input.metadata ?? {},
8925
+ emailKind: "message",
8926
+ fileKind: "email",
8927
+ from: headers.from,
8928
+ inReplyTo: headers.inReplyTo,
8929
+ messageId: headers.messageId,
8930
+ references: headers.references,
8931
+ threadTopic: headers.subject,
8932
+ to: headers.to,
8933
+ hasAttachments: parsed.attachments.length > 0
8934
+ };
8935
+ const attachmentDocuments = await Promise.all(parsed.attachments.map(async (attachment, index) => {
8936
+ const documents = await extractRAGFileDocuments({
8937
+ chunking: input.chunking,
8938
+ contentType: attachment.contentType,
8939
+ data: attachment.data,
8940
+ format: inferFormatFromContentType(attachment.contentType ?? null) ?? inferFormatFromName(attachment.fileName),
8941
+ metadata: {
8942
+ ...messageMetadata,
8943
+ attachmentIndex: index,
8944
+ attachmentName: attachment.fileName,
8945
+ emailKind: "attachment"
8946
+ },
8947
+ name: attachment.fileName,
8948
+ source: `${source}#attachments/${attachment.fileName}`,
8949
+ title: headers.subject ? `${headers.subject} \xB7 ${attachment.fileName}` : attachment.fileName
8950
+ });
8951
+ return documents;
8952
+ }));
8953
+ const messageDocument = {
8234
8954
  chunking: input.chunking,
8235
8955
  contentType: input.contentType,
8236
8956
  format: "text",
8237
- metadata: {
8238
- ...input.metadata ?? {},
8239
- fileKind: "email",
8240
- from: headers.from,
8241
- threadTopic: headers.subject,
8242
- to: headers.to
8243
- },
8244
- source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`,
8957
+ metadata: messageMetadata,
8958
+ source,
8245
8959
  text: extractEmailText(raw),
8246
8960
  title: input.title ?? headers.subject
8247
8961
  };
8962
+ return [messageDocument, ...attachmentDocuments.flat()];
8248
8963
  }
8249
8964
  });
8250
8965
  var createEPUBExtractor = () => ({
@@ -8388,7 +9103,7 @@ var createRAGImageOCRExtractor = (provider) => ({
8388
9103
  format: "text",
8389
9104
  metadata: {
8390
9105
  ...input.metadata ?? {},
8391
- ...result.metadata ?? {},
9106
+ ...ocrMetadata(result),
8392
9107
  fileKind: "image"
8393
9108
  },
8394
9109
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.image.txt`,
@@ -8476,6 +9191,9 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
8476
9191
  metadata: {
8477
9192
  ...archiveInput.metadata ?? {},
8478
9193
  ...entry.metadata ?? {},
9194
+ archiveEntryName: basename(entry.path),
9195
+ archiveParentName: archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source,
9196
+ archiveParentSource: archiveInput.source ?? archiveInput.path ?? archiveInput.name,
8479
9197
  archivePath: entry.path,
8480
9198
  fileKind: "archive_entry"
8481
9199
  },
@@ -8551,21 +9269,27 @@ var createRAGPDFOCRExtractor = (options) => ({
8551
9269
  ...input,
8552
9270
  contentType: input.contentType ?? "application/pdf"
8553
9271
  });
8554
- return {
9272
+ const baseMetadata = {
9273
+ ...ocrMetadata(ocr),
9274
+ fileKind: "pdf",
9275
+ pageCount: estimatePDFPageCount(input.data),
9276
+ pdfTextMode: "ocr"
9277
+ };
9278
+ const summaryDocument = {
8555
9279
  chunking: input.chunking,
8556
9280
  contentType: input.contentType ?? "application/pdf",
8557
9281
  format: "text",
8558
9282
  metadata: {
8559
9283
  ...input.metadata ?? {},
8560
- ...ocr.metadata ?? {},
8561
- fileKind: "pdf",
8562
- pageCount: estimatePDFPageCount(input.data),
8563
- pdfTextMode: "ocr"
9284
+ ...baseMetadata
8564
9285
  },
8565
9286
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
8566
9287
  text: ocr.text,
8567
9288
  title: ocr.title ?? input.title
8568
9289
  };
9290
+ const pageDocuments = ocrPageDocuments(ocr, input, baseMetadata);
9291
+ const regionDocuments = ocrRegionDocuments(ocr, input, baseMetadata);
9292
+ return [summaryDocument, ...pageDocuments, ...regionDocuments];
8569
9293
  }
8570
9294
  });
8571
9295
  var DEFAULT_FILE_EXTRACTORS = [
@@ -8632,7 +9356,7 @@ var fixedUnits = (text, maxChunkLength) => {
8632
9356
  return units;
8633
9357
  };
8634
9358
  var sourceAwareUnits = (document, format, normalizedText) => {
8635
- const resolveStructuredUnits = (sections) => sections.length > 0 ? sections : paragraphUnits(normalizedText);
9359
+ const resolveStructuredUnits = (sections) => sections.length > 0 ? sections : paragraphUnits(normalizedText).map((text) => ({ text }));
8636
9360
  switch (format) {
8637
9361
  case "markdown": {
8638
9362
  const sections = markdownStructureUnits(document.text);
@@ -8644,7 +9368,7 @@ var sourceAwareUnits = (document, format, normalizedText) => {
8644
9368
  }
8645
9369
  case "text":
8646
9370
  default:
8647
- return paragraphUnits(normalizedText);
9371
+ return paragraphUnits(normalizedText).map((text) => ({ text }));
8648
9372
  }
8649
9373
  };
8650
9374
  var overlapTail = (value, overlap) => {
@@ -8708,10 +9432,13 @@ var chunkFromUnits = (units, maxChunkLength, chunkOverlap, minChunkLength) => {
8708
9432
  return merged;
8709
9433
  };
8710
9434
  var chunkSourceAwareUnit = (unit, options) => {
8711
- if (unit.length <= options.maxChunkLength) {
9435
+ if (unit.text.length <= options.maxChunkLength) {
8712
9436
  return [unit];
8713
9437
  }
8714
- return chunkFromUnits(paragraphUnits(unit), options.maxChunkLength, options.chunkOverlap, options.minChunkLength);
9438
+ return chunkFromUnits(paragraphUnits(unit.text), options.maxChunkLength, options.chunkOverlap, options.minChunkLength).map((text) => ({
9439
+ ...unit,
9440
+ text
9441
+ }));
8715
9442
  };
8716
9443
  var resolveChunkingUnits = (text, options) => {
8717
9444
  if (options.strategy === "fixed") {
@@ -8734,15 +9461,15 @@ var resolveChunkingOptions = (document, defaults) => {
8734
9461
  strategy
8735
9462
  };
8736
9463
  };
8737
- var createChunkTexts = (document, format, text, options) => {
9464
+ var createChunkEntries = (document, format, text, options) => {
8738
9465
  if (text.length <= options.maxChunkLength && options.strategy !== "source_aware") {
8739
- return [text];
9466
+ return [{ text }];
8740
9467
  }
8741
9468
  if (options.strategy === "source_aware") {
8742
9469
  return sourceAwareUnits(document, format, text).flatMap((unit) => chunkSourceAwareUnit(unit, options));
8743
9470
  }
8744
9471
  const units = resolveChunkingUnits(text, options);
8745
- return chunkFromUnits(units, options.maxChunkLength, options.chunkOverlap, options.minChunkLength);
9472
+ return chunkFromUnits(units, options.maxChunkLength, options.chunkOverlap, options.minChunkLength).map((entry) => ({ text: entry }));
8746
9473
  };
8747
9474
  var prepareRAGDocument = (document, defaultChunking) => {
8748
9475
  const format = inferFormat(document);
@@ -8764,18 +9491,46 @@ var prepareRAGDocument = (document, defaultChunking) => {
8764
9491
  source,
8765
9492
  title
8766
9493
  };
8767
- const chunkTexts = createChunkTexts(document, format, normalizedText, chunking);
8768
- const chunks = chunkTexts.map((text, index) => ({
8769
- chunkId: `${documentId}:${String(index + 1).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}`,
8770
- metadata: {
8771
- ...metadata,
8772
- chunkCount: chunkTexts.length,
8773
- chunkIndex: index
8774
- },
8775
- source,
8776
- text,
8777
- title
8778
- }));
9494
+ const chunkEntries = createChunkEntries(document, format, normalizedText, chunking);
9495
+ const chunks = chunkEntries.map((entry, index) => {
9496
+ const sectionPath = Array.isArray(entry.sectionPath) ? entry.sectionPath.filter((value) => typeof value === "string" && value.length > 0) : undefined;
9497
+ const sectionTitle = typeof entry.sectionTitle === "string" && entry.sectionTitle.length > 0 ? entry.sectionTitle : sectionPath?.at(-1);
9498
+ const chunkTitle = sectionTitle && sectionTitle !== title ? `${title} \xB7 ${sectionTitle}` : title;
9499
+ const sectionChunkId = sectionPath && sectionPath.length > 0 ? `${documentId}:section:${slugify(sectionPath.join(" "))}` : undefined;
9500
+ const sectionSiblingIndexes = sectionChunkId === undefined ? [index] : chunkEntries.reduce((indexes, candidate, candidateIndex) => {
9501
+ const candidatePath = Array.isArray(candidate.sectionPath) ? candidate.sectionPath.filter((value) => typeof value === "string" && value.length > 0) : undefined;
9502
+ const candidateSectionId = candidatePath && candidatePath.length > 0 ? `${documentId}:section:${slugify(candidatePath.join(" "))}` : undefined;
9503
+ if (candidateSectionId === sectionChunkId) {
9504
+ indexes.push(candidateIndex);
9505
+ }
9506
+ return indexes;
9507
+ }, []);
9508
+ const sectionChunkIndex = sectionSiblingIndexes.indexOf(index);
9509
+ const previousChunkId = index > 0 ? `${documentId}:${String(index).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}` : undefined;
9510
+ const nextChunkId = index + 1 < chunkEntries.length ? `${documentId}:${String(index + 2).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}` : undefined;
9511
+ return {
9512
+ chunkId: `${documentId}:${String(index + 1).padStart(RAG_CHUNK_ID_PAD_LENGTH, "0")}`,
9513
+ metadata: {
9514
+ ...metadata,
9515
+ chunkCount: chunkEntries.length,
9516
+ chunkIndex: index,
9517
+ ...sectionTitle ? { sectionTitle } : {},
9518
+ ...sectionPath && sectionPath.length > 0 ? { sectionPath } : {},
9519
+ ...typeof entry.sectionDepth === "number" ? { sectionDepth: entry.sectionDepth } : {},
9520
+ ...entry.sectionKind ? { sectionKind: entry.sectionKind } : {},
9521
+ ...sectionChunkId ? { sectionChunkId } : {},
9522
+ ...sectionChunkId && sectionChunkIndex >= 0 ? {
9523
+ sectionChunkCount: sectionSiblingIndexes.length,
9524
+ sectionChunkIndex
9525
+ } : {},
9526
+ ...previousChunkId ? { previousChunkId } : {},
9527
+ ...nextChunkId ? { nextChunkId } : {}
9528
+ },
9529
+ source,
9530
+ text: entry.text,
9531
+ title: chunkTitle
9532
+ };
9533
+ });
8779
9534
  return {
8780
9535
  chunks,
8781
9536
  documentId,
@@ -9421,6 +10176,30 @@ var searchDocuments = async (collection, input) => collection.search(input);
9421
10176
  // src/ai/rag/htmxWorkflowRenderers.ts
9422
10177
  init_constants();
9423
10178
  var escapeHtml2 = (text) => text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
10179
+ var renderSourceLabels = (input) => {
10180
+ if (!input) {
10181
+ return "";
10182
+ }
10183
+ const rows = [
10184
+ input.contextLabel ? `<li><strong>Context</strong> ${escapeHtml2(input.contextLabel)}</li>` : "",
10185
+ input.locatorLabel ? `<li><strong>Location</strong> ${escapeHtml2(input.locatorLabel)}</li>` : "",
10186
+ input.provenanceLabel ? `<li><strong>Provenance</strong> ${escapeHtml2(input.provenanceLabel)}</li>` : ""
10187
+ ].filter((row) => row.length > 0);
10188
+ return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
10189
+ };
10190
+ var renderChunkStructure = (structure) => {
10191
+ if (!structure) {
10192
+ return "";
10193
+ }
10194
+ const rows = [
10195
+ structure.section?.title ? `<li><strong>Section</strong> ${escapeHtml2(structure.section.title)}</li>` : "",
10196
+ structure.section?.path && structure.section.path.length > 1 ? `<li><strong>Section path</strong> ${escapeHtml2(structure.section.path.join(" > "))}</li>` : "",
10197
+ typeof structure.sequence?.sectionChunkIndex === "number" && typeof structure.sequence?.sectionChunkCount === "number" ? `<li><strong>Section chunk</strong> ${structure.sequence.sectionChunkIndex + 1} of ${structure.sequence.sectionChunkCount}</li>` : "",
10198
+ structure.sequence?.previousChunkId ? `<li><strong>Previous</strong> ${escapeHtml2(structure.sequence.previousChunkId)}</li>` : "",
10199
+ structure.sequence?.nextChunkId ? `<li><strong>Next</strong> ${escapeHtml2(structure.sequence.nextChunkId)}</li>` : ""
10200
+ ].filter((row) => row.length > 0);
10201
+ return rows.length > 0 ? `<ul class="rag-chunk-structure">${rows.join("")}</ul>` : "";
10202
+ };
9424
10203
  var renderEmptyState = (kind) => {
9425
10204
  switch (kind) {
9426
10205
  case "documents":
@@ -9460,17 +10239,41 @@ var defaultStatus = ({
9460
10239
  }
9461
10240
  return `<dl class="rag-status">` + `<div><dt>Backend</dt><dd>${escapeHtml2(status.backend)}</dd></div>` + `<div><dt>Vector mode</dt><dd>${escapeHtml2(status.vectorMode)}</dd></div>` + `<div><dt>Embedding dimensions</dt><dd>${status.dimensions ?? "n/a"}</dd></div>` + `<div><dt>Vector acceleration</dt><dd>${status.native?.active ? "active" : "inactive"}</dd></div>` + `<div><dt>Documents</dt><dd>${documents?.total ?? "n/a"}</dd></div>` + `<div><dt>Total chunks</dt><dd>${documents?.chunkCount ?? "n/a"}</dd></div>` + `<div><dt>Seed docs</dt><dd>${documents?.byKind.seed ?? 0}</dd></div>` + `<div><dt>Custom docs</dt><dd>${documents?.byKind.custom ?? 0}</dd></div>` + `</dl>${renderCapabilityList(capabilities)}`;
9462
10241
  };
9463
- var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
10242
+ var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + renderSourceLabels(source.labels) + renderChunkStructure(source.structure) + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
9464
10243
  var defaultSearchResults = ({
9465
10244
  query,
9466
10245
  results,
9467
10246
  trace
9468
10247
  }) => results.length === 0 ? renderEmptyState("searchResults") : `<section class="rag-search-results">` + `<p class="rag-search-summary">${results.length} results for ${escapeHtml2(query)}</p>` + (trace ? `<p class="rag-search-summary">mode=${escapeHtml2(trace.mode)} \xB7 final=${trace.resultCounts.final} \xB7 vector=${trace.resultCounts.vector} \xB7 lexical=${trace.resultCounts.lexical}</p>` : "") + `${results.map((result, index) => defaultSearchResultItem(result, index)).join("")}</section>`;
9469
- var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
10248
+ var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + renderSourceLabels(document.labels) + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
9470
10249
  var defaultDocuments = ({
9471
10250
  documents
9472
10251
  }) => documents.length === 0 ? renderEmptyState("documents") : `<section class="rag-documents">${documents.map((document, index) => defaultDocumentItem(document, index)).join("")}</section>`;
9473
- var defaultChunkPreview = (input) => `<section class="rag-chunk-preview">` + `<h3>${escapeHtml2(input.document.title)}</h3>` + `<p class="rag-chunk-preview-source">${escapeHtml2(input.document.source)}</p>` + `<article class="rag-chunk-normalized">` + `<h4>Normalized text</h4>` + `<pre>${escapeHtml2(input.normalizedText)}</pre>` + `</article>${input.chunks.map((chunk) => '<article class="rag-chunk">' + `<h4>${escapeHtml2(chunk.chunkId)}</h4>` + `<p class="rag-chunk-meta">chunk ${typeof chunk.metadata?.chunkIndex === "number" ? chunk.metadata.chunkIndex : 0} of ${typeof chunk.metadata?.chunkCount === "number" ? chunk.metadata.chunkCount : input.chunks.length}</p>` + `<pre>${escapeHtml2(chunk.text)}</pre>` + "</article>").join("")}</section>`;
10252
+ var defaultChunkPreview = (input) => {
10253
+ const groups = input.chunks.reduce((acc, chunk) => {
10254
+ const metadata = chunk.metadata ?? {};
10255
+ const kind = typeof metadata.sourceNativeKind === "string" ? metadata.sourceNativeKind : "document_chunk";
10256
+ const locator = chunk.labels?.locatorLabel ?? "";
10257
+ const title = kind === "pdf_page" ? locator || "PDF pages" : kind === "pdf_region" ? locator || "PDF regions" : kind === "spreadsheet_sheet" ? locator || "Spreadsheet sheets" : kind === "presentation_slide" ? locator || "Presentation slides" : kind === "attachment" ? locator || "Attachments" : kind === "archive_entry" ? locator || "Archive entries" : "Chunks";
10258
+ const key = kind === "document_chunk" ? "document_chunk" : `${kind}:${title}`;
10259
+ const existing = acc.find((entry) => entry.key === key);
10260
+ if (existing) {
10261
+ existing.chunks.push(chunk);
10262
+ return acc;
10263
+ }
10264
+ acc.push({
10265
+ chunks: [chunk],
10266
+ key,
10267
+ title
10268
+ });
10269
+ return acc;
10270
+ }, []);
10271
+ const groupHtml = groups.map((group) => {
10272
+ const chunkHtml = group.chunks.map((chunk) => '<article class="rag-chunk">' + `<h5>${escapeHtml2(chunk.chunkId)}</h5>` + `<p class="rag-chunk-meta">chunk ${typeof chunk.metadata?.chunkIndex === "number" ? chunk.metadata.chunkIndex : 0} of ${typeof chunk.metadata?.chunkCount === "number" ? chunk.metadata.chunkCount : input.chunks.length}</p>` + renderSourceLabels(chunk.labels) + renderChunkStructure(chunk.structure) + `<pre>${escapeHtml2(chunk.text)}</pre>` + "</article>").join("");
10273
+ return `<section class="rag-chunk-group"><h4>${escapeHtml2(group.title)}</h4>${chunkHtml}</section>`;
10274
+ }).join("");
10275
+ return `<section class="rag-chunk-preview">` + `<h3>${escapeHtml2(input.document.title)}</h3>` + `<p class="rag-chunk-preview-source">${escapeHtml2(input.document.source)}</p>` + renderSourceLabels(input.document.labels) + `<article class="rag-chunk-normalized">` + `<h4>Normalized text</h4>` + `<pre>${escapeHtml2(input.normalizedText)}</pre>` + `</article>${groupHtml}</section>`;
10276
+ };
9474
10277
  var defaultMutationResult = (input) => {
9475
10278
  if (!input.ok) {
9476
10279
  return `<div class="rag-mutation error">${escapeHtml2(input.error ?? "Request failed")}</div>`;
@@ -9533,6 +10336,10 @@ var buildRAGContextLocatorLabel = (metadata, source, title) => {
9533
10336
  return;
9534
10337
  }
9535
10338
  const page = getContextNumber3(metadata.page) ?? getContextNumber3(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
10339
+ const region = getContextNumber3(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
10340
+ if (page && region) {
10341
+ return `Page ${page} \xB7 Region ${region}`;
10342
+ }
9536
10343
  if (page) {
9537
10344
  return `Page ${page}`;
9538
10345
  }
@@ -9574,9 +10381,11 @@ var buildRAGContextProvenanceLabel = (metadata) => {
9574
10381
  const threadTopic = getContextString3(metadata.threadTopic);
9575
10382
  const from = getContextString3(metadata.from);
9576
10383
  const speaker = getContextString3(metadata.speaker);
10384
+ const ocrConfidence = getContextNumber3(metadata.ocrRegionConfidence) ?? getContextNumber3(metadata.ocrConfidence);
9577
10385
  const labels = [
9578
10386
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
9579
10387
  ocrEngine ? `OCR ${ocrEngine}` : "",
10388
+ typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
9580
10389
  mediaKind ? `Media ${mediaKind}` : "",
9581
10390
  transcriptSource ? `Transcript ${transcriptSource}` : "",
9582
10391
  threadTopic ? `Thread ${threadTopic}` : "",
@@ -9886,9 +10695,15 @@ var isRAGDocumentUrlArray = (value) => Array.isArray(value) && value.every((entr
9886
10695
  var isRAGDocumentChunkArray = (value) => Array.isArray(value) && value.every((entry) => isRAGDocumentChunk(entry));
9887
10696
  var buildSources2 = (results) => results.map((result) => ({
9888
10697
  chunkId: result.chunkId,
10698
+ labels: buildRAGSourceLabels({
10699
+ metadata: result.metadata,
10700
+ source: result.source,
10701
+ title: result.title
10702
+ }),
9889
10703
  metadata: result.metadata,
9890
10704
  score: normalizeScore(result.score),
9891
10705
  source: result.source,
10706
+ structure: buildRAGChunkStructure(result.metadata),
9892
10707
  text: result.chunkText,
9893
10708
  title: result.title
9894
10709
  }));
@@ -13616,6 +14431,11 @@ var ragChat = (config) => {
13616
14431
  let documentsWithoutChunkPreview = 0;
13617
14432
  let inspectedDocuments = 0;
13618
14433
  let inspectedChunks = 0;
14434
+ let documentsWithSourceLabels = 0;
14435
+ let chunksWithSourceLabels = 0;
14436
+ const sourceNativeKinds = new Map;
14437
+ const sampleDocuments = [];
14438
+ const sampleChunks = [];
13619
14439
  let oldestDocumentAgeMs;
13620
14440
  let newestDocumentAgeMs;
13621
14441
  const staleDocuments = [];
@@ -13656,6 +14476,27 @@ var ragChat = (config) => {
13656
14476
  if ((document.chunkCount ?? 0) === 0) {
13657
14477
  emptyDocuments += 1;
13658
14478
  }
14479
+ const documentLabels = buildRAGSourceLabels({
14480
+ metadata: document.metadata,
14481
+ source: document.source,
14482
+ title: document.title
14483
+ });
14484
+ if (documentLabels) {
14485
+ documentsWithSourceLabels += 1;
14486
+ }
14487
+ const documentSourceNativeKind = typeof document.metadata?.sourceNativeKind === "string" ? document.metadata.sourceNativeKind : undefined;
14488
+ if (documentSourceNativeKind) {
14489
+ sourceNativeKinds.set(documentSourceNativeKind, (sourceNativeKinds.get(documentSourceNativeKind) ?? 0) + 1);
14490
+ }
14491
+ if (sampleDocuments.length < 5 && (documentLabels || documentSourceNativeKind)) {
14492
+ sampleDocuments.push({
14493
+ id: document.id,
14494
+ labels: documentLabels,
14495
+ source: document.source,
14496
+ sourceNativeKind: documentSourceNativeKind,
14497
+ title: document.title
14498
+ });
14499
+ }
13659
14500
  if (indexManager?.getDocumentChunks) {
13660
14501
  const preview = await indexManager.getDocumentChunks(document.id);
13661
14502
  if (!preview) {
@@ -13665,6 +14506,27 @@ var ragChat = (config) => {
13665
14506
  inspectedDocuments += 1;
13666
14507
  for (const chunk of preview.chunks) {
13667
14508
  inspectedChunks += 1;
14509
+ const chunkLabels = buildRAGSourceLabels({
14510
+ metadata: chunk.metadata,
14511
+ source: chunk.source ?? preview.document.source,
14512
+ title: chunk.title ?? preview.document.title
14513
+ });
14514
+ if (chunkLabels) {
14515
+ chunksWithSourceLabels += 1;
14516
+ }
14517
+ const chunkSourceNativeKind = typeof chunk.metadata?.sourceNativeKind === "string" ? chunk.metadata.sourceNativeKind : undefined;
14518
+ if (chunkSourceNativeKind) {
14519
+ sourceNativeKinds.set(chunkSourceNativeKind, (sourceNativeKinds.get(chunkSourceNativeKind) ?? 0) + 1);
14520
+ }
14521
+ if (sampleChunks.length < 8 && (chunkLabels || chunkSourceNativeKind)) {
14522
+ sampleChunks.push({
14523
+ chunkId: chunk.chunkId,
14524
+ documentId: document.id,
14525
+ labels: chunkLabels,
14526
+ source: chunk.source ?? preview.document.source,
14527
+ sourceNativeKind: chunkSourceNativeKind
14528
+ });
14529
+ }
13668
14530
  const normalized = chunk.text.trim();
13669
14531
  if (!normalized) {
13670
14532
  emptyChunks += 1;
@@ -13721,6 +14583,13 @@ var ragChat = (config) => {
13721
14583
  failuresByInputKind: Object.fromEntries(failuresByInputKind.entries()),
13722
14584
  inspectedChunks,
13723
14585
  inspectedDocuments,
14586
+ inspection: {
14587
+ chunksWithSourceLabels,
14588
+ documentsWithSourceLabels,
14589
+ sampleChunks,
14590
+ sampleDocuments,
14591
+ sourceNativeKinds: Object.fromEntries(sourceNativeKinds.entries())
14592
+ },
13724
14593
  lowSignalChunks,
13725
14594
  newestDocumentAgeMs,
13726
14595
  oldestDocumentAgeMs,
@@ -14901,7 +15770,14 @@ var ragChat = (config) => {
14901
15770
  }
14902
15771
  const documents = await indexManager.listDocuments({ kind });
14903
15772
  return {
14904
- documents,
15773
+ documents: documents.map((document) => ({
15774
+ ...document,
15775
+ labels: buildRAGSourceLabels({
15776
+ metadata: document.metadata,
15777
+ source: document.source,
15778
+ title: document.title
15779
+ })
15780
+ })),
14905
15781
  ok: true
14906
15782
  };
14907
15783
  };
@@ -14961,7 +15837,24 @@ var ragChat = (config) => {
14961
15837
  }
14962
15838
  return {
14963
15839
  ok: true,
14964
- ...preview
15840
+ ...preview,
15841
+ document: {
15842
+ ...preview.document,
15843
+ labels: buildRAGSourceLabels({
15844
+ metadata: preview.document.metadata,
15845
+ source: preview.document.source,
15846
+ title: preview.document.title
15847
+ })
15848
+ },
15849
+ chunks: preview.chunks.map((chunk) => ({
15850
+ ...chunk,
15851
+ labels: buildRAGSourceLabels({
15852
+ metadata: chunk.metadata,
15853
+ source: chunk.source ?? preview.document.source,
15854
+ title: chunk.title ?? preview.document.title
15855
+ }),
15856
+ structure: buildRAGChunkStructure(chunk.metadata)
15857
+ }))
14965
15858
  };
14966
15859
  };
14967
15860
  const handleDeleteDocument = async (id) => {
@@ -20557,5 +21450,5 @@ export {
20557
21450
  aiChat
20558
21451
  };
20559
21452
 
20560
- //# debugId=E76E681490B6CFE564756E2164756E21
21453
+ //# debugId=DE5EC1314BD5A9F664756E2164756E21
20561
21454
  //# sourceMappingURL=index.js.map