@absolutejs/absolute 0.19.0-beta.602 → 0.19.0-beta.604

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/ai/client/index.js +59 -5
  2. package/dist/ai/client/index.js.map +4 -4
  3. package/dist/ai/client/ui.js +59 -5
  4. package/dist/ai/client/ui.js.map +4 -4
  5. package/dist/ai/index.js +720 -80
  6. package/dist/ai/index.js.map +8 -8
  7. package/dist/ai/rag/quality.js +11 -1
  8. package/dist/ai/rag/quality.js.map +3 -3
  9. package/dist/ai/rag/ui.js +59 -5
  10. package/dist/ai/rag/ui.js.map +4 -4
  11. package/dist/ai-client/angular/ai/index.js +58 -4
  12. package/dist/ai-client/react/ai/index.js +58 -4
  13. package/dist/ai-client/vue/ai/index.js +58 -4
  14. package/dist/angular/ai/index.js +59 -5
  15. package/dist/angular/ai/index.js.map +4 -4
  16. package/dist/angular/index.js +2 -2
  17. package/dist/angular/index.js.map +1 -1
  18. package/dist/angular/server.js +2 -2
  19. package/dist/angular/server.js.map +1 -1
  20. package/dist/build.js +2 -2
  21. package/dist/build.js.map +1 -1
  22. package/dist/index.js +2 -2
  23. package/dist/index.js.map +1 -1
  24. package/dist/react/ai/index.js +59 -5
  25. package/dist/react/ai/index.js.map +4 -4
  26. package/dist/src/ai/rag/presentation.d.ts +6 -1
  27. package/dist/src/vue/ai/useRAG.d.ts +60 -0
  28. package/dist/src/vue/ai/useRAGChunkPreview.d.ts +20 -0
  29. package/dist/src/vue/ai/useRAGDocuments.d.ts +20 -0
  30. package/dist/src/vue/ai/useRAGIndexAdmin.d.ts +10 -0
  31. package/dist/src/vue/ai/useRAGSearch.d.ts +10 -0
  32. package/dist/svelte/ai/index.js +59 -5
  33. package/dist/svelte/ai/index.js.map +4 -4
  34. package/dist/types/ai.d.ts +42 -1
  35. package/dist/vue/ai/index.js +59 -5
  36. package/dist/vue/ai/index.js.map +4 -4
  37. package/package.json +53 -7
package/dist/ai/index.js CHANGED
@@ -216,6 +216,10 @@ var buildContextLabel = (metadata) => {
216
216
  return from ? `Message from ${from}` : "Message evidence";
217
217
  }
218
218
  const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
219
+ const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
220
+ if (page && region) {
221
+ return `Page ${page} region ${region}`;
222
+ }
219
223
  if (page) {
220
224
  return `Page ${page}`;
221
225
  }
@@ -256,6 +260,10 @@ var buildLocatorLabel = (metadata, source, title) => {
256
260
  return;
257
261
  }
258
262
  const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
263
+ const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
264
+ if (page && region) {
265
+ return `Page ${page} \xB7 Region ${region}`;
266
+ }
259
267
  if (page) {
260
268
  return `Page ${page}`;
261
269
  }
@@ -308,9 +316,11 @@ var buildProvenanceLabel = (metadata) => {
308
316
  const transcriptSource = getContextString(metadata.transcriptSource);
309
317
  const pdfTextMode = getContextString(metadata.pdfTextMode);
310
318
  const ocrEngine = getContextString(metadata.ocrEngine);
319
+ const ocrConfidence = getContextNumber(metadata.ocrRegionConfidence) ?? getContextNumber(metadata.ocrConfidence);
311
320
  const labels = [
312
321
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
313
322
  ocrEngine ? `OCR ${ocrEngine}` : "",
323
+ typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
314
324
  mediaKind ? `Media ${mediaKind}` : "",
315
325
  transcriptSource ? `Transcript ${transcriptSource}` : "",
316
326
  threadTopic ? `Thread ${threadTopic}` : "",
@@ -3977,6 +3987,10 @@ var buildContextLabel2 = (metadata) => {
3977
3987
  return from ? `Message from ${from}` : "Message evidence";
3978
3988
  }
3979
3989
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
3990
+ const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
3991
+ if (page && region) {
3992
+ return `Page ${page} region ${region}`;
3993
+ }
3980
3994
  if (page) {
3981
3995
  return `Page ${page}`;
3982
3996
  }
@@ -4007,6 +4021,10 @@ var buildLocatorLabel2 = (metadata, source, title) => {
4007
4021
  return;
4008
4022
  }
4009
4023
  const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
4024
+ const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
4025
+ if (page && region) {
4026
+ return `Page ${page} \xB7 Region ${region}`;
4027
+ }
4010
4028
  if (page) {
4011
4029
  return `Page ${page}`;
4012
4030
  }
@@ -4049,9 +4067,11 @@ var buildProvenanceLabel2 = (metadata) => {
4049
4067
  const transcriptSource = getContextString2(metadata.transcriptSource);
4050
4068
  const pdfTextMode = getContextString2(metadata.pdfTextMode);
4051
4069
  const ocrEngine = getContextString2(metadata.ocrEngine);
4070
+ const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
4052
4071
  const labels = [
4053
4072
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
4054
4073
  ocrEngine ? `OCR ${ocrEngine}` : "",
4074
+ typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
4055
4075
  mediaKind ? `Media ${mediaKind}` : "",
4056
4076
  transcriptSource ? `Transcript ${transcriptSource}` : "",
4057
4077
  threadTopic ? `Thread ${threadTopic}` : "",
@@ -4061,6 +4081,23 @@ var buildProvenanceLabel2 = (metadata) => {
4061
4081
  ].filter((value) => value.length > 0);
4062
4082
  return labels.length > 0 ? labels.join(" \xB7 ") : undefined;
4063
4083
  };
4084
+ var buildRAGSourceLabels = ({
4085
+ metadata,
4086
+ source,
4087
+ title
4088
+ }) => {
4089
+ const contextLabel = buildContextLabel2(metadata);
4090
+ const locatorLabel = buildLocatorLabel2(metadata, source, title);
4091
+ const provenanceLabel = buildProvenanceLabel2(metadata);
4092
+ if (!contextLabel && !locatorLabel && !provenanceLabel) {
4093
+ return;
4094
+ }
4095
+ return {
4096
+ contextLabel,
4097
+ locatorLabel,
4098
+ provenanceLabel
4099
+ };
4100
+ };
4064
4101
  var buildExcerpt2 = (text, maxLength = 160) => {
4065
4102
  const normalized = text.replaceAll(/\s+/g, " ").trim();
4066
4103
  if (normalized.length <= maxLength) {
@@ -4102,13 +4139,13 @@ var buildRAGSourceSummaries = (sources) => {
4102
4139
  citationNumbers: groupCitations.map((citation) => citationReferenceMap[citation.chunkId] ?? 0),
4103
4140
  citations: groupCitations,
4104
4141
  chunkIds: group.chunks.map((chunk) => chunk.chunkId),
4105
- contextLabel: buildContextLabel2(leadChunk?.metadata),
4142
+ contextLabel: leadChunk?.labels?.contextLabel ?? buildContextLabel2(leadChunk?.metadata),
4106
4143
  count: group.count,
4107
4144
  excerpt: buildExcerpt2(leadChunk?.text ?? ""),
4108
4145
  key: group.key,
4109
4146
  label: group.label,
4110
- locatorLabel: buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
4111
- provenanceLabel: buildProvenanceLabel2(leadChunk?.metadata),
4147
+ locatorLabel: leadChunk?.labels?.locatorLabel ?? buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
4148
+ provenanceLabel: leadChunk?.labels?.provenanceLabel ?? buildProvenanceLabel2(leadChunk?.metadata),
4112
4149
  source: group.source,
4113
4150
  title: group.title
4114
4151
  };
@@ -4232,6 +4269,11 @@ var buildSourceGroup = (source, key) => ({
4232
4269
  count: 1,
4233
4270
  key,
4234
4271
  label: buildSourceLabel2(source),
4272
+ labels: source.labels ?? buildRAGSourceLabels({
4273
+ metadata: source.metadata,
4274
+ source: source.source,
4275
+ title: source.title
4276
+ }),
4235
4277
  source: source.source,
4236
4278
  title: source.title
4237
4279
  });
@@ -4242,7 +4284,19 @@ var updateSourceGroup = (groups, source) => {
4242
4284
  groups.set(key, buildSourceGroup(source, key));
4243
4285
  return;
4244
4286
  }
4245
- existing.bestScore = Math.max(existing.bestScore, source.score);
4287
+ if (source.score > existing.bestScore) {
4288
+ existing.bestScore = source.score;
4289
+ existing.label = buildSourceLabel2(source);
4290
+ existing.labels = source.labels ?? buildRAGSourceLabels({
4291
+ metadata: source.metadata,
4292
+ source: source.source,
4293
+ title: source.title
4294
+ });
4295
+ existing.source = source.source;
4296
+ existing.title = source.title;
4297
+ } else {
4298
+ existing.bestScore = Math.max(existing.bestScore, source.score);
4299
+ }
4246
4300
  existing.count += 1;
4247
4301
  existing.chunks.push(source);
4248
4302
  };
@@ -7787,11 +7841,71 @@ var decodeHtmlEntities = (value) => {
7787
7841
  output = output.replace(/&#(\d+);/g, (_, code) => String.fromCodePoint(Number(code)));
7788
7842
  return output.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCodePoint(parseInt(code, 16)));
7789
7843
  };
7790
- var stripHtml = (value) => {
7791
- const withoutTags = value.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<br\s*\/?>/gi, `
7844
+ var formatHtmlLinkContext = (href) => {
7845
+ const decoded = decodeHtmlEntities(href.trim());
7846
+ if (!decoded) {
7847
+ return;
7848
+ }
7849
+ if (decoded.startsWith("#")) {
7850
+ return decoded;
7851
+ }
7852
+ if (/^[a-z]+:/i.test(decoded)) {
7853
+ try {
7854
+ const url = new URL(decoded);
7855
+ const path = url.pathname === "/" ? "" : url.pathname;
7856
+ return `${url.hostname}${path}`;
7857
+ } catch {
7858
+ return decoded;
7859
+ }
7860
+ }
7861
+ return decoded;
7862
+ };
7863
+ var stripHtmlTags = (value) => {
7864
+ const withoutTags = value.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<a\b[^>]*href=(['"])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, (_match, _quote, href, inner) => {
7865
+ const label = normalizeWhitespace(stripHtmlTags(inner));
7866
+ const context = formatHtmlLinkContext(href);
7867
+ if (!label) {
7868
+ return context ?? " ";
7869
+ }
7870
+ if (!context || context === label) {
7871
+ return label;
7872
+ }
7873
+ return `${label} (${context})`;
7874
+ }).replace(/<br\s*\/?>/gi, `
7792
7875
  `).replace(/<\/(p|div|section|article|li|ul|ol|h[1-6]|table|tr)>/gi, `
7793
7876
  `).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
7794
- return normalizeWhitespace(decodeHtmlEntities(withoutTags));
7877
+ return decodeHtmlEntities(withoutTags);
7878
+ };
7879
+ var extractMainHtmlContent = (value) => {
7880
+ const trimmed = value.trim();
7881
+ if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
7882
+ return value;
7883
+ }
7884
+ const boilerplateStripped = trimmed.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<(nav|footer|header|aside|form)\b[^>]*>[\s\S]*?<\/\1>/gi, " ");
7885
+ const mainMatch = boilerplateStripped.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
7886
+ if (mainMatch?.[1]) {
7887
+ return mainMatch[1];
7888
+ }
7889
+ const articleMatches = [
7890
+ ...boilerplateStripped.matchAll(/<article\b[^>]*>([\s\S]*?)<\/article>/gi)
7891
+ ].map((match) => match[1]?.trim()).filter(Boolean);
7892
+ if (articleMatches.length > 0) {
7893
+ return articleMatches.join(`
7894
+ `);
7895
+ }
7896
+ const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
7897
+ if (roleMainMatch?.[3]) {
7898
+ return roleMainMatch[3];
7899
+ }
7900
+ const bodyMatch = boilerplateStripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
7901
+ if (bodyMatch?.[1]) {
7902
+ return bodyMatch[1];
7903
+ }
7904
+ return boilerplateStripped;
7905
+ };
7906
+ var stripHtml = (value) => {
7907
+ const focused = extractMainHtmlContent(value);
7908
+ return normalizeWhitespace(stripHtmlTags(focused));
7795
7909
  };
7796
7910
  var stripMarkdown = (value) => {
7797
7911
  const withoutCodeBlocks = value.replace(/```[\s\S]*?```/g, (block) => {
@@ -7828,14 +7942,49 @@ var markdownStructureUnits = (value) => {
7828
7942
  flushCurrentSection();
7829
7943
  return sections.map((section) => stripMarkdown(section)).map((section) => normalizeWhitespace(section)).filter(Boolean);
7830
7944
  };
7831
- var htmlStructureUnits = (value) => {
7832
- const marked = value.replace(/<(section|article|main|aside|nav|h[1-6])\b[^>]*>/gi, `
7833
-
7834
- __ABS_SECTION_BREAK__ `).replace(/<\/(section|article|main|aside|nav|h[1-6])>/gi, `
7835
-
7945
+ var joinHtmlHeadingSection = (headings, content) => {
7946
+ const normalizedHeadings = headings.map((heading) => normalizeWhitespace(heading));
7947
+ const combined = [...normalizedHeadings, content].filter(Boolean).join(`
7836
7948
  `);
7837
- const normalized = stripHtml(marked);
7838
- return normalized.split(/__ABS_SECTION_BREAK__/).map((section) => normalizeWhitespace(section)).filter(Boolean);
7949
+ return normalizeWhitespace(combined);
7950
+ };
7951
+ var htmlStructureUnits = (value) => {
7952
+ const focused = extractMainHtmlContent(value);
7953
+ const headingPattern = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
7954
+ const sections = [];
7955
+ const headingStack = [];
7956
+ let cursor = 0;
7957
+ let currentContentStart = 0;
7958
+ let activeHeadings = [];
7959
+ const flushSection = (end) => {
7960
+ const content = normalizeWhitespace(stripHtmlTags(focused.slice(currentContentStart, end)));
7961
+ if (!content) {
7962
+ return;
7963
+ }
7964
+ const section = joinHtmlHeadingSection(activeHeadings, content);
7965
+ if (section) {
7966
+ sections.push(section);
7967
+ }
7968
+ };
7969
+ for (const match of focused.matchAll(headingPattern)) {
7970
+ const fullMatch = match[0];
7971
+ const start = match.index ?? cursor;
7972
+ flushSection(start);
7973
+ const level = Number.parseInt(match[1] ?? "1", 10);
7974
+ const headingText = normalizeWhitespace(stripHtmlTags(match[2] ?? ""));
7975
+ if (headingText) {
7976
+ headingStack[level - 1] = headingText;
7977
+ headingStack.length = level;
7978
+ activeHeadings = [...headingStack];
7979
+ }
7980
+ cursor = start + fullMatch.length;
7981
+ currentContentStart = cursor;
7982
+ }
7983
+ flushSection(focused.length);
7984
+ if (sections.length > 0) {
7985
+ return sections;
7986
+ }
7987
+ return [normalizeWhitespace(stripHtmlTags(focused))].filter(Boolean);
7839
7988
  };
7840
7989
  var inferFormat = (document) => {
7841
7990
  if (document.format) {
@@ -7927,10 +8076,77 @@ var isLikelyTextData = (data) => {
7927
8076
  };
7928
8077
  var decodePdfLiteral = (value) => value.replace(/\\([\\()])/g, "$1").replace(/\\n/g, `
7929
8078
  `).replace(/\\r/g, "\r").replace(/\\t/g, "\t").replace(/\\b/g, "\b").replace(/\\f/g, "\f").replace(/\\([0-7]{1,3})/g, (_match, octal) => String.fromCharCode(parseInt(octal, 8)));
8079
+ var PDF_TABLE_GAP_THRESHOLD = 120;
8080
+ var extractPdfArrayText = (value) => {
8081
+ const parts = [];
8082
+ const tokenPattern = /\(((?:\\.|[^\\)])*)\)|([-+]?\d*\.?\d+)/g;
8083
+ let pendingColumnGap = false;
8084
+ for (const match of value.matchAll(tokenPattern)) {
8085
+ if (match[1] !== undefined) {
8086
+ const decoded = decodePdfLiteral(match[1]);
8087
+ if (pendingColumnGap && decoded && !/^\s/.test(decoded) && parts.at(-1) !== " | ") {
8088
+ parts.push(" | ");
8089
+ }
8090
+ parts.push(decoded);
8091
+ pendingColumnGap = false;
8092
+ continue;
8093
+ }
8094
+ const gap = Number(match[2]);
8095
+ if (Number.isFinite(gap) && gap >= PDF_TABLE_GAP_THRESHOLD) {
8096
+ pendingColumnGap = true;
8097
+ }
8098
+ }
8099
+ return normalizeWhitespace(parts.join("")).replace(/\s+\|\s+/g, " | ").trim();
8100
+ };
8101
+ var appendPdfText = (parts, value) => {
8102
+ if (!value) {
8103
+ return;
8104
+ }
8105
+ parts.push(value);
8106
+ };
8107
+ var appendPdfLineBreak = (parts) => {
8108
+ const last = parts.at(-1);
8109
+ if (!last || last.endsWith(`
8110
+ `)) {
8111
+ return;
8112
+ }
8113
+ parts.push(`
8114
+ `);
8115
+ };
8116
+ var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
8117
+ var extractTextFromPDFTextObject = (value) => {
8118
+ const parts = [];
8119
+ for (const match of value.matchAll(PDF_TEXT_OPERATOR_PATTERN)) {
8120
+ if (match[2] !== undefined) {
8121
+ appendPdfText(parts, extractPdfArrayText(match[2]));
8122
+ continue;
8123
+ }
8124
+ if (match[4] !== undefined) {
8125
+ appendPdfText(parts, decodePdfLiteral(match[4]));
8126
+ continue;
8127
+ }
8128
+ if (match[6] !== undefined) {
8129
+ appendPdfLineBreak(parts);
8130
+ appendPdfText(parts, decodePdfLiteral(match[6]));
8131
+ continue;
8132
+ }
8133
+ if (match[8] !== undefined) {
8134
+ appendPdfLineBreak(parts);
8135
+ appendPdfText(parts, decodePdfLiteral(match[8]));
8136
+ continue;
8137
+ }
8138
+ if (match[9] !== undefined || match[10] !== undefined || match[11] !== undefined) {
8139
+ appendPdfLineBreak(parts);
8140
+ }
8141
+ }
8142
+ return parts.join("");
8143
+ };
7930
8144
  var extractTextFromPDFBytes = (data) => {
7931
8145
  const raw = Buffer.from(data).toString("latin1");
7932
- const matches = [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)];
7933
- const combined = matches.map((match) => decodePdfLiteral(match[1] ?? "")).join(`
8146
+ const textObjects = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match) => extractTextFromPDFTextObject(match[1] ?? "")).filter(Boolean);
8147
+ const combined = textObjects.length > 0 ? textObjects.join(`
8148
+
8149
+ `) : [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
7934
8150
  `);
7935
8151
  return normalizeWhitespace(combined);
7936
8152
  };
@@ -8022,7 +8238,40 @@ var decodeGzipEntries = (data, input) => {
8022
8238
  ];
8023
8239
  };
8024
8240
  var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
8241
+ var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
8242
+ `).replace(/<[^>]+>/g, " ")));
8243
+ var officeDocumentParagraphs = (entries) => {
8244
+ const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
8245
+ if (!documentEntry) {
8246
+ return [];
8247
+ }
8248
+ const xml = decodeUtf8(documentEntry.data);
8249
+ const paragraphs = [...xml.matchAll(/<w:p\b[\s\S]*?<\/w:p>/g)];
8250
+ return paragraphs.map((match) => {
8251
+ const paragraphXml = match[0] ?? "";
8252
+ const text = extractOfficeParagraphText(paragraphXml);
8253
+ if (!text) {
8254
+ return "";
8255
+ }
8256
+ const styleMatch = paragraphXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
8257
+ const style = (styleMatch?.[1] ?? "").toLowerCase();
8258
+ if (style === "title") {
8259
+ return text;
8260
+ }
8261
+ const headingMatch = style.match(/^heading([1-6])$/);
8262
+ if (headingMatch) {
8263
+ return text;
8264
+ }
8265
+ return text;
8266
+ }).filter(Boolean);
8267
+ };
8025
8268
  var officeDocumentText = (entries) => {
8269
+ const paragraphs = officeDocumentParagraphs(entries);
8270
+ if (paragraphs.length > 0) {
8271
+ return normalizeWhitespace(paragraphs.join(`
8272
+
8273
+ `));
8274
+ }
8026
8275
  const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
8027
8276
  if (!documentEntry) {
8028
8277
  return "";
@@ -8037,31 +8286,68 @@ var officeDocumentSectionCount = (entries) => {
8037
8286
  const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
8038
8287
  return count > 0 ? count : undefined;
8039
8288
  };
8040
- var spreadsheetText = (entries) => {
8041
- const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
8042
- ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
8043
- ].map((match) => decodeHtmlEntities(match[1] ?? "")));
8044
- const sheetValues = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).flatMap((entry) => [...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)].map((match) => match[1] ?? "")).map((value) => {
8045
- const index = Number(value);
8046
- return Number.isInteger(index) && sharedStrings[index] ? sharedStrings[index] : value;
8289
+ var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
8290
+ ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
8291
+ ].map((match) => decodeHtmlEntities(match[1] ?? "")));
8292
+ var spreadsheetColumnLabel = (reference) => {
8293
+ const match = reference?.match(/([A-Z]+)/i);
8294
+ return match?.[1]?.toUpperCase() ?? "";
8295
+ };
8296
+ var spreadsheetResolveCellValue = (cellXml, sharedStrings) => {
8297
+ const inlineMatch = cellXml.match(/<is\b[^>]*>[\s\S]*?<t[^>]*>([\s\S]*?)<\/t>[\s\S]*?<\/is>/i);
8298
+ if (inlineMatch?.[1]) {
8299
+ return normalizeWhitespace(decodeHtmlEntities(inlineMatch[1]));
8300
+ }
8301
+ const valueMatch = cellXml.match(/<v>([\s\S]*?)<\/v>/i);
8302
+ if (!valueMatch?.[1]) {
8303
+ return "";
8304
+ }
8305
+ const rawValue = decodeHtmlEntities(valueMatch[1]);
8306
+ const typeMatch = cellXml.match(/\bt="([^"]+)"/i);
8307
+ if (typeMatch?.[1] === "s") {
8308
+ const index = Number(rawValue);
8309
+ return Number.isInteger(index) && sharedStrings[index] ? sharedStrings[index] : rawValue;
8310
+ }
8311
+ return normalizeWhitespace(rawValue);
8312
+ };
8313
+ var spreadsheetWorksheetRows = (worksheetXml, sharedStrings) => [...worksheetXml.matchAll(/<row\b[^>]*>([\s\S]*?)<\/row>/gi)].map((rowMatch) => {
8314
+ const rowXml = rowMatch[1] ?? "";
8315
+ const cells = [...rowXml.matchAll(/<c\b([^>]*)>([\s\S]*?)<\/c>/gi)].map((cellMatch) => {
8316
+ const attributes = cellMatch[1] ?? "";
8317
+ const cellBody = cellMatch[2] ?? "";
8318
+ const referenceMatch = attributes.match(/\br="([^"]+)"/i);
8319
+ const reference = referenceMatch?.[1];
8320
+ const value = spreadsheetResolveCellValue(`<c${attributes}>${cellBody}</c>`, sharedStrings);
8321
+ return {
8322
+ column: spreadsheetColumnLabel(reference),
8323
+ reference,
8324
+ value
8325
+ };
8326
+ }).filter((cell) => cell.value);
8327
+ return cells;
8328
+ }).filter((row) => row.length > 0);
8329
+ var spreadsheetRowText = (row, headers) => {
8330
+ const entries = row.map((cell, index) => {
8331
+ const header = headers[index];
8332
+ if (header) {
8333
+ return `${header}: ${cell.value}`;
8334
+ }
8335
+ return cell.column ? `${cell.column}: ${cell.value}` : cell.value;
8047
8336
  });
8048
- return normalizeWhitespace(sheetValues.join(`
8049
- `));
8337
+ return normalizeWhitespace(entries.join(" | "));
8050
8338
  };
8051
8339
  var spreadsheetSheetTexts = (entries) => {
8052
- const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
8053
- ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
8054
- ].map((match) => decodeHtmlEntities(match[1] ?? "")));
8340
+ const sharedStrings = spreadsheetSharedStrings(entries);
8055
8341
  const sheetNames = spreadsheetSheetNames(entries);
8056
8342
  const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
8057
8343
  return sheetEntries.map((entry, index) => {
8058
- const values = [
8059
- ...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
8060
- ].map((match) => match[1] ?? "").map((value) => {
8061
- const sharedStringIndex = Number(value);
8062
- return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
8063
- });
8064
- const text = normalizeWhitespace(values.join(`
8344
+ const rows = spreadsheetWorksheetRows(decodeUtf8(entry.data), sharedStrings);
8345
+ if (rows.length === 0) {
8346
+ return null;
8347
+ }
8348
+ const headers = rows[0].map((cell) => cell.value);
8349
+ const rowTexts = rows.map((row, rowIndex) => normalizeWhitespace(`Row ${rowIndex + 1}. ${spreadsheetRowText(row, rowIndex === 0 ? [] : headers)}`));
8350
+ const text = normalizeWhitespace(rowTexts.join(`
8065
8351
  `));
8066
8352
  if (!text) {
8067
8353
  return null;
@@ -8072,19 +8358,38 @@ var spreadsheetSheetTexts = (entries) => {
8072
8358
  };
8073
8359
  }).filter((entry) => Boolean(entry));
8074
8360
  };
8361
+ var spreadsheetText = (entries) => normalizeWhitespace(spreadsheetSheetTexts(entries).map((sheet) => `Sheet ${sheet.name}
8362
+ ${sheet.text}`).join(`
8363
+
8364
+ `));
8075
8365
  var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
8076
8366
  ...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
8077
8367
  ].map((match) => match[1] ?? "")).filter(Boolean);
8078
- var presentationText = (entries) => {
8079
- const slides = entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).map((entry) => extractXmlText(decodeUtf8(entry.data)));
8080
- return normalizeWhitespace(slides.join(`
8081
-
8368
+ var presentationNotesByIndex = (entries) => new Map(entries.filter((entry) => entry.path.startsWith("ppt/notesSlides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry) => {
8369
+ const indexMatch = entry.path.match(/notesSlide(\d+)\.xml$/i);
8370
+ const index = Number(indexMatch?.[1] ?? "0") - 1;
8371
+ return [
8372
+ index,
8373
+ normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
8374
+ ];
8375
+ }).filter((entry) => entry[0] >= 0 && Boolean(entry[1])));
8376
+ var presentationSlides = (entries) => {
8377
+ const notesByIndex = presentationNotesByIndex(entries);
8378
+ return entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => {
8379
+ const slideText = normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)));
8380
+ const notesText = notesByIndex.get(index);
8381
+ const text = normalizeWhitespace([slideText, notesText ? `Speaker notes: ${notesText}` : ""].filter(Boolean).join(`
8082
8382
  `));
8383
+ return {
8384
+ index,
8385
+ notesText,
8386
+ text
8387
+ };
8388
+ }).filter((slide) => Boolean(slide.text));
8083
8389
  };
8084
- var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
8085
- index,
8086
- text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
8087
- })).filter((slide) => Boolean(slide.text));
8390
+ var presentationText = (entries) => normalizeWhitespace(presentationSlides(entries).map((slide) => slide.text).join(`
8391
+
8392
+ `));
8088
8393
  var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
8089
8394
  var epubText = (entries) => {
8090
8395
  const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
@@ -8092,17 +8397,113 @@ var epubText = (entries) => {
8092
8397
 
8093
8398
  `));
8094
8399
  };
8095
- var extractEmailText = (raw) => {
8400
+ var splitEmailMessage = (raw) => {
8096
8401
  const normalized = raw.replace(/\r\n?/g, `
8097
8402
  `);
8098
- const [, ...bodyParts] = normalized.split(`
8099
-
8100
- `);
8101
- const body = bodyParts.join(`
8403
+ const separator = normalized.indexOf(`
8102
8404
 
8103
8405
  `);
8406
+ if (separator < 0) {
8407
+ return {
8408
+ body: "",
8409
+ headerBlock: normalized
8410
+ };
8411
+ }
8412
+ return {
8413
+ body: normalized.slice(separator + 2),
8414
+ headerBlock: normalized.slice(0, separator)
8415
+ };
8416
+ };
8417
+ var parseHeaderBlock = (headerBlock) => {
8418
+ const unfolded = headerBlock.replace(/\n[ \t]+/g, " ");
8419
+ const headers = new Map;
8420
+ for (const line of unfolded.split(`
8421
+ `)) {
8422
+ const separator = line.indexOf(":");
8423
+ if (separator < 0) {
8424
+ continue;
8425
+ }
8426
+ headers.set(line.slice(0, separator).trim().toLowerCase(), line.slice(separator + 1).trim());
8427
+ }
8428
+ return headers;
8429
+ };
8430
+ var decodeQuotedPrintable = (value) => value.replace(/=\r?\n/g, "").replace(/=([0-9A-F]{2})/gi, (_match, hex) => String.fromCharCode(parseInt(hex, 16)));
8431
+ var decodeEmailPartBody = (body, encoding) => {
8432
+ const normalizedEncoding = encoding?.toLowerCase();
8433
+ const trimmed = body.trim();
8434
+ if (normalizedEncoding === "base64") {
8435
+ return new Uint8Array(Buffer.from(trimmed.replace(/\s+/g, ""), "base64"));
8436
+ }
8437
+ if (normalizedEncoding === "quoted-printable") {
8438
+ return new Uint8Array(Buffer.from(decodeQuotedPrintable(body), "utf8"));
8439
+ }
8440
+ return new Uint8Array(Buffer.from(body, "utf8"));
8441
+ };
8442
+ var parseMimeBoundary = (contentType) => {
8443
+ const match = contentType?.match(/boundary="?([^";]+)"?/i);
8444
+ return match?.[1];
8445
+ };
8446
+ var parseEmailMimeParts = (body, contentType) => {
8447
+ const boundary = parseMimeBoundary(contentType);
8448
+ if (!boundary) {
8449
+ const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
8450
+ return {
8451
+ attachments: [],
8452
+ bodyHtml: htmlMatch?.[0],
8453
+ bodyText: htmlMatch ? undefined : body
8454
+ };
8455
+ }
8456
+ const attachments = [];
8457
+ let bodyText;
8458
+ let bodyHtml;
8459
+ const parts = body.split(`--${boundary}`);
8460
+ for (const rawPart of parts) {
8461
+ const trimmed = rawPart.trim();
8462
+ if (!trimmed || trimmed === "--") {
8463
+ continue;
8464
+ }
8465
+ const { body: partBody, headerBlock } = splitEmailMessage(trimmed);
8466
+ const headers = parseHeaderBlock(headerBlock);
8467
+ const partContentType = headers.get("content-type");
8468
+ const disposition = headers.get("content-disposition");
8469
+ const transferEncoding = headers.get("content-transfer-encoding");
8470
+ const filename = disposition?.match(/filename="?([^";]+)"?/i)?.[1] ?? partContentType?.match(/name="?([^";]+)"?/i)?.[1];
8471
+ if (filename) {
8472
+ attachments.push({
8473
+ contentType: partContentType,
8474
+ data: decodeEmailPartBody(partBody, transferEncoding),
8475
+ fileName: filename
8476
+ });
8477
+ continue;
8478
+ }
8479
+ const decoded = Buffer.from(decodeEmailPartBody(partBody, transferEncoding)).toString("utf8");
8480
+ if (partContentType?.toLowerCase().includes("text/html")) {
8481
+ bodyHtml = decoded;
8482
+ continue;
8483
+ }
8484
+ if (partContentType?.toLowerCase().includes("text/plain")) {
8485
+ bodyText = decoded;
8486
+ }
8487
+ }
8488
+ return {
8489
+ attachments,
8490
+ bodyHtml,
8491
+ bodyText
8492
+ };
8493
+ };
8494
+ var extractEmailText = (raw) => {
8495
+ const { body, headerBlock } = splitEmailMessage(raw);
8496
+ const headers = parseHeaderBlock(headerBlock);
8497
+ const parsed = parseEmailMimeParts(body, headers.get("content-type"));
8498
+ if (parsed.bodyHtml) {
8499
+ return stripHtml(parsed.bodyHtml);
8500
+ }
8501
+ if (parsed.bodyText) {
8502
+ return normalizeWhitespace(parsed.bodyText);
8503
+ }
8104
8504
  if (!body) {
8105
- return normalizeWhitespace(normalized);
8505
+ return normalizeWhitespace(raw.replace(/\r\n?/g, `
8506
+ `));
8106
8507
  }
8107
8508
  const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
8108
8509
  if (htmlMatch) {
@@ -8111,17 +8512,15 @@ var extractEmailText = (raw) => {
8111
8512
  return normalizeWhitespace(body);
8112
8513
  };
8113
8514
  var parseEmailHeaders = (raw) => {
8114
- const normalized = raw.replace(/\r\n?/g, `
8115
- `);
8116
- const [headerBlock = ""] = normalized.split(`
8117
-
8118
- `);
8119
- const getHeader = (name) => {
8120
- const match = headerBlock.match(new RegExp(`^${name}:\\s*(.+)$`, "im"));
8121
- return match?.[1]?.trim();
8122
- };
8515
+ const { headerBlock } = splitEmailMessage(raw);
8516
+ const headers = parseHeaderBlock(headerBlock);
8517
+ const getHeader = (name) => headers.get(name.toLowerCase());
8123
8518
  return {
8519
+ contentType: getHeader("Content-Type"),
8124
8520
  from: getHeader("From"),
8521
+ inReplyTo: getHeader("In-Reply-To"),
8522
+ messageId: getHeader("Message-ID"),
8523
+ references: getHeader("References"),
8125
8524
  subject: getHeader("Subject"),
8126
8525
  threadTopic: getHeader("Thread-Topic") ?? getHeader("Subject"),
8127
8526
  to: getHeader("To")
@@ -8142,6 +8541,87 @@ var extractPrintableStrings = (data) => {
8142
8541
  return unique.join(`
8143
8542
  `);
8144
8543
  };
8544
+ var ocrMetadata = (result) => {
8545
+ const regions = result.regions?.filter((region) => normalizeWhitespace(region.text ?? "").length > 0);
8546
+ const confidenceValues = [
8547
+ typeof result.confidence === "number" ? result.confidence : undefined,
8548
+ ...(regions ?? []).map((region) => typeof region.confidence === "number" ? region.confidence : undefined)
8549
+ ].filter((value) => value !== undefined);
8550
+ const averageConfidence = confidenceValues.length > 0 ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length : undefined;
8551
+ return {
8552
+ ...result.metadata ?? {},
8553
+ ocrConfidence: result.confidence,
8554
+ ocrRegionCount: regions?.length,
8555
+ ocrRegions: regions,
8556
+ ocrAverageConfidence: averageConfidence
8557
+ };
8558
+ };
8559
+ var ocrPageDocuments = (result, input, baseMetadata) => {
8560
+ const grouped = new Map;
8561
+ for (const region of result.regions ?? []) {
8562
+ const text = normalizeWhitespace(region.text ?? "");
8563
+ if (!text || typeof region.page !== "number" || region.page < 1) {
8564
+ continue;
8565
+ }
8566
+ const bucket = grouped.get(region.page) ?? [];
8567
+ bucket.push({ ...region, text });
8568
+ grouped.set(region.page, bucket);
8569
+ }
8570
+ return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => ({
8571
+ chunking: input.chunking,
8572
+ contentType: input.contentType,
8573
+ format: "text",
8574
+ metadata: {
8575
+ ...input.metadata ?? {},
8576
+ ...baseMetadata,
8577
+ ocrRegionCount: regions.length,
8578
+ ocrRegions: regions,
8579
+ pageNumber,
8580
+ pageIndex: pageNumber - 1,
8581
+ sourceNativeKind: "pdf_page"
8582
+ },
8583
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
8584
+ text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
8585
+ ${regions.map((region) => region.text).join(`
8586
+ `)}`),
8587
+ title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
8588
+ }));
8589
+ };
8590
+ var ocrRegionDocuments = (result, input, baseMetadata) => {
8591
+ const documents = [];
8592
+ for (const [index, region] of (result.regions ?? []).entries()) {
8593
+ const text = normalizeWhitespace(region.text ?? "");
8594
+ if (!text || typeof region.page !== "number" || region.page < 1) {
8595
+ continue;
8596
+ }
8597
+ const pageNumber = region.page;
8598
+ const regionNumber = index + 1;
8599
+ documents.push({
8600
+ chunking: input.chunking,
8601
+ contentType: input.contentType,
8602
+ format: "text",
8603
+ metadata: {
8604
+ ...input.metadata ?? {},
8605
+ ...baseMetadata,
8606
+ ocrRegionConfidence: region.confidence,
8607
+ ocrRegionHeight: region.height,
8608
+ ocrRegionWidth: region.width,
8609
+ ocrRegionX: region.x,
8610
+ ocrRegionY: region.y,
8611
+ pageNumber,
8612
+ pageIndex: pageNumber - 1,
8613
+ regionIndex: index,
8614
+ regionNumber,
8615
+ sourceNativeKind: "pdf_region"
8616
+ },
8617
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
8618
+ text: normalizeWhitespace(`PDF page ${pageNumber} region ${regionNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
8619
+ ${text}`),
8620
+ title: input.title ? `${input.title} \xB7 Page ${pageNumber} Region ${regionNumber}` : `Page ${pageNumber} Region ${regionNumber}`
8621
+ });
8622
+ }
8623
+ return documents;
8624
+ };
8145
8625
  var textExtractorSupports = (input) => {
8146
8626
  if (input.format) {
8147
8627
  return true;
@@ -8227,24 +8707,52 @@ var createBuiltinArchiveExpander = () => ({
8227
8707
  var createEmailExtractor = () => ({
8228
8708
  name: "absolute_email",
8229
8709
  supports: emailExtractorSupports,
8230
- extract: (input) => {
8710
+ extract: async (input) => {
8231
8711
  const raw = decodeUtf8(input.data);
8232
8712
  const headers = parseEmailHeaders(raw);
8233
- return {
8713
+ const { body } = splitEmailMessage(raw);
8714
+ const parsed = parseEmailMimeParts(body, headers.contentType);
8715
+ const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`;
8716
+ const messageMetadata = {
8717
+ ...input.metadata ?? {},
8718
+ emailKind: "message",
8719
+ fileKind: "email",
8720
+ from: headers.from,
8721
+ inReplyTo: headers.inReplyTo,
8722
+ messageId: headers.messageId,
8723
+ references: headers.references,
8724
+ threadTopic: headers.subject,
8725
+ to: headers.to,
8726
+ hasAttachments: parsed.attachments.length > 0
8727
+ };
8728
+ const attachmentDocuments = await Promise.all(parsed.attachments.map(async (attachment, index) => {
8729
+ const documents = await extractRAGFileDocuments({
8730
+ chunking: input.chunking,
8731
+ contentType: attachment.contentType,
8732
+ data: attachment.data,
8733
+ format: inferFormatFromContentType(attachment.contentType ?? null) ?? inferFormatFromName(attachment.fileName),
8734
+ metadata: {
8735
+ ...messageMetadata,
8736
+ attachmentIndex: index,
8737
+ attachmentName: attachment.fileName,
8738
+ emailKind: "attachment"
8739
+ },
8740
+ name: attachment.fileName,
8741
+ source: `${source}#attachments/${attachment.fileName}`,
8742
+ title: headers.subject ? `${headers.subject} \xB7 ${attachment.fileName}` : attachment.fileName
8743
+ });
8744
+ return documents;
8745
+ }));
8746
+ const messageDocument = {
8234
8747
  chunking: input.chunking,
8235
8748
  contentType: input.contentType,
8236
8749
  format: "text",
8237
- metadata: {
8238
- ...input.metadata ?? {},
8239
- fileKind: "email",
8240
- from: headers.from,
8241
- threadTopic: headers.subject,
8242
- to: headers.to
8243
- },
8244
- source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`,
8750
+ metadata: messageMetadata,
8751
+ source,
8245
8752
  text: extractEmailText(raw),
8246
8753
  title: input.title ?? headers.subject
8247
8754
  };
8755
+ return [messageDocument, ...attachmentDocuments.flat()];
8248
8756
  }
8249
8757
  });
8250
8758
  var createEPUBExtractor = () => ({
@@ -8388,7 +8896,7 @@ var createRAGImageOCRExtractor = (provider) => ({
8388
8896
  format: "text",
8389
8897
  metadata: {
8390
8898
  ...input.metadata ?? {},
8391
- ...result.metadata ?? {},
8899
+ ...ocrMetadata(result),
8392
8900
  fileKind: "image"
8393
8901
  },
8394
8902
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.image.txt`,
@@ -8476,6 +8984,9 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
8476
8984
  metadata: {
8477
8985
  ...archiveInput.metadata ?? {},
8478
8986
  ...entry.metadata ?? {},
8987
+ archiveEntryName: basename(entry.path),
8988
+ archiveParentName: archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source,
8989
+ archiveParentSource: archiveInput.source ?? archiveInput.path ?? archiveInput.name,
8479
8990
  archivePath: entry.path,
8480
8991
  fileKind: "archive_entry"
8481
8992
  },
@@ -8551,21 +9062,27 @@ var createRAGPDFOCRExtractor = (options) => ({
8551
9062
  ...input,
8552
9063
  contentType: input.contentType ?? "application/pdf"
8553
9064
  });
8554
- return {
9065
+ const baseMetadata = {
9066
+ ...ocrMetadata(ocr),
9067
+ fileKind: "pdf",
9068
+ pageCount: estimatePDFPageCount(input.data),
9069
+ pdfTextMode: "ocr"
9070
+ };
9071
+ const summaryDocument = {
8555
9072
  chunking: input.chunking,
8556
9073
  contentType: input.contentType ?? "application/pdf",
8557
9074
  format: "text",
8558
9075
  metadata: {
8559
9076
  ...input.metadata ?? {},
8560
- ...ocr.metadata ?? {},
8561
- fileKind: "pdf",
8562
- pageCount: estimatePDFPageCount(input.data),
8563
- pdfTextMode: "ocr"
9077
+ ...baseMetadata
8564
9078
  },
8565
9079
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
8566
9080
  text: ocr.text,
8567
9081
  title: ocr.title ?? input.title
8568
9082
  };
9083
+ const pageDocuments = ocrPageDocuments(ocr, input, baseMetadata);
9084
+ const regionDocuments = ocrRegionDocuments(ocr, input, baseMetadata);
9085
+ return [summaryDocument, ...pageDocuments, ...regionDocuments];
8569
9086
  }
8570
9087
  });
8571
9088
  var DEFAULT_FILE_EXTRACTORS = [
@@ -9421,6 +9938,17 @@ var searchDocuments = async (collection, input) => collection.search(input);
9421
9938
  // src/ai/rag/htmxWorkflowRenderers.ts
9422
9939
  init_constants();
9423
9940
  var escapeHtml2 = (text) => text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
9941
+ var renderSourceLabels = (input) => {
9942
+ if (!input) {
9943
+ return "";
9944
+ }
9945
+ const rows = [
9946
+ input.contextLabel ? `<li><strong>Context</strong> ${escapeHtml2(input.contextLabel)}</li>` : "",
9947
+ input.locatorLabel ? `<li><strong>Location</strong> ${escapeHtml2(input.locatorLabel)}</li>` : "",
9948
+ input.provenanceLabel ? `<li><strong>Provenance</strong> ${escapeHtml2(input.provenanceLabel)}</li>` : ""
9949
+ ].filter((row) => row.length > 0);
9950
+ return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
9951
+ };
9424
9952
  var renderEmptyState = (kind) => {
9425
9953
  switch (kind) {
9426
9954
  case "documents":
@@ -9460,17 +9988,41 @@ var defaultStatus = ({
9460
9988
  }
9461
9989
  return `<dl class="rag-status">` + `<div><dt>Backend</dt><dd>${escapeHtml2(status.backend)}</dd></div>` + `<div><dt>Vector mode</dt><dd>${escapeHtml2(status.vectorMode)}</dd></div>` + `<div><dt>Embedding dimensions</dt><dd>${status.dimensions ?? "n/a"}</dd></div>` + `<div><dt>Vector acceleration</dt><dd>${status.native?.active ? "active" : "inactive"}</dd></div>` + `<div><dt>Documents</dt><dd>${documents?.total ?? "n/a"}</dd></div>` + `<div><dt>Total chunks</dt><dd>${documents?.chunkCount ?? "n/a"}</dd></div>` + `<div><dt>Seed docs</dt><dd>${documents?.byKind.seed ?? 0}</dd></div>` + `<div><dt>Custom docs</dt><dd>${documents?.byKind.custom ?? 0}</dd></div>` + `</dl>${renderCapabilityList(capabilities)}`;
9462
9990
  };
9463
- var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
9991
+ var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + renderSourceLabels(source.labels) + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
9464
9992
  var defaultSearchResults = ({
9465
9993
  query,
9466
9994
  results,
9467
9995
  trace
9468
9996
  }) => results.length === 0 ? renderEmptyState("searchResults") : `<section class="rag-search-results">` + `<p class="rag-search-summary">${results.length} results for ${escapeHtml2(query)}</p>` + (trace ? `<p class="rag-search-summary">mode=${escapeHtml2(trace.mode)} \xB7 final=${trace.resultCounts.final} \xB7 vector=${trace.resultCounts.vector} \xB7 lexical=${trace.resultCounts.lexical}</p>` : "") + `${results.map((result, index) => defaultSearchResultItem(result, index)).join("")}</section>`;
9469
- var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
9997
+ var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + renderSourceLabels(document.labels) + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
9470
9998
  var defaultDocuments = ({
9471
9999
  documents
9472
10000
  }) => documents.length === 0 ? renderEmptyState("documents") : `<section class="rag-documents">${documents.map((document, index) => defaultDocumentItem(document, index)).join("")}</section>`;
9473
- var defaultChunkPreview = (input) => `<section class="rag-chunk-preview">` + `<h3>${escapeHtml2(input.document.title)}</h3>` + `<p class="rag-chunk-preview-source">${escapeHtml2(input.document.source)}</p>` + `<article class="rag-chunk-normalized">` + `<h4>Normalized text</h4>` + `<pre>${escapeHtml2(input.normalizedText)}</pre>` + `</article>${input.chunks.map((chunk) => '<article class="rag-chunk">' + `<h4>${escapeHtml2(chunk.chunkId)}</h4>` + `<p class="rag-chunk-meta">chunk ${typeof chunk.metadata?.chunkIndex === "number" ? chunk.metadata.chunkIndex : 0} of ${typeof chunk.metadata?.chunkCount === "number" ? chunk.metadata.chunkCount : input.chunks.length}</p>` + `<pre>${escapeHtml2(chunk.text)}</pre>` + "</article>").join("")}</section>`;
10001
+ var defaultChunkPreview = (input) => {
10002
+ const groups = input.chunks.reduce((acc, chunk) => {
10003
+ const metadata = chunk.metadata ?? {};
10004
+ const kind = typeof metadata.sourceNativeKind === "string" ? metadata.sourceNativeKind : "document_chunk";
10005
+ const locator = chunk.labels?.locatorLabel ?? "";
10006
+ const title = kind === "pdf_page" ? locator || "PDF pages" : kind === "pdf_region" ? locator || "PDF regions" : kind === "spreadsheet_sheet" ? locator || "Spreadsheet sheets" : kind === "presentation_slide" ? locator || "Presentation slides" : kind === "attachment" ? locator || "Attachments" : kind === "archive_entry" ? locator || "Archive entries" : "Chunks";
10007
+ const key = kind === "document_chunk" ? "document_chunk" : `${kind}:${title}`;
10008
+ const existing = acc.find((entry) => entry.key === key);
10009
+ if (existing) {
10010
+ existing.chunks.push(chunk);
10011
+ return acc;
10012
+ }
10013
+ acc.push({
10014
+ chunks: [chunk],
10015
+ key,
10016
+ title
10017
+ });
10018
+ return acc;
10019
+ }, []);
10020
+ const groupHtml = groups.map((group) => {
10021
+ const chunkHtml = group.chunks.map((chunk) => '<article class="rag-chunk">' + `<h5>${escapeHtml2(chunk.chunkId)}</h5>` + `<p class="rag-chunk-meta">chunk ${typeof chunk.metadata?.chunkIndex === "number" ? chunk.metadata.chunkIndex : 0} of ${typeof chunk.metadata?.chunkCount === "number" ? chunk.metadata.chunkCount : input.chunks.length}</p>` + renderSourceLabels(chunk.labels) + `<pre>${escapeHtml2(chunk.text)}</pre>` + "</article>").join("");
10022
+ return `<section class="rag-chunk-group"><h4>${escapeHtml2(group.title)}</h4>${chunkHtml}</section>`;
10023
+ }).join("");
10024
+ return `<section class="rag-chunk-preview">` + `<h3>${escapeHtml2(input.document.title)}</h3>` + `<p class="rag-chunk-preview-source">${escapeHtml2(input.document.source)}</p>` + renderSourceLabels(input.document.labels) + `<article class="rag-chunk-normalized">` + `<h4>Normalized text</h4>` + `<pre>${escapeHtml2(input.normalizedText)}</pre>` + `</article>${groupHtml}</section>`;
10025
+ };
9474
10026
  var defaultMutationResult = (input) => {
9475
10027
  if (!input.ok) {
9476
10028
  return `<div class="rag-mutation error">${escapeHtml2(input.error ?? "Request failed")}</div>`;
@@ -9533,6 +10085,10 @@ var buildRAGContextLocatorLabel = (metadata, source, title) => {
9533
10085
  return;
9534
10086
  }
9535
10087
  const page = getContextNumber3(metadata.page) ?? getContextNumber3(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
10088
+ const region = getContextNumber3(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
10089
+ if (page && region) {
10090
+ return `Page ${page} \xB7 Region ${region}`;
10091
+ }
9536
10092
  if (page) {
9537
10093
  return `Page ${page}`;
9538
10094
  }
@@ -9574,9 +10130,11 @@ var buildRAGContextProvenanceLabel = (metadata) => {
9574
10130
  const threadTopic = getContextString3(metadata.threadTopic);
9575
10131
  const from = getContextString3(metadata.from);
9576
10132
  const speaker = getContextString3(metadata.speaker);
10133
+ const ocrConfidence = getContextNumber3(metadata.ocrRegionConfidence) ?? getContextNumber3(metadata.ocrConfidence);
9577
10134
  const labels = [
9578
10135
  pdfTextMode ? `PDF ${pdfTextMode}` : "",
9579
10136
  ocrEngine ? `OCR ${ocrEngine}` : "",
10137
+ typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
9580
10138
  mediaKind ? `Media ${mediaKind}` : "",
9581
10139
  transcriptSource ? `Transcript ${transcriptSource}` : "",
9582
10140
  threadTopic ? `Thread ${threadTopic}` : "",
@@ -9886,6 +10444,11 @@ var isRAGDocumentUrlArray = (value) => Array.isArray(value) && value.every((entr
9886
10444
  var isRAGDocumentChunkArray = (value) => Array.isArray(value) && value.every((entry) => isRAGDocumentChunk(entry));
9887
10445
  var buildSources2 = (results) => results.map((result) => ({
9888
10446
  chunkId: result.chunkId,
10447
+ labels: buildRAGSourceLabels({
10448
+ metadata: result.metadata,
10449
+ source: result.source,
10450
+ title: result.title
10451
+ }),
9889
10452
  metadata: result.metadata,
9890
10453
  score: normalizeScore(result.score),
9891
10454
  source: result.source,
@@ -13616,6 +14179,11 @@ var ragChat = (config) => {
13616
14179
  let documentsWithoutChunkPreview = 0;
13617
14180
  let inspectedDocuments = 0;
13618
14181
  let inspectedChunks = 0;
14182
+ let documentsWithSourceLabels = 0;
14183
+ let chunksWithSourceLabels = 0;
14184
+ const sourceNativeKinds = new Map;
14185
+ const sampleDocuments = [];
14186
+ const sampleChunks = [];
13619
14187
  let oldestDocumentAgeMs;
13620
14188
  let newestDocumentAgeMs;
13621
14189
  const staleDocuments = [];
@@ -13656,6 +14224,27 @@ var ragChat = (config) => {
13656
14224
  if ((document.chunkCount ?? 0) === 0) {
13657
14225
  emptyDocuments += 1;
13658
14226
  }
14227
+ const documentLabels = buildRAGSourceLabels({
14228
+ metadata: document.metadata,
14229
+ source: document.source,
14230
+ title: document.title
14231
+ });
14232
+ if (documentLabels) {
14233
+ documentsWithSourceLabels += 1;
14234
+ }
14235
+ const documentSourceNativeKind = typeof document.metadata?.sourceNativeKind === "string" ? document.metadata.sourceNativeKind : undefined;
14236
+ if (documentSourceNativeKind) {
14237
+ sourceNativeKinds.set(documentSourceNativeKind, (sourceNativeKinds.get(documentSourceNativeKind) ?? 0) + 1);
14238
+ }
14239
+ if (sampleDocuments.length < 5 && (documentLabels || documentSourceNativeKind)) {
14240
+ sampleDocuments.push({
14241
+ id: document.id,
14242
+ labels: documentLabels,
14243
+ source: document.source,
14244
+ sourceNativeKind: documentSourceNativeKind,
14245
+ title: document.title
14246
+ });
14247
+ }
13659
14248
  if (indexManager?.getDocumentChunks) {
13660
14249
  const preview = await indexManager.getDocumentChunks(document.id);
13661
14250
  if (!preview) {
@@ -13665,6 +14254,27 @@ var ragChat = (config) => {
13665
14254
  inspectedDocuments += 1;
13666
14255
  for (const chunk of preview.chunks) {
13667
14256
  inspectedChunks += 1;
14257
+ const chunkLabels = buildRAGSourceLabels({
14258
+ metadata: chunk.metadata,
14259
+ source: chunk.source ?? preview.document.source,
14260
+ title: chunk.title ?? preview.document.title
14261
+ });
14262
+ if (chunkLabels) {
14263
+ chunksWithSourceLabels += 1;
14264
+ }
14265
+ const chunkSourceNativeKind = typeof chunk.metadata?.sourceNativeKind === "string" ? chunk.metadata.sourceNativeKind : undefined;
14266
+ if (chunkSourceNativeKind) {
14267
+ sourceNativeKinds.set(chunkSourceNativeKind, (sourceNativeKinds.get(chunkSourceNativeKind) ?? 0) + 1);
14268
+ }
14269
+ if (sampleChunks.length < 8 && (chunkLabels || chunkSourceNativeKind)) {
14270
+ sampleChunks.push({
14271
+ chunkId: chunk.chunkId,
14272
+ documentId: document.id,
14273
+ labels: chunkLabels,
14274
+ source: chunk.source ?? preview.document.source,
14275
+ sourceNativeKind: chunkSourceNativeKind
14276
+ });
14277
+ }
13668
14278
  const normalized = chunk.text.trim();
13669
14279
  if (!normalized) {
13670
14280
  emptyChunks += 1;
@@ -13721,6 +14331,13 @@ var ragChat = (config) => {
13721
14331
  failuresByInputKind: Object.fromEntries(failuresByInputKind.entries()),
13722
14332
  inspectedChunks,
13723
14333
  inspectedDocuments,
14334
+ inspection: {
14335
+ chunksWithSourceLabels,
14336
+ documentsWithSourceLabels,
14337
+ sampleChunks,
14338
+ sampleDocuments,
14339
+ sourceNativeKinds: Object.fromEntries(sourceNativeKinds.entries())
14340
+ },
13724
14341
  lowSignalChunks,
13725
14342
  newestDocumentAgeMs,
13726
14343
  oldestDocumentAgeMs,
@@ -14901,7 +15518,14 @@ var ragChat = (config) => {
14901
15518
  }
14902
15519
  const documents = await indexManager.listDocuments({ kind });
14903
15520
  return {
14904
- documents,
15521
+ documents: documents.map((document) => ({
15522
+ ...document,
15523
+ labels: buildRAGSourceLabels({
15524
+ metadata: document.metadata,
15525
+ source: document.source,
15526
+ title: document.title
15527
+ })
15528
+ })),
14905
15529
  ok: true
14906
15530
  };
14907
15531
  };
@@ -14961,7 +15585,23 @@ var ragChat = (config) => {
14961
15585
  }
14962
15586
  return {
14963
15587
  ok: true,
14964
- ...preview
15588
+ ...preview,
15589
+ document: {
15590
+ ...preview.document,
15591
+ labels: buildRAGSourceLabels({
15592
+ metadata: preview.document.metadata,
15593
+ source: preview.document.source,
15594
+ title: preview.document.title
15595
+ })
15596
+ },
15597
+ chunks: preview.chunks.map((chunk) => ({
15598
+ ...chunk,
15599
+ labels: buildRAGSourceLabels({
15600
+ metadata: chunk.metadata,
15601
+ source: chunk.source ?? preview.document.source,
15602
+ title: chunk.title ?? preview.document.title
15603
+ })
15604
+ }))
14965
15605
  };
14966
15606
  };
14967
15607
  const handleDeleteDocument = async (id) => {
@@ -20557,5 +21197,5 @@ export {
20557
21197
  aiChat
20558
21198
  };
20559
21199
 
20560
- //# debugId=E76E681490B6CFE564756E2164756E21
21200
+ //# debugId=5C4A7D98C1C2BE6B64756E2164756E21
20561
21201
  //# sourceMappingURL=index.js.map