@absolutejs/absolute 0.19.0-beta.603 → 0.19.0-beta.604
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/client/index.js +59 -5
- package/dist/ai/client/index.js.map +4 -4
- package/dist/ai/client/ui.js +59 -5
- package/dist/ai/client/ui.js.map +4 -4
- package/dist/ai/index.js +720 -80
- package/dist/ai/index.js.map +8 -8
- package/dist/ai/rag/quality.js +11 -1
- package/dist/ai/rag/quality.js.map +3 -3
- package/dist/ai/rag/ui.js +59 -5
- package/dist/ai/rag/ui.js.map +4 -4
- package/dist/ai-client/angular/ai/index.js +58 -4
- package/dist/ai-client/react/ai/index.js +58 -4
- package/dist/ai-client/vue/ai/index.js +58 -4
- package/dist/angular/ai/index.js +59 -5
- package/dist/angular/ai/index.js.map +4 -4
- package/dist/angular/index.js +2 -2
- package/dist/angular/index.js.map +1 -1
- package/dist/angular/server.js +2 -2
- package/dist/angular/server.js.map +1 -1
- package/dist/build.js +2 -2
- package/dist/build.js.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/dist/react/ai/index.js +59 -5
- package/dist/react/ai/index.js.map +4 -4
- package/dist/src/ai/rag/presentation.d.ts +6 -1
- package/dist/src/vue/ai/useRAG.d.ts +60 -0
- package/dist/src/vue/ai/useRAGChunkPreview.d.ts +20 -0
- package/dist/src/vue/ai/useRAGDocuments.d.ts +20 -0
- package/dist/src/vue/ai/useRAGIndexAdmin.d.ts +10 -0
- package/dist/src/vue/ai/useRAGSearch.d.ts +10 -0
- package/dist/svelte/ai/index.js +59 -5
- package/dist/svelte/ai/index.js.map +4 -4
- package/dist/types/ai.d.ts +42 -1
- package/dist/vue/ai/index.js +59 -5
- package/dist/vue/ai/index.js.map +4 -4
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -216,6 +216,10 @@ var buildContextLabel = (metadata) => {
|
|
|
216
216
|
return from ? `Message from ${from}` : "Message evidence";
|
|
217
217
|
}
|
|
218
218
|
const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
219
|
+
const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
220
|
+
if (page && region) {
|
|
221
|
+
return `Page ${page} region ${region}`;
|
|
222
|
+
}
|
|
219
223
|
if (page) {
|
|
220
224
|
return `Page ${page}`;
|
|
221
225
|
}
|
|
@@ -256,6 +260,10 @@ var buildLocatorLabel = (metadata, source, title) => {
|
|
|
256
260
|
return;
|
|
257
261
|
}
|
|
258
262
|
const page = getContextNumber(metadata.page) ?? getContextNumber(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
263
|
+
const region = getContextNumber(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
264
|
+
if (page && region) {
|
|
265
|
+
return `Page ${page} \xB7 Region ${region}`;
|
|
266
|
+
}
|
|
259
267
|
if (page) {
|
|
260
268
|
return `Page ${page}`;
|
|
261
269
|
}
|
|
@@ -308,9 +316,11 @@ var buildProvenanceLabel = (metadata) => {
|
|
|
308
316
|
const transcriptSource = getContextString(metadata.transcriptSource);
|
|
309
317
|
const pdfTextMode = getContextString(metadata.pdfTextMode);
|
|
310
318
|
const ocrEngine = getContextString(metadata.ocrEngine);
|
|
319
|
+
const ocrConfidence = getContextNumber(metadata.ocrRegionConfidence) ?? getContextNumber(metadata.ocrConfidence);
|
|
311
320
|
const labels = [
|
|
312
321
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
313
322
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
323
|
+
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
314
324
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
315
325
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
316
326
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
@@ -3977,6 +3987,10 @@ var buildContextLabel2 = (metadata) => {
|
|
|
3977
3987
|
return from ? `Message from ${from}` : "Message evidence";
|
|
3978
3988
|
}
|
|
3979
3989
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
3990
|
+
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
3991
|
+
if (page && region) {
|
|
3992
|
+
return `Page ${page} region ${region}`;
|
|
3993
|
+
}
|
|
3980
3994
|
if (page) {
|
|
3981
3995
|
return `Page ${page}`;
|
|
3982
3996
|
}
|
|
@@ -4007,6 +4021,10 @@ var buildLocatorLabel2 = (metadata, source, title) => {
|
|
|
4007
4021
|
return;
|
|
4008
4022
|
}
|
|
4009
4023
|
const page = getContextNumber2(metadata.page) ?? getContextNumber2(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
4024
|
+
const region = getContextNumber2(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
4025
|
+
if (page && region) {
|
|
4026
|
+
return `Page ${page} \xB7 Region ${region}`;
|
|
4027
|
+
}
|
|
4010
4028
|
if (page) {
|
|
4011
4029
|
return `Page ${page}`;
|
|
4012
4030
|
}
|
|
@@ -4049,9 +4067,11 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4049
4067
|
const transcriptSource = getContextString2(metadata.transcriptSource);
|
|
4050
4068
|
const pdfTextMode = getContextString2(metadata.pdfTextMode);
|
|
4051
4069
|
const ocrEngine = getContextString2(metadata.ocrEngine);
|
|
4070
|
+
const ocrConfidence = getContextNumber2(metadata.ocrRegionConfidence) ?? getContextNumber2(metadata.ocrConfidence);
|
|
4052
4071
|
const labels = [
|
|
4053
4072
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
4054
4073
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
4074
|
+
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
4055
4075
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
4056
4076
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
4057
4077
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
@@ -4061,6 +4081,23 @@ var buildProvenanceLabel2 = (metadata) => {
|
|
|
4061
4081
|
].filter((value) => value.length > 0);
|
|
4062
4082
|
return labels.length > 0 ? labels.join(" \xB7 ") : undefined;
|
|
4063
4083
|
};
|
|
4084
|
+
var buildRAGSourceLabels = ({
|
|
4085
|
+
metadata,
|
|
4086
|
+
source,
|
|
4087
|
+
title
|
|
4088
|
+
}) => {
|
|
4089
|
+
const contextLabel = buildContextLabel2(metadata);
|
|
4090
|
+
const locatorLabel = buildLocatorLabel2(metadata, source, title);
|
|
4091
|
+
const provenanceLabel = buildProvenanceLabel2(metadata);
|
|
4092
|
+
if (!contextLabel && !locatorLabel && !provenanceLabel) {
|
|
4093
|
+
return;
|
|
4094
|
+
}
|
|
4095
|
+
return {
|
|
4096
|
+
contextLabel,
|
|
4097
|
+
locatorLabel,
|
|
4098
|
+
provenanceLabel
|
|
4099
|
+
};
|
|
4100
|
+
};
|
|
4064
4101
|
var buildExcerpt2 = (text, maxLength = 160) => {
|
|
4065
4102
|
const normalized = text.replaceAll(/\s+/g, " ").trim();
|
|
4066
4103
|
if (normalized.length <= maxLength) {
|
|
@@ -4102,13 +4139,13 @@ var buildRAGSourceSummaries = (sources) => {
|
|
|
4102
4139
|
citationNumbers: groupCitations.map((citation) => citationReferenceMap[citation.chunkId] ?? 0),
|
|
4103
4140
|
citations: groupCitations,
|
|
4104
4141
|
chunkIds: group.chunks.map((chunk) => chunk.chunkId),
|
|
4105
|
-
contextLabel: buildContextLabel2(leadChunk?.metadata),
|
|
4142
|
+
contextLabel: leadChunk?.labels?.contextLabel ?? buildContextLabel2(leadChunk?.metadata),
|
|
4106
4143
|
count: group.count,
|
|
4107
4144
|
excerpt: buildExcerpt2(leadChunk?.text ?? ""),
|
|
4108
4145
|
key: group.key,
|
|
4109
4146
|
label: group.label,
|
|
4110
|
-
locatorLabel: buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
|
|
4111
|
-
provenanceLabel: buildProvenanceLabel2(leadChunk?.metadata),
|
|
4147
|
+
locatorLabel: leadChunk?.labels?.locatorLabel ?? buildLocatorLabel2(leadChunk?.metadata, leadChunk?.source, leadChunk?.title),
|
|
4148
|
+
provenanceLabel: leadChunk?.labels?.provenanceLabel ?? buildProvenanceLabel2(leadChunk?.metadata),
|
|
4112
4149
|
source: group.source,
|
|
4113
4150
|
title: group.title
|
|
4114
4151
|
};
|
|
@@ -4232,6 +4269,11 @@ var buildSourceGroup = (source, key) => ({
|
|
|
4232
4269
|
count: 1,
|
|
4233
4270
|
key,
|
|
4234
4271
|
label: buildSourceLabel2(source),
|
|
4272
|
+
labels: source.labels ?? buildRAGSourceLabels({
|
|
4273
|
+
metadata: source.metadata,
|
|
4274
|
+
source: source.source,
|
|
4275
|
+
title: source.title
|
|
4276
|
+
}),
|
|
4235
4277
|
source: source.source,
|
|
4236
4278
|
title: source.title
|
|
4237
4279
|
});
|
|
@@ -4242,7 +4284,19 @@ var updateSourceGroup = (groups, source) => {
|
|
|
4242
4284
|
groups.set(key, buildSourceGroup(source, key));
|
|
4243
4285
|
return;
|
|
4244
4286
|
}
|
|
4245
|
-
|
|
4287
|
+
if (source.score > existing.bestScore) {
|
|
4288
|
+
existing.bestScore = source.score;
|
|
4289
|
+
existing.label = buildSourceLabel2(source);
|
|
4290
|
+
existing.labels = source.labels ?? buildRAGSourceLabels({
|
|
4291
|
+
metadata: source.metadata,
|
|
4292
|
+
source: source.source,
|
|
4293
|
+
title: source.title
|
|
4294
|
+
});
|
|
4295
|
+
existing.source = source.source;
|
|
4296
|
+
existing.title = source.title;
|
|
4297
|
+
} else {
|
|
4298
|
+
existing.bestScore = Math.max(existing.bestScore, source.score);
|
|
4299
|
+
}
|
|
4246
4300
|
existing.count += 1;
|
|
4247
4301
|
existing.chunks.push(source);
|
|
4248
4302
|
};
|
|
@@ -7787,11 +7841,71 @@ var decodeHtmlEntities = (value) => {
|
|
|
7787
7841
|
output = output.replace(/&#(\d+);/g, (_, code) => String.fromCodePoint(Number(code)));
|
|
7788
7842
|
return output.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCodePoint(parseInt(code, 16)));
|
|
7789
7843
|
};
|
|
7790
|
-
var
|
|
7791
|
-
const
|
|
7844
|
+
var formatHtmlLinkContext = (href) => {
|
|
7845
|
+
const decoded = decodeHtmlEntities(href.trim());
|
|
7846
|
+
if (!decoded) {
|
|
7847
|
+
return;
|
|
7848
|
+
}
|
|
7849
|
+
if (decoded.startsWith("#")) {
|
|
7850
|
+
return decoded;
|
|
7851
|
+
}
|
|
7852
|
+
if (/^[a-z]+:/i.test(decoded)) {
|
|
7853
|
+
try {
|
|
7854
|
+
const url = new URL(decoded);
|
|
7855
|
+
const path = url.pathname === "/" ? "" : url.pathname;
|
|
7856
|
+
return `${url.hostname}${path}`;
|
|
7857
|
+
} catch {
|
|
7858
|
+
return decoded;
|
|
7859
|
+
}
|
|
7860
|
+
}
|
|
7861
|
+
return decoded;
|
|
7862
|
+
};
|
|
7863
|
+
var stripHtmlTags = (value) => {
|
|
7864
|
+
const withoutTags = value.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<a\b[^>]*href=(['"])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, (_match, _quote, href, inner) => {
|
|
7865
|
+
const label = normalizeWhitespace(stripHtmlTags(inner));
|
|
7866
|
+
const context = formatHtmlLinkContext(href);
|
|
7867
|
+
if (!label) {
|
|
7868
|
+
return context ?? " ";
|
|
7869
|
+
}
|
|
7870
|
+
if (!context || context === label) {
|
|
7871
|
+
return label;
|
|
7872
|
+
}
|
|
7873
|
+
return `${label} (${context})`;
|
|
7874
|
+
}).replace(/<br\s*\/?>/gi, `
|
|
7792
7875
|
`).replace(/<\/(p|div|section|article|li|ul|ol|h[1-6]|table|tr)>/gi, `
|
|
7793
7876
|
`).replace(/<li\b[^>]*>/gi, "- ").replace(/<[^>]+>/g, " ");
|
|
7794
|
-
return
|
|
7877
|
+
return decodeHtmlEntities(withoutTags);
|
|
7878
|
+
};
|
|
7879
|
+
var extractMainHtmlContent = (value) => {
|
|
7880
|
+
const trimmed = value.trim();
|
|
7881
|
+
if (!/<html\b|<body\b|<main\b|<article\b/i.test(trimmed)) {
|
|
7882
|
+
return value;
|
|
7883
|
+
}
|
|
7884
|
+
const boilerplateStripped = trimmed.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ").replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ").replace(/<(nav|footer|header|aside|form)\b[^>]*>[\s\S]*?<\/\1>/gi, " ");
|
|
7885
|
+
const mainMatch = boilerplateStripped.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
|
|
7886
|
+
if (mainMatch?.[1]) {
|
|
7887
|
+
return mainMatch[1];
|
|
7888
|
+
}
|
|
7889
|
+
const articleMatches = [
|
|
7890
|
+
...boilerplateStripped.matchAll(/<article\b[^>]*>([\s\S]*?)<\/article>/gi)
|
|
7891
|
+
].map((match) => match[1]?.trim()).filter(Boolean);
|
|
7892
|
+
if (articleMatches.length > 0) {
|
|
7893
|
+
return articleMatches.join(`
|
|
7894
|
+
`);
|
|
7895
|
+
}
|
|
7896
|
+
const roleMainMatch = boilerplateStripped.match(/<([a-z0-9:_-]+)\b[^>]*\brole=(['"])main\2[^>]*>([\s\S]*?)<\/\1>/i);
|
|
7897
|
+
if (roleMainMatch?.[3]) {
|
|
7898
|
+
return roleMainMatch[3];
|
|
7899
|
+
}
|
|
7900
|
+
const bodyMatch = boilerplateStripped.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
|
|
7901
|
+
if (bodyMatch?.[1]) {
|
|
7902
|
+
return bodyMatch[1];
|
|
7903
|
+
}
|
|
7904
|
+
return boilerplateStripped;
|
|
7905
|
+
};
|
|
7906
|
+
var stripHtml = (value) => {
|
|
7907
|
+
const focused = extractMainHtmlContent(value);
|
|
7908
|
+
return normalizeWhitespace(stripHtmlTags(focused));
|
|
7795
7909
|
};
|
|
7796
7910
|
var stripMarkdown = (value) => {
|
|
7797
7911
|
const withoutCodeBlocks = value.replace(/```[\s\S]*?```/g, (block) => {
|
|
@@ -7828,14 +7942,49 @@ var markdownStructureUnits = (value) => {
|
|
|
7828
7942
|
flushCurrentSection();
|
|
7829
7943
|
return sections.map((section) => stripMarkdown(section)).map((section) => normalizeWhitespace(section)).filter(Boolean);
|
|
7830
7944
|
};
|
|
7831
|
-
var
|
|
7832
|
-
const
|
|
7833
|
-
|
|
7834
|
-
__ABS_SECTION_BREAK__ `).replace(/<\/(section|article|main|aside|nav|h[1-6])>/gi, `
|
|
7835
|
-
|
|
7945
|
+
var joinHtmlHeadingSection = (headings, content) => {
|
|
7946
|
+
const normalizedHeadings = headings.map((heading) => normalizeWhitespace(heading));
|
|
7947
|
+
const combined = [...normalizedHeadings, content].filter(Boolean).join(`
|
|
7836
7948
|
`);
|
|
7837
|
-
|
|
7838
|
-
|
|
7949
|
+
return normalizeWhitespace(combined);
|
|
7950
|
+
};
|
|
7951
|
+
var htmlStructureUnits = (value) => {
|
|
7952
|
+
const focused = extractMainHtmlContent(value);
|
|
7953
|
+
const headingPattern = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi;
|
|
7954
|
+
const sections = [];
|
|
7955
|
+
const headingStack = [];
|
|
7956
|
+
let cursor = 0;
|
|
7957
|
+
let currentContentStart = 0;
|
|
7958
|
+
let activeHeadings = [];
|
|
7959
|
+
const flushSection = (end) => {
|
|
7960
|
+
const content = normalizeWhitespace(stripHtmlTags(focused.slice(currentContentStart, end)));
|
|
7961
|
+
if (!content) {
|
|
7962
|
+
return;
|
|
7963
|
+
}
|
|
7964
|
+
const section = joinHtmlHeadingSection(activeHeadings, content);
|
|
7965
|
+
if (section) {
|
|
7966
|
+
sections.push(section);
|
|
7967
|
+
}
|
|
7968
|
+
};
|
|
7969
|
+
for (const match of focused.matchAll(headingPattern)) {
|
|
7970
|
+
const fullMatch = match[0];
|
|
7971
|
+
const start = match.index ?? cursor;
|
|
7972
|
+
flushSection(start);
|
|
7973
|
+
const level = Number.parseInt(match[1] ?? "1", 10);
|
|
7974
|
+
const headingText = normalizeWhitespace(stripHtmlTags(match[2] ?? ""));
|
|
7975
|
+
if (headingText) {
|
|
7976
|
+
headingStack[level - 1] = headingText;
|
|
7977
|
+
headingStack.length = level;
|
|
7978
|
+
activeHeadings = [...headingStack];
|
|
7979
|
+
}
|
|
7980
|
+
cursor = start + fullMatch.length;
|
|
7981
|
+
currentContentStart = cursor;
|
|
7982
|
+
}
|
|
7983
|
+
flushSection(focused.length);
|
|
7984
|
+
if (sections.length > 0) {
|
|
7985
|
+
return sections;
|
|
7986
|
+
}
|
|
7987
|
+
return [normalizeWhitespace(stripHtmlTags(focused))].filter(Boolean);
|
|
7839
7988
|
};
|
|
7840
7989
|
var inferFormat = (document) => {
|
|
7841
7990
|
if (document.format) {
|
|
@@ -7927,10 +8076,77 @@ var isLikelyTextData = (data) => {
|
|
|
7927
8076
|
};
|
|
7928
8077
|
var decodePdfLiteral = (value) => value.replace(/\\([\\()])/g, "$1").replace(/\\n/g, `
|
|
7929
8078
|
`).replace(/\\r/g, "\r").replace(/\\t/g, "\t").replace(/\\b/g, "\b").replace(/\\f/g, "\f").replace(/\\([0-7]{1,3})/g, (_match, octal) => String.fromCharCode(parseInt(octal, 8)));
|
|
8079
|
+
var PDF_TABLE_GAP_THRESHOLD = 120;
|
|
8080
|
+
var extractPdfArrayText = (value) => {
|
|
8081
|
+
const parts = [];
|
|
8082
|
+
const tokenPattern = /\(((?:\\.|[^\\)])*)\)|([-+]?\d*\.?\d+)/g;
|
|
8083
|
+
let pendingColumnGap = false;
|
|
8084
|
+
for (const match of value.matchAll(tokenPattern)) {
|
|
8085
|
+
if (match[1] !== undefined) {
|
|
8086
|
+
const decoded = decodePdfLiteral(match[1]);
|
|
8087
|
+
if (pendingColumnGap && decoded && !/^\s/.test(decoded) && parts.at(-1) !== " | ") {
|
|
8088
|
+
parts.push(" | ");
|
|
8089
|
+
}
|
|
8090
|
+
parts.push(decoded);
|
|
8091
|
+
pendingColumnGap = false;
|
|
8092
|
+
continue;
|
|
8093
|
+
}
|
|
8094
|
+
const gap = Number(match[2]);
|
|
8095
|
+
if (Number.isFinite(gap) && gap >= PDF_TABLE_GAP_THRESHOLD) {
|
|
8096
|
+
pendingColumnGap = true;
|
|
8097
|
+
}
|
|
8098
|
+
}
|
|
8099
|
+
return normalizeWhitespace(parts.join("")).replace(/\s+\|\s+/g, " | ").trim();
|
|
8100
|
+
};
|
|
8101
|
+
var appendPdfText = (parts, value) => {
|
|
8102
|
+
if (!value) {
|
|
8103
|
+
return;
|
|
8104
|
+
}
|
|
8105
|
+
parts.push(value);
|
|
8106
|
+
};
|
|
8107
|
+
var appendPdfLineBreak = (parts) => {
|
|
8108
|
+
const last = parts.at(-1);
|
|
8109
|
+
if (!last || last.endsWith(`
|
|
8110
|
+
`)) {
|
|
8111
|
+
return;
|
|
8112
|
+
}
|
|
8113
|
+
parts.push(`
|
|
8114
|
+
`);
|
|
8115
|
+
};
|
|
8116
|
+
var PDF_TEXT_OPERATOR_PATTERN = /(\[((?:\\.|[^\]])*)\]\s*TJ)|(\(((?:\\.|[^\\)])*)\)\s*Tj)|([-+]?\d*\.?\d+\s+[-+]?\d*\.?\d+\s+\(((?:\\.|[^\\)])*)\)\s*")|(\(((?:\\.|[^\\)])*)\)\s*')|((?:[-+]?\d*\.?\d+\s+){2}(?:Td|TD))|(T\*)|((?:[-+]?\d*\.?\d+\s+){6}Tm)/g;
|
|
8117
|
+
var extractTextFromPDFTextObject = (value) => {
|
|
8118
|
+
const parts = [];
|
|
8119
|
+
for (const match of value.matchAll(PDF_TEXT_OPERATOR_PATTERN)) {
|
|
8120
|
+
if (match[2] !== undefined) {
|
|
8121
|
+
appendPdfText(parts, extractPdfArrayText(match[2]));
|
|
8122
|
+
continue;
|
|
8123
|
+
}
|
|
8124
|
+
if (match[4] !== undefined) {
|
|
8125
|
+
appendPdfText(parts, decodePdfLiteral(match[4]));
|
|
8126
|
+
continue;
|
|
8127
|
+
}
|
|
8128
|
+
if (match[6] !== undefined) {
|
|
8129
|
+
appendPdfLineBreak(parts);
|
|
8130
|
+
appendPdfText(parts, decodePdfLiteral(match[6]));
|
|
8131
|
+
continue;
|
|
8132
|
+
}
|
|
8133
|
+
if (match[8] !== undefined) {
|
|
8134
|
+
appendPdfLineBreak(parts);
|
|
8135
|
+
appendPdfText(parts, decodePdfLiteral(match[8]));
|
|
8136
|
+
continue;
|
|
8137
|
+
}
|
|
8138
|
+
if (match[9] !== undefined || match[10] !== undefined || match[11] !== undefined) {
|
|
8139
|
+
appendPdfLineBreak(parts);
|
|
8140
|
+
}
|
|
8141
|
+
}
|
|
8142
|
+
return parts.join("");
|
|
8143
|
+
};
|
|
7930
8144
|
var extractTextFromPDFBytes = (data) => {
|
|
7931
8145
|
const raw = Buffer.from(data).toString("latin1");
|
|
7932
|
-
const
|
|
7933
|
-
const combined =
|
|
8146
|
+
const textObjects = [...raw.matchAll(/BT([\s\S]*?)ET/g)].map((match) => extractTextFromPDFTextObject(match[1] ?? "")).filter(Boolean);
|
|
8147
|
+
const combined = textObjects.length > 0 ? textObjects.join(`
|
|
8148
|
+
|
|
8149
|
+
`) : [...raw.matchAll(/\(((?:\\.|[^\\)])*)\)\s*Tj/g)].map((match) => decodePdfLiteral(match[1] ?? "")).join(`
|
|
7934
8150
|
`);
|
|
7935
8151
|
return normalizeWhitespace(combined);
|
|
7936
8152
|
};
|
|
@@ -8022,7 +8238,40 @@ var decodeGzipEntries = (data, input) => {
|
|
|
8022
8238
|
];
|
|
8023
8239
|
};
|
|
8024
8240
|
var extractXmlText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")));
|
|
8241
|
+
var extractOfficeParagraphText = (value) => normalizeWhitespace(decodeHtmlEntities(value.replace(/<w:tab\b[^>]*\/>/gi, "\t").replace(/<w:br\b[^>]*\/>/gi, `
|
|
8242
|
+
`).replace(/<[^>]+>/g, " ")));
|
|
8243
|
+
var officeDocumentParagraphs = (entries) => {
|
|
8244
|
+
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
8245
|
+
if (!documentEntry) {
|
|
8246
|
+
return [];
|
|
8247
|
+
}
|
|
8248
|
+
const xml = decodeUtf8(documentEntry.data);
|
|
8249
|
+
const paragraphs = [...xml.matchAll(/<w:p\b[\s\S]*?<\/w:p>/g)];
|
|
8250
|
+
return paragraphs.map((match) => {
|
|
8251
|
+
const paragraphXml = match[0] ?? "";
|
|
8252
|
+
const text = extractOfficeParagraphText(paragraphXml);
|
|
8253
|
+
if (!text) {
|
|
8254
|
+
return "";
|
|
8255
|
+
}
|
|
8256
|
+
const styleMatch = paragraphXml.match(/<w:pStyle\b[^>]*w:val="([^"]+)"[^>]*\/?>/i);
|
|
8257
|
+
const style = (styleMatch?.[1] ?? "").toLowerCase();
|
|
8258
|
+
if (style === "title") {
|
|
8259
|
+
return text;
|
|
8260
|
+
}
|
|
8261
|
+
const headingMatch = style.match(/^heading([1-6])$/);
|
|
8262
|
+
if (headingMatch) {
|
|
8263
|
+
return text;
|
|
8264
|
+
}
|
|
8265
|
+
return text;
|
|
8266
|
+
}).filter(Boolean);
|
|
8267
|
+
};
|
|
8025
8268
|
var officeDocumentText = (entries) => {
|
|
8269
|
+
const paragraphs = officeDocumentParagraphs(entries);
|
|
8270
|
+
if (paragraphs.length > 0) {
|
|
8271
|
+
return normalizeWhitespace(paragraphs.join(`
|
|
8272
|
+
|
|
8273
|
+
`));
|
|
8274
|
+
}
|
|
8026
8275
|
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
8027
8276
|
if (!documentEntry) {
|
|
8028
8277
|
return "";
|
|
@@ -8037,31 +8286,68 @@ var officeDocumentSectionCount = (entries) => {
|
|
|
8037
8286
|
const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
|
|
8038
8287
|
return count > 0 ? count : undefined;
|
|
8039
8288
|
};
|
|
8040
|
-
var
|
|
8041
|
-
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
const
|
|
8045
|
-
|
|
8046
|
-
|
|
8289
|
+
var spreadsheetSharedStrings = (entries) => entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
8290
|
+
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
8291
|
+
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
8292
|
+
var spreadsheetColumnLabel = (reference) => {
|
|
8293
|
+
const match = reference?.match(/([A-Z]+)/i);
|
|
8294
|
+
return match?.[1]?.toUpperCase() ?? "";
|
|
8295
|
+
};
|
|
8296
|
+
var spreadsheetResolveCellValue = (cellXml, sharedStrings) => {
|
|
8297
|
+
const inlineMatch = cellXml.match(/<is\b[^>]*>[\s\S]*?<t[^>]*>([\s\S]*?)<\/t>[\s\S]*?<\/is>/i);
|
|
8298
|
+
if (inlineMatch?.[1]) {
|
|
8299
|
+
return normalizeWhitespace(decodeHtmlEntities(inlineMatch[1]));
|
|
8300
|
+
}
|
|
8301
|
+
const valueMatch = cellXml.match(/<v>([\s\S]*?)<\/v>/i);
|
|
8302
|
+
if (!valueMatch?.[1]) {
|
|
8303
|
+
return "";
|
|
8304
|
+
}
|
|
8305
|
+
const rawValue = decodeHtmlEntities(valueMatch[1]);
|
|
8306
|
+
const typeMatch = cellXml.match(/\bt="([^"]+)"/i);
|
|
8307
|
+
if (typeMatch?.[1] === "s") {
|
|
8308
|
+
const index = Number(rawValue);
|
|
8309
|
+
return Number.isInteger(index) && sharedStrings[index] ? sharedStrings[index] : rawValue;
|
|
8310
|
+
}
|
|
8311
|
+
return normalizeWhitespace(rawValue);
|
|
8312
|
+
};
|
|
8313
|
+
var spreadsheetWorksheetRows = (worksheetXml, sharedStrings) => [...worksheetXml.matchAll(/<row\b[^>]*>([\s\S]*?)<\/row>/gi)].map((rowMatch) => {
|
|
8314
|
+
const rowXml = rowMatch[1] ?? "";
|
|
8315
|
+
const cells = [...rowXml.matchAll(/<c\b([^>]*)>([\s\S]*?)<\/c>/gi)].map((cellMatch) => {
|
|
8316
|
+
const attributes = cellMatch[1] ?? "";
|
|
8317
|
+
const cellBody = cellMatch[2] ?? "";
|
|
8318
|
+
const referenceMatch = attributes.match(/\br="([^"]+)"/i);
|
|
8319
|
+
const reference = referenceMatch?.[1];
|
|
8320
|
+
const value = spreadsheetResolveCellValue(`<c${attributes}>${cellBody}</c>`, sharedStrings);
|
|
8321
|
+
return {
|
|
8322
|
+
column: spreadsheetColumnLabel(reference),
|
|
8323
|
+
reference,
|
|
8324
|
+
value
|
|
8325
|
+
};
|
|
8326
|
+
}).filter((cell) => cell.value);
|
|
8327
|
+
return cells;
|
|
8328
|
+
}).filter((row) => row.length > 0);
|
|
8329
|
+
var spreadsheetRowText = (row, headers) => {
|
|
8330
|
+
const entries = row.map((cell, index) => {
|
|
8331
|
+
const header = headers[index];
|
|
8332
|
+
if (header) {
|
|
8333
|
+
return `${header}: ${cell.value}`;
|
|
8334
|
+
}
|
|
8335
|
+
return cell.column ? `${cell.column}: ${cell.value}` : cell.value;
|
|
8047
8336
|
});
|
|
8048
|
-
return normalizeWhitespace(
|
|
8049
|
-
`));
|
|
8337
|
+
return normalizeWhitespace(entries.join(" | "));
|
|
8050
8338
|
};
|
|
8051
8339
|
var spreadsheetSheetTexts = (entries) => {
|
|
8052
|
-
const sharedStrings = entries
|
|
8053
|
-
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
8054
|
-
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
8340
|
+
const sharedStrings = spreadsheetSharedStrings(entries);
|
|
8055
8341
|
const sheetNames = spreadsheetSheetNames(entries);
|
|
8056
8342
|
const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
|
|
8057
8343
|
return sheetEntries.map((entry, index) => {
|
|
8058
|
-
const
|
|
8059
|
-
|
|
8060
|
-
|
|
8061
|
-
|
|
8062
|
-
|
|
8063
|
-
});
|
|
8064
|
-
const text = normalizeWhitespace(
|
|
8344
|
+
const rows = spreadsheetWorksheetRows(decodeUtf8(entry.data), sharedStrings);
|
|
8345
|
+
if (rows.length === 0) {
|
|
8346
|
+
return null;
|
|
8347
|
+
}
|
|
8348
|
+
const headers = rows[0].map((cell) => cell.value);
|
|
8349
|
+
const rowTexts = rows.map((row, rowIndex) => normalizeWhitespace(`Row ${rowIndex + 1}. ${spreadsheetRowText(row, rowIndex === 0 ? [] : headers)}`));
|
|
8350
|
+
const text = normalizeWhitespace(rowTexts.join(`
|
|
8065
8351
|
`));
|
|
8066
8352
|
if (!text) {
|
|
8067
8353
|
return null;
|
|
@@ -8072,19 +8358,38 @@ var spreadsheetSheetTexts = (entries) => {
|
|
|
8072
8358
|
};
|
|
8073
8359
|
}).filter((entry) => Boolean(entry));
|
|
8074
8360
|
};
|
|
8361
|
+
var spreadsheetText = (entries) => normalizeWhitespace(spreadsheetSheetTexts(entries).map((sheet) => `Sheet ${sheet.name}
|
|
8362
|
+
${sheet.text}`).join(`
|
|
8363
|
+
|
|
8364
|
+
`));
|
|
8075
8365
|
var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
|
|
8076
8366
|
...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
|
|
8077
8367
|
].map((match) => match[1] ?? "")).filter(Boolean);
|
|
8078
|
-
var
|
|
8079
|
-
const
|
|
8080
|
-
|
|
8081
|
-
|
|
8368
|
+
var presentationNotesByIndex = (entries) => new Map(entries.filter((entry) => entry.path.startsWith("ppt/notesSlides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry) => {
|
|
8369
|
+
const indexMatch = entry.path.match(/notesSlide(\d+)\.xml$/i);
|
|
8370
|
+
const index = Number(indexMatch?.[1] ?? "0") - 1;
|
|
8371
|
+
return [
|
|
8372
|
+
index,
|
|
8373
|
+
normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
|
|
8374
|
+
];
|
|
8375
|
+
}).filter((entry) => entry[0] >= 0 && Boolean(entry[1])));
|
|
8376
|
+
var presentationSlides = (entries) => {
|
|
8377
|
+
const notesByIndex = presentationNotesByIndex(entries);
|
|
8378
|
+
return entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => {
|
|
8379
|
+
const slideText = normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)));
|
|
8380
|
+
const notesText = notesByIndex.get(index);
|
|
8381
|
+
const text = normalizeWhitespace([slideText, notesText ? `Speaker notes: ${notesText}` : ""].filter(Boolean).join(`
|
|
8082
8382
|
`));
|
|
8383
|
+
return {
|
|
8384
|
+
index,
|
|
8385
|
+
notesText,
|
|
8386
|
+
text
|
|
8387
|
+
};
|
|
8388
|
+
}).filter((slide) => Boolean(slide.text));
|
|
8083
8389
|
};
|
|
8084
|
-
var
|
|
8085
|
-
|
|
8086
|
-
|
|
8087
|
-
})).filter((slide) => Boolean(slide.text));
|
|
8390
|
+
var presentationText = (entries) => normalizeWhitespace(presentationSlides(entries).map((slide) => slide.text).join(`
|
|
8391
|
+
|
|
8392
|
+
`));
|
|
8088
8393
|
var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
|
|
8089
8394
|
var epubText = (entries) => {
|
|
8090
8395
|
const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
|
|
@@ -8092,17 +8397,113 @@ var epubText = (entries) => {
|
|
|
8092
8397
|
|
|
8093
8398
|
`));
|
|
8094
8399
|
};
|
|
8095
|
-
var
|
|
8400
|
+
var splitEmailMessage = (raw) => {
|
|
8096
8401
|
const normalized = raw.replace(/\r\n?/g, `
|
|
8097
8402
|
`);
|
|
8098
|
-
const
|
|
8099
|
-
|
|
8100
|
-
`);
|
|
8101
|
-
const body = bodyParts.join(`
|
|
8403
|
+
const separator = normalized.indexOf(`
|
|
8102
8404
|
|
|
8103
8405
|
`);
|
|
8406
|
+
if (separator < 0) {
|
|
8407
|
+
return {
|
|
8408
|
+
body: "",
|
|
8409
|
+
headerBlock: normalized
|
|
8410
|
+
};
|
|
8411
|
+
}
|
|
8412
|
+
return {
|
|
8413
|
+
body: normalized.slice(separator + 2),
|
|
8414
|
+
headerBlock: normalized.slice(0, separator)
|
|
8415
|
+
};
|
|
8416
|
+
};
|
|
8417
|
+
var parseHeaderBlock = (headerBlock) => {
|
|
8418
|
+
const unfolded = headerBlock.replace(/\n[ \t]+/g, " ");
|
|
8419
|
+
const headers = new Map;
|
|
8420
|
+
for (const line of unfolded.split(`
|
|
8421
|
+
`)) {
|
|
8422
|
+
const separator = line.indexOf(":");
|
|
8423
|
+
if (separator < 0) {
|
|
8424
|
+
continue;
|
|
8425
|
+
}
|
|
8426
|
+
headers.set(line.slice(0, separator).trim().toLowerCase(), line.slice(separator + 1).trim());
|
|
8427
|
+
}
|
|
8428
|
+
return headers;
|
|
8429
|
+
};
|
|
8430
|
+
var decodeQuotedPrintable = (value) => value.replace(/=\r?\n/g, "").replace(/=([0-9A-F]{2})/gi, (_match, hex) => String.fromCharCode(parseInt(hex, 16)));
|
|
8431
|
+
var decodeEmailPartBody = (body, encoding) => {
|
|
8432
|
+
const normalizedEncoding = encoding?.toLowerCase();
|
|
8433
|
+
const trimmed = body.trim();
|
|
8434
|
+
if (normalizedEncoding === "base64") {
|
|
8435
|
+
return new Uint8Array(Buffer.from(trimmed.replace(/\s+/g, ""), "base64"));
|
|
8436
|
+
}
|
|
8437
|
+
if (normalizedEncoding === "quoted-printable") {
|
|
8438
|
+
return new Uint8Array(Buffer.from(decodeQuotedPrintable(body), "utf8"));
|
|
8439
|
+
}
|
|
8440
|
+
return new Uint8Array(Buffer.from(body, "utf8"));
|
|
8441
|
+
};
|
|
8442
|
+
var parseMimeBoundary = (contentType) => {
|
|
8443
|
+
const match = contentType?.match(/boundary="?([^";]+)"?/i);
|
|
8444
|
+
return match?.[1];
|
|
8445
|
+
};
|
|
8446
|
+
var parseEmailMimeParts = (body, contentType) => {
|
|
8447
|
+
const boundary = parseMimeBoundary(contentType);
|
|
8448
|
+
if (!boundary) {
|
|
8449
|
+
const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
|
|
8450
|
+
return {
|
|
8451
|
+
attachments: [],
|
|
8452
|
+
bodyHtml: htmlMatch?.[0],
|
|
8453
|
+
bodyText: htmlMatch ? undefined : body
|
|
8454
|
+
};
|
|
8455
|
+
}
|
|
8456
|
+
const attachments = [];
|
|
8457
|
+
let bodyText;
|
|
8458
|
+
let bodyHtml;
|
|
8459
|
+
const parts = body.split(`--${boundary}`);
|
|
8460
|
+
for (const rawPart of parts) {
|
|
8461
|
+
const trimmed = rawPart.trim();
|
|
8462
|
+
if (!trimmed || trimmed === "--") {
|
|
8463
|
+
continue;
|
|
8464
|
+
}
|
|
8465
|
+
const { body: partBody, headerBlock } = splitEmailMessage(trimmed);
|
|
8466
|
+
const headers = parseHeaderBlock(headerBlock);
|
|
8467
|
+
const partContentType = headers.get("content-type");
|
|
8468
|
+
const disposition = headers.get("content-disposition");
|
|
8469
|
+
const transferEncoding = headers.get("content-transfer-encoding");
|
|
8470
|
+
const filename = disposition?.match(/filename="?([^";]+)"?/i)?.[1] ?? partContentType?.match(/name="?([^";]+)"?/i)?.[1];
|
|
8471
|
+
if (filename) {
|
|
8472
|
+
attachments.push({
|
|
8473
|
+
contentType: partContentType,
|
|
8474
|
+
data: decodeEmailPartBody(partBody, transferEncoding),
|
|
8475
|
+
fileName: filename
|
|
8476
|
+
});
|
|
8477
|
+
continue;
|
|
8478
|
+
}
|
|
8479
|
+
const decoded = Buffer.from(decodeEmailPartBody(partBody, transferEncoding)).toString("utf8");
|
|
8480
|
+
if (partContentType?.toLowerCase().includes("text/html")) {
|
|
8481
|
+
bodyHtml = decoded;
|
|
8482
|
+
continue;
|
|
8483
|
+
}
|
|
8484
|
+
if (partContentType?.toLowerCase().includes("text/plain")) {
|
|
8485
|
+
bodyText = decoded;
|
|
8486
|
+
}
|
|
8487
|
+
}
|
|
8488
|
+
return {
|
|
8489
|
+
attachments,
|
|
8490
|
+
bodyHtml,
|
|
8491
|
+
bodyText
|
|
8492
|
+
};
|
|
8493
|
+
};
|
|
8494
|
+
var extractEmailText = (raw) => {
|
|
8495
|
+
const { body, headerBlock } = splitEmailMessage(raw);
|
|
8496
|
+
const headers = parseHeaderBlock(headerBlock);
|
|
8497
|
+
const parsed = parseEmailMimeParts(body, headers.get("content-type"));
|
|
8498
|
+
if (parsed.bodyHtml) {
|
|
8499
|
+
return stripHtml(parsed.bodyHtml);
|
|
8500
|
+
}
|
|
8501
|
+
if (parsed.bodyText) {
|
|
8502
|
+
return normalizeWhitespace(parsed.bodyText);
|
|
8503
|
+
}
|
|
8104
8504
|
if (!body) {
|
|
8105
|
-
return normalizeWhitespace(
|
|
8505
|
+
return normalizeWhitespace(raw.replace(/\r\n?/g, `
|
|
8506
|
+
`));
|
|
8106
8507
|
}
|
|
8107
8508
|
const htmlMatch = body.match(/<html[\s\S]*<\/html>/i);
|
|
8108
8509
|
if (htmlMatch) {
|
|
@@ -8111,17 +8512,15 @@ var extractEmailText = (raw) => {
|
|
|
8111
8512
|
return normalizeWhitespace(body);
|
|
8112
8513
|
};
|
|
8113
8514
|
var parseEmailHeaders = (raw) => {
|
|
8114
|
-
const
|
|
8115
|
-
|
|
8116
|
-
const
|
|
8117
|
-
|
|
8118
|
-
`);
|
|
8119
|
-
const getHeader = (name) => {
|
|
8120
|
-
const match = headerBlock.match(new RegExp(`^${name}:\\s*(.+)$`, "im"));
|
|
8121
|
-
return match?.[1]?.trim();
|
|
8122
|
-
};
|
|
8515
|
+
const { headerBlock } = splitEmailMessage(raw);
|
|
8516
|
+
const headers = parseHeaderBlock(headerBlock);
|
|
8517
|
+
const getHeader = (name) => headers.get(name.toLowerCase());
|
|
8123
8518
|
return {
|
|
8519
|
+
contentType: getHeader("Content-Type"),
|
|
8124
8520
|
from: getHeader("From"),
|
|
8521
|
+
inReplyTo: getHeader("In-Reply-To"),
|
|
8522
|
+
messageId: getHeader("Message-ID"),
|
|
8523
|
+
references: getHeader("References"),
|
|
8125
8524
|
subject: getHeader("Subject"),
|
|
8126
8525
|
threadTopic: getHeader("Thread-Topic") ?? getHeader("Subject"),
|
|
8127
8526
|
to: getHeader("To")
|
|
@@ -8142,6 +8541,87 @@ var extractPrintableStrings = (data) => {
|
|
|
8142
8541
|
return unique.join(`
|
|
8143
8542
|
`);
|
|
8144
8543
|
};
|
|
8544
|
+
var ocrMetadata = (result) => {
|
|
8545
|
+
const regions = result.regions?.filter((region) => normalizeWhitespace(region.text ?? "").length > 0);
|
|
8546
|
+
const confidenceValues = [
|
|
8547
|
+
typeof result.confidence === "number" ? result.confidence : undefined,
|
|
8548
|
+
...(regions ?? []).map((region) => typeof region.confidence === "number" ? region.confidence : undefined)
|
|
8549
|
+
].filter((value) => value !== undefined);
|
|
8550
|
+
const averageConfidence = confidenceValues.length > 0 ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length : undefined;
|
|
8551
|
+
return {
|
|
8552
|
+
...result.metadata ?? {},
|
|
8553
|
+
ocrConfidence: result.confidence,
|
|
8554
|
+
ocrRegionCount: regions?.length,
|
|
8555
|
+
ocrRegions: regions,
|
|
8556
|
+
ocrAverageConfidence: averageConfidence
|
|
8557
|
+
};
|
|
8558
|
+
};
|
|
8559
|
+
var ocrPageDocuments = (result, input, baseMetadata) => {
|
|
8560
|
+
const grouped = new Map;
|
|
8561
|
+
for (const region of result.regions ?? []) {
|
|
8562
|
+
const text = normalizeWhitespace(region.text ?? "");
|
|
8563
|
+
if (!text || typeof region.page !== "number" || region.page < 1) {
|
|
8564
|
+
continue;
|
|
8565
|
+
}
|
|
8566
|
+
const bucket = grouped.get(region.page) ?? [];
|
|
8567
|
+
bucket.push({ ...region, text });
|
|
8568
|
+
grouped.set(region.page, bucket);
|
|
8569
|
+
}
|
|
8570
|
+
return [...grouped.entries()].sort((left, right) => left[0] - right[0]).map(([pageNumber, regions]) => ({
|
|
8571
|
+
chunking: input.chunking,
|
|
8572
|
+
contentType: input.contentType,
|
|
8573
|
+
format: "text",
|
|
8574
|
+
metadata: {
|
|
8575
|
+
...input.metadata ?? {},
|
|
8576
|
+
...baseMetadata,
|
|
8577
|
+
ocrRegionCount: regions.length,
|
|
8578
|
+
ocrRegions: regions,
|
|
8579
|
+
pageNumber,
|
|
8580
|
+
pageIndex: pageNumber - 1,
|
|
8581
|
+
sourceNativeKind: "pdf_page"
|
|
8582
|
+
},
|
|
8583
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
8584
|
+
text: normalizeWhitespace(`PDF page ${pageNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
8585
|
+
${regions.map((region) => region.text).join(`
|
|
8586
|
+
`)}`),
|
|
8587
|
+
title: input.title ? `${input.title} \xB7 Page ${pageNumber}` : `Page ${pageNumber}`
|
|
8588
|
+
}));
|
|
8589
|
+
};
|
|
8590
|
+
var ocrRegionDocuments = (result, input, baseMetadata) => {
|
|
8591
|
+
const documents = [];
|
|
8592
|
+
for (const [index, region] of (result.regions ?? []).entries()) {
|
|
8593
|
+
const text = normalizeWhitespace(region.text ?? "");
|
|
8594
|
+
if (!text || typeof region.page !== "number" || region.page < 1) {
|
|
8595
|
+
continue;
|
|
8596
|
+
}
|
|
8597
|
+
const pageNumber = region.page;
|
|
8598
|
+
const regionNumber = index + 1;
|
|
8599
|
+
documents.push({
|
|
8600
|
+
chunking: input.chunking,
|
|
8601
|
+
contentType: input.contentType,
|
|
8602
|
+
format: "text",
|
|
8603
|
+
metadata: {
|
|
8604
|
+
...input.metadata ?? {},
|
|
8605
|
+
...baseMetadata,
|
|
8606
|
+
ocrRegionConfidence: region.confidence,
|
|
8607
|
+
ocrRegionHeight: region.height,
|
|
8608
|
+
ocrRegionWidth: region.width,
|
|
8609
|
+
ocrRegionX: region.x,
|
|
8610
|
+
ocrRegionY: region.y,
|
|
8611
|
+
pageNumber,
|
|
8612
|
+
pageIndex: pageNumber - 1,
|
|
8613
|
+
regionIndex: index,
|
|
8614
|
+
regionNumber,
|
|
8615
|
+
sourceNativeKind: "pdf_region"
|
|
8616
|
+
},
|
|
8617
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
8618
|
+
text: normalizeWhitespace(`PDF page ${pageNumber} region ${regionNumber} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
8619
|
+
${text}`),
|
|
8620
|
+
title: input.title ? `${input.title} \xB7 Page ${pageNumber} Region ${regionNumber}` : `Page ${pageNumber} Region ${regionNumber}`
|
|
8621
|
+
});
|
|
8622
|
+
}
|
|
8623
|
+
return documents;
|
|
8624
|
+
};
|
|
8145
8625
|
var textExtractorSupports = (input) => {
|
|
8146
8626
|
if (input.format) {
|
|
8147
8627
|
return true;
|
|
@@ -8227,24 +8707,52 @@ var createBuiltinArchiveExpander = () => ({
|
|
|
8227
8707
|
var createEmailExtractor = () => ({
|
|
8228
8708
|
name: "absolute_email",
|
|
8229
8709
|
supports: emailExtractorSupports,
|
|
8230
|
-
extract: (input) => {
|
|
8710
|
+
extract: async (input) => {
|
|
8231
8711
|
const raw = decodeUtf8(input.data);
|
|
8232
8712
|
const headers = parseEmailHeaders(raw);
|
|
8233
|
-
|
|
8713
|
+
const { body } = splitEmailMessage(raw);
|
|
8714
|
+
const parsed = parseEmailMimeParts(body, headers.contentType);
|
|
8715
|
+
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`;
|
|
8716
|
+
const messageMetadata = {
|
|
8717
|
+
...input.metadata ?? {},
|
|
8718
|
+
emailKind: "message",
|
|
8719
|
+
fileKind: "email",
|
|
8720
|
+
from: headers.from,
|
|
8721
|
+
inReplyTo: headers.inReplyTo,
|
|
8722
|
+
messageId: headers.messageId,
|
|
8723
|
+
references: headers.references,
|
|
8724
|
+
threadTopic: headers.subject,
|
|
8725
|
+
to: headers.to,
|
|
8726
|
+
hasAttachments: parsed.attachments.length > 0
|
|
8727
|
+
};
|
|
8728
|
+
const attachmentDocuments = await Promise.all(parsed.attachments.map(async (attachment, index) => {
|
|
8729
|
+
const documents = await extractRAGFileDocuments({
|
|
8730
|
+
chunking: input.chunking,
|
|
8731
|
+
contentType: attachment.contentType,
|
|
8732
|
+
data: attachment.data,
|
|
8733
|
+
format: inferFormatFromContentType(attachment.contentType ?? null) ?? inferFormatFromName(attachment.fileName),
|
|
8734
|
+
metadata: {
|
|
8735
|
+
...messageMetadata,
|
|
8736
|
+
attachmentIndex: index,
|
|
8737
|
+
attachmentName: attachment.fileName,
|
|
8738
|
+
emailKind: "attachment"
|
|
8739
|
+
},
|
|
8740
|
+
name: attachment.fileName,
|
|
8741
|
+
source: `${source}#attachments/${attachment.fileName}`,
|
|
8742
|
+
title: headers.subject ? `${headers.subject} \xB7 ${attachment.fileName}` : attachment.fileName
|
|
8743
|
+
});
|
|
8744
|
+
return documents;
|
|
8745
|
+
}));
|
|
8746
|
+
const messageDocument = {
|
|
8234
8747
|
chunking: input.chunking,
|
|
8235
8748
|
contentType: input.contentType,
|
|
8236
8749
|
format: "text",
|
|
8237
|
-
metadata:
|
|
8238
|
-
|
|
8239
|
-
fileKind: "email",
|
|
8240
|
-
from: headers.from,
|
|
8241
|
-
threadTopic: headers.subject,
|
|
8242
|
-
to: headers.to
|
|
8243
|
-
},
|
|
8244
|
-
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`,
|
|
8750
|
+
metadata: messageMetadata,
|
|
8751
|
+
source,
|
|
8245
8752
|
text: extractEmailText(raw),
|
|
8246
8753
|
title: input.title ?? headers.subject
|
|
8247
8754
|
};
|
|
8755
|
+
return [messageDocument, ...attachmentDocuments.flat()];
|
|
8248
8756
|
}
|
|
8249
8757
|
});
|
|
8250
8758
|
var createEPUBExtractor = () => ({
|
|
@@ -8388,7 +8896,7 @@ var createRAGImageOCRExtractor = (provider) => ({
|
|
|
8388
8896
|
format: "text",
|
|
8389
8897
|
metadata: {
|
|
8390
8898
|
...input.metadata ?? {},
|
|
8391
|
-
...result
|
|
8899
|
+
...ocrMetadata(result),
|
|
8392
8900
|
fileKind: "image"
|
|
8393
8901
|
},
|
|
8394
8902
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.image.txt`,
|
|
@@ -8476,6 +8984,9 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
|
|
|
8476
8984
|
metadata: {
|
|
8477
8985
|
...archiveInput.metadata ?? {},
|
|
8478
8986
|
...entry.metadata ?? {},
|
|
8987
|
+
archiveEntryName: basename(entry.path),
|
|
8988
|
+
archiveParentName: archiveInput.name ?? archiveInput.path?.split(/[/\\]/).pop() ?? archiveInput.source,
|
|
8989
|
+
archiveParentSource: archiveInput.source ?? archiveInput.path ?? archiveInput.name,
|
|
8479
8990
|
archivePath: entry.path,
|
|
8480
8991
|
fileKind: "archive_entry"
|
|
8481
8992
|
},
|
|
@@ -8551,21 +9062,27 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
8551
9062
|
...input,
|
|
8552
9063
|
contentType: input.contentType ?? "application/pdf"
|
|
8553
9064
|
});
|
|
8554
|
-
|
|
9065
|
+
const baseMetadata = {
|
|
9066
|
+
...ocrMetadata(ocr),
|
|
9067
|
+
fileKind: "pdf",
|
|
9068
|
+
pageCount: estimatePDFPageCount(input.data),
|
|
9069
|
+
pdfTextMode: "ocr"
|
|
9070
|
+
};
|
|
9071
|
+
const summaryDocument = {
|
|
8555
9072
|
chunking: input.chunking,
|
|
8556
9073
|
contentType: input.contentType ?? "application/pdf",
|
|
8557
9074
|
format: "text",
|
|
8558
9075
|
metadata: {
|
|
8559
9076
|
...input.metadata ?? {},
|
|
8560
|
-
...
|
|
8561
|
-
fileKind: "pdf",
|
|
8562
|
-
pageCount: estimatePDFPageCount(input.data),
|
|
8563
|
-
pdfTextMode: "ocr"
|
|
9077
|
+
...baseMetadata
|
|
8564
9078
|
},
|
|
8565
9079
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
8566
9080
|
text: ocr.text,
|
|
8567
9081
|
title: ocr.title ?? input.title
|
|
8568
9082
|
};
|
|
9083
|
+
const pageDocuments = ocrPageDocuments(ocr, input, baseMetadata);
|
|
9084
|
+
const regionDocuments = ocrRegionDocuments(ocr, input, baseMetadata);
|
|
9085
|
+
return [summaryDocument, ...pageDocuments, ...regionDocuments];
|
|
8569
9086
|
}
|
|
8570
9087
|
});
|
|
8571
9088
|
var DEFAULT_FILE_EXTRACTORS = [
|
|
@@ -9421,6 +9938,17 @@ var searchDocuments = async (collection, input) => collection.search(input);
|
|
|
9421
9938
|
// src/ai/rag/htmxWorkflowRenderers.ts
|
|
9422
9939
|
init_constants();
|
|
9423
9940
|
var escapeHtml2 = (text) => text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
9941
|
+
var renderSourceLabels = (input) => {
|
|
9942
|
+
if (!input) {
|
|
9943
|
+
return "";
|
|
9944
|
+
}
|
|
9945
|
+
const rows = [
|
|
9946
|
+
input.contextLabel ? `<li><strong>Context</strong> ${escapeHtml2(input.contextLabel)}</li>` : "",
|
|
9947
|
+
input.locatorLabel ? `<li><strong>Location</strong> ${escapeHtml2(input.locatorLabel)}</li>` : "",
|
|
9948
|
+
input.provenanceLabel ? `<li><strong>Provenance</strong> ${escapeHtml2(input.provenanceLabel)}</li>` : ""
|
|
9949
|
+
].filter((row) => row.length > 0);
|
|
9950
|
+
return rows.length > 0 ? `<ul class="rag-source-labels">${rows.join("")}</ul>` : "";
|
|
9951
|
+
};
|
|
9424
9952
|
var renderEmptyState = (kind) => {
|
|
9425
9953
|
switch (kind) {
|
|
9426
9954
|
case "documents":
|
|
@@ -9460,17 +9988,41 @@ var defaultStatus = ({
|
|
|
9460
9988
|
}
|
|
9461
9989
|
return `<dl class="rag-status">` + `<div><dt>Backend</dt><dd>${escapeHtml2(status.backend)}</dd></div>` + `<div><dt>Vector mode</dt><dd>${escapeHtml2(status.vectorMode)}</dd></div>` + `<div><dt>Embedding dimensions</dt><dd>${status.dimensions ?? "n/a"}</dd></div>` + `<div><dt>Vector acceleration</dt><dd>${status.native?.active ? "active" : "inactive"}</dd></div>` + `<div><dt>Documents</dt><dd>${documents?.total ?? "n/a"}</dd></div>` + `<div><dt>Total chunks</dt><dd>${documents?.chunkCount ?? "n/a"}</dd></div>` + `<div><dt>Seed docs</dt><dd>${documents?.byKind.seed ?? 0}</dd></div>` + `<div><dt>Custom docs</dt><dd>${documents?.byKind.custom ?? 0}</dd></div>` + `</dl>${renderCapabilityList(capabilities)}`;
|
|
9462
9990
|
};
|
|
9463
|
-
var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
|
|
9991
|
+
var defaultSearchResultItem = (source, index) => '<article class="rag-search-result">' + `<h3>${escapeHtml2(source.title ?? source.chunkId ?? `Result ${index + 1}`)}</h3>` + `<p class="rag-search-source">${escapeHtml2(source.source ?? "unknown source")}</p>` + renderSourceLabels(source.labels) + `<p class="rag-search-score">score ${source.score.toFixed(RAG_SEARCH_SCORE_DECIMAL_PLACES)}</p>` + `<p class="rag-search-text">${escapeHtml2(source.text)}</p>` + "</article>";
|
|
9464
9992
|
var defaultSearchResults = ({
|
|
9465
9993
|
query,
|
|
9466
9994
|
results,
|
|
9467
9995
|
trace
|
|
9468
9996
|
}) => results.length === 0 ? renderEmptyState("searchResults") : `<section class="rag-search-results">` + `<p class="rag-search-summary">${results.length} results for ${escapeHtml2(query)}</p>` + (trace ? `<p class="rag-search-summary">mode=${escapeHtml2(trace.mode)} \xB7 final=${trace.resultCounts.final} \xB7 vector=${trace.resultCounts.vector} \xB7 lexical=${trace.resultCounts.lexical}</p>` : "") + `${results.map((result, index) => defaultSearchResultItem(result, index)).join("")}</section>`;
|
|
9469
|
-
var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
|
|
9997
|
+
var defaultDocumentItem = (document, index) => '<article class="rag-document">' + `<h3>${escapeHtml2(document.title || `Document ${index + 1}`)}</h3>` + `<p class="rag-document-id">${escapeHtml2(document.id)}</p>` + `<p class="rag-document-source">${escapeHtml2(document.source)}</p>` + renderSourceLabels(document.labels) + `<p class="rag-document-meta">${escapeHtml2(document.format ?? "text")} \xB7 ${escapeHtml2(document.chunkStrategy ?? "paragraphs")} \xB7 ${document.chunkCount ?? 0} chunks</p>` + "</article>";
|
|
9470
9998
|
var defaultDocuments = ({
|
|
9471
9999
|
documents
|
|
9472
10000
|
}) => documents.length === 0 ? renderEmptyState("documents") : `<section class="rag-documents">${documents.map((document, index) => defaultDocumentItem(document, index)).join("")}</section>`;
|
|
9473
|
-
var defaultChunkPreview = (input) =>
|
|
10001
|
+
var defaultChunkPreview = (input) => {
|
|
10002
|
+
const groups = input.chunks.reduce((acc, chunk) => {
|
|
10003
|
+
const metadata = chunk.metadata ?? {};
|
|
10004
|
+
const kind = typeof metadata.sourceNativeKind === "string" ? metadata.sourceNativeKind : "document_chunk";
|
|
10005
|
+
const locator = chunk.labels?.locatorLabel ?? "";
|
|
10006
|
+
const title = kind === "pdf_page" ? locator || "PDF pages" : kind === "pdf_region" ? locator || "PDF regions" : kind === "spreadsheet_sheet" ? locator || "Spreadsheet sheets" : kind === "presentation_slide" ? locator || "Presentation slides" : kind === "attachment" ? locator || "Attachments" : kind === "archive_entry" ? locator || "Archive entries" : "Chunks";
|
|
10007
|
+
const key = kind === "document_chunk" ? "document_chunk" : `${kind}:${title}`;
|
|
10008
|
+
const existing = acc.find((entry) => entry.key === key);
|
|
10009
|
+
if (existing) {
|
|
10010
|
+
existing.chunks.push(chunk);
|
|
10011
|
+
return acc;
|
|
10012
|
+
}
|
|
10013
|
+
acc.push({
|
|
10014
|
+
chunks: [chunk],
|
|
10015
|
+
key,
|
|
10016
|
+
title
|
|
10017
|
+
});
|
|
10018
|
+
return acc;
|
|
10019
|
+
}, []);
|
|
10020
|
+
const groupHtml = groups.map((group) => {
|
|
10021
|
+
const chunkHtml = group.chunks.map((chunk) => '<article class="rag-chunk">' + `<h5>${escapeHtml2(chunk.chunkId)}</h5>` + `<p class="rag-chunk-meta">chunk ${typeof chunk.metadata?.chunkIndex === "number" ? chunk.metadata.chunkIndex : 0} of ${typeof chunk.metadata?.chunkCount === "number" ? chunk.metadata.chunkCount : input.chunks.length}</p>` + renderSourceLabels(chunk.labels) + `<pre>${escapeHtml2(chunk.text)}</pre>` + "</article>").join("");
|
|
10022
|
+
return `<section class="rag-chunk-group"><h4>${escapeHtml2(group.title)}</h4>${chunkHtml}</section>`;
|
|
10023
|
+
}).join("");
|
|
10024
|
+
return `<section class="rag-chunk-preview">` + `<h3>${escapeHtml2(input.document.title)}</h3>` + `<p class="rag-chunk-preview-source">${escapeHtml2(input.document.source)}</p>` + renderSourceLabels(input.document.labels) + `<article class="rag-chunk-normalized">` + `<h4>Normalized text</h4>` + `<pre>${escapeHtml2(input.normalizedText)}</pre>` + `</article>${groupHtml}</section>`;
|
|
10025
|
+
};
|
|
9474
10026
|
var defaultMutationResult = (input) => {
|
|
9475
10027
|
if (!input.ok) {
|
|
9476
10028
|
return `<div class="rag-mutation error">${escapeHtml2(input.error ?? "Request failed")}</div>`;
|
|
@@ -9533,6 +10085,10 @@ var buildRAGContextLocatorLabel = (metadata, source, title) => {
|
|
|
9533
10085
|
return;
|
|
9534
10086
|
}
|
|
9535
10087
|
const page = getContextNumber3(metadata.page) ?? getContextNumber3(metadata.pageNumber) ?? (typeof metadata.pageIndex === "number" ? metadata.pageIndex + 1 : undefined);
|
|
10088
|
+
const region = getContextNumber3(metadata.regionNumber) ?? (typeof metadata.regionIndex === "number" ? metadata.regionIndex + 1 : undefined);
|
|
10089
|
+
if (page && region) {
|
|
10090
|
+
return `Page ${page} \xB7 Region ${region}`;
|
|
10091
|
+
}
|
|
9536
10092
|
if (page) {
|
|
9537
10093
|
return `Page ${page}`;
|
|
9538
10094
|
}
|
|
@@ -9574,9 +10130,11 @@ var buildRAGContextProvenanceLabel = (metadata) => {
|
|
|
9574
10130
|
const threadTopic = getContextString3(metadata.threadTopic);
|
|
9575
10131
|
const from = getContextString3(metadata.from);
|
|
9576
10132
|
const speaker = getContextString3(metadata.speaker);
|
|
10133
|
+
const ocrConfidence = getContextNumber3(metadata.ocrRegionConfidence) ?? getContextNumber3(metadata.ocrConfidence);
|
|
9577
10134
|
const labels = [
|
|
9578
10135
|
pdfTextMode ? `PDF ${pdfTextMode}` : "",
|
|
9579
10136
|
ocrEngine ? `OCR ${ocrEngine}` : "",
|
|
10137
|
+
typeof ocrConfidence === "number" ? `Confidence ${ocrConfidence.toFixed(2)}` : "",
|
|
9580
10138
|
mediaKind ? `Media ${mediaKind}` : "",
|
|
9581
10139
|
transcriptSource ? `Transcript ${transcriptSource}` : "",
|
|
9582
10140
|
threadTopic ? `Thread ${threadTopic}` : "",
|
|
@@ -9886,6 +10444,11 @@ var isRAGDocumentUrlArray = (value) => Array.isArray(value) && value.every((entr
|
|
|
9886
10444
|
var isRAGDocumentChunkArray = (value) => Array.isArray(value) && value.every((entry) => isRAGDocumentChunk(entry));
|
|
9887
10445
|
var buildSources2 = (results) => results.map((result) => ({
|
|
9888
10446
|
chunkId: result.chunkId,
|
|
10447
|
+
labels: buildRAGSourceLabels({
|
|
10448
|
+
metadata: result.metadata,
|
|
10449
|
+
source: result.source,
|
|
10450
|
+
title: result.title
|
|
10451
|
+
}),
|
|
9889
10452
|
metadata: result.metadata,
|
|
9890
10453
|
score: normalizeScore(result.score),
|
|
9891
10454
|
source: result.source,
|
|
@@ -13616,6 +14179,11 @@ var ragChat = (config) => {
|
|
|
13616
14179
|
let documentsWithoutChunkPreview = 0;
|
|
13617
14180
|
let inspectedDocuments = 0;
|
|
13618
14181
|
let inspectedChunks = 0;
|
|
14182
|
+
let documentsWithSourceLabels = 0;
|
|
14183
|
+
let chunksWithSourceLabels = 0;
|
|
14184
|
+
const sourceNativeKinds = new Map;
|
|
14185
|
+
const sampleDocuments = [];
|
|
14186
|
+
const sampleChunks = [];
|
|
13619
14187
|
let oldestDocumentAgeMs;
|
|
13620
14188
|
let newestDocumentAgeMs;
|
|
13621
14189
|
const staleDocuments = [];
|
|
@@ -13656,6 +14224,27 @@ var ragChat = (config) => {
|
|
|
13656
14224
|
if ((document.chunkCount ?? 0) === 0) {
|
|
13657
14225
|
emptyDocuments += 1;
|
|
13658
14226
|
}
|
|
14227
|
+
const documentLabels = buildRAGSourceLabels({
|
|
14228
|
+
metadata: document.metadata,
|
|
14229
|
+
source: document.source,
|
|
14230
|
+
title: document.title
|
|
14231
|
+
});
|
|
14232
|
+
if (documentLabels) {
|
|
14233
|
+
documentsWithSourceLabels += 1;
|
|
14234
|
+
}
|
|
14235
|
+
const documentSourceNativeKind = typeof document.metadata?.sourceNativeKind === "string" ? document.metadata.sourceNativeKind : undefined;
|
|
14236
|
+
if (documentSourceNativeKind) {
|
|
14237
|
+
sourceNativeKinds.set(documentSourceNativeKind, (sourceNativeKinds.get(documentSourceNativeKind) ?? 0) + 1);
|
|
14238
|
+
}
|
|
14239
|
+
if (sampleDocuments.length < 5 && (documentLabels || documentSourceNativeKind)) {
|
|
14240
|
+
sampleDocuments.push({
|
|
14241
|
+
id: document.id,
|
|
14242
|
+
labels: documentLabels,
|
|
14243
|
+
source: document.source,
|
|
14244
|
+
sourceNativeKind: documentSourceNativeKind,
|
|
14245
|
+
title: document.title
|
|
14246
|
+
});
|
|
14247
|
+
}
|
|
13659
14248
|
if (indexManager?.getDocumentChunks) {
|
|
13660
14249
|
const preview = await indexManager.getDocumentChunks(document.id);
|
|
13661
14250
|
if (!preview) {
|
|
@@ -13665,6 +14254,27 @@ var ragChat = (config) => {
|
|
|
13665
14254
|
inspectedDocuments += 1;
|
|
13666
14255
|
for (const chunk of preview.chunks) {
|
|
13667
14256
|
inspectedChunks += 1;
|
|
14257
|
+
const chunkLabels = buildRAGSourceLabels({
|
|
14258
|
+
metadata: chunk.metadata,
|
|
14259
|
+
source: chunk.source ?? preview.document.source,
|
|
14260
|
+
title: chunk.title ?? preview.document.title
|
|
14261
|
+
});
|
|
14262
|
+
if (chunkLabels) {
|
|
14263
|
+
chunksWithSourceLabels += 1;
|
|
14264
|
+
}
|
|
14265
|
+
const chunkSourceNativeKind = typeof chunk.metadata?.sourceNativeKind === "string" ? chunk.metadata.sourceNativeKind : undefined;
|
|
14266
|
+
if (chunkSourceNativeKind) {
|
|
14267
|
+
sourceNativeKinds.set(chunkSourceNativeKind, (sourceNativeKinds.get(chunkSourceNativeKind) ?? 0) + 1);
|
|
14268
|
+
}
|
|
14269
|
+
if (sampleChunks.length < 8 && (chunkLabels || chunkSourceNativeKind)) {
|
|
14270
|
+
sampleChunks.push({
|
|
14271
|
+
chunkId: chunk.chunkId,
|
|
14272
|
+
documentId: document.id,
|
|
14273
|
+
labels: chunkLabels,
|
|
14274
|
+
source: chunk.source ?? preview.document.source,
|
|
14275
|
+
sourceNativeKind: chunkSourceNativeKind
|
|
14276
|
+
});
|
|
14277
|
+
}
|
|
13668
14278
|
const normalized = chunk.text.trim();
|
|
13669
14279
|
if (!normalized) {
|
|
13670
14280
|
emptyChunks += 1;
|
|
@@ -13721,6 +14331,13 @@ var ragChat = (config) => {
|
|
|
13721
14331
|
failuresByInputKind: Object.fromEntries(failuresByInputKind.entries()),
|
|
13722
14332
|
inspectedChunks,
|
|
13723
14333
|
inspectedDocuments,
|
|
14334
|
+
inspection: {
|
|
14335
|
+
chunksWithSourceLabels,
|
|
14336
|
+
documentsWithSourceLabels,
|
|
14337
|
+
sampleChunks,
|
|
14338
|
+
sampleDocuments,
|
|
14339
|
+
sourceNativeKinds: Object.fromEntries(sourceNativeKinds.entries())
|
|
14340
|
+
},
|
|
13724
14341
|
lowSignalChunks,
|
|
13725
14342
|
newestDocumentAgeMs,
|
|
13726
14343
|
oldestDocumentAgeMs,
|
|
@@ -14901,7 +15518,14 @@ var ragChat = (config) => {
|
|
|
14901
15518
|
}
|
|
14902
15519
|
const documents = await indexManager.listDocuments({ kind });
|
|
14903
15520
|
return {
|
|
14904
|
-
documents
|
|
15521
|
+
documents: documents.map((document) => ({
|
|
15522
|
+
...document,
|
|
15523
|
+
labels: buildRAGSourceLabels({
|
|
15524
|
+
metadata: document.metadata,
|
|
15525
|
+
source: document.source,
|
|
15526
|
+
title: document.title
|
|
15527
|
+
})
|
|
15528
|
+
})),
|
|
14905
15529
|
ok: true
|
|
14906
15530
|
};
|
|
14907
15531
|
};
|
|
@@ -14961,7 +15585,23 @@ var ragChat = (config) => {
|
|
|
14961
15585
|
}
|
|
14962
15586
|
return {
|
|
14963
15587
|
ok: true,
|
|
14964
|
-
...preview
|
|
15588
|
+
...preview,
|
|
15589
|
+
document: {
|
|
15590
|
+
...preview.document,
|
|
15591
|
+
labels: buildRAGSourceLabels({
|
|
15592
|
+
metadata: preview.document.metadata,
|
|
15593
|
+
source: preview.document.source,
|
|
15594
|
+
title: preview.document.title
|
|
15595
|
+
})
|
|
15596
|
+
},
|
|
15597
|
+
chunks: preview.chunks.map((chunk) => ({
|
|
15598
|
+
...chunk,
|
|
15599
|
+
labels: buildRAGSourceLabels({
|
|
15600
|
+
metadata: chunk.metadata,
|
|
15601
|
+
source: chunk.source ?? preview.document.source,
|
|
15602
|
+
title: chunk.title ?? preview.document.title
|
|
15603
|
+
})
|
|
15604
|
+
}))
|
|
14965
15605
|
};
|
|
14966
15606
|
};
|
|
14967
15607
|
const handleDeleteDocument = async (id) => {
|
|
@@ -20557,5 +21197,5 @@ export {
|
|
|
20557
21197
|
aiChat
|
|
20558
21198
|
};
|
|
20559
21199
|
|
|
20560
|
-
//# debugId=
|
|
21200
|
+
//# debugId=5C4A7D98C1C2BE6B64756E2164756E21
|
|
20561
21201
|
//# sourceMappingURL=index.js.map
|