@absolutejs/absolute 0.19.0-beta.494 → 0.19.0-beta.496
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/index.js +157 -30
- package/dist/ai/index.js.map +4 -4
- package/dist/src/ai/rag/ingestion.d.ts +22 -2
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -2199,8 +2199,10 @@ var extractWeightedLexicalFields = (result) => {
|
|
|
2199
2199
|
const archivePath = typeof metadata.archivePath === "string" ? metadata.archivePath : source.includes("#") ? source.split("#")[1] ?? "" : "";
|
|
2200
2200
|
const mediaSegments = Array.isArray(metadata.mediaSegments) ? metadata.mediaSegments.map((segment) => segment && typeof segment === "object" ? toFieldText(segment) : "").filter(Boolean).join(" ") : "";
|
|
2201
2201
|
const metadataFocus = [
|
|
2202
|
+
metadata.sourceNativeKind,
|
|
2202
2203
|
metadata.sheetName,
|
|
2203
2204
|
metadata.sheetNames,
|
|
2205
|
+
metadata.slideNumber,
|
|
2204
2206
|
metadata.slideTitle,
|
|
2205
2207
|
metadata.slideTitles,
|
|
2206
2208
|
metadata.threadTopic,
|
|
@@ -3232,6 +3234,30 @@ var spreadsheetText = (entries) => {
|
|
|
3232
3234
|
return normalizeWhitespace(sheetValues.join(`
|
|
3233
3235
|
`));
|
|
3234
3236
|
};
|
|
3237
|
+
var spreadsheetSheetTexts = (entries) => {
|
|
3238
|
+
const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
3239
|
+
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
3240
|
+
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
3241
|
+
const sheetNames = spreadsheetSheetNames(entries);
|
|
3242
|
+
const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
|
|
3243
|
+
return sheetEntries.map((entry, index) => {
|
|
3244
|
+
const values = [
|
|
3245
|
+
...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
|
|
3246
|
+
].map((match) => match[1] ?? "").map((value) => {
|
|
3247
|
+
const sharedStringIndex = Number(value);
|
|
3248
|
+
return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
|
|
3249
|
+
});
|
|
3250
|
+
const text = normalizeWhitespace(values.join(`
|
|
3251
|
+
`));
|
|
3252
|
+
if (!text) {
|
|
3253
|
+
return null;
|
|
3254
|
+
}
|
|
3255
|
+
return {
|
|
3256
|
+
name: sheetNames[index] ?? `Sheet ${index + 1}`,
|
|
3257
|
+
text
|
|
3258
|
+
};
|
|
3259
|
+
}).filter((entry) => Boolean(entry));
|
|
3260
|
+
};
|
|
3235
3261
|
var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
|
|
3236
3262
|
...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
|
|
3237
3263
|
].map((match) => match[1] ?? "")).filter(Boolean);
|
|
@@ -3241,6 +3267,10 @@ var presentationText = (entries) => {
|
|
|
3241
3267
|
|
|
3242
3268
|
`));
|
|
3243
3269
|
};
|
|
3270
|
+
var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
|
|
3271
|
+
index,
|
|
3272
|
+
text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
|
|
3273
|
+
})).filter((slide) => Boolean(slide.text));
|
|
3244
3274
|
var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
|
|
3245
3275
|
var epubText = (entries) => {
|
|
3246
3276
|
const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
|
|
@@ -3458,6 +3488,7 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3458
3488
|
const entries = unzipEntries(input.data);
|
|
3459
3489
|
let text = "";
|
|
3460
3490
|
let officeMetadata = {};
|
|
3491
|
+
let structuredDocuments = [];
|
|
3461
3492
|
if (extension === ".docx" || extension === ".odt") {
|
|
3462
3493
|
text = officeDocumentText(entries);
|
|
3463
3494
|
officeMetadata = {
|
|
@@ -3465,19 +3496,55 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3465
3496
|
};
|
|
3466
3497
|
} else if (extension === ".xlsx" || extension === ".ods") {
|
|
3467
3498
|
text = spreadsheetText(entries);
|
|
3499
|
+
const sheets = spreadsheetSheetTexts(entries);
|
|
3468
3500
|
officeMetadata = {
|
|
3469
3501
|
sheetNames: spreadsheetSheetNames(entries)
|
|
3470
3502
|
};
|
|
3503
|
+
structuredDocuments = sheets.map((sheet, index) => ({
|
|
3504
|
+
chunking: input.chunking,
|
|
3505
|
+
contentType: input.contentType,
|
|
3506
|
+
format: "text",
|
|
3507
|
+
metadata: {
|
|
3508
|
+
...input.metadata ?? {},
|
|
3509
|
+
fileKind: "office",
|
|
3510
|
+
...officeMetadata,
|
|
3511
|
+
sourceNativeKind: "spreadsheet_sheet",
|
|
3512
|
+
sheetIndex: index,
|
|
3513
|
+
sheetName: sheet.name
|
|
3514
|
+
},
|
|
3515
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
3516
|
+
text: normalizeWhitespace(`Spreadsheet sheet ${sheet.name} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
3517
|
+
${sheet.text}`),
|
|
3518
|
+
title: input.title ? `${input.title} \xB7 ${sheet.name}` : sheet.name
|
|
3519
|
+
}));
|
|
3471
3520
|
} else if (extension === ".pptx" || extension === ".odp") {
|
|
3472
3521
|
text = presentationText(entries);
|
|
3522
|
+
const slides = presentationSlides(entries);
|
|
3473
3523
|
officeMetadata = {
|
|
3474
3524
|
slideCount: presentationSlideCount(entries)
|
|
3475
3525
|
};
|
|
3526
|
+
structuredDocuments = slides.map((slide) => ({
|
|
3527
|
+
chunking: input.chunking,
|
|
3528
|
+
contentType: input.contentType,
|
|
3529
|
+
format: "text",
|
|
3530
|
+
metadata: {
|
|
3531
|
+
...input.metadata ?? {},
|
|
3532
|
+
fileKind: "office",
|
|
3533
|
+
...officeMetadata,
|
|
3534
|
+
sourceNativeKind: "presentation_slide",
|
|
3535
|
+
slideIndex: slide.index,
|
|
3536
|
+
slideNumber: slide.index + 1
|
|
3537
|
+
},
|
|
3538
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
3539
|
+
text: normalizeWhitespace(`Presentation slide ${slide.index + 1} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
3540
|
+
${slide.text}`),
|
|
3541
|
+
title: input.title ? `${input.title} \xB7 Slide ${slide.index + 1}` : `Slide ${slide.index + 1}`
|
|
3542
|
+
}));
|
|
3476
3543
|
}
|
|
3477
3544
|
if (!text) {
|
|
3478
3545
|
throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
|
|
3479
3546
|
}
|
|
3480
|
-
|
|
3547
|
+
const summaryDocument = {
|
|
3481
3548
|
chunking: input.chunking,
|
|
3482
3549
|
contentType: input.contentType,
|
|
3483
3550
|
format: "text",
|
|
@@ -3490,6 +3557,7 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3490
3557
|
text,
|
|
3491
3558
|
title: input.title
|
|
3492
3559
|
};
|
|
3560
|
+
return [summaryDocument, ...structuredDocuments];
|
|
3493
3561
|
}
|
|
3494
3562
|
});
|
|
3495
3563
|
var createRAGArchiveExpander = (expander) => expander;
|
|
@@ -3519,7 +3587,37 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
3519
3587
|
supports: mediaExtractorSupports,
|
|
3520
3588
|
extract: async (input) => {
|
|
3521
3589
|
const result = await transcriber.transcribe(input);
|
|
3522
|
-
|
|
3590
|
+
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
|
|
3591
|
+
const segmentDocuments = [];
|
|
3592
|
+
for (const [index, segment] of (result.segments ?? []).entries()) {
|
|
3593
|
+
const text = normalizeWhitespace(segment.text ?? "");
|
|
3594
|
+
if (!text) {
|
|
3595
|
+
continue;
|
|
3596
|
+
}
|
|
3597
|
+
const startMs = typeof segment.startMs === "number" ? segment.startMs : undefined;
|
|
3598
|
+
const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
|
|
3599
|
+
segmentDocuments.push({
|
|
3600
|
+
chunking: input.chunking,
|
|
3601
|
+
contentType: input.contentType,
|
|
3602
|
+
format: "text",
|
|
3603
|
+
metadata: {
|
|
3604
|
+
...input.metadata ?? {},
|
|
3605
|
+
...result.metadata ?? {},
|
|
3606
|
+
fileKind: "media",
|
|
3607
|
+
sourceNativeKind: "media_segment",
|
|
3608
|
+
mediaSegmentIndex: index,
|
|
3609
|
+
mediaSegmentStartMs: startMs,
|
|
3610
|
+
mediaSegmentEndMs: endMs,
|
|
3611
|
+
mediaSegments: [segment],
|
|
3612
|
+
speaker: typeof segment.speaker === "string" ? segment.speaker : undefined
|
|
3613
|
+
},
|
|
3614
|
+
source,
|
|
3615
|
+
text: normalizeWhitespace(`Media transcript segment${typeof startMs === "number" ? ` ${startMs}-${endMs ?? startMs}ms` : ""} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
|
|
3616
|
+
${text}`),
|
|
3617
|
+
title: input.title ? `${input.title} \xB7 Segment ${index + 1}` : `Segment ${index + 1}`
|
|
3618
|
+
});
|
|
3619
|
+
}
|
|
3620
|
+
const summaryDocument = {
|
|
3523
3621
|
chunking: input.chunking,
|
|
3524
3622
|
contentType: input.contentType,
|
|
3525
3623
|
format: "text",
|
|
@@ -3529,10 +3627,11 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
3529
3627
|
fileKind: "media",
|
|
3530
3628
|
mediaSegments: result.segments
|
|
3531
3629
|
},
|
|
3532
|
-
source
|
|
3630
|
+
source,
|
|
3533
3631
|
text: result.text,
|
|
3534
3632
|
title: result.title ?? input.title
|
|
3535
3633
|
};
|
|
3634
|
+
return [summaryDocument, ...segmentDocuments];
|
|
3536
3635
|
}
|
|
3537
3636
|
});
|
|
3538
3637
|
var createRAGMediaTranscriber = (transcriber) => transcriber;
|
|
@@ -3564,7 +3663,7 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
|
|
|
3564
3663
|
},
|
|
3565
3664
|
name: basename(entry.path),
|
|
3566
3665
|
source: archiveInput.source && !archiveInput.source.startsWith("http") ? `${archiveInput.source}#${entry.path}` : entry.path,
|
|
3567
|
-
title:
|
|
3666
|
+
title: basename(entry.path)
|
|
3568
3667
|
}, extractors);
|
|
3569
3668
|
return documents;
|
|
3570
3669
|
};
|
|
@@ -3693,6 +3792,7 @@ var getFirstExtractedDocument = (documents, label) => {
|
|
|
3693
3792
|
}
|
|
3694
3793
|
return document;
|
|
3695
3794
|
};
|
|
3795
|
+
var loadExtractedDocuments = async (input, extractors) => extractRAGFileDocuments(input, extractors);
|
|
3696
3796
|
var sentenceUnits = (text) => {
|
|
3697
3797
|
const matches = text.match(/[^.!?\n]+(?:[.!?]+|$)/g);
|
|
3698
3798
|
if (!matches) {
|
|
@@ -3915,32 +4015,55 @@ var loadRAGDocumentFromURL = async (input) => {
|
|
|
3915
4015
|
};
|
|
3916
4016
|
var loadRAGDocumentsFromUploads = async (input) => {
|
|
3917
4017
|
const documents = await Promise.all(input.uploads.map(async (upload) => {
|
|
3918
|
-
const loaded = await
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
4018
|
+
const loaded = await loadExtractedDocuments({
|
|
4019
|
+
chunking: upload.chunking,
|
|
4020
|
+
contentType: upload.contentType,
|
|
4021
|
+
data: decodeUploadContent(upload),
|
|
4022
|
+
format: upload.format,
|
|
4023
|
+
metadata: upload.metadata,
|
|
4024
|
+
name: upload.name,
|
|
4025
|
+
source: upload.source ?? upload.name,
|
|
4026
|
+
title: upload.title
|
|
4027
|
+
}, input.extractors);
|
|
4028
|
+
return loaded.map((document) => ({
|
|
4029
|
+
...document,
|
|
4030
|
+
metadata: mergeMetadata(document.metadata, { uploadFile: upload.name }, input.baseMetadata)
|
|
4031
|
+
}));
|
|
3926
4032
|
}));
|
|
3927
4033
|
return {
|
|
3928
4034
|
defaultChunking: input.defaultChunking,
|
|
3929
|
-
documents
|
|
4035
|
+
documents: documents.flat()
|
|
3930
4036
|
};
|
|
3931
4037
|
};
|
|
3932
4038
|
var loadRAGDocumentsFromURLs = async (input) => {
|
|
3933
|
-
const documents = await Promise.all(input.urls.map(async (urlInput) =>
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
}
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
4039
|
+
const documents = await Promise.all(input.urls.map(async (urlInput) => {
|
|
4040
|
+
const url = urlInput.url.trim();
|
|
4041
|
+
if (!url) {
|
|
4042
|
+
throw new Error("RAG URL is required");
|
|
4043
|
+
}
|
|
4044
|
+
const response = await fetch(url);
|
|
4045
|
+
if (!response.ok) {
|
|
4046
|
+
throw new Error(`Failed to fetch RAG URL ${url}: ${response.status} ${response.statusText}`);
|
|
4047
|
+
}
|
|
4048
|
+
const data = new Uint8Array(await response.arrayBuffer());
|
|
4049
|
+
const loaded = await loadExtractedDocuments({
|
|
4050
|
+
chunking: urlInput.chunking,
|
|
4051
|
+
contentType: urlInput.contentType ?? response.headers.get("content-type") ?? undefined,
|
|
4052
|
+
data,
|
|
4053
|
+
format: urlInput.format ?? inferFormatFromUrl(url),
|
|
4054
|
+
metadata: urlInput.metadata,
|
|
4055
|
+
name: basename(new URL(url).pathname),
|
|
4056
|
+
source: urlInput.source ?? url,
|
|
4057
|
+
title: urlInput.title
|
|
4058
|
+
}, urlInput.extractors ?? input.extractors);
|
|
4059
|
+
return loaded.map((document) => ({
|
|
4060
|
+
...document,
|
|
4061
|
+
metadata: mergeMetadata(document.metadata, { sourceUrl: urlInput.url }, input.baseMetadata)
|
|
4062
|
+
}));
|
|
4063
|
+
}));
|
|
3941
4064
|
return {
|
|
3942
4065
|
defaultChunking: input.defaultChunking,
|
|
3943
|
-
documents
|
|
4066
|
+
documents: documents.flat()
|
|
3944
4067
|
};
|
|
3945
4068
|
};
|
|
3946
4069
|
var loadRAGDocumentUpload = async (input) => {
|
|
@@ -4020,21 +4143,25 @@ var loadRAGDocumentsFromDirectory = async (input) => {
|
|
|
4020
4143
|
const files = await collectDirectoryFiles(root, input.recursive !== false, includeExtensions);
|
|
4021
4144
|
const documents = await Promise.all(files.map(async (path) => {
|
|
4022
4145
|
const source = relative(root, path).replace(/\\/g, "/");
|
|
4023
|
-
const
|
|
4146
|
+
const data = await readFile(path);
|
|
4147
|
+
const loaded = await loadExtractedDocuments({
|
|
4148
|
+
chunking: input.defaultChunking,
|
|
4149
|
+
data,
|
|
4024
4150
|
metadata: {
|
|
4025
|
-
...input.baseMetadata ?? {},
|
|
4026
4151
|
fileName: basename(path),
|
|
4027
4152
|
relativePath: source
|
|
4028
4153
|
},
|
|
4029
4154
|
path,
|
|
4030
|
-
source
|
|
4031
|
-
|
|
4032
|
-
|
|
4033
|
-
|
|
4155
|
+
source
|
|
4156
|
+
}, input.extractors);
|
|
4157
|
+
return loaded.map((document) => ({
|
|
4158
|
+
...document,
|
|
4159
|
+
metadata: mergeMetadata(document.metadata, undefined, input.baseMetadata)
|
|
4160
|
+
}));
|
|
4034
4161
|
}));
|
|
4035
4162
|
return {
|
|
4036
4163
|
defaultChunking: input.defaultChunking,
|
|
4037
|
-
documents
|
|
4164
|
+
documents: documents.flat()
|
|
4038
4165
|
};
|
|
4039
4166
|
};
|
|
4040
4167
|
var prepareRAGDirectoryDocuments = async (input) => prepareRAGDocuments(await loadRAGDocumentsFromDirectory(input));
|
|
@@ -8773,5 +8900,5 @@ export {
|
|
|
8773
8900
|
aiChat
|
|
8774
8901
|
};
|
|
8775
8902
|
|
|
8776
|
-
//# debugId=
|
|
8903
|
+
//# debugId=7CB5678D47F7B89564756E2164756E21
|
|
8777
8904
|
//# sourceMappingURL=index.js.map
|