@absolutejs/absolute 0.19.0-beta.494 → 0.19.0-beta.495
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/index.js +152 -30
- package/dist/ai/index.js.map +3 -3
- package/dist/src/ai/rag/ingestion.d.ts +22 -2
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -3232,6 +3232,30 @@ var spreadsheetText = (entries) => {
|
|
|
3232
3232
|
return normalizeWhitespace(sheetValues.join(`
|
|
3233
3233
|
`));
|
|
3234
3234
|
};
|
|
3235
|
+
var spreadsheetSheetTexts = (entries) => {
|
|
3236
|
+
const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
3237
|
+
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
3238
|
+
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
3239
|
+
const sheetNames = spreadsheetSheetNames(entries);
|
|
3240
|
+
const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
|
|
3241
|
+
return sheetEntries.map((entry, index) => {
|
|
3242
|
+
const values = [
|
|
3243
|
+
...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
|
|
3244
|
+
].map((match) => match[1] ?? "").map((value) => {
|
|
3245
|
+
const sharedStringIndex = Number(value);
|
|
3246
|
+
return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
|
|
3247
|
+
});
|
|
3248
|
+
const text = normalizeWhitespace(values.join(`
|
|
3249
|
+
`));
|
|
3250
|
+
if (!text) {
|
|
3251
|
+
return null;
|
|
3252
|
+
}
|
|
3253
|
+
return {
|
|
3254
|
+
name: sheetNames[index] ?? `Sheet ${index + 1}`,
|
|
3255
|
+
text
|
|
3256
|
+
};
|
|
3257
|
+
}).filter((entry) => Boolean(entry));
|
|
3258
|
+
};
|
|
3235
3259
|
var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
|
|
3236
3260
|
...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
|
|
3237
3261
|
].map((match) => match[1] ?? "")).filter(Boolean);
|
|
@@ -3241,6 +3265,10 @@ var presentationText = (entries) => {
|
|
|
3241
3265
|
|
|
3242
3266
|
`));
|
|
3243
3267
|
};
|
|
3268
|
+
var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
|
|
3269
|
+
index,
|
|
3270
|
+
text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
|
|
3271
|
+
})).filter((slide) => Boolean(slide.text));
|
|
3244
3272
|
var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
|
|
3245
3273
|
var epubText = (entries) => {
|
|
3246
3274
|
const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
|
|
@@ -3458,6 +3486,7 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3458
3486
|
const entries = unzipEntries(input.data);
|
|
3459
3487
|
let text = "";
|
|
3460
3488
|
let officeMetadata = {};
|
|
3489
|
+
let structuredDocuments = [];
|
|
3461
3490
|
if (extension === ".docx" || extension === ".odt") {
|
|
3462
3491
|
text = officeDocumentText(entries);
|
|
3463
3492
|
officeMetadata = {
|
|
@@ -3465,19 +3494,53 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3465
3494
|
};
|
|
3466
3495
|
} else if (extension === ".xlsx" || extension === ".ods") {
|
|
3467
3496
|
text = spreadsheetText(entries);
|
|
3497
|
+
const sheets = spreadsheetSheetTexts(entries);
|
|
3468
3498
|
officeMetadata = {
|
|
3469
3499
|
sheetNames: spreadsheetSheetNames(entries)
|
|
3470
3500
|
};
|
|
3501
|
+
structuredDocuments = sheets.map((sheet, index) => ({
|
|
3502
|
+
chunking: input.chunking,
|
|
3503
|
+
contentType: input.contentType,
|
|
3504
|
+
format: "text",
|
|
3505
|
+
metadata: {
|
|
3506
|
+
...input.metadata ?? {},
|
|
3507
|
+
fileKind: "office",
|
|
3508
|
+
...officeMetadata,
|
|
3509
|
+
sheetIndex: index,
|
|
3510
|
+
sheetName: sheet.name
|
|
3511
|
+
},
|
|
3512
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
3513
|
+
text: `Sheet ${sheet.name}
|
|
3514
|
+
${sheet.text}`,
|
|
3515
|
+
title: input.title ? `${input.title} \xB7 ${sheet.name}` : sheet.name
|
|
3516
|
+
}));
|
|
3471
3517
|
} else if (extension === ".pptx" || extension === ".odp") {
|
|
3472
3518
|
text = presentationText(entries);
|
|
3519
|
+
const slides = presentationSlides(entries);
|
|
3473
3520
|
officeMetadata = {
|
|
3474
3521
|
slideCount: presentationSlideCount(entries)
|
|
3475
3522
|
};
|
|
3523
|
+
structuredDocuments = slides.map((slide) => ({
|
|
3524
|
+
chunking: input.chunking,
|
|
3525
|
+
contentType: input.contentType,
|
|
3526
|
+
format: "text",
|
|
3527
|
+
metadata: {
|
|
3528
|
+
...input.metadata ?? {},
|
|
3529
|
+
fileKind: "office",
|
|
3530
|
+
...officeMetadata,
|
|
3531
|
+
slideIndex: slide.index,
|
|
3532
|
+
slideNumber: slide.index + 1
|
|
3533
|
+
},
|
|
3534
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
3535
|
+
text: `Slide ${slide.index + 1}
|
|
3536
|
+
${slide.text}`,
|
|
3537
|
+
title: input.title ? `${input.title} \xB7 Slide ${slide.index + 1}` : `Slide ${slide.index + 1}`
|
|
3538
|
+
}));
|
|
3476
3539
|
}
|
|
3477
3540
|
if (!text) {
|
|
3478
3541
|
throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
|
|
3479
3542
|
}
|
|
3480
|
-
|
|
3543
|
+
const summaryDocument = {
|
|
3481
3544
|
chunking: input.chunking,
|
|
3482
3545
|
contentType: input.contentType,
|
|
3483
3546
|
format: "text",
|
|
@@ -3490,6 +3553,7 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3490
3553
|
text,
|
|
3491
3554
|
title: input.title
|
|
3492
3555
|
};
|
|
3556
|
+
return [summaryDocument, ...structuredDocuments];
|
|
3493
3557
|
}
|
|
3494
3558
|
});
|
|
3495
3559
|
var createRAGArchiveExpander = (expander) => expander;
|
|
@@ -3519,7 +3583,36 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
3519
3583
|
supports: mediaExtractorSupports,
|
|
3520
3584
|
extract: async (input) => {
|
|
3521
3585
|
const result = await transcriber.transcribe(input);
|
|
3522
|
-
|
|
3586
|
+
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
|
|
3587
|
+
const segmentDocuments = [];
|
|
3588
|
+
for (const [index, segment] of (result.segments ?? []).entries()) {
|
|
3589
|
+
const text = normalizeWhitespace(segment.text ?? "");
|
|
3590
|
+
if (!text) {
|
|
3591
|
+
continue;
|
|
3592
|
+
}
|
|
3593
|
+
const startMs = typeof segment.startMs === "number" ? segment.startMs : undefined;
|
|
3594
|
+
const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
|
|
3595
|
+
segmentDocuments.push({
|
|
3596
|
+
chunking: input.chunking,
|
|
3597
|
+
contentType: input.contentType,
|
|
3598
|
+
format: "text",
|
|
3599
|
+
metadata: {
|
|
3600
|
+
...input.metadata ?? {},
|
|
3601
|
+
...result.metadata ?? {},
|
|
3602
|
+
fileKind: "media",
|
|
3603
|
+
mediaSegmentIndex: index,
|
|
3604
|
+
mediaSegmentStartMs: startMs,
|
|
3605
|
+
mediaSegmentEndMs: endMs,
|
|
3606
|
+
mediaSegments: [segment],
|
|
3607
|
+
speaker: typeof segment.speaker === "string" ? segment.speaker : undefined
|
|
3608
|
+
},
|
|
3609
|
+
source,
|
|
3610
|
+
text: `Transcript segment${typeof startMs === "number" ? ` ${startMs}-${endMs ?? startMs}ms` : ""}
|
|
3611
|
+
${text}`,
|
|
3612
|
+
title: input.title ? `${input.title} \xB7 Segment ${index + 1}` : `Segment ${index + 1}`
|
|
3613
|
+
});
|
|
3614
|
+
}
|
|
3615
|
+
const summaryDocument = {
|
|
3523
3616
|
chunking: input.chunking,
|
|
3524
3617
|
contentType: input.contentType,
|
|
3525
3618
|
format: "text",
|
|
@@ -3529,10 +3622,11 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
3529
3622
|
fileKind: "media",
|
|
3530
3623
|
mediaSegments: result.segments
|
|
3531
3624
|
},
|
|
3532
|
-
source
|
|
3625
|
+
source,
|
|
3533
3626
|
text: result.text,
|
|
3534
3627
|
title: result.title ?? input.title
|
|
3535
3628
|
};
|
|
3629
|
+
return [summaryDocument, ...segmentDocuments];
|
|
3536
3630
|
}
|
|
3537
3631
|
});
|
|
3538
3632
|
var createRAGMediaTranscriber = (transcriber) => transcriber;
|
|
@@ -3564,7 +3658,7 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
|
|
|
3564
3658
|
},
|
|
3565
3659
|
name: basename(entry.path),
|
|
3566
3660
|
source: archiveInput.source && !archiveInput.source.startsWith("http") ? `${archiveInput.source}#${entry.path}` : entry.path,
|
|
3567
|
-
title:
|
|
3661
|
+
title: basename(entry.path)
|
|
3568
3662
|
}, extractors);
|
|
3569
3663
|
return documents;
|
|
3570
3664
|
};
|
|
@@ -3693,6 +3787,7 @@ var getFirstExtractedDocument = (documents, label) => {
|
|
|
3693
3787
|
}
|
|
3694
3788
|
return document;
|
|
3695
3789
|
};
|
|
3790
|
+
var loadExtractedDocuments = async (input, extractors) => extractRAGFileDocuments(input, extractors);
|
|
3696
3791
|
var sentenceUnits = (text) => {
|
|
3697
3792
|
const matches = text.match(/[^.!?\n]+(?:[.!?]+|$)/g);
|
|
3698
3793
|
if (!matches) {
|
|
@@ -3915,32 +4010,55 @@ var loadRAGDocumentFromURL = async (input) => {
|
|
|
3915
4010
|
};
|
|
3916
4011
|
var loadRAGDocumentsFromUploads = async (input) => {
|
|
3917
4012
|
const documents = await Promise.all(input.uploads.map(async (upload) => {
|
|
3918
|
-
const loaded = await
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
4013
|
+
const loaded = await loadExtractedDocuments({
|
|
4014
|
+
chunking: upload.chunking,
|
|
4015
|
+
contentType: upload.contentType,
|
|
4016
|
+
data: decodeUploadContent(upload),
|
|
4017
|
+
format: upload.format,
|
|
4018
|
+
metadata: upload.metadata,
|
|
4019
|
+
name: upload.name,
|
|
4020
|
+
source: upload.source ?? upload.name,
|
|
4021
|
+
title: upload.title
|
|
4022
|
+
}, input.extractors);
|
|
4023
|
+
return loaded.map((document) => ({
|
|
4024
|
+
...document,
|
|
4025
|
+
metadata: mergeMetadata(document.metadata, { uploadFile: upload.name }, input.baseMetadata)
|
|
4026
|
+
}));
|
|
3926
4027
|
}));
|
|
3927
4028
|
return {
|
|
3928
4029
|
defaultChunking: input.defaultChunking,
|
|
3929
|
-
documents
|
|
4030
|
+
documents: documents.flat()
|
|
3930
4031
|
};
|
|
3931
4032
|
};
|
|
3932
4033
|
var loadRAGDocumentsFromURLs = async (input) => {
|
|
3933
|
-
const documents = await Promise.all(input.urls.map(async (urlInput) =>
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
}
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
4034
|
+
const documents = await Promise.all(input.urls.map(async (urlInput) => {
|
|
4035
|
+
const url = urlInput.url.trim();
|
|
4036
|
+
if (!url) {
|
|
4037
|
+
throw new Error("RAG URL is required");
|
|
4038
|
+
}
|
|
4039
|
+
const response = await fetch(url);
|
|
4040
|
+
if (!response.ok) {
|
|
4041
|
+
throw new Error(`Failed to fetch RAG URL ${url}: ${response.status} ${response.statusText}`);
|
|
4042
|
+
}
|
|
4043
|
+
const data = new Uint8Array(await response.arrayBuffer());
|
|
4044
|
+
const loaded = await loadExtractedDocuments({
|
|
4045
|
+
chunking: urlInput.chunking,
|
|
4046
|
+
contentType: urlInput.contentType ?? response.headers.get("content-type") ?? undefined,
|
|
4047
|
+
data,
|
|
4048
|
+
format: urlInput.format ?? inferFormatFromUrl(url),
|
|
4049
|
+
metadata: urlInput.metadata,
|
|
4050
|
+
name: basename(new URL(url).pathname),
|
|
4051
|
+
source: urlInput.source ?? url,
|
|
4052
|
+
title: urlInput.title
|
|
4053
|
+
}, urlInput.extractors ?? input.extractors);
|
|
4054
|
+
return loaded.map((document) => ({
|
|
4055
|
+
...document,
|
|
4056
|
+
metadata: mergeMetadata(document.metadata, { sourceUrl: urlInput.url }, input.baseMetadata)
|
|
4057
|
+
}));
|
|
4058
|
+
}));
|
|
3941
4059
|
return {
|
|
3942
4060
|
defaultChunking: input.defaultChunking,
|
|
3943
|
-
documents
|
|
4061
|
+
documents: documents.flat()
|
|
3944
4062
|
};
|
|
3945
4063
|
};
|
|
3946
4064
|
var loadRAGDocumentUpload = async (input) => {
|
|
@@ -4020,21 +4138,25 @@ var loadRAGDocumentsFromDirectory = async (input) => {
|
|
|
4020
4138
|
const files = await collectDirectoryFiles(root, input.recursive !== false, includeExtensions);
|
|
4021
4139
|
const documents = await Promise.all(files.map(async (path) => {
|
|
4022
4140
|
const source = relative(root, path).replace(/\\/g, "/");
|
|
4023
|
-
const
|
|
4141
|
+
const data = await readFile(path);
|
|
4142
|
+
const loaded = await loadExtractedDocuments({
|
|
4143
|
+
chunking: input.defaultChunking,
|
|
4144
|
+
data,
|
|
4024
4145
|
metadata: {
|
|
4025
|
-
...input.baseMetadata ?? {},
|
|
4026
4146
|
fileName: basename(path),
|
|
4027
4147
|
relativePath: source
|
|
4028
4148
|
},
|
|
4029
4149
|
path,
|
|
4030
|
-
source
|
|
4031
|
-
|
|
4032
|
-
|
|
4033
|
-
|
|
4150
|
+
source
|
|
4151
|
+
}, input.extractors);
|
|
4152
|
+
return loaded.map((document) => ({
|
|
4153
|
+
...document,
|
|
4154
|
+
metadata: mergeMetadata(document.metadata, undefined, input.baseMetadata)
|
|
4155
|
+
}));
|
|
4034
4156
|
}));
|
|
4035
4157
|
return {
|
|
4036
4158
|
defaultChunking: input.defaultChunking,
|
|
4037
|
-
documents
|
|
4159
|
+
documents: documents.flat()
|
|
4038
4160
|
};
|
|
4039
4161
|
};
|
|
4040
4162
|
var prepareRAGDirectoryDocuments = async (input) => prepareRAGDocuments(await loadRAGDocumentsFromDirectory(input));
|
|
@@ -8773,5 +8895,5 @@ export {
|
|
|
8773
8895
|
aiChat
|
|
8774
8896
|
};
|
|
8775
8897
|
|
|
8776
|
-
//# debugId=
|
|
8898
|
+
//# debugId=A1829EEFE0D80F9264756E2164756E21
|
|
8777
8899
|
//# sourceMappingURL=index.js.map
|