@absolutejs/absolute 0.19.0-beta.494 → 0.19.0-beta.495

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -3232,6 +3232,30 @@ var spreadsheetText = (entries) => {
3232
3232
  return normalizeWhitespace(sheetValues.join(`
3233
3233
  `));
3234
3234
  };
3235
+ var spreadsheetSheetTexts = (entries) => {
3236
+ const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
3237
+ ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
3238
+ ].map((match) => decodeHtmlEntities(match[1] ?? "")));
3239
+ const sheetNames = spreadsheetSheetNames(entries);
3240
+ const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
3241
+ return sheetEntries.map((entry, index) => {
3242
+ const values = [
3243
+ ...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
3244
+ ].map((match) => match[1] ?? "").map((value) => {
3245
+ const sharedStringIndex = Number(value);
3246
+ return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
3247
+ });
3248
+ const text = normalizeWhitespace(values.join(`
3249
+ `));
3250
+ if (!text) {
3251
+ return null;
3252
+ }
3253
+ return {
3254
+ name: sheetNames[index] ?? `Sheet ${index + 1}`,
3255
+ text
3256
+ };
3257
+ }).filter((entry) => Boolean(entry));
3258
+ };
3235
3259
  var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
3236
3260
  ...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
3237
3261
  ].map((match) => match[1] ?? "")).filter(Boolean);
@@ -3241,6 +3265,10 @@ var presentationText = (entries) => {
3241
3265
 
3242
3266
  `));
3243
3267
  };
3268
+ var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
3269
+ index,
3270
+ text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
3271
+ })).filter((slide) => Boolean(slide.text));
3244
3272
  var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
3245
3273
  var epubText = (entries) => {
3246
3274
  const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
@@ -3458,6 +3486,7 @@ var createOfficeDocumentExtractor = () => ({
3458
3486
  const entries = unzipEntries(input.data);
3459
3487
  let text = "";
3460
3488
  let officeMetadata = {};
3489
+ let structuredDocuments = [];
3461
3490
  if (extension === ".docx" || extension === ".odt") {
3462
3491
  text = officeDocumentText(entries);
3463
3492
  officeMetadata = {
@@ -3465,19 +3494,53 @@ var createOfficeDocumentExtractor = () => ({
3465
3494
  };
3466
3495
  } else if (extension === ".xlsx" || extension === ".ods") {
3467
3496
  text = spreadsheetText(entries);
3497
+ const sheets = spreadsheetSheetTexts(entries);
3468
3498
  officeMetadata = {
3469
3499
  sheetNames: spreadsheetSheetNames(entries)
3470
3500
  };
3501
+ structuredDocuments = sheets.map((sheet, index) => ({
3502
+ chunking: input.chunking,
3503
+ contentType: input.contentType,
3504
+ format: "text",
3505
+ metadata: {
3506
+ ...input.metadata ?? {},
3507
+ fileKind: "office",
3508
+ ...officeMetadata,
3509
+ sheetIndex: index,
3510
+ sheetName: sheet.name
3511
+ },
3512
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
3513
+ text: `Sheet ${sheet.name}
3514
+ ${sheet.text}`,
3515
+ title: input.title ? `${input.title} \xB7 ${sheet.name}` : sheet.name
3516
+ }));
3471
3517
  } else if (extension === ".pptx" || extension === ".odp") {
3472
3518
  text = presentationText(entries);
3519
+ const slides = presentationSlides(entries);
3473
3520
  officeMetadata = {
3474
3521
  slideCount: presentationSlideCount(entries)
3475
3522
  };
3523
+ structuredDocuments = slides.map((slide) => ({
3524
+ chunking: input.chunking,
3525
+ contentType: input.contentType,
3526
+ format: "text",
3527
+ metadata: {
3528
+ ...input.metadata ?? {},
3529
+ fileKind: "office",
3530
+ ...officeMetadata,
3531
+ slideIndex: slide.index,
3532
+ slideNumber: slide.index + 1
3533
+ },
3534
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
3535
+ text: `Slide ${slide.index + 1}
3536
+ ${slide.text}`,
3537
+ title: input.title ? `${input.title} \xB7 Slide ${slide.index + 1}` : `Slide ${slide.index + 1}`
3538
+ }));
3476
3539
  }
3477
3540
  if (!text) {
3478
3541
  throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
3479
3542
  }
3480
- return {
3543
+ const summaryDocument = {
3481
3544
  chunking: input.chunking,
3482
3545
  contentType: input.contentType,
3483
3546
  format: "text",
@@ -3490,6 +3553,7 @@ var createOfficeDocumentExtractor = () => ({
3490
3553
  text,
3491
3554
  title: input.title
3492
3555
  };
3556
+ return [summaryDocument, ...structuredDocuments];
3493
3557
  }
3494
3558
  });
3495
3559
  var createRAGArchiveExpander = (expander) => expander;
@@ -3519,7 +3583,36 @@ var createRAGMediaFileExtractor = (transcriber) => ({
3519
3583
  supports: mediaExtractorSupports,
3520
3584
  extract: async (input) => {
3521
3585
  const result = await transcriber.transcribe(input);
3522
- return {
3586
+ const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
3587
+ const segmentDocuments = [];
3588
+ for (const [index, segment] of (result.segments ?? []).entries()) {
3589
+ const text = normalizeWhitespace(segment.text ?? "");
3590
+ if (!text) {
3591
+ continue;
3592
+ }
3593
+ const startMs = typeof segment.startMs === "number" ? segment.startMs : undefined;
3594
+ const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
3595
+ segmentDocuments.push({
3596
+ chunking: input.chunking,
3597
+ contentType: input.contentType,
3598
+ format: "text",
3599
+ metadata: {
3600
+ ...input.metadata ?? {},
3601
+ ...result.metadata ?? {},
3602
+ fileKind: "media",
3603
+ mediaSegmentIndex: index,
3604
+ mediaSegmentStartMs: startMs,
3605
+ mediaSegmentEndMs: endMs,
3606
+ mediaSegments: [segment],
3607
+ speaker: typeof segment.speaker === "string" ? segment.speaker : undefined
3608
+ },
3609
+ source,
3610
+ text: `Transcript segment${typeof startMs === "number" ? ` ${startMs}-${endMs ?? startMs}ms` : ""}
3611
+ ${text}`,
3612
+ title: input.title ? `${input.title} \xB7 Segment ${index + 1}` : `Segment ${index + 1}`
3613
+ });
3614
+ }
3615
+ const summaryDocument = {
3523
3616
  chunking: input.chunking,
3524
3617
  contentType: input.contentType,
3525
3618
  format: "text",
@@ -3529,10 +3622,11 @@ var createRAGMediaFileExtractor = (transcriber) => ({
3529
3622
  fileKind: "media",
3530
3623
  mediaSegments: result.segments
3531
3624
  },
3532
- source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`,
3625
+ source,
3533
3626
  text: result.text,
3534
3627
  title: result.title ?? input.title
3535
3628
  };
3629
+ return [summaryDocument, ...segmentDocuments];
3536
3630
  }
3537
3631
  });
3538
3632
  var createRAGMediaTranscriber = (transcriber) => transcriber;
@@ -3564,7 +3658,7 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
3564
3658
  },
3565
3659
  name: basename(entry.path),
3566
3660
  source: archiveInput.source && !archiveInput.source.startsWith("http") ? `${archiveInput.source}#${entry.path}` : entry.path,
3567
- title: archiveInput.title
3661
+ title: basename(entry.path)
3568
3662
  }, extractors);
3569
3663
  return documents;
3570
3664
  };
@@ -3693,6 +3787,7 @@ var getFirstExtractedDocument = (documents, label) => {
3693
3787
  }
3694
3788
  return document;
3695
3789
  };
3790
+ var loadExtractedDocuments = async (input, extractors) => extractRAGFileDocuments(input, extractors);
3696
3791
  var sentenceUnits = (text) => {
3697
3792
  const matches = text.match(/[^.!?\n]+(?:[.!?]+|$)/g);
3698
3793
  if (!matches) {
@@ -3915,32 +4010,55 @@ var loadRAGDocumentFromURL = async (input) => {
3915
4010
  };
3916
4011
  var loadRAGDocumentsFromUploads = async (input) => {
3917
4012
  const documents = await Promise.all(input.uploads.map(async (upload) => {
3918
- const loaded = await loadRAGDocumentUpload({
3919
- ...upload,
3920
- extractors: input.extractors
3921
- });
3922
- return {
3923
- ...loaded,
3924
- metadata: mergeMetadata(loaded.metadata, { uploadFile: upload.name }, input.baseMetadata)
3925
- };
4013
+ const loaded = await loadExtractedDocuments({
4014
+ chunking: upload.chunking,
4015
+ contentType: upload.contentType,
4016
+ data: decodeUploadContent(upload),
4017
+ format: upload.format,
4018
+ metadata: upload.metadata,
4019
+ name: upload.name,
4020
+ source: upload.source ?? upload.name,
4021
+ title: upload.title
4022
+ }, input.extractors);
4023
+ return loaded.map((document) => ({
4024
+ ...document,
4025
+ metadata: mergeMetadata(document.metadata, { uploadFile: upload.name }, input.baseMetadata)
4026
+ }));
3926
4027
  }));
3927
4028
  return {
3928
4029
  defaultChunking: input.defaultChunking,
3929
- documents
4030
+ documents: documents.flat()
3930
4031
  };
3931
4032
  };
3932
4033
  var loadRAGDocumentsFromURLs = async (input) => {
3933
- const documents = await Promise.all(input.urls.map(async (urlInput) => loadRAGDocumentFromURL({
3934
- ...urlInput,
3935
- metadata: mergeMetadata(urlInput.metadata, {
3936
- sourceUrl: urlInput.url
3937
- }, input.baseMetadata),
3938
- contentType: urlInput.contentType,
3939
- extractors: urlInput.extractors ?? input.extractors
3940
- })));
4034
+ const documents = await Promise.all(input.urls.map(async (urlInput) => {
4035
+ const url = urlInput.url.trim();
4036
+ if (!url) {
4037
+ throw new Error("RAG URL is required");
4038
+ }
4039
+ const response = await fetch(url);
4040
+ if (!response.ok) {
4041
+ throw new Error(`Failed to fetch RAG URL ${url}: ${response.status} ${response.statusText}`);
4042
+ }
4043
+ const data = new Uint8Array(await response.arrayBuffer());
4044
+ const loaded = await loadExtractedDocuments({
4045
+ chunking: urlInput.chunking,
4046
+ contentType: urlInput.contentType ?? response.headers.get("content-type") ?? undefined,
4047
+ data,
4048
+ format: urlInput.format ?? inferFormatFromUrl(url),
4049
+ metadata: urlInput.metadata,
4050
+ name: basename(new URL(url).pathname),
4051
+ source: urlInput.source ?? url,
4052
+ title: urlInput.title
4053
+ }, urlInput.extractors ?? input.extractors);
4054
+ return loaded.map((document) => ({
4055
+ ...document,
4056
+ metadata: mergeMetadata(document.metadata, { sourceUrl: urlInput.url }, input.baseMetadata)
4057
+ }));
4058
+ }));
3941
4059
  return {
3942
4060
  defaultChunking: input.defaultChunking,
3943
- documents
4061
+ documents: documents.flat()
3944
4062
  };
3945
4063
  };
3946
4064
  var loadRAGDocumentUpload = async (input) => {
@@ -4020,21 +4138,25 @@ var loadRAGDocumentsFromDirectory = async (input) => {
4020
4138
  const files = await collectDirectoryFiles(root, input.recursive !== false, includeExtensions);
4021
4139
  const documents = await Promise.all(files.map(async (path) => {
4022
4140
  const source = relative(root, path).replace(/\\/g, "/");
4023
- const loaded = await loadRAGDocumentFile({
4141
+ const data = await readFile(path);
4142
+ const loaded = await loadExtractedDocuments({
4143
+ chunking: input.defaultChunking,
4144
+ data,
4024
4145
  metadata: {
4025
- ...input.baseMetadata ?? {},
4026
4146
  fileName: basename(path),
4027
4147
  relativePath: source
4028
4148
  },
4029
4149
  path,
4030
- source,
4031
- extractors: input.extractors
4032
- });
4033
- return loaded;
4150
+ source
4151
+ }, input.extractors);
4152
+ return loaded.map((document) => ({
4153
+ ...document,
4154
+ metadata: mergeMetadata(document.metadata, undefined, input.baseMetadata)
4155
+ }));
4034
4156
  }));
4035
4157
  return {
4036
4158
  defaultChunking: input.defaultChunking,
4037
- documents
4159
+ documents: documents.flat()
4038
4160
  };
4039
4161
  };
4040
4162
  var prepareRAGDirectoryDocuments = async (input) => prepareRAGDocuments(await loadRAGDocumentsFromDirectory(input));
@@ -8773,5 +8895,5 @@ export {
8773
8895
  aiChat
8774
8896
  };
8775
8897
 
8776
- //# debugId=F37A373F20F3691864756E2164756E21
8898
+ //# debugId=A1829EEFE0D80F9264756E2164756E21
8777
8899
  //# sourceMappingURL=index.js.map