@absolutejs/absolute 0.19.0-beta.494 → 0.19.0-beta.496

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -2199,8 +2199,10 @@ var extractWeightedLexicalFields = (result) => {
2199
2199
  const archivePath = typeof metadata.archivePath === "string" ? metadata.archivePath : source.includes("#") ? source.split("#")[1] ?? "" : "";
2200
2200
  const mediaSegments = Array.isArray(metadata.mediaSegments) ? metadata.mediaSegments.map((segment) => segment && typeof segment === "object" ? toFieldText(segment) : "").filter(Boolean).join(" ") : "";
2201
2201
  const metadataFocus = [
2202
+ metadata.sourceNativeKind,
2202
2203
  metadata.sheetName,
2203
2204
  metadata.sheetNames,
2205
+ metadata.slideNumber,
2204
2206
  metadata.slideTitle,
2205
2207
  metadata.slideTitles,
2206
2208
  metadata.threadTopic,
@@ -3232,6 +3234,30 @@ var spreadsheetText = (entries) => {
3232
3234
  return normalizeWhitespace(sheetValues.join(`
3233
3235
  `));
3234
3236
  };
3237
+ var spreadsheetSheetTexts = (entries) => {
3238
+ const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
3239
+ ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
3240
+ ].map((match) => decodeHtmlEntities(match[1] ?? "")));
3241
+ const sheetNames = spreadsheetSheetNames(entries);
3242
+ const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
3243
+ return sheetEntries.map((entry, index) => {
3244
+ const values = [
3245
+ ...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
3246
+ ].map((match) => match[1] ?? "").map((value) => {
3247
+ const sharedStringIndex = Number(value);
3248
+ return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
3249
+ });
3250
+ const text = normalizeWhitespace(values.join(`
3251
+ `));
3252
+ if (!text) {
3253
+ return null;
3254
+ }
3255
+ return {
3256
+ name: sheetNames[index] ?? `Sheet ${index + 1}`,
3257
+ text
3258
+ };
3259
+ }).filter((entry) => Boolean(entry));
3260
+ };
3235
3261
  var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
3236
3262
  ...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
3237
3263
  ].map((match) => match[1] ?? "")).filter(Boolean);
@@ -3241,6 +3267,10 @@ var presentationText = (entries) => {
3241
3267
 
3242
3268
  `));
3243
3269
  };
3270
+ var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
3271
+ index,
3272
+ text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
3273
+ })).filter((slide) => Boolean(slide.text));
3244
3274
  var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
3245
3275
  var epubText = (entries) => {
3246
3276
  const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
@@ -3458,6 +3488,7 @@ var createOfficeDocumentExtractor = () => ({
3458
3488
  const entries = unzipEntries(input.data);
3459
3489
  let text = "";
3460
3490
  let officeMetadata = {};
3491
+ let structuredDocuments = [];
3461
3492
  if (extension === ".docx" || extension === ".odt") {
3462
3493
  text = officeDocumentText(entries);
3463
3494
  officeMetadata = {
@@ -3465,19 +3496,55 @@ var createOfficeDocumentExtractor = () => ({
3465
3496
  };
3466
3497
  } else if (extension === ".xlsx" || extension === ".ods") {
3467
3498
  text = spreadsheetText(entries);
3499
+ const sheets = spreadsheetSheetTexts(entries);
3468
3500
  officeMetadata = {
3469
3501
  sheetNames: spreadsheetSheetNames(entries)
3470
3502
  };
3503
+ structuredDocuments = sheets.map((sheet, index) => ({
3504
+ chunking: input.chunking,
3505
+ contentType: input.contentType,
3506
+ format: "text",
3507
+ metadata: {
3508
+ ...input.metadata ?? {},
3509
+ fileKind: "office",
3510
+ ...officeMetadata,
3511
+ sourceNativeKind: "spreadsheet_sheet",
3512
+ sheetIndex: index,
3513
+ sheetName: sheet.name
3514
+ },
3515
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
3516
+ text: normalizeWhitespace(`Spreadsheet sheet ${sheet.name} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
3517
+ ${sheet.text}`),
3518
+ title: input.title ? `${input.title} \xB7 ${sheet.name}` : sheet.name
3519
+ }));
3471
3520
  } else if (extension === ".pptx" || extension === ".odp") {
3472
3521
  text = presentationText(entries);
3522
+ const slides = presentationSlides(entries);
3473
3523
  officeMetadata = {
3474
3524
  slideCount: presentationSlideCount(entries)
3475
3525
  };
3526
+ structuredDocuments = slides.map((slide) => ({
3527
+ chunking: input.chunking,
3528
+ contentType: input.contentType,
3529
+ format: "text",
3530
+ metadata: {
3531
+ ...input.metadata ?? {},
3532
+ fileKind: "office",
3533
+ ...officeMetadata,
3534
+ sourceNativeKind: "presentation_slide",
3535
+ slideIndex: slide.index,
3536
+ slideNumber: slide.index + 1
3537
+ },
3538
+ source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
3539
+ text: normalizeWhitespace(`Presentation slide ${slide.index + 1} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
3540
+ ${slide.text}`),
3541
+ title: input.title ? `${input.title} \xB7 Slide ${slide.index + 1}` : `Slide ${slide.index + 1}`
3542
+ }));
3476
3543
  }
3477
3544
  if (!text) {
3478
3545
  throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
3479
3546
  }
3480
- return {
3547
+ const summaryDocument = {
3481
3548
  chunking: input.chunking,
3482
3549
  contentType: input.contentType,
3483
3550
  format: "text",
@@ -3490,6 +3557,7 @@ var createOfficeDocumentExtractor = () => ({
3490
3557
  text,
3491
3558
  title: input.title
3492
3559
  };
3560
+ return [summaryDocument, ...structuredDocuments];
3493
3561
  }
3494
3562
  });
3495
3563
  var createRAGArchiveExpander = (expander) => expander;
@@ -3519,7 +3587,37 @@ var createRAGMediaFileExtractor = (transcriber) => ({
3519
3587
  supports: mediaExtractorSupports,
3520
3588
  extract: async (input) => {
3521
3589
  const result = await transcriber.transcribe(input);
3522
- return {
3590
+ const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
3591
+ const segmentDocuments = [];
3592
+ for (const [index, segment] of (result.segments ?? []).entries()) {
3593
+ const text = normalizeWhitespace(segment.text ?? "");
3594
+ if (!text) {
3595
+ continue;
3596
+ }
3597
+ const startMs = typeof segment.startMs === "number" ? segment.startMs : undefined;
3598
+ const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
3599
+ segmentDocuments.push({
3600
+ chunking: input.chunking,
3601
+ contentType: input.contentType,
3602
+ format: "text",
3603
+ metadata: {
3604
+ ...input.metadata ?? {},
3605
+ ...result.metadata ?? {},
3606
+ fileKind: "media",
3607
+ sourceNativeKind: "media_segment",
3608
+ mediaSegmentIndex: index,
3609
+ mediaSegmentStartMs: startMs,
3610
+ mediaSegmentEndMs: endMs,
3611
+ mediaSegments: [segment],
3612
+ speaker: typeof segment.speaker === "string" ? segment.speaker : undefined
3613
+ },
3614
+ source,
3615
+ text: normalizeWhitespace(`Media transcript segment${typeof startMs === "number" ? ` ${startMs}-${endMs ?? startMs}ms` : ""} from ${input.title ?? input.name ?? input.path ?? DEFAULT_BINARY_NAME}.
3616
+ ${text}`),
3617
+ title: input.title ? `${input.title} \xB7 Segment ${index + 1}` : `Segment ${index + 1}`
3618
+ });
3619
+ }
3620
+ const summaryDocument = {
3523
3621
  chunking: input.chunking,
3524
3622
  contentType: input.contentType,
3525
3623
  format: "text",
@@ -3529,10 +3627,11 @@ var createRAGMediaFileExtractor = (transcriber) => ({
3529
3627
  fileKind: "media",
3530
3628
  mediaSegments: result.segments
3531
3629
  },
3532
- source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`,
3630
+ source,
3533
3631
  text: result.text,
3534
3632
  title: result.title ?? input.title
3535
3633
  };
3634
+ return [summaryDocument, ...segmentDocuments];
3536
3635
  }
3537
3636
  });
3538
3637
  var createRAGMediaTranscriber = (transcriber) => transcriber;
@@ -3564,7 +3663,7 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
3564
3663
  },
3565
3664
  name: basename(entry.path),
3566
3665
  source: archiveInput.source && !archiveInput.source.startsWith("http") ? `${archiveInput.source}#${entry.path}` : entry.path,
3567
- title: archiveInput.title
3666
+ title: basename(entry.path)
3568
3667
  }, extractors);
3569
3668
  return documents;
3570
3669
  };
@@ -3693,6 +3792,7 @@ var getFirstExtractedDocument = (documents, label) => {
3693
3792
  }
3694
3793
  return document;
3695
3794
  };
3795
+ var loadExtractedDocuments = async (input, extractors) => extractRAGFileDocuments(input, extractors);
3696
3796
  var sentenceUnits = (text) => {
3697
3797
  const matches = text.match(/[^.!?\n]+(?:[.!?]+|$)/g);
3698
3798
  if (!matches) {
@@ -3915,32 +4015,55 @@ var loadRAGDocumentFromURL = async (input) => {
3915
4015
  };
3916
4016
  var loadRAGDocumentsFromUploads = async (input) => {
3917
4017
  const documents = await Promise.all(input.uploads.map(async (upload) => {
3918
- const loaded = await loadRAGDocumentUpload({
3919
- ...upload,
3920
- extractors: input.extractors
3921
- });
3922
- return {
3923
- ...loaded,
3924
- metadata: mergeMetadata(loaded.metadata, { uploadFile: upload.name }, input.baseMetadata)
3925
- };
4018
+ const loaded = await loadExtractedDocuments({
4019
+ chunking: upload.chunking,
4020
+ contentType: upload.contentType,
4021
+ data: decodeUploadContent(upload),
4022
+ format: upload.format,
4023
+ metadata: upload.metadata,
4024
+ name: upload.name,
4025
+ source: upload.source ?? upload.name,
4026
+ title: upload.title
4027
+ }, input.extractors);
4028
+ return loaded.map((document) => ({
4029
+ ...document,
4030
+ metadata: mergeMetadata(document.metadata, { uploadFile: upload.name }, input.baseMetadata)
4031
+ }));
3926
4032
  }));
3927
4033
  return {
3928
4034
  defaultChunking: input.defaultChunking,
3929
- documents
4035
+ documents: documents.flat()
3930
4036
  };
3931
4037
  };
3932
4038
  var loadRAGDocumentsFromURLs = async (input) => {
3933
- const documents = await Promise.all(input.urls.map(async (urlInput) => loadRAGDocumentFromURL({
3934
- ...urlInput,
3935
- metadata: mergeMetadata(urlInput.metadata, {
3936
- sourceUrl: urlInput.url
3937
- }, input.baseMetadata),
3938
- contentType: urlInput.contentType,
3939
- extractors: urlInput.extractors ?? input.extractors
3940
- })));
4039
+ const documents = await Promise.all(input.urls.map(async (urlInput) => {
4040
+ const url = urlInput.url.trim();
4041
+ if (!url) {
4042
+ throw new Error("RAG URL is required");
4043
+ }
4044
+ const response = await fetch(url);
4045
+ if (!response.ok) {
4046
+ throw new Error(`Failed to fetch RAG URL ${url}: ${response.status} ${response.statusText}`);
4047
+ }
4048
+ const data = new Uint8Array(await response.arrayBuffer());
4049
+ const loaded = await loadExtractedDocuments({
4050
+ chunking: urlInput.chunking,
4051
+ contentType: urlInput.contentType ?? response.headers.get("content-type") ?? undefined,
4052
+ data,
4053
+ format: urlInput.format ?? inferFormatFromUrl(url),
4054
+ metadata: urlInput.metadata,
4055
+ name: basename(new URL(url).pathname),
4056
+ source: urlInput.source ?? url,
4057
+ title: urlInput.title
4058
+ }, urlInput.extractors ?? input.extractors);
4059
+ return loaded.map((document) => ({
4060
+ ...document,
4061
+ metadata: mergeMetadata(document.metadata, { sourceUrl: urlInput.url }, input.baseMetadata)
4062
+ }));
4063
+ }));
3941
4064
  return {
3942
4065
  defaultChunking: input.defaultChunking,
3943
- documents
4066
+ documents: documents.flat()
3944
4067
  };
3945
4068
  };
3946
4069
  var loadRAGDocumentUpload = async (input) => {
@@ -4020,21 +4143,25 @@ var loadRAGDocumentsFromDirectory = async (input) => {
4020
4143
  const files = await collectDirectoryFiles(root, input.recursive !== false, includeExtensions);
4021
4144
  const documents = await Promise.all(files.map(async (path) => {
4022
4145
  const source = relative(root, path).replace(/\\/g, "/");
4023
- const loaded = await loadRAGDocumentFile({
4146
+ const data = await readFile(path);
4147
+ const loaded = await loadExtractedDocuments({
4148
+ chunking: input.defaultChunking,
4149
+ data,
4024
4150
  metadata: {
4025
- ...input.baseMetadata ?? {},
4026
4151
  fileName: basename(path),
4027
4152
  relativePath: source
4028
4153
  },
4029
4154
  path,
4030
- source,
4031
- extractors: input.extractors
4032
- });
4033
- return loaded;
4155
+ source
4156
+ }, input.extractors);
4157
+ return loaded.map((document) => ({
4158
+ ...document,
4159
+ metadata: mergeMetadata(document.metadata, undefined, input.baseMetadata)
4160
+ }));
4034
4161
  }));
4035
4162
  return {
4036
4163
  defaultChunking: input.defaultChunking,
4037
- documents
4164
+ documents: documents.flat()
4038
4165
  };
4039
4166
  };
4040
4167
  var prepareRAGDirectoryDocuments = async (input) => prepareRAGDocuments(await loadRAGDocumentsFromDirectory(input));
@@ -8773,5 +8900,5 @@ export {
8773
8900
  aiChat
8774
8901
  };
8775
8902
 
8776
- //# debugId=F37A373F20F3691864756E2164756E21
8903
+ //# debugId=7CB5678D47F7B89564756E2164756E21
8777
8904
  //# sourceMappingURL=index.js.map