@absolutejs/absolute 0.19.0-beta.473 → 0.19.0-beta.474

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -2455,6 +2455,11 @@ var extractTextFromPDFBytes = (data) => {
2455
2455
  `);
2456
2456
  return normalizeWhitespace(combined);
2457
2457
  };
2458
+ var estimatePDFPageCount = (data) => {
2459
+ const raw = Buffer.from(data).toString("latin1");
2460
+ const count = [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
2461
+ return count > 0 ? count : 1;
2462
+ };
2458
2463
  var readUInt16LE = (data, offset) => data[offset] | data[offset + 1] << 8;
2459
2464
  var readUInt32LE = (data, offset) => (data[offset] | data[offset + 1] << 8 | data[offset + 2] << 16 | data[offset + 3] << 24) >>> 0;
2460
2465
  var decodeUtf8 = (data) => Buffer.from(data).toString("utf8");
@@ -2545,6 +2550,14 @@ var officeDocumentText = (entries) => {
2545
2550
  }
2546
2551
  return extractXmlText(decodeUtf8(documentEntry.data));
2547
2552
  };
2553
+ var officeDocumentSectionCount = (entries) => {
2554
+ const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
2555
+ if (!documentEntry) {
2556
+ return;
2557
+ }
2558
+ const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
2559
+ return count > 0 ? count : undefined;
2560
+ };
2548
2561
  var spreadsheetText = (entries) => {
2549
2562
  const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
2550
2563
  ...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
@@ -2558,12 +2571,16 @@ var spreadsheetText = (entries) => {
2558
2571
  return normalizeWhitespace(sheetValues.join(`
2559
2572
  `));
2560
2573
  };
2574
+ var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
2575
+ ...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
2576
+ ].map((match) => match[1] ?? "")).filter(Boolean);
2561
2577
  var presentationText = (entries) => {
2562
2578
  const slides = entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).map((entry) => extractXmlText(decodeUtf8(entry.data)));
2563
2579
  return normalizeWhitespace(slides.join(`
2564
2580
 
2565
2581
  `));
2566
2582
  };
2583
+ var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
2567
2584
  var epubText = (entries) => {
2568
2585
  const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
2569
2586
  return normalizeWhitespace(htmlEntries.map((entry) => stripHtml(decodeUtf8(entry.data))).join(`
@@ -2588,6 +2605,23 @@ var extractEmailText = (raw) => {
2588
2605
  }
2589
2606
  return normalizeWhitespace(body);
2590
2607
  };
2608
+ var parseEmailHeaders = (raw) => {
2609
+ const normalized = raw.replace(/\r\n?/g, `
2610
+ `);
2611
+ const [headerBlock = ""] = normalized.split(`
2612
+
2613
+ `);
2614
+ const getHeader = (name) => {
2615
+ const match = headerBlock.match(new RegExp(`^${name}:\\s*(.+)$`, "im"));
2616
+ return match?.[1]?.trim();
2617
+ };
2618
+ return {
2619
+ from: getHeader("From"),
2620
+ subject: getHeader("Subject"),
2621
+ threadTopic: getHeader("Thread-Topic") ?? getHeader("Subject"),
2622
+ to: getHeader("To")
2623
+ };
2624
+ };
2591
2625
  var stripRTF = (value) => {
2592
2626
  const withoutBinary = value.replace(/\\bin\d+ [\s\S]*?(?=[\\}])/g, " ");
2593
2627
  const withoutControls = withoutBinary.replace(/\\par[d]?/g, `
@@ -2750,12 +2784,22 @@ var createOfficeDocumentExtractor = () => ({
2750
2784
  const extension = inferExtensionFromInput(input);
2751
2785
  const entries = unzipEntries(input.data);
2752
2786
  let text = "";
2787
+ let officeMetadata = {};
2753
2788
  if (extension === ".docx" || extension === ".odt") {
2754
2789
  text = officeDocumentText(entries);
2790
+ officeMetadata = {
2791
+ sectionCount: officeDocumentSectionCount(entries)
2792
+ };
2755
2793
  } else if (extension === ".xlsx" || extension === ".ods") {
2756
2794
  text = spreadsheetText(entries);
2795
+ officeMetadata = {
2796
+ sheetNames: spreadsheetSheetNames(entries)
2797
+ };
2757
2798
  } else if (extension === ".pptx" || extension === ".odp") {
2758
2799
  text = presentationText(entries);
2800
+ officeMetadata = {
2801
+ slideCount: presentationSlideCount(entries)
2802
+ };
2759
2803
  }
2760
2804
  if (!text) {
2761
2805
  throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
@@ -2766,7 +2810,8 @@ var createOfficeDocumentExtractor = () => ({
2766
2810
  format: "text",
2767
2811
  metadata: {
2768
2812
  ...input.metadata ?? {},
2769
- fileKind: "office"
2813
+ fileKind: "office",
2814
+ ...officeMetadata
2770
2815
  },
2771
2816
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
2772
2817
  text,
@@ -2801,18 +2846,21 @@ var createEmailExtractor = () => ({
2801
2846
  supports: emailExtractorSupports,
2802
2847
  extract: (input) => {
2803
2848
  const raw = decodeUtf8(input.data);
2804
- const subjectMatch = raw.match(/^Subject:\s*(.+)$/im);
2849
+ const headers = parseEmailHeaders(raw);
2805
2850
  return {
2806
2851
  chunking: input.chunking,
2807
2852
  contentType: input.contentType,
2808
2853
  format: "text",
2809
2854
  metadata: {
2810
2855
  ...input.metadata ?? {},
2811
- fileKind: "email"
2856
+ fileKind: "email",
2857
+ from: headers.from,
2858
+ threadTopic: headers.subject,
2859
+ to: headers.to
2812
2860
  },
2813
2861
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`,
2814
2862
  text: extractEmailText(raw),
2815
- title: input.title ?? subjectMatch?.[1]?.trim()
2863
+ title: input.title ?? headers.subject
2816
2864
  };
2817
2865
  }
2818
2866
  });
@@ -2889,7 +2937,8 @@ var createPDFFileExtractor = () => ({
2889
2937
  format: "text",
2890
2938
  metadata: {
2891
2939
  ...input.metadata ?? {},
2892
- fileKind: "pdf"
2940
+ fileKind: "pdf",
2941
+ pageCount: estimatePDFPageCount(input.data)
2893
2942
  },
2894
2943
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
2895
2944
  text,
@@ -2912,6 +2961,7 @@ var createRAGPDFOCRExtractor = (options) => ({
2912
2961
  metadata: {
2913
2962
  ...input.metadata ?? {},
2914
2963
  fileKind: "pdf",
2964
+ pageCount: estimatePDFPageCount(input.data),
2915
2965
  pdfTextMode: "native"
2916
2966
  },
2917
2967
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
@@ -2931,6 +2981,7 @@ var createRAGPDFOCRExtractor = (options) => ({
2931
2981
  ...input.metadata ?? {},
2932
2982
  ...ocr.metadata ?? {},
2933
2983
  fileKind: "pdf",
2984
+ pageCount: estimatePDFPageCount(input.data),
2934
2985
  pdfTextMode: "ocr"
2935
2986
  },
2936
2987
  source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
@@ -6437,5 +6488,5 @@ export {
6437
6488
  aiChat
6438
6489
  };
6439
6490
 
6440
- //# debugId=C9D2C43A1F5E327064756E2164756E21
6491
+ //# debugId=4C8CFD94D17CA32664756E2164756E21
6441
6492
  //# sourceMappingURL=index.js.map