@absolutejs/absolute 0.19.0-beta.473 → 0.19.0-beta.474
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/index.js +57 -6
- package/dist/ai/index.js.map +3 -3
- package/dist/angular/index.js +2 -2
- package/dist/angular/index.js.map +1 -1
- package/dist/angular/server.js +2 -2
- package/dist/angular/server.js.map +1 -1
- package/dist/build.js +2 -2
- package/dist/build.js.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/package.json +7 -7
package/dist/ai/index.js
CHANGED
|
@@ -2455,6 +2455,11 @@ var extractTextFromPDFBytes = (data) => {
|
|
|
2455
2455
|
`);
|
|
2456
2456
|
return normalizeWhitespace(combined);
|
|
2457
2457
|
};
|
|
2458
|
+
var estimatePDFPageCount = (data) => {
|
|
2459
|
+
const raw = Buffer.from(data).toString("latin1");
|
|
2460
|
+
const count = [...raw.matchAll(/\/Type\s*\/Page\b/g)].length;
|
|
2461
|
+
return count > 0 ? count : 1;
|
|
2462
|
+
};
|
|
2458
2463
|
var readUInt16LE = (data, offset) => data[offset] | data[offset + 1] << 8;
|
|
2459
2464
|
var readUInt32LE = (data, offset) => (data[offset] | data[offset + 1] << 8 | data[offset + 2] << 16 | data[offset + 3] << 24) >>> 0;
|
|
2460
2465
|
var decodeUtf8 = (data) => Buffer.from(data).toString("utf8");
|
|
@@ -2545,6 +2550,14 @@ var officeDocumentText = (entries) => {
|
|
|
2545
2550
|
}
|
|
2546
2551
|
return extractXmlText(decodeUtf8(documentEntry.data));
|
|
2547
2552
|
};
|
|
2553
|
+
var officeDocumentSectionCount = (entries) => {
|
|
2554
|
+
const documentEntry = entries.find((entry) => entry.path === "word/document.xml");
|
|
2555
|
+
if (!documentEntry) {
|
|
2556
|
+
return;
|
|
2557
|
+
}
|
|
2558
|
+
const count = [...decodeUtf8(documentEntry.data).matchAll(/<w:p\b/g)].length;
|
|
2559
|
+
return count > 0 ? count : undefined;
|
|
2560
|
+
};
|
|
2548
2561
|
var spreadsheetText = (entries) => {
|
|
2549
2562
|
const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
2550
2563
|
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
@@ -2558,12 +2571,16 @@ var spreadsheetText = (entries) => {
|
|
|
2558
2571
|
return normalizeWhitespace(sheetValues.join(`
|
|
2559
2572
|
`));
|
|
2560
2573
|
};
|
|
2574
|
+
var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
|
|
2575
|
+
...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
|
|
2576
|
+
].map((match) => match[1] ?? "")).filter(Boolean);
|
|
2561
2577
|
var presentationText = (entries) => {
|
|
2562
2578
|
const slides = entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).map((entry) => extractXmlText(decodeUtf8(entry.data)));
|
|
2563
2579
|
return normalizeWhitespace(slides.join(`
|
|
2564
2580
|
|
|
2565
2581
|
`));
|
|
2566
2582
|
};
|
|
2583
|
+
var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
|
|
2567
2584
|
var epubText = (entries) => {
|
|
2568
2585
|
const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
|
|
2569
2586
|
return normalizeWhitespace(htmlEntries.map((entry) => stripHtml(decodeUtf8(entry.data))).join(`
|
|
@@ -2588,6 +2605,23 @@ var extractEmailText = (raw) => {
|
|
|
2588
2605
|
}
|
|
2589
2606
|
return normalizeWhitespace(body);
|
|
2590
2607
|
};
|
|
2608
|
+
var parseEmailHeaders = (raw) => {
|
|
2609
|
+
const normalized = raw.replace(/\r\n?/g, `
|
|
2610
|
+
`);
|
|
2611
|
+
const [headerBlock = ""] = normalized.split(`
|
|
2612
|
+
|
|
2613
|
+
`);
|
|
2614
|
+
const getHeader = (name) => {
|
|
2615
|
+
const match = headerBlock.match(new RegExp(`^${name}:\\s*(.+)$`, "im"));
|
|
2616
|
+
return match?.[1]?.trim();
|
|
2617
|
+
};
|
|
2618
|
+
return {
|
|
2619
|
+
from: getHeader("From"),
|
|
2620
|
+
subject: getHeader("Subject"),
|
|
2621
|
+
threadTopic: getHeader("Thread-Topic") ?? getHeader("Subject"),
|
|
2622
|
+
to: getHeader("To")
|
|
2623
|
+
};
|
|
2624
|
+
};
|
|
2591
2625
|
var stripRTF = (value) => {
|
|
2592
2626
|
const withoutBinary = value.replace(/\\bin\d+ [\s\S]*?(?=[\\}])/g, " ");
|
|
2593
2627
|
const withoutControls = withoutBinary.replace(/\\par[d]?/g, `
|
|
@@ -2750,12 +2784,22 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
2750
2784
|
const extension = inferExtensionFromInput(input);
|
|
2751
2785
|
const entries = unzipEntries(input.data);
|
|
2752
2786
|
let text = "";
|
|
2787
|
+
let officeMetadata = {};
|
|
2753
2788
|
if (extension === ".docx" || extension === ".odt") {
|
|
2754
2789
|
text = officeDocumentText(entries);
|
|
2790
|
+
officeMetadata = {
|
|
2791
|
+
sectionCount: officeDocumentSectionCount(entries)
|
|
2792
|
+
};
|
|
2755
2793
|
} else if (extension === ".xlsx" || extension === ".ods") {
|
|
2756
2794
|
text = spreadsheetText(entries);
|
|
2795
|
+
officeMetadata = {
|
|
2796
|
+
sheetNames: spreadsheetSheetNames(entries)
|
|
2797
|
+
};
|
|
2757
2798
|
} else if (extension === ".pptx" || extension === ".odp") {
|
|
2758
2799
|
text = presentationText(entries);
|
|
2800
|
+
officeMetadata = {
|
|
2801
|
+
slideCount: presentationSlideCount(entries)
|
|
2802
|
+
};
|
|
2759
2803
|
}
|
|
2760
2804
|
if (!text) {
|
|
2761
2805
|
throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
|
|
@@ -2766,7 +2810,8 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
2766
2810
|
format: "text",
|
|
2767
2811
|
metadata: {
|
|
2768
2812
|
...input.metadata ?? {},
|
|
2769
|
-
fileKind: "office"
|
|
2813
|
+
fileKind: "office",
|
|
2814
|
+
...officeMetadata
|
|
2770
2815
|
},
|
|
2771
2816
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
2772
2817
|
text,
|
|
@@ -2801,18 +2846,21 @@ var createEmailExtractor = () => ({
|
|
|
2801
2846
|
supports: emailExtractorSupports,
|
|
2802
2847
|
extract: (input) => {
|
|
2803
2848
|
const raw = decodeUtf8(input.data);
|
|
2804
|
-
const
|
|
2849
|
+
const headers = parseEmailHeaders(raw);
|
|
2805
2850
|
return {
|
|
2806
2851
|
chunking: input.chunking,
|
|
2807
2852
|
contentType: input.contentType,
|
|
2808
2853
|
format: "text",
|
|
2809
2854
|
metadata: {
|
|
2810
2855
|
...input.metadata ?? {},
|
|
2811
|
-
fileKind: "email"
|
|
2856
|
+
fileKind: "email",
|
|
2857
|
+
from: headers.from,
|
|
2858
|
+
threadTopic: headers.subject,
|
|
2859
|
+
to: headers.to
|
|
2812
2860
|
},
|
|
2813
2861
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.eml`,
|
|
2814
2862
|
text: extractEmailText(raw),
|
|
2815
|
-
title: input.title ??
|
|
2863
|
+
title: input.title ?? headers.subject
|
|
2816
2864
|
};
|
|
2817
2865
|
}
|
|
2818
2866
|
});
|
|
@@ -2889,7 +2937,8 @@ var createPDFFileExtractor = () => ({
|
|
|
2889
2937
|
format: "text",
|
|
2890
2938
|
metadata: {
|
|
2891
2939
|
...input.metadata ?? {},
|
|
2892
|
-
fileKind: "pdf"
|
|
2940
|
+
fileKind: "pdf",
|
|
2941
|
+
pageCount: estimatePDFPageCount(input.data)
|
|
2893
2942
|
},
|
|
2894
2943
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
2895
2944
|
text,
|
|
@@ -2912,6 +2961,7 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
2912
2961
|
metadata: {
|
|
2913
2962
|
...input.metadata ?? {},
|
|
2914
2963
|
fileKind: "pdf",
|
|
2964
|
+
pageCount: estimatePDFPageCount(input.data),
|
|
2915
2965
|
pdfTextMode: "native"
|
|
2916
2966
|
},
|
|
2917
2967
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
@@ -2931,6 +2981,7 @@ var createRAGPDFOCRExtractor = (options) => ({
|
|
|
2931
2981
|
...input.metadata ?? {},
|
|
2932
2982
|
...ocr.metadata ?? {},
|
|
2933
2983
|
fileKind: "pdf",
|
|
2984
|
+
pageCount: estimatePDFPageCount(input.data),
|
|
2934
2985
|
pdfTextMode: "ocr"
|
|
2935
2986
|
},
|
|
2936
2987
|
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.pdf`,
|
|
@@ -6437,5 +6488,5 @@ export {
|
|
|
6437
6488
|
aiChat
|
|
6438
6489
|
};
|
|
6439
6490
|
|
|
6440
|
-
//# debugId=
|
|
6491
|
+
//# debugId=4C8CFD94D17CA32664756E2164756E21
|
|
6441
6492
|
//# sourceMappingURL=index.js.map
|