any-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ import { Element, LiveNodeList } from "@xmldom/xmldom";
2
+ import { ERRORMSG } from "../constant";
3
+ import { AnyParserMethod } from "../types";
4
+ import { extractFiles, parseString } from "../util";
5
+
6
+ export class ExcelParser implements AnyParserMethod {
7
+ mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
8
+
9
+ async apply(file: Buffer): Promise<string> {
10
+ const sheetsRegex = /xl\/worksheets\/sheet\d+.xml/g;
11
+ const drawingsRegex = /xl\/drawings\/drawing\d+.xml/g;
12
+ const chartsRegex = /xl\/charts\/chart\d+.xml/g;
13
+ const stringsFilePath = 'xl/sharedStrings.xml';
14
+
15
+ try {
16
+ const files = await extractFiles(file, x =>
17
+ [sheetsRegex, drawingsRegex, chartsRegex].some(fileRegex => x.match(fileRegex)) || x == stringsFilePath
18
+ );
19
+
20
+ if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(sheetsRegex))) {
21
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
22
+ }
23
+
24
+ const xmlContentFilesObject = {
25
+ sheetFiles: files.filter(file => file.path.match(sheetsRegex)).map(file => file.content),
26
+ drawingFiles: files.filter(file => file.path.match(drawingsRegex)).map(file => file.content),
27
+ chartFiles: files.filter(file => file.path.match(chartsRegex)).map(file => file.content),
28
+ sharedStringsFile: files.filter(file => file.path == stringsFilePath).map(file => file.content)[0],
29
+ };
30
+
31
+ let responseText: string[] = [];
32
+
33
+ function isValidInlineStringCNode(cNode: Element): boolean {
34
+ if (cNode.tagName.toLowerCase() != 'c') return false;
35
+ if (cNode.getAttribute("t") != 'inlineStr') return false;
36
+ const childNodesNamedIs: LiveNodeList<Element> = cNode.getElementsByTagName('is');
37
+ if (childNodesNamedIs.length != 1) return false;
38
+ const childNodesNamedT: LiveNodeList<Element> = childNodesNamedIs[0].getElementsByTagName('t');
39
+ if (childNodesNamedT.length != 1) return false;
40
+ return childNodesNamedT[0].childNodes[0] && childNodesNamedT[0].childNodes[0].nodeValue != '';
41
+ }
42
+
43
+ function hasValidVNodeInCNode(cNode: Element): boolean {
44
+ const vNodes = cNode.getElementsByTagName("v");
45
+ return vNodes[0] && vNodes[0].childNodes[0] && vNodes[0].childNodes[0].nodeValue != '';
46
+ }
47
+
48
+ const sharedStringsXmlTNodesList = xmlContentFilesObject.sharedStringsFile != undefined
49
+ ? parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t")
50
+ : [];
51
+
52
+ const sharedStrings = Array.from(sharedStringsXmlTNodesList)
53
+ .map(tNode => tNode.childNodes[0]?.nodeValue ?? '');
54
+
55
+ for (const sheetXmlContent of xmlContentFilesObject.sheetFiles) {
56
+ const sheetsXmlCNodesList = parseString(sheetXmlContent).getElementsByTagName("c");
57
+ responseText.push(
58
+ Array.from(sheetsXmlCNodesList)
59
+ .filter(cNode => isValidInlineStringCNode(cNode) || hasValidVNodeInCNode(cNode))
60
+ .map(cNode => {
61
+ if (isValidInlineStringCNode(cNode))
62
+ return cNode.getElementsByTagName('is')[0].getElementsByTagName('t')[0].childNodes[0].nodeValue;
63
+ if (hasValidVNodeInCNode(cNode)) {
64
+ const isIndexInSharedStrings = cNode.getAttribute("t") == "s";
65
+ const value = parseInt(cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue ?? "", 10);
66
+ if (isIndexInSharedStrings && value >= sharedStrings.length)
67
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
68
+
69
+ return isIndexInSharedStrings
70
+ ? sharedStrings[value]
71
+ : value;
72
+ }
73
+ return '';
74
+ })
75
+ .join("\n")
76
+ );
77
+ }
78
+
79
+ for (const drawingXmlContent of xmlContentFilesObject.drawingFiles) {
80
+ const drawingsXmlParagraphNodesList = parseString(drawingXmlContent).getElementsByTagName("a:p");
81
+ responseText.push(
82
+ Array.from(drawingsXmlParagraphNodesList)
83
+ .filter(paragraphNode => paragraphNode.getElementsByTagName("a:t").length != 0)
84
+ .map(paragraphNode => {
85
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
86
+ return Array.from(xmlTextNodeList)
87
+ .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
88
+ .map(textNode => textNode.childNodes[0].nodeValue)
89
+ .join("");
90
+ })
91
+ .join("\n")
92
+ );
93
+ }
94
+
95
+ for (const chartXmlContent of xmlContentFilesObject.chartFiles) {
96
+ const chartsXmlCVNodesList = parseString(chartXmlContent).getElementsByTagName("c:v");
97
+ responseText.push(
98
+ Array.from(chartsXmlCVNodesList)
99
+ .filter(cVNode => cVNode.childNodes[0] && cVNode.childNodes[0].nodeValue)
100
+ .map(cVNode => cVNode.childNodes[0].nodeValue)
101
+ .join("\n")
102
+ );
103
+ }
104
+
105
+ return responseText.join("\n");
106
+ } catch (error) {
107
+ console.error("Error parsing Excel file:", error);
108
+ throw error;
109
+ }
110
+ }
111
+ }
@@ -0,0 +1,108 @@
1
+ import { ERRORMSG } from "../constant";
2
+ import { AnyParserMethod } from "../types";
3
+ import { extractFiles, parseString } from "../util";
4
+ import { Element, Node } from "@xmldom/xmldom";
5
+
6
+ export class OpenOfficeParser implements AnyParserMethod {
7
+ mimes = ["application/vnd.oasis.opendocument.text",
8
+ "application/vnd.oasis.opendocument.spreadsheet",
9
+ "application/vnd.oasis.opendocument.presentation",
10
+ "application/vnd.oasis.opendocument.graphics",
11
+ "application/vnd.oasis.opendocument.formula"];
12
+
13
+ apply = async (file: Buffer): Promise<string> => {
14
+ const mainContentFilePath = 'content.xml';
15
+ const objectContentFilesRegex = /Object \d+\/content.xml/g;
16
+
17
+ try {
18
+ const files = await extractFiles(file, x => x == mainContentFilePath || !!x.match(objectContentFilesRegex));
19
+
20
+ if (!files.map(file => file.path).includes(mainContentFilePath)) {
21
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
22
+ }
23
+
24
+ const xmlContentFilesObject = {
25
+ mainContentFile: files.filter(file => file.path == mainContentFilePath).map(file => file.content)[0],
26
+ objectContentFiles: files.filter(file => file.path.match(objectContentFilesRegex)).map(file => file.content),
27
+ };
28
+
29
+ let notesText: string[] = [];
30
+ let responseText: string[] = [];
31
+
32
+ const allowedTextTags = ["text:p", "text:h"];
33
+ const notesTag = "presentation:notes";
34
+
35
+ function extractAllTextsFromNode(root: Element): string {
36
+ let xmlTextArray: string[] = [];
37
+ for (let i = 0; i < root.childNodes.length; i++) {
38
+ traversal(root.childNodes[i], xmlTextArray, true);
39
+ }
40
+ return xmlTextArray.join("");
41
+ }
42
+
43
+ function traversal(node: Node, xmlTextArray: string[], isFirstRecursion: boolean): void {
44
+ if (!node.childNodes || node.childNodes.length == 0) {
45
+ if (node.parentNode && (node.parentNode as Element).tagName.indexOf('text') == 0 && node.nodeValue) {
46
+ if (isNotesNode(node.parentNode as Element)) {
47
+ notesText.push(node.nodeValue);
48
+ if (allowedTextTags.includes((node.parentNode as Element).tagName) && !isFirstRecursion) {
49
+ notesText.push("\n");
50
+ }
51
+ } else {
52
+ xmlTextArray.push(node.nodeValue);
53
+ if (allowedTextTags.includes((node.parentNode as Element).tagName) && !isFirstRecursion) {
54
+ xmlTextArray.push("\n");
55
+ }
56
+ }
57
+ }
58
+ return;
59
+ }
60
+
61
+ for (let i = 0; i < node.childNodes.length; i++) {
62
+ traversal(node.childNodes[i] as Element, xmlTextArray, false);
63
+ }
64
+ }
65
+
66
+ function isNotesNode(node: Element): boolean {
67
+ if (node.tagName == notesTag) {
68
+ return true;
69
+ }
70
+ if (node.parentNode) {
71
+ return isNotesNode(node.parentNode as Element);
72
+ }
73
+ return false;
74
+ }
75
+
76
+ function isInvalidTextNode(node: Element) {
77
+ if (allowedTextTags.includes(node.tagName)) {
78
+ return true;
79
+ }
80
+ if (node.parentNode) {
81
+ return isInvalidTextNode(node.parentNode as Element);
82
+ }
83
+ return false;
84
+ }
85
+
86
+ const xmlContentArray = [xmlContentFilesObject.mainContentFile, ...xmlContentFilesObject.objectContentFiles].map(xmlContent => parseString(xmlContent));
87
+ xmlContentArray.forEach(xmlContent => {
88
+ const xmlTextNodesList = [...Array.from(xmlContent
89
+ .getElementsByTagName("*"))
90
+ .filter(node => allowedTextTags.includes(node.tagName)
91
+ && !isInvalidTextNode(node.parentNode as Element))];
92
+ responseText.push(
93
+ xmlTextNodesList
94
+ .map(textNode => extractAllTextsFromNode(textNode))
95
+ .filter(text => text != "")
96
+ .join("\n")
97
+ );
98
+ });
99
+
100
+ responseText = [...responseText, ...notesText];
101
+ return responseText.join("\n");
102
+
103
+ } catch (error) {
104
+ console.error("Error parsing OpenOffice file:", error);
105
+ throw error;
106
+ }
107
+ }
108
+ }
@@ -0,0 +1,17 @@
1
+ import pdf from 'pdf-parse';
2
+ import { AnyParserMethod } from "../types";
3
+
4
+ export class PDFParser implements AnyParserMethod {
5
+ mimes = ["application/pdf"];
6
+
7
+ apply = async (file: Buffer): Promise<string> => {
8
+ try {
9
+ const data = await pdf(file);
10
+ const textContent = data.text;
11
+ return textContent;
12
+ } catch (error) {
13
+ console.error("Error parsing PDF file:", error);
14
+ throw error;
15
+ }
16
+ };
17
+ }
@@ -0,0 +1,58 @@
1
+ import { ERRORMSG } from "../constant";
2
+ import { AnyParserMethod } from "../types";
3
+ import { extractFiles, parseString } from "../util";
4
+
5
+ export class PowerPointParser implements AnyParserMethod {
6
+ mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
7
+
8
+ async apply(file: Buffer): Promise<string> {
9
+ const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
10
+ const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
11
+ const slideNumberRegex = /lide(\d+)\.xml/;
12
+
13
+ try {
14
+ const files = await extractFiles(file, x => !!x.match(allFilesRegex));
15
+
16
+ files.sort((a, b) => {
17
+ const matchedANumber = parseInt(a.path.match(slideNumberRegex)?.at(1) ?? "", 10);
18
+ const matchedBNumber = parseInt(b.path.match(slideNumberRegex)?.at(1) ?? "", 10);
19
+
20
+ const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber;
21
+ const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber;
22
+
23
+ return aNumber - bNumber || Number(a.path.includes('notes')) - Number(b.path.includes('notes'));
24
+ });
25
+
26
+ if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(slidesRegex))) {
27
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
28
+ }
29
+
30
+ files.sort((a, b) => a.path.indexOf("notes") - b.path.indexOf("notes"));
31
+
32
+ const xmlContentArray = files.map(file => file.content);
33
+
34
+ let responseText: string[] = [];
35
+
36
+ for (const xmlContent of xmlContentArray) {
37
+ const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName("a:p");
38
+ responseText.push(
39
+ Array.from(xmlParagraphNodesList)
40
+ .filter(paragraphNode => paragraphNode.getElementsByTagName("a:t").length != 0)
41
+ .map(paragraphNode => {
42
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("a:t");
43
+ return Array.from(xmlTextNodeList)
44
+ .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
45
+ .map(textNode => textNode.childNodes[0].nodeValue)
46
+ .join("");
47
+ })
48
+ .join("\n")
49
+ );
50
+ }
51
+ const responseTextString = responseText.join("\n");
52
+ return responseTextString;
53
+ } catch (error) {
54
+ console.error("Error parsing PowerPoint file:", error);
55
+ throw error;
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,50 @@
1
+ import { ERRORMSG } from "../constant";
2
+ import { AnyParserMethod } from "../types";
3
+ import { extractFiles, parseString } from "../util";
4
+
5
+ export class WordParser implements AnyParserMethod {
6
+ mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
7
+
8
+ async apply(file: Buffer): Promise<string> {
9
+ const mainContentFileRegex = /word\/document[\d+]?.xml/g;
10
+ const footnotesFileRegex = /word\/footnotes[\d+]?.xml/g;
11
+ const endnotesFileRegex = /word\/endnotes[\d+]?.xml/g;
12
+
13
+ try {
14
+ const files = await extractFiles(file, x =>
15
+ [mainContentFileRegex, footnotesFileRegex, endnotesFileRegex].some(fileRegex => x.match(fileRegex))
16
+ );
17
+
18
+ if (!files.some(file => file.path.match(mainContentFileRegex))) {
19
+ throw ERRORMSG.fileCorrupted("TODO: figure this out");
20
+ }
21
+
22
+ const xmlContentArray = files
23
+ .filter(file => file.path.match(mainContentFileRegex) || file.path.match(footnotesFileRegex) || file.path.match(endnotesFileRegex))
24
+ .map(file => file.content);
25
+
26
+ let responseText: string[] = [];
27
+
28
+ xmlContentArray.forEach(xmlContent => {
29
+ const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName("w:p");
30
+ responseText.push(
31
+ Array.from(xmlParagraphNodesList)
32
+ .filter(paragraphNode => paragraphNode.getElementsByTagName("w:t").length != 0)
33
+ .map(paragraphNode => {
34
+ const xmlTextNodeList = paragraphNode.getElementsByTagName("w:t");
35
+ return Array.from(xmlTextNodeList)
36
+ .filter(textNode => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
37
+ .map(textNode => textNode.childNodes[0].nodeValue)
38
+ .join("");
39
+ })
40
+ .join("\n")
41
+ );
42
+ });
43
+ const responseTextString = responseText.join("\n");
44
+ return responseTextString;
45
+ } catch (error) {
46
+ console.error("Error parsing Word file:", error);
47
+ throw error;
48
+ }
49
+ }
50
+ }
package/src/types.ts ADDED
@@ -0,0 +1,12 @@
1
+ export type InputType = 'buffer' | 'file' | 'fileurl'
2
+ export type ExtractionPayload = { type: InputType; input: string | Buffer }
3
+
4
+ export type AnyParserMethod = {
5
+ mimes: string[];
6
+ apply: (_: Buffer) => Promise<string>;
7
+ }
8
+
9
+ export type ExtractedFile = {
10
+ path: string;
11
+ content: string;
12
+ }
package/src/util.ts ADDED
@@ -0,0 +1,68 @@
1
+ import { readFile as read } from 'node:fs/promises';
2
+ import { fetch } from 'undici';
3
+ import yauzl from 'yauzl';
4
+ import { ERRORMSG } from './constant';
5
+ import { ExtractedFile } from './types';
6
+ import concat from 'concat-stream';
7
+ import { DOMParser } from '@xmldom/xmldom';
8
+
9
+ export const readFile = async (filePath: string): Promise<Buffer> =>
10
+ (await read(filePath)) as unknown as Buffer;
11
+
12
+ export const readFileUrl = async (url: string): Promise<Buffer> => {
13
+ const res = await fetch(url);
14
+ if (!res.ok) throw new Error(`Failed to fetch: ${res.statusText}`);
15
+ return Buffer.from(await res.arrayBuffer());
16
+ }
17
+
18
+ export const extractFiles = (zipInput: Buffer | string, filterFn: (x: string) => boolean): Promise<ExtractedFile[]> => {
19
+ return new Promise((res, rej) => {
20
+ const processZipfile = (zipfile: yauzl.ZipFile) => {
21
+ const extractedFiles: ExtractedFile[] = [];
22
+ zipfile.readEntry();
23
+
24
+ function processEntry(entry: yauzl.Entry) {
25
+ if (filterFn(entry.fileName)) {
26
+ zipfile.openReadStream(entry, (err, readStream) => {
27
+ if (err)
28
+ return rej(err);
29
+
30
+ readStream.pipe(concat((data: Buffer) => {
31
+ extractedFiles.push({
32
+ path: entry.fileName,
33
+ content: data.toString()
34
+ });
35
+ zipfile.readEntry();
36
+ }));
37
+ });
38
+ }
39
+ else
40
+ zipfile.readEntry();
41
+ }
42
+
43
+ zipfile.on('entry', processEntry);
44
+ zipfile.on('end', () => res(extractedFiles));
45
+ zipfile.on('error', rej);
46
+ };
47
+
48
+ if (Buffer.isBuffer(zipInput)) {
49
+ yauzl.fromBuffer(zipInput, { lazyEntries: true }, (err, zipfile) => {
50
+ if (err) return rej(err);
51
+ processZipfile(zipfile);
52
+ });
53
+ }
54
+ else if (typeof zipInput === 'string') {
55
+ yauzl.open(zipInput, { lazyEntries: true }, (err, zipfile) => {
56
+ if (err) return rej(err);
57
+ processZipfile(zipfile);
58
+ });
59
+ }
60
+ else
61
+ rej(ERRORMSG.invalidInput);
62
+ });
63
+ }
64
+
65
+ export const parseString = (xml: string) => {
66
+ let parser = new DOMParser();
67
+ return parser.parseFromString(xml, "text/xml");
68
+ };
package/tsconfig.json ADDED
@@ -0,0 +1,17 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "module": "ESNext",
5
+ "moduleResolution": "Node",
6
+ "outDir": "dist",
7
+ "declaration": true,
8
+ "declarationDir": "dist",
9
+ "esModuleInterop": true,
10
+ "forceConsistentCasingInFileNames": true,
11
+ "skipLibCheck": true,
12
+ "strict": true,
13
+ "resolveJsonModule": true,
14
+ "sourceMap": true
15
+ },
16
+ "include": ["src"]
17
+ }
package/tsup.config.ts ADDED
@@ -0,0 +1,9 @@
1
+ import { defineConfig } from 'tsup'
2
+
3
+ export default defineConfig({
4
+ entry: ['src/index.ts'],
5
+ format: ['esm', 'cjs'],
6
+ dts: true,
7
+ sourcemap: true,
8
+ clean: true
9
+ })