any-extractor 2.0.2 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -145
- package/dist/index.d.mts +4 -21
- package/dist/index.d.ts +4 -21
- package/dist/index.js +24 -212
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +24 -212
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -3
package/dist/index.js
CHANGED
|
@@ -343,11 +343,6 @@ var ConfluenceCrawler = class {
|
|
|
343
343
|
var AnyExtractor = class {
|
|
344
344
|
constructor(extractorConfig) {
|
|
345
345
|
this.extractorConfig = {
|
|
346
|
-
llm: {
|
|
347
|
-
llmProvider: "openai",
|
|
348
|
-
visionModel: "",
|
|
349
|
-
apikey: ""
|
|
350
|
-
},
|
|
351
346
|
confluence: {
|
|
352
347
|
baseUrl: "",
|
|
353
348
|
email: "",
|
|
@@ -361,11 +356,7 @@ var AnyExtractor = class {
|
|
|
361
356
|
});
|
|
362
357
|
return this;
|
|
363
358
|
};
|
|
364
|
-
this.parseFile = async (input, basicAuth = null
|
|
365
|
-
extractImages: false,
|
|
366
|
-
imageExtractionMethod: "ocr",
|
|
367
|
-
language: "eng"
|
|
368
|
-
}) => {
|
|
359
|
+
this.parseFile = async (input, basicAuth = null) => {
|
|
369
360
|
let preparedInput;
|
|
370
361
|
if (typeof input === "string") {
|
|
371
362
|
if (isValidUrl(input)) {
|
|
@@ -390,17 +381,11 @@ var AnyExtractor = class {
|
|
|
390
381
|
}
|
|
391
382
|
const extractor = this.mimeParserMap.get(mimeDetails.mime);
|
|
392
383
|
if (!extractor?.apply) {
|
|
393
|
-
|
|
394
|
-
throw new Error(message);
|
|
384
|
+
return "";
|
|
395
385
|
}
|
|
396
|
-
return await extractor.apply(preparedInput,
|
|
386
|
+
return await extractor.apply(preparedInput, this.extractorConfig);
|
|
397
387
|
};
|
|
398
|
-
this.parseConfluenceDoc = async (pageId
|
|
399
|
-
extractAttachments: false,
|
|
400
|
-
extractImages: false,
|
|
401
|
-
imageExtractionMethod: "ocr",
|
|
402
|
-
language: "eng"
|
|
403
|
-
}) => {
|
|
388
|
+
this.parseConfluenceDoc = async (pageId) => {
|
|
404
389
|
const { baseUrl, email, apiKey } = this.extractorConfig.confluence || {};
|
|
405
390
|
if (!baseUrl || !email || !apiKey) {
|
|
406
391
|
throw new Error("AnyExtractor: Confluence base URL, email, and API key are required");
|
|
@@ -409,20 +394,18 @@ var AnyExtractor = class {
|
|
|
409
394
|
const content = await confCrawler.extractPageContent(pageId);
|
|
410
395
|
let textContent = "";
|
|
411
396
|
for (const item of content) {
|
|
412
|
-
if (item.type === "image"
|
|
397
|
+
if (item.type === "image") {
|
|
413
398
|
const parsedFile = await this.parseFile(
|
|
414
399
|
item.content,
|
|
415
|
-
`Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}
|
|
416
|
-
extractingOptions
|
|
400
|
+
`Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`
|
|
417
401
|
);
|
|
418
402
|
textContent += parsedFile ? `
|
|
419
403
|
(Image): ${parsedFile}
|
|
420
404
|
` : "";
|
|
421
|
-
} else if (item.type === "view-file"
|
|
405
|
+
} else if (item.type === "view-file") {
|
|
422
406
|
const parsedFile = await this.parseFile(
|
|
423
407
|
item.content,
|
|
424
|
-
`Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}
|
|
425
|
-
extractingOptions
|
|
408
|
+
`Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`
|
|
426
409
|
);
|
|
427
410
|
textContent += parsedFile ? `
|
|
428
411
|
[Attachment]: ${parsedFile}
|
|
@@ -467,7 +450,7 @@ var ExcelParser = class {
|
|
|
467
450
|
this.mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
|
|
468
451
|
this.anyExtractor = anyExtractor;
|
|
469
452
|
}
|
|
470
|
-
async apply(file
|
|
453
|
+
async apply(file) {
|
|
471
454
|
const patterns = {
|
|
472
455
|
sheets: /xl\/worksheets\/sheet\d+.xml/g,
|
|
473
456
|
drawings: /xl\/drawings\/drawing\d+.xml/g,
|
|
@@ -501,7 +484,7 @@ var ExcelParser = class {
|
|
|
501
484
|
} else if (patterns.charts.test(file2.path)) {
|
|
502
485
|
return this.extractChartText([file2.content.toString()]);
|
|
503
486
|
} else if (patterns.images.test(file2.path)) {
|
|
504
|
-
return await this.extractImageText([file2]
|
|
487
|
+
return await this.extractImageText([file2]);
|
|
505
488
|
}
|
|
506
489
|
return null;
|
|
507
490
|
}).filter(Boolean);
|
|
@@ -538,11 +521,11 @@ var ExcelParser = class {
|
|
|
538
521
|
return Array.from(vNodes).map((node) => node.childNodes[0]?.nodeValue ?? "").join("\n");
|
|
539
522
|
}).join("\n");
|
|
540
523
|
}
|
|
541
|
-
async extractImageText(imageFiles
|
|
524
|
+
async extractImageText(imageFiles) {
|
|
542
525
|
const texts = await Promise.all(
|
|
543
526
|
imageFiles.map(async (file) => {
|
|
544
527
|
try {
|
|
545
|
-
return await this.anyExtractor.parseFile(file.content, null
|
|
528
|
+
return await this.anyExtractor.parseFile(file.content, null);
|
|
546
529
|
} catch (e) {
|
|
547
530
|
console.log(`AnyExtractor: Error extracting text from image ${file.path}:`, e);
|
|
548
531
|
return "";
|
|
@@ -584,165 +567,6 @@ var ExcelParser = class {
|
|
|
584
567
|
}
|
|
585
568
|
};
|
|
586
569
|
|
|
587
|
-
// src/file-parser/image-parser.ts
|
|
588
|
-
var import_tesseract = __toESM(require("tesseract.js"));
|
|
589
|
-
var import_undici3 = require("undici");
|
|
590
|
-
var import_file_type_mime2 = require("file-type-mime");
|
|
591
|
-
var ImageParser = class {
|
|
592
|
-
constructor() {
|
|
593
|
-
this.mimes = ["image/jpeg", "image/png", "image/webp"];
|
|
594
|
-
this.apply = async (file, extractingOptions, extractorConfig) => {
|
|
595
|
-
const { extractImages, imageExtractionMethod, language } = extractingOptions;
|
|
596
|
-
if (!extractImages) {
|
|
597
|
-
return "";
|
|
598
|
-
}
|
|
599
|
-
const mimeDetails = (0, import_file_type_mime2.parse)(
|
|
600
|
-
file.buffer.slice(file.byteOffset, file.byteOffset + file.byteLength)
|
|
601
|
-
);
|
|
602
|
-
if (!mimeDetails) {
|
|
603
|
-
throw new Error("AnyExtractor: Unable to parse MIME type");
|
|
604
|
-
}
|
|
605
|
-
const mimeType = mimeDetails.mime;
|
|
606
|
-
if (!this.mimes.includes(mimeType)) {
|
|
607
|
-
return "";
|
|
608
|
-
}
|
|
609
|
-
if (imageExtractionMethod === "ocr") {
|
|
610
|
-
return await this.performOCR(file, language);
|
|
611
|
-
}
|
|
612
|
-
const { llmProvider, visionModel, apikey } = extractorConfig.llm || {};
|
|
613
|
-
if (!llmProvider || !visionModel || !apikey) {
|
|
614
|
-
throw new Error(
|
|
615
|
-
"AnyExtractor: LLM provider, vision model and API key are required for image extraction"
|
|
616
|
-
);
|
|
617
|
-
}
|
|
618
|
-
const base64Image = file.toString("base64");
|
|
619
|
-
switch (llmProvider) {
|
|
620
|
-
case "openai":
|
|
621
|
-
return this.handleOpenAI(base64Image, mimeType, visionModel, apikey);
|
|
622
|
-
case "google":
|
|
623
|
-
return this.handleGoogle(base64Image, mimeType, visionModel, apikey);
|
|
624
|
-
case "anthropic":
|
|
625
|
-
return this.handleAnthropic(base64Image, mimeType, visionModel, apikey);
|
|
626
|
-
default:
|
|
627
|
-
throw new Error(`ImageParser: Unsupported LLM provider '${llmProvider}'`);
|
|
628
|
-
}
|
|
629
|
-
};
|
|
630
|
-
this.performOCR = async (file, language) => {
|
|
631
|
-
const worker = await import_tesseract.default.createWorker(language);
|
|
632
|
-
const {
|
|
633
|
-
data: { text }
|
|
634
|
-
} = await worker.recognize(file);
|
|
635
|
-
await worker.terminate();
|
|
636
|
-
return text;
|
|
637
|
-
};
|
|
638
|
-
this.handleOpenAI = async (base64Image, mimeType, visionModel, apikey) => {
|
|
639
|
-
const response = await (0, import_undici3.fetch)("https://api.openai.com/v1/chat/completions", {
|
|
640
|
-
method: "POST",
|
|
641
|
-
headers: {
|
|
642
|
-
"Content-Type": "application/json",
|
|
643
|
-
Authorization: `Bearer ${apikey}`
|
|
644
|
-
},
|
|
645
|
-
body: JSON.stringify({
|
|
646
|
-
model: visionModel,
|
|
647
|
-
messages: [
|
|
648
|
-
{
|
|
649
|
-
role: "user",
|
|
650
|
-
content: [
|
|
651
|
-
{
|
|
652
|
-
type: "text",
|
|
653
|
-
text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
|
|
654
|
-
},
|
|
655
|
-
{
|
|
656
|
-
type: "image_url",
|
|
657
|
-
image_url: {
|
|
658
|
-
url: `data:${mimeType};base64,${base64Image}`
|
|
659
|
-
}
|
|
660
|
-
}
|
|
661
|
-
]
|
|
662
|
-
}
|
|
663
|
-
]
|
|
664
|
-
})
|
|
665
|
-
});
|
|
666
|
-
if (!response.ok) {
|
|
667
|
-
throw new Error(`ImageParser: OpenAI API error ${response.status}`);
|
|
668
|
-
}
|
|
669
|
-
const data = await response.json();
|
|
670
|
-
return data.choices[0].message.content;
|
|
671
|
-
};
|
|
672
|
-
this.handleGoogle = async (base64Image, mimeType, visionModel, apikey) => {
|
|
673
|
-
const response = await (0, import_undici3.fetch)(
|
|
674
|
-
`https://generativelanguage.googleapis.com/v1beta/models/${visionModel}:generateContent?key=${apikey}`,
|
|
675
|
-
{
|
|
676
|
-
method: "POST",
|
|
677
|
-
headers: {
|
|
678
|
-
"Content-Type": "application/json"
|
|
679
|
-
},
|
|
680
|
-
body: JSON.stringify({
|
|
681
|
-
contents: [
|
|
682
|
-
{
|
|
683
|
-
parts: [
|
|
684
|
-
{
|
|
685
|
-
text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
|
|
686
|
-
},
|
|
687
|
-
{
|
|
688
|
-
inlineData: {
|
|
689
|
-
mimeType,
|
|
690
|
-
data: base64Image
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
]
|
|
694
|
-
}
|
|
695
|
-
]
|
|
696
|
-
})
|
|
697
|
-
}
|
|
698
|
-
);
|
|
699
|
-
if (!response.ok) {
|
|
700
|
-
throw new Error(`Google Gemini error: ${response.statusText}`);
|
|
701
|
-
}
|
|
702
|
-
const data = await response.json();
|
|
703
|
-
return data.candidates[0].content.parts[0].text;
|
|
704
|
-
};
|
|
705
|
-
this.handleAnthropic = async (base64Image, mimeType, visionModel, apikey) => {
|
|
706
|
-
const response = await (0, import_undici3.fetch)("https://api.anthropic.com/v1/messages", {
|
|
707
|
-
method: "POST",
|
|
708
|
-
headers: {
|
|
709
|
-
"Content-Type": "application/json",
|
|
710
|
-
"x-api-key": apikey,
|
|
711
|
-
"anthropic-version": "2023-06-01"
|
|
712
|
-
},
|
|
713
|
-
body: JSON.stringify({
|
|
714
|
-
model: visionModel,
|
|
715
|
-
max_tokens: 300,
|
|
716
|
-
messages: [
|
|
717
|
-
{
|
|
718
|
-
role: "user",
|
|
719
|
-
content: [
|
|
720
|
-
{
|
|
721
|
-
type: "text",
|
|
722
|
-
text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
|
|
723
|
-
},
|
|
724
|
-
{
|
|
725
|
-
type: "image",
|
|
726
|
-
source: {
|
|
727
|
-
type: "base64",
|
|
728
|
-
media_type: mimeType,
|
|
729
|
-
data: base64Image
|
|
730
|
-
}
|
|
731
|
-
}
|
|
732
|
-
]
|
|
733
|
-
}
|
|
734
|
-
]
|
|
735
|
-
})
|
|
736
|
-
});
|
|
737
|
-
if (!response.ok) {
|
|
738
|
-
throw new Error(`Anthropic Claude error: ${response.statusText}`);
|
|
739
|
-
}
|
|
740
|
-
const data = await response.json();
|
|
741
|
-
return data.content[0].text;
|
|
742
|
-
};
|
|
743
|
-
}
|
|
744
|
-
};
|
|
745
|
-
|
|
746
570
|
// src/file-parser/openoffice-paser.ts
|
|
747
571
|
var OpenOfficeParser = class {
|
|
748
572
|
constructor() {
|
|
@@ -842,7 +666,7 @@ var PowerPointParser = class {
|
|
|
842
666
|
this.mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
|
|
843
667
|
this.anyExtractor = anyExtractor;
|
|
844
668
|
}
|
|
845
|
-
async apply(file
|
|
669
|
+
async apply(file) {
|
|
846
670
|
const fileMatchRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+\.xml|ppt\/media\/image\d+\..+|ppt\/slides\/_rels\/slide\d+\.xml.rels/i;
|
|
847
671
|
const slideNumberRegex = /slide(\d+)\.xml/;
|
|
848
672
|
const imageRegex = /^ppt\/media\/image\d+\..+$/i;
|
|
@@ -873,7 +697,7 @@ var PowerPointParser = class {
|
|
|
873
697
|
const imageFullPath = `ppt/${imagePath.replace(/^(\.\.\/)+/, "")}`;
|
|
874
698
|
const imageBuffer = imageBuffers[imageFullPath];
|
|
875
699
|
if (imageBuffer) {
|
|
876
|
-
const imageDescription = await this.convertImageToText(imageBuffer
|
|
700
|
+
const imageDescription = await this.convertImageToText(imageBuffer);
|
|
877
701
|
if (imageDescription) {
|
|
878
702
|
results.push(`[Image]: ${imageDescription}`);
|
|
879
703
|
}
|
|
@@ -898,8 +722,8 @@ var PowerPointParser = class {
|
|
|
898
722
|
const rels = parseString(relsXml).getElementsByTagName("Relationship");
|
|
899
723
|
return Array.from(rels).filter((rel) => rel.getAttribute("Type")?.includes("/image") && rel.getAttribute("Target")).map((rel) => rel.getAttribute("Target"));
|
|
900
724
|
}
|
|
901
|
-
async convertImageToText(imageBuffer
|
|
902
|
-
return this.anyExtractor.parseFile(imageBuffer, null
|
|
725
|
+
async convertImageToText(imageBuffer) {
|
|
726
|
+
return this.anyExtractor.parseFile(imageBuffer, null);
|
|
903
727
|
}
|
|
904
728
|
};
|
|
905
729
|
|
|
@@ -919,7 +743,7 @@ var WordParser = class {
|
|
|
919
743
|
this.mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
|
|
920
744
|
this.anyExtractor = anyExtractor;
|
|
921
745
|
}
|
|
922
|
-
async apply(file
|
|
746
|
+
async apply(file) {
|
|
923
747
|
const mainRegex = /word\/document[\d+]?.xml/;
|
|
924
748
|
const footnotesRegex = /word\/footnotes[\d+]?.xml/;
|
|
925
749
|
const endnotesRegex = /word\/endnotes[\d+]?.xml/;
|
|
@@ -949,21 +773,10 @@ var WordParser = class {
|
|
|
949
773
|
const mainText = await this.extractTextAndImages(
|
|
950
774
|
mainDoc.content.toString(),
|
|
951
775
|
embedMap,
|
|
952
|
-
mediaFiles
|
|
953
|
-
extractingOptions
|
|
776
|
+
mediaFiles
|
|
954
777
|
);
|
|
955
|
-
const footnotesText = footnotesDoc ? await this.extractTextAndImages(
|
|
956
|
-
|
|
957
|
-
embedMap,
|
|
958
|
-
mediaFiles,
|
|
959
|
-
extractingOptions
|
|
960
|
-
) : "";
|
|
961
|
-
const endnotesText = endnotesDoc ? await this.extractTextAndImages(
|
|
962
|
-
endnotesDoc.content.toString(),
|
|
963
|
-
embedMap,
|
|
964
|
-
mediaFiles,
|
|
965
|
-
extractingOptions
|
|
966
|
-
) : "";
|
|
778
|
+
const footnotesText = footnotesDoc ? await this.extractTextAndImages(footnotesDoc.content.toString(), embedMap, mediaFiles) : "";
|
|
779
|
+
const endnotesText = endnotesDoc ? await this.extractTextAndImages(endnotesDoc.content.toString(), embedMap, mediaFiles) : "";
|
|
967
780
|
return [
|
|
968
781
|
mainText,
|
|
969
782
|
footnotesText ? "\n--- Footnotes ---\n" + footnotesText : "",
|
|
@@ -988,7 +801,7 @@ var WordParser = class {
|
|
|
988
801
|
}
|
|
989
802
|
return map;
|
|
990
803
|
}
|
|
991
|
-
async extractTextAndImages(xmlContent, embedMap, mediaFiles
|
|
804
|
+
async extractTextAndImages(xmlContent, embedMap, mediaFiles) {
|
|
992
805
|
const doc = parseString(xmlContent);
|
|
993
806
|
const paragraphs = Array.from(doc.getElementsByTagName("w:p"));
|
|
994
807
|
const parts = [];
|
|
@@ -1004,7 +817,7 @@ var WordParser = class {
|
|
|
1004
817
|
const imageFile = mediaFiles[embedMap[embedId]];
|
|
1005
818
|
if (imageFile) {
|
|
1006
819
|
const imageBuffer = imageFile.content;
|
|
1007
|
-
const imageDescription = await this.convertImageToText(imageBuffer
|
|
820
|
+
const imageDescription = await this.convertImageToText(imageBuffer);
|
|
1008
821
|
paragraphText += `
|
|
1009
822
|
[Image: ${imageDescription}]`;
|
|
1010
823
|
}
|
|
@@ -1016,8 +829,8 @@ var WordParser = class {
|
|
|
1016
829
|
}
|
|
1017
830
|
return parts.join("\n");
|
|
1018
831
|
}
|
|
1019
|
-
async convertImageToText(imageBuffer
|
|
1020
|
-
return await this.anyExtractor.parseFile(imageBuffer, null
|
|
832
|
+
async convertImageToText(imageBuffer) {
|
|
833
|
+
return await this.anyExtractor.parseFile(imageBuffer, null);
|
|
1021
834
|
}
|
|
1022
835
|
};
|
|
1023
836
|
|
|
@@ -1026,7 +839,6 @@ var getAnyExtractor = (config) => {
|
|
|
1026
839
|
const anyExtractor = new AnyExtractor(config);
|
|
1027
840
|
const parsers = [
|
|
1028
841
|
new ExcelParser(anyExtractor),
|
|
1029
|
-
new ImageParser(),
|
|
1030
842
|
new OpenOfficeParser(),
|
|
1031
843
|
new PDFParser(),
|
|
1032
844
|
new PowerPointParser(anyExtractor),
|