any-extractor 2.0.2 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -343,11 +343,6 @@ var ConfluenceCrawler = class {
343
343
  var AnyExtractor = class {
344
344
  constructor(extractorConfig) {
345
345
  this.extractorConfig = {
346
- llm: {
347
- llmProvider: "openai",
348
- visionModel: "",
349
- apikey: ""
350
- },
351
346
  confluence: {
352
347
  baseUrl: "",
353
348
  email: "",
@@ -361,11 +356,7 @@ var AnyExtractor = class {
361
356
  });
362
357
  return this;
363
358
  };
364
- this.parseFile = async (input, basicAuth = null, extractingOptions = {
365
- extractImages: false,
366
- imageExtractionMethod: "ocr",
367
- language: "eng"
368
- }) => {
359
+ this.parseFile = async (input, basicAuth = null) => {
369
360
  let preparedInput;
370
361
  if (typeof input === "string") {
371
362
  if (isValidUrl(input)) {
@@ -390,17 +381,11 @@ var AnyExtractor = class {
390
381
  }
391
382
  const extractor = this.mimeParserMap.get(mimeDetails.mime);
392
383
  if (!extractor?.apply) {
393
- const message = `AnyExtractor: No extraction method registered for MIME type '${mimeDetails.mime}'`;
394
- throw new Error(message);
384
+ return "";
395
385
  }
396
- return await extractor.apply(preparedInput, extractingOptions, this.extractorConfig);
386
+ return await extractor.apply(preparedInput, this.extractorConfig);
397
387
  };
398
- this.parseConfluenceDoc = async (pageId, extractingOptions = {
399
- extractAttachments: false,
400
- extractImages: false,
401
- imageExtractionMethod: "ocr",
402
- language: "eng"
403
- }) => {
388
+ this.parseConfluenceDoc = async (pageId) => {
404
389
  const { baseUrl, email, apiKey } = this.extractorConfig.confluence || {};
405
390
  if (!baseUrl || !email || !apiKey) {
406
391
  throw new Error("AnyExtractor: Confluence base URL, email, and API key are required");
@@ -409,20 +394,18 @@ var AnyExtractor = class {
409
394
  const content = await confCrawler.extractPageContent(pageId);
410
395
  let textContent = "";
411
396
  for (const item of content) {
412
- if (item.type === "image" && extractingOptions.extractImages) {
397
+ if (item.type === "image") {
413
398
  const parsedFile = await this.parseFile(
414
399
  item.content,
415
- `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`,
416
- extractingOptions
400
+ `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`
417
401
  );
418
402
  textContent += parsedFile ? `
419
403
  (Image): ${parsedFile}
420
404
  ` : "";
421
- } else if (item.type === "view-file" && extractingOptions.extractAttachments) {
405
+ } else if (item.type === "view-file") {
422
406
  const parsedFile = await this.parseFile(
423
407
  item.content,
424
- `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`,
425
- extractingOptions
408
+ `Basic ${Buffer.from(`${email}:${apiKey}`).toString("base64")}`
426
409
  );
427
410
  textContent += parsedFile ? `
428
411
  [Attachment]: ${parsedFile}
@@ -467,7 +450,7 @@ var ExcelParser = class {
467
450
  this.mimes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"];
468
451
  this.anyExtractor = anyExtractor;
469
452
  }
470
- async apply(file, extractingOptions) {
453
+ async apply(file) {
471
454
  const patterns = {
472
455
  sheets: /xl\/worksheets\/sheet\d+.xml/g,
473
456
  drawings: /xl\/drawings\/drawing\d+.xml/g,
@@ -501,7 +484,7 @@ var ExcelParser = class {
501
484
  } else if (patterns.charts.test(file2.path)) {
502
485
  return this.extractChartText([file2.content.toString()]);
503
486
  } else if (patterns.images.test(file2.path)) {
504
- return await this.extractImageText([file2], extractingOptions);
487
+ return await this.extractImageText([file2]);
505
488
  }
506
489
  return null;
507
490
  }).filter(Boolean);
@@ -538,11 +521,11 @@ var ExcelParser = class {
538
521
  return Array.from(vNodes).map((node) => node.childNodes[0]?.nodeValue ?? "").join("\n");
539
522
  }).join("\n");
540
523
  }
541
- async extractImageText(imageFiles, extractingOptions) {
524
+ async extractImageText(imageFiles) {
542
525
  const texts = await Promise.all(
543
526
  imageFiles.map(async (file) => {
544
527
  try {
545
- return await this.anyExtractor.parseFile(file.content, null, extractingOptions);
528
+ return await this.anyExtractor.parseFile(file.content, null);
546
529
  } catch (e) {
547
530
  console.log(`AnyExtractor: Error extracting text from image ${file.path}:`, e);
548
531
  return "";
@@ -584,165 +567,6 @@ var ExcelParser = class {
584
567
  }
585
568
  };
586
569
 
587
- // src/file-parser/image-parser.ts
588
- var import_tesseract = __toESM(require("tesseract.js"));
589
- var import_undici3 = require("undici");
590
- var import_file_type_mime2 = require("file-type-mime");
591
- var ImageParser = class {
592
- constructor() {
593
- this.mimes = ["image/jpeg", "image/png", "image/webp"];
594
- this.apply = async (file, extractingOptions, extractorConfig) => {
595
- const { extractImages, imageExtractionMethod, language } = extractingOptions;
596
- if (!extractImages) {
597
- return "";
598
- }
599
- const mimeDetails = (0, import_file_type_mime2.parse)(
600
- file.buffer.slice(file.byteOffset, file.byteOffset + file.byteLength)
601
- );
602
- if (!mimeDetails) {
603
- throw new Error("AnyExtractor: Unable to parse MIME type");
604
- }
605
- const mimeType = mimeDetails.mime;
606
- if (!this.mimes.includes(mimeType)) {
607
- return "";
608
- }
609
- if (imageExtractionMethod === "ocr") {
610
- return await this.performOCR(file, language);
611
- }
612
- const { llmProvider, visionModel, apikey } = extractorConfig.llm || {};
613
- if (!llmProvider || !visionModel || !apikey) {
614
- throw new Error(
615
- "AnyExtractor: LLM provider, vision model and API key are required for image extraction"
616
- );
617
- }
618
- const base64Image = file.toString("base64");
619
- switch (llmProvider) {
620
- case "openai":
621
- return this.handleOpenAI(base64Image, mimeType, visionModel, apikey);
622
- case "google":
623
- return this.handleGoogle(base64Image, mimeType, visionModel, apikey);
624
- case "anthropic":
625
- return this.handleAnthropic(base64Image, mimeType, visionModel, apikey);
626
- default:
627
- throw new Error(`ImageParser: Unsupported LLM provider '${llmProvider}'`);
628
- }
629
- };
630
- this.performOCR = async (file, language) => {
631
- const worker = await import_tesseract.default.createWorker(language);
632
- const {
633
- data: { text }
634
- } = await worker.recognize(file);
635
- await worker.terminate();
636
- return text;
637
- };
638
- this.handleOpenAI = async (base64Image, mimeType, visionModel, apikey) => {
639
- const response = await (0, import_undici3.fetch)("https://api.openai.com/v1/chat/completions", {
640
- method: "POST",
641
- headers: {
642
- "Content-Type": "application/json",
643
- Authorization: `Bearer ${apikey}`
644
- },
645
- body: JSON.stringify({
646
- model: visionModel,
647
- messages: [
648
- {
649
- role: "user",
650
- content: [
651
- {
652
- type: "text",
653
- text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
654
- },
655
- {
656
- type: "image_url",
657
- image_url: {
658
- url: `data:${mimeType};base64,${base64Image}`
659
- }
660
- }
661
- ]
662
- }
663
- ]
664
- })
665
- });
666
- if (!response.ok) {
667
- throw new Error(`ImageParser: OpenAI API error ${response.status}`);
668
- }
669
- const data = await response.json();
670
- return data.choices[0].message.content;
671
- };
672
- this.handleGoogle = async (base64Image, mimeType, visionModel, apikey) => {
673
- const response = await (0, import_undici3.fetch)(
674
- `https://generativelanguage.googleapis.com/v1beta/models/${visionModel}:generateContent?key=${apikey}`,
675
- {
676
- method: "POST",
677
- headers: {
678
- "Content-Type": "application/json"
679
- },
680
- body: JSON.stringify({
681
- contents: [
682
- {
683
- parts: [
684
- {
685
- text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
686
- },
687
- {
688
- inlineData: {
689
- mimeType,
690
- data: base64Image
691
- }
692
- }
693
- ]
694
- }
695
- ]
696
- })
697
- }
698
- );
699
- if (!response.ok) {
700
- throw new Error(`Google Gemini error: ${response.statusText}`);
701
- }
702
- const data = await response.json();
703
- return data.candidates[0].content.parts[0].text;
704
- };
705
- this.handleAnthropic = async (base64Image, mimeType, visionModel, apikey) => {
706
- const response = await (0, import_undici3.fetch)("https://api.anthropic.com/v1/messages", {
707
- method: "POST",
708
- headers: {
709
- "Content-Type": "application/json",
710
- "x-api-key": apikey,
711
- "anthropic-version": "2023-06-01"
712
- },
713
- body: JSON.stringify({
714
- model: visionModel,
715
- max_tokens: 300,
716
- messages: [
717
- {
718
- role: "user",
719
- content: [
720
- {
721
- type: "text",
722
- text: "Provide a concise summary of the image for semantic search. Exclude any introductions, labels, or formatting \u2014 just return the core content. Also include visible text and contextual details about layout, content type, or purpose."
723
- },
724
- {
725
- type: "image",
726
- source: {
727
- type: "base64",
728
- media_type: mimeType,
729
- data: base64Image
730
- }
731
- }
732
- ]
733
- }
734
- ]
735
- })
736
- });
737
- if (!response.ok) {
738
- throw new Error(`Anthropic Claude error: ${response.statusText}`);
739
- }
740
- const data = await response.json();
741
- return data.content[0].text;
742
- };
743
- }
744
- };
745
-
746
570
  // src/file-parser/openoffice-paser.ts
747
571
  var OpenOfficeParser = class {
748
572
  constructor() {
@@ -842,7 +666,7 @@ var PowerPointParser = class {
842
666
  this.mimes = ["application/vnd.openxmlformats-officedocument.presentationml.presentation"];
843
667
  this.anyExtractor = anyExtractor;
844
668
  }
845
- async apply(file, extractingOptions) {
669
+ async apply(file) {
846
670
  const fileMatchRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+\.xml|ppt\/media\/image\d+\..+|ppt\/slides\/_rels\/slide\d+\.xml.rels/i;
847
671
  const slideNumberRegex = /slide(\d+)\.xml/;
848
672
  const imageRegex = /^ppt\/media\/image\d+\..+$/i;
@@ -873,7 +697,7 @@ var PowerPointParser = class {
873
697
  const imageFullPath = `ppt/${imagePath.replace(/^(\.\.\/)+/, "")}`;
874
698
  const imageBuffer = imageBuffers[imageFullPath];
875
699
  if (imageBuffer) {
876
- const imageDescription = await this.convertImageToText(imageBuffer, extractingOptions);
700
+ const imageDescription = await this.convertImageToText(imageBuffer);
877
701
  if (imageDescription) {
878
702
  results.push(`[Image]: ${imageDescription}`);
879
703
  }
@@ -898,8 +722,8 @@ var PowerPointParser = class {
898
722
  const rels = parseString(relsXml).getElementsByTagName("Relationship");
899
723
  return Array.from(rels).filter((rel) => rel.getAttribute("Type")?.includes("/image") && rel.getAttribute("Target")).map((rel) => rel.getAttribute("Target"));
900
724
  }
901
- async convertImageToText(imageBuffer, extractingOptions) {
902
- return this.anyExtractor.parseFile(imageBuffer, null, extractingOptions);
725
+ async convertImageToText(imageBuffer) {
726
+ return this.anyExtractor.parseFile(imageBuffer, null);
903
727
  }
904
728
  };
905
729
 
@@ -919,7 +743,7 @@ var WordParser = class {
919
743
  this.mimes = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"];
920
744
  this.anyExtractor = anyExtractor;
921
745
  }
922
- async apply(file, extractingOptions) {
746
+ async apply(file) {
923
747
  const mainRegex = /word\/document[\d+]?.xml/;
924
748
  const footnotesRegex = /word\/footnotes[\d+]?.xml/;
925
749
  const endnotesRegex = /word\/endnotes[\d+]?.xml/;
@@ -949,21 +773,10 @@ var WordParser = class {
949
773
  const mainText = await this.extractTextAndImages(
950
774
  mainDoc.content.toString(),
951
775
  embedMap,
952
- mediaFiles,
953
- extractingOptions
776
+ mediaFiles
954
777
  );
955
- const footnotesText = footnotesDoc ? await this.extractTextAndImages(
956
- footnotesDoc.content.toString(),
957
- embedMap,
958
- mediaFiles,
959
- extractingOptions
960
- ) : "";
961
- const endnotesText = endnotesDoc ? await this.extractTextAndImages(
962
- endnotesDoc.content.toString(),
963
- embedMap,
964
- mediaFiles,
965
- extractingOptions
966
- ) : "";
778
+ const footnotesText = footnotesDoc ? await this.extractTextAndImages(footnotesDoc.content.toString(), embedMap, mediaFiles) : "";
779
+ const endnotesText = endnotesDoc ? await this.extractTextAndImages(endnotesDoc.content.toString(), embedMap, mediaFiles) : "";
967
780
  return [
968
781
  mainText,
969
782
  footnotesText ? "\n--- Footnotes ---\n" + footnotesText : "",
@@ -988,7 +801,7 @@ var WordParser = class {
988
801
  }
989
802
  return map;
990
803
  }
991
- async extractTextAndImages(xmlContent, embedMap, mediaFiles, extractingOptions) {
804
+ async extractTextAndImages(xmlContent, embedMap, mediaFiles) {
992
805
  const doc = parseString(xmlContent);
993
806
  const paragraphs = Array.from(doc.getElementsByTagName("w:p"));
994
807
  const parts = [];
@@ -1004,7 +817,7 @@ var WordParser = class {
1004
817
  const imageFile = mediaFiles[embedMap[embedId]];
1005
818
  if (imageFile) {
1006
819
  const imageBuffer = imageFile.content;
1007
- const imageDescription = await this.convertImageToText(imageBuffer, extractingOptions);
820
+ const imageDescription = await this.convertImageToText(imageBuffer);
1008
821
  paragraphText += `
1009
822
  [Image: ${imageDescription}]`;
1010
823
  }
@@ -1016,8 +829,8 @@ var WordParser = class {
1016
829
  }
1017
830
  return parts.join("\n");
1018
831
  }
1019
- async convertImageToText(imageBuffer, extractingOptions) {
1020
- return await this.anyExtractor.parseFile(imageBuffer, null, extractingOptions);
832
+ async convertImageToText(imageBuffer) {
833
+ return await this.anyExtractor.parseFile(imageBuffer, null);
1021
834
  }
1022
835
  };
1023
836
 
@@ -1026,7 +839,6 @@ var getAnyExtractor = (config) => {
1026
839
  const anyExtractor = new AnyExtractor(config);
1027
840
  const parsers = [
1028
841
  new ExcelParser(anyExtractor),
1029
- new ImageParser(),
1030
842
  new OpenOfficeParser(),
1031
843
  new PDFParser(),
1032
844
  new PowerPointParser(anyExtractor),