pdf-plus 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.d.mts +5 -4
- package/dist/index.d.ts +5 -4
- package/dist/index.js +29 -27
- package/dist/index.mjs +29 -27
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -398,7 +398,7 @@ const result = await extractPdfContent("document.pdf", {
|
|
|
398
398
|
|
|
399
399
|
### Getting Help
|
|
400
400
|
|
|
401
|
-
- Check the [Issues](https://github.com/kauandotnet/
|
|
401
|
+
- Check the [Issues](https://github.com/kauandotnet/pdf-plus/issues) page
|
|
402
402
|
- Review [examples](./examples/) for common use cases
|
|
403
403
|
- Enable verbose logging for debugging: `{ verbose: true }`
|
|
404
404
|
|
package/dist/index.d.mts
CHANGED
|
@@ -63,6 +63,7 @@ interface ExtractionResult {
|
|
|
63
63
|
pages: PageInfo[];
|
|
64
64
|
images: ImageItem[];
|
|
65
65
|
textItems: TextItem[];
|
|
66
|
+
text: string;
|
|
66
67
|
textWithRefs: string;
|
|
67
68
|
cleanText: string;
|
|
68
69
|
summary?: DocumentSummary;
|
|
@@ -605,7 +606,7 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
|
|
|
605
606
|
*
|
|
606
607
|
* @example
|
|
607
608
|
* ```typescript
|
|
608
|
-
* import { extractPdfContent } from '
|
|
609
|
+
* import { extractPdfContent } from 'pdf-plus';
|
|
609
610
|
*
|
|
610
611
|
* const result = await extractPdfContent('document.pdf', {
|
|
611
612
|
* extractText: true,
|
|
@@ -626,7 +627,7 @@ declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions)
|
|
|
626
627
|
*
|
|
627
628
|
* @example
|
|
628
629
|
* ```typescript
|
|
629
|
-
* import { extractText } from '
|
|
630
|
+
* import { extractText } from 'pdf-plus';
|
|
630
631
|
*
|
|
631
632
|
* const text = await extractText('document.pdf');
|
|
632
633
|
* console.log(`Extracted ${text.length} characters`);
|
|
@@ -642,7 +643,7 @@ declare function extractText(pdfPath: string, options?: Partial<ExtractionOption
|
|
|
642
643
|
*
|
|
643
644
|
* @example
|
|
644
645
|
* ```typescript
|
|
645
|
-
* import { extractImages } from '
|
|
646
|
+
* import { extractImages } from 'pdf-plus';
|
|
646
647
|
*
|
|
647
648
|
* const images = await extractImages('document.pdf', {
|
|
648
649
|
* extractImageFiles: true,
|
|
@@ -663,7 +664,7 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
663
664
|
*
|
|
664
665
|
* @example
|
|
665
666
|
* ```typescript
|
|
666
|
-
* import { extractImageFiles } from '
|
|
667
|
+
* import { extractImageFiles } from 'pdf-plus';
|
|
667
668
|
*
|
|
668
669
|
* const filePaths = await extractImageFiles('document.pdf', './images', {
|
|
669
670
|
* verbose: true
|
package/dist/index.d.ts
CHANGED
|
@@ -63,6 +63,7 @@ interface ExtractionResult {
|
|
|
63
63
|
pages: PageInfo[];
|
|
64
64
|
images: ImageItem[];
|
|
65
65
|
textItems: TextItem[];
|
|
66
|
+
text: string;
|
|
66
67
|
textWithRefs: string;
|
|
67
68
|
cleanText: string;
|
|
68
69
|
summary?: DocumentSummary;
|
|
@@ -605,7 +606,7 @@ declare function validateFilePath(filePath: string, extensions?: string[]): Vali
|
|
|
605
606
|
*
|
|
606
607
|
* @example
|
|
607
608
|
* ```typescript
|
|
608
|
-
* import { extractPdfContent } from '
|
|
609
|
+
* import { extractPdfContent } from 'pdf-plus';
|
|
609
610
|
*
|
|
610
611
|
* const result = await extractPdfContent('document.pdf', {
|
|
611
612
|
* extractText: true,
|
|
@@ -626,7 +627,7 @@ declare function extractPdfContent(pdfPath: string, options?: ExtractionOptions)
|
|
|
626
627
|
*
|
|
627
628
|
* @example
|
|
628
629
|
* ```typescript
|
|
629
|
-
* import { extractText } from '
|
|
630
|
+
* import { extractText } from 'pdf-plus';
|
|
630
631
|
*
|
|
631
632
|
* const text = await extractText('document.pdf');
|
|
632
633
|
* console.log(`Extracted ${text.length} characters`);
|
|
@@ -642,7 +643,7 @@ declare function extractText(pdfPath: string, options?: Partial<ExtractionOption
|
|
|
642
643
|
*
|
|
643
644
|
* @example
|
|
644
645
|
* ```typescript
|
|
645
|
-
* import { extractImages } from '
|
|
646
|
+
* import { extractImages } from 'pdf-plus';
|
|
646
647
|
*
|
|
647
648
|
* const images = await extractImages('document.pdf', {
|
|
648
649
|
* extractImageFiles: true,
|
|
@@ -663,7 +664,7 @@ declare function extractImages(pdfPath: string, options?: Partial<ExtractionOpti
|
|
|
663
664
|
*
|
|
664
665
|
* @example
|
|
665
666
|
* ```typescript
|
|
666
|
-
* import { extractImageFiles } from '
|
|
667
|
+
* import { extractImageFiles } from 'pdf-plus';
|
|
667
668
|
*
|
|
668
669
|
* const filePaths = await extractImageFiles('document.pdf', './images', {
|
|
669
670
|
* verbose: true
|
package/dist/index.js
CHANGED
|
@@ -1,40 +1,42 @@
|
|
|
1
|
-
'use strict';Object.defineProperty(exports,'__esModule',{value:true});var
|
|
2
|
-
`);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let o=parseInt(s[1],10),n=parseInt(s[2],10),l=parseInt(s[3],10),i=parseInt(s[4],10),c=s[5]?.toUpperCase()||"PNG";e.push({page:o,index:n,width:l,height:i,format:c});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>exports.ImageExtractor});exports.ImageExtractor=void 0;var N=O(()=>{exports.ImageExtractor=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(y__namespace.default.existsSync(r.imageOutputDir)||y__namespace.default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let o=await s.extractImages(t,r);if(!o.success)throw new Error(o.error||"Engine extraction failed");return {success:!0,images:o.images||[],metadata:{totalImages:o.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=y__namespace.default.readFileSync(t),o=await r.load(s,{ignoreEncryption:!0}),n=o.getPageCount(),l=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(y__namespace.default.existsSync(e.imageOutputDir)||y__namespace.default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let c=0;c<n;c++){let u=c+1;try{let g=o.getPage(c).node.Resources();if(!g){e.verbose;continue}let m=g.get(a.of("XObject"));if(!m){e.verbose;continue}let b=m.dict;e.verbose;for(let[d,x]of b)try{let h=o.context.lookup(x),P=h.dict.get(a.of("Subtype"));if(!P||P.toString()!=="/Image")continue;let I=await this.extractImageFromPdfObject(h,u,i,e);I&&(l.push(I),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:l,totalPages:n,totalImages:l.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),o=t.dict.get(s.of("Width")),n=t.dict.get(s.of("Height")),l=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),c=t.dict.get(s.of("BitsPerComponent")),u=o&&typeof o.value=="number"?o.value:100,f=n&&typeof n.value=="number"?n.value:100,g=c&&typeof c.value=="number"?c.value:8;a.verbose;let m=await this.extractImageData(t,l,u,f,i,g,a);if(!m.success||!m.imageData)return a.verbose,null;let b=m.imageData,d=m.mimeType||"image/jpeg",x=m.extension||"jpg",h=`img_p${e}_${r}.${x}`,P="",I=b.length;return a.extractImageFiles&&a.imageOutputDir&&(P=D__default.default.join(a.imageOutputDir,h),y__namespace.default.writeFileSync(P,b),a.verbose),{id:`img_${r}`,name:h,page:e,position:{x:0,y:0,width:u,height:f},width:u,height:f,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:P}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,o,n){try{let l=await import('zlib'),i,c="image/jpeg",u="jpg";if(e){let f=e.toString();if(n.verbose,f.includes("DCTDecode")&&f.includes("FlateDecode")){n.verbose;try{let g=t.contents;i=l.inflateSync(Buffer.from(g)),c="image/jpeg",u="jpg",n.verbose;}catch(g){return n.verbose,{success:!1,error:`Zlib decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("DCTDecode"))n.verbose,i=Buffer.from(t.contents),c="image/jpeg",u="jpg";else if(f.includes("FlateDecode")){n.verbose;try{let g=t.contents,m=l.inflateSync(Buffer.from(g));n.verbose;let b=this.detectImageFormat(m);if(b.valid)i=m,c=b.mimeType,u=b.extension,n.verbose;else {let d=await this.createPngFromPdfMetadata(m,r,a,s,o,n);if(d.success&&d.pngData)i=d.pngData,c="image/png",u="png",n.verbose;else return n.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(g){return n.verbose,{success:!1,error:`FlateDecode decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("JPXDecode")){n.verbose;try{i=Buffer.from(t.contents),c="image/jp2",u="jp2",n.verbose;}catch(g){return n.verbose,{success:!1,error:`JPXDecode extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else {n.verbose;try{let g=await t.asUint8Array();i=Buffer.from(g);let m=this.detectImageFormat(i);m.valid&&(c=m.mimeType,u=m.extension);}catch(g){return n.verbose,{success:!1,error:`Generic decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}}else {n.verbose;try{let f=await t.asUint8Array();i=Buffer.from(f);let g=this.detectImageFormat(i);g.valid&&(c=g.mimeType,u=g.extension);}catch(f){return n.verbose,{success:!1,error:`Raw data extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:c,extension:u}}catch(l){return n.verbose,{success:false,error:l instanceof Error?l.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,o){try{let{PNG:n}=await import('pngjs'),l=a?.toString()||"",i=3,c=2;l.includes("DeviceGray")||l.includes("Gray")?(i=1,c=0):l.includes("DeviceRGB")||l.includes("RGB")?(i=3,c=2):(l.includes("DeviceCMYK")||l.includes("CMYK"))&&(i=4,c=2);let u=e*r*i*(s/8),f=t.length;if(o.verbose,Math.abs(f-u)>f*.1)return {success:!1,error:`Data size mismatch: expected ${u}, got ${f} bytes`};let g=new n({width:e,height:r,colorType:c===0?0:6,bitDepth:8}),m;if(i===1){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=t[d]||0,h=d*4;m[h]=x,m[h+1]=x,m[h+2]=x,m[h+3]=255;}}else if(i===3){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*3,h=d*4;m[h]=t[x]||0,m[h+1]=t[x+1]||0,m[h+2]=t[x+2]||0,m[h+3]=255;}}else if(i===4){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*4,h=(t[x]||0)/255,P=(t[x+1]||0)/255,I=(t[x+2]||0)/255,E=(t[x+3]||0)/255,v=d*4;m[v]=Math.round(255*(1-h)*(1-E)),m[v+1]=Math.round(255*(1-P)*(1-E)),m[v+2]=Math.round(255*(1-I)*(1-E)),m[v+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};g.data=m;let b=n.sync.write(g);return o.verbose,{success:!0,pngData:b}}catch(n){return {success:false,error:`PNG creation error: ${n instanceof Error?n.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):y__namespace.default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!ce(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function ce(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(o=>p.includes(o))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let o of s)e.includes(o)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${o}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!y__namespace.default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D__default.default.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=y__namespace.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(o=>o.text).join(`
|
|
3
|
-
`).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(t){return this.pdfLibDoc=await pdfLib.PDFDocument.load(t,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((e,r)=>{let{width:a,height:s}=e.getSize();return {pageNumber:r+1,width:a,height:s,rotation:e.getRotation(),mediaBox:e.getMediaBox()}})}async processPDFParse(t){let e=
|
|
4
|
-
`,c=
|
|
5
|
-
`);if(i.trim()){let
|
|
6
|
-
`);
|
|
1
|
+
'use strict';Object.defineProperty(exports,'__esModule',{value:true});var P=require('fs'),D=require('path'),se=require('pdf-parse'),pdfLib=require('pdf-lib'),ue=require('crypto');function _interopDefault(e){return e&&e.__esModule?e:{default:e}}function _interopNamespace(e){if(e&&e.__esModule)return e;var n=Object.create(null);if(e){Object.keys(e).forEach(function(k){if(k!=='default'){var d=Object.getOwnPropertyDescriptor(e,k);Object.defineProperty(n,k,d.get?d:{enumerable:true,get:function(){return e[k]}});}})}n.default=e;return Object.freeze(n)}var P__namespace=/*#__PURE__*/_interopNamespace(P);var D__default=/*#__PURE__*/_interopDefault(D);var se__default=/*#__PURE__*/_interopDefault(se);var ue__default=/*#__PURE__*/_interopDefault(ue);var ce=Object.defineProperty;var O=(p,t)=>()=>(p&&(t=p(p=0)),t);var Y=(p,t)=>{for(var e in t)ce(p,e,{get:t[e],enumerable:true});};var T,H=O(()=>{T=class{};});var B,Q=O(()=>{H();B=class extends T{name="pdf-lib";description="PDF-lib based extraction with full format support";async isAvailable(){try{return await import('pdf-lib'),!0}catch{return false}}getCapabilities(){return {formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}}async extractImages(t,e){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib');if(!P__namespace.default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let s=P__namespace.default.readFileSync(t),n=await r.load(s),o=n.getPages(),g=[],i=1;e.verbose;for(let l=0;l<o.length;l++){let m=o[l],u=l+1,c=m?.node.Resources;if(!c)continue;let b=(typeof c=="function"?c():c)?.get?.(a.of("XObject"));if(!b)continue;let d=b.entries?.()||[],h=0;e.verbose;for(let[,x]of d){let y=n.context.lookup(x);if(!y||y.dict?.get?.(a.of("Subtype"))?.toString()!=="/Image")continue;h++;let E=await this.extractImageFromPdfObject(y,u,i,e);E&&g.push(E),i++;}}return e.verbose,{success:!0,images:g}}catch(r){return {success:false,error:`PDF-lib extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=t.dict.get(s.of("Width")),o=t.dict.get(s.of("Height")),g=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),l=t.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,c=l&&typeof l.value=="number"?l.value:8;a.verbose;let f=await this.extractImageData(t,g,m,u,i,c,a);if(!f.success||!f.imageData)return a.verbose,null;let b=f.extension||"bin",d=`img_p${e}_${r}.${b}`,h,x=f.imageData.length;if(a.extractImageFiles&&a.imageOutputDir){let y=D__default.default.join(a.imageOutputDir,"images");P__namespace.default.existsSync(y)||P__namespace.default.mkdirSync(y,{recursive:!0}),h=D__default.default.join(y,d),P__namespace.default.writeFileSync(h,f.imageData),a.verbose;}return {id:`img_${r}`,filename:`images/${d}`,filepath:h||"",page:e,width:m,height:u,format:this.getFormatFromMimeType(f.mimeType||""),mimeType:f.mimeType||"",size:x,position:{x:0,y:0,width:m,height:u}}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,n,o){try{let g=await import('zlib'),i,l="image/jpeg",m="jpg";if(e){let u=e.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let c=t.contents;i=g.inflateSync(Buffer.from(c)),l="image/jpeg",m="jpg",o.verbose;}catch(c){return o.verbose,{success:!1,error:`Zlib decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(t.contents),l="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let c=t.contents,f=g.inflateSync(Buffer.from(c));o.verbose;let b=this.detectImageFormat(f);if(b.valid)i=f,l=b.mimeType,m=b.extension,o.verbose;else {let d=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(d.success&&d.pngData)i=d.pngData,l="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(c){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(t.contents),l="image/jp2",m="jp2",o.verbose;}catch(c){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else {o.verbose;try{let c=await t.asUint8Array();i=Buffer.from(c);let f=this.detectImageFormat(i);f.valid&&(l=f.mimeType,m=f.extension);}catch(c){return o.verbose,{success:!1,error:`Generic decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await t.asUint8Array();i=Buffer.from(u);let c=this.detectImageFormat(i);c.valid&&(l=c.mimeType,m=c.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return {success:!0,imageData:i,mimeType:l,extension:m}}catch(g){return {success:false,error:`Image data extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,n){try{let{PNG:o}=await import('pngjs'),g=a?.toString()||"",i=3,l=2;g.includes("DeviceGray")||g.includes("Gray")?(i=1,l=0):g.includes("DeviceRGB")||g.includes("RGB")?(i=3,l=2):(g.includes("DeviceCMYK")||g.includes("CMYK"))&&(i=4,l=2);let m=e*r*i*(s/8),u=t.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let c=new o({width:e,height:r,colorType:l===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=t[d]||0,x=d*4;f[x]=h,f[x+1]=h,f[x+2]=h,f[x+3]=255;}}else if(i===3){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*3,x=d*4;f[x]=t[h]||0,f[x+1]=t[h+1]||0,f[x+2]=t[h+2]||0,f[x+3]=255;}}else if(i===4){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*4,x=(t[h]||0)/255,y=(t[h+1]||0)/255,w=(t[h+2]||0)/255,E=(t[h+3]||0)/255,I=d*4;f[I]=Math.round(255*(1-x)*(1-E)),f[I+1]=Math.round(255*(1-y)*(1-E)),f[I+2]=Math.round(255*(1-w)*(1-E)),f[I+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};c.data=f;let b=o.sync.write(c);return n.verbose,{success:!0,pngData:b}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}getFormatFromMimeType(t){switch(t){case "image/jpeg":return "JPEG";case "image/png":return "PNG";case "image/jp2":return "JPEG 2000";case "image/gif":return "GIF";case "image/tiff":return "TIFF";default:return "unknown"}}};});var A,ee=O(()=>{H();A=class extends T{name="poppler";description="Poppler-based extraction using pdfimages command";async isAvailable(){try{let{Poppler:t}=await import('node-poppler');return new t,!0}catch{return false}}getCapabilities(){return {formats:["png"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}async extractImages(t,e){try{let{Poppler:r}=await import('node-poppler');if(!P__namespace.default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let a=new r,s=[],n=D__default.default.join(process.cwd(),"temp-poppler-images");P__namespace.default.existsSync(n)||P__namespace.default.mkdirSync(n,{recursive:!0});try{e.verbose;let o=D__default.default.join(n,"img"),g={firstPageToConvert:1,lastPageToConvert:-1,pngFile:!0};e.verbose,await a.pdfImages(t,o,g),e.verbose;let i={list:!0};e.verbose;let l=await a.pdfImages(t,void 0,i),m=this.parseImageList(l);e.verbose;let u=P__namespace.default.readdirSync(n).filter(c=>c.startsWith("img-")&&c.endsWith(".png"));e.verbose;for(let c=0;c<u.length;c++){let f=u[c];if(!f)continue;let b=D__default.default.join(n,f);if(!P__namespace.default.existsSync(b))continue;let d=P__namespace.default.statSync(b);P__namespace.default.readFileSync(b);let h=f.match(/img-(\d+)\.png/),x=h?parseInt(h[1],10)+1:c+1,y=m[c]||{page:1,index:x,width:0,height:0,format:"PNG"},w=y.page,E=`img_p${w}_${x}.png`,I;if(e.extractImageFiles&&e.imageOutputDir){let L=D__default.default.join(e.imageOutputDir,"images");P__namespace.default.existsSync(L)||P__namespace.default.mkdirSync(L,{recursive:!0}),I=D__default.default.join(L,E),P__namespace.default.copyFileSync(b,I),e.verbose;}let ie={id:`img_${x}`,filename:`images/${E}`,filepath:I||"",page:w,width:y.width,height:y.height,format:"PNG",mimeType:"image/png",size:d.size,position:{x:0,y:0,width:y.width,height:y.height}};s.push(ie);}return e.verbose,{success:!0,images:s}}finally{P__namespace.default.existsSync(n)&&P__namespace.default.rmSync(n,{recursive:!0,force:!0});}}catch(r){return {success:false,error:`Poppler extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}parseImageList(t){let e=[],r=t.split(`
|
|
2
|
+
`);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let n=parseInt(s[1],10),o=parseInt(s[2],10),g=parseInt(s[3],10),i=parseInt(s[4],10),l=s[5]?.toUpperCase()||"PNG";e.push({page:n,index:o,width:g,height:i,format:l});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>exports.ImageExtractor});exports.ImageExtractor=void 0;var N=O(()=>{exports.ImageExtractor=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(P__namespace.default.existsSync(r.imageOutputDir)||P__namespace.default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let n=await s.extractImages(t,r);if(!n.success)throw new Error(n.error||"Engine extraction failed");return {success:!0,images:n.images||[],metadata:{totalImages:n.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=P__namespace.default.readFileSync(t),n=await r.load(s,{ignoreEncryption:!0}),o=n.getPageCount(),g=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(P__namespace.default.existsSync(e.imageOutputDir)||P__namespace.default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let l=0;l<o;l++){let m=l+1;try{let c=n.getPage(l).node.Resources();if(!c){e.verbose;continue}let f=c.get(a.of("XObject"));if(!f){e.verbose;continue}let b=f.dict;e.verbose;for(let[d,h]of b)try{let x=n.context.lookup(h),y=x.dict.get(a.of("Subtype"));if(!y||y.toString()!=="/Image")continue;let w=await this.extractImageFromPdfObject(x,m,i,e);w&&(g.push(w),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:g,totalPages:o,totalImages:g.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=t.dict.get(s.of("Width")),o=t.dict.get(s.of("Height")),g=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),l=t.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,c=l&&typeof l.value=="number"?l.value:8;a.verbose;let f=await this.extractImageData(t,g,m,u,i,c,a);if(!f.success||!f.imageData)return a.verbose,null;let b=f.imageData,d=f.mimeType||"image/jpeg",h=f.extension||"jpg",x=`img_p${e}_${r}.${h}`,y="",w=b.length;return a.extractImageFiles&&a.imageOutputDir&&(y=D__default.default.join(a.imageOutputDir,x),P__namespace.default.writeFileSync(y,b),a.verbose),{id:`img_${r}`,name:x,page:e,position:{x:0,y:0,width:m,height:u},width:m,height:u,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:y}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,n,o){try{let g=await import('zlib'),i,l="image/jpeg",m="jpg";if(e){let u=e.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let c=t.contents;i=g.inflateSync(Buffer.from(c)),l="image/jpeg",m="jpg",o.verbose;}catch(c){return o.verbose,{success:!1,error:`Zlib decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(t.contents),l="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let c=t.contents,f=g.inflateSync(Buffer.from(c));o.verbose;let b=this.detectImageFormat(f);if(b.valid)i=f,l=b.mimeType,m=b.extension,o.verbose;else {let d=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(d.success&&d.pngData)i=d.pngData,l="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(c){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(t.contents),l="image/jp2",m="jp2",o.verbose;}catch(c){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else {o.verbose;try{let c=await t.asUint8Array();i=Buffer.from(c);let f=this.detectImageFormat(i);f.valid&&(l=f.mimeType,m=f.extension);}catch(c){return o.verbose,{success:!1,error:`Generic decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await t.asUint8Array();i=Buffer.from(u);let c=this.detectImageFormat(i);c.valid&&(l=c.mimeType,m=c.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:l,extension:m}}catch(g){return o.verbose,{success:false,error:g instanceof Error?g.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,n){try{let{PNG:o}=await import('pngjs'),g=a?.toString()||"",i=3,l=2;g.includes("DeviceGray")||g.includes("Gray")?(i=1,l=0):g.includes("DeviceRGB")||g.includes("RGB")?(i=3,l=2):(g.includes("DeviceCMYK")||g.includes("CMYK"))&&(i=4,l=2);let m=e*r*i*(s/8),u=t.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let c=new o({width:e,height:r,colorType:l===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=t[d]||0,x=d*4;f[x]=h,f[x+1]=h,f[x+2]=h,f[x+3]=255;}}else if(i===3){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*3,x=d*4;f[x]=t[h]||0,f[x+1]=t[h+1]||0,f[x+2]=t[h+2]||0,f[x+3]=255;}}else if(i===4){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*4,x=(t[h]||0)/255,y=(t[h+1]||0)/255,w=(t[h+2]||0)/255,E=(t[h+3]||0)/255,I=d*4;f[I]=Math.round(255*(1-x)*(1-E)),f[I+1]=Math.round(255*(1-y)*(1-E)),f[I+2]=Math.round(255*(1-w)*(1-E)),f[I+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};c.data=f;let b=o.sync.write(c);return n.verbose,{success:!0,pngData:b}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):P__namespace.default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!le(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function le(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(n=>p.includes(n))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let n of s)e.includes(n)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${n}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!P__namespace.default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D__default.default.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=P__namespace.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(n=>n.text).join(`
|
|
3
|
+
`).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(t){return this.pdfLibDoc=await pdfLib.PDFDocument.load(t,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((e,r)=>{let{width:a,height:s}=e.getSize();return {pageNumber:r+1,width:a,height:s,rotation:e.getRotation(),mediaBox:e.getMediaBox()}})}async processPDFParse(t){let e=[];return await se__default.default(t,{pagerender:async a=>{try{let s=await a.getTextContent(),n=a.getViewport({scale:1}),o=s.items.filter(u=>typeof u.str=="string");o.sort((u,c)=>{let f=c.transform[5]-u.transform[5];return Math.abs(f)>2?f:u.transform[4]-c.transform[4]});let g="",i=null,l="";for(let u of o){let c=u.transform[5];i===null?(i=c,l=u.str):Math.abs(c-i)>2?(g+=`${l}
|
|
4
|
+
`,i=c,l=u.str):l+=` ${u.str}`;}l&&(g+=l),g=g.trim();let m={pageNumber:a.pageIndex+1,text:g,textItems:s.items,pdfParseWidth:n.width,pdfParseHeight:n.height};return e.push(m),g}catch{return e.push({pageNumber:a.pageIndex+1,text:"",textItems:[],pdfParseWidth:0,pdfParseHeight:0}),""}}}),e.sort((a,s)=>a.pageNumber-s.pageNumber)}combineResults(t,e){return t.map(r=>{let a=e.find(n=>n.pageNumber===r.pageNumber),s=a?.text||"";return {pageNumber:r.pageNumber,text:s,width:r.width,height:r.height,rotation:r.rotation,mediaBox:r.mediaBox,textItems:a?.textItems||[],wordCount:this.countWords(s),characterCount:s.length}})}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){let a=await this.processPDF(t),s=[];if(r.includeImageRefs)try{let{ImageExtractor:o}=await Promise.resolve().then(()=>(N(),ae));s=(await new o().extract(t,{extractImageFiles:!1,verbose:!1,imageEngine:r.imageEngine||"auto"})).images||[];}catch{}let n="";return a.pages.forEach(o=>{let g=e.replace("{page}",o.pageNumber.toString()),i=o.text;if(r.includeImageRefs&&s.length>0){let l=s.filter(m=>m.page===o.pageNumber);if(l.length>0){let m=l.map(u=>(r.imageRefFormat||"[IMG:{id}] {name}").replace("{id}",`img_${u.id}`).replace("{name}",u.filename||`img_p${u.page}_${u.id}.jpg`)).join(`
|
|
5
|
+
`);if(i.trim()){let u=i.split(`
|
|
6
|
+
`);u.length>1?(u.splice(1,0,m),i=u.join(`
|
|
7
7
|
`)):i=`${i}
|
|
8
|
-
${
|
|
8
|
+
${m}`;}else i=m;}}i.trim()?n+=`${g}
|
|
9
9
|
|
|
10
10
|
${i}
|
|
11
|
-
`:
|
|
11
|
+
`:n+=`${g}
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
`;}),{text:
|
|
15
|
-
|
|
16
|
-
`)
|
|
17
|
-
`),
|
|
18
|
-
`).
|
|
14
|
+
`;}),{text:n.trim(),cleanText:a.fullText,numPages:a.totalPages,pages:a.pages}}getPage(t){return this.textData[t-1]||null}async getDetailedPageInfo(t,e){this.textData.length||await this.processPDF(t);let r=this.getPage(e);if(!r)return null;let a=(r.textItems||[]).map(s=>({text:s.str||"",x:s.transform?.[4]||0,y:s.transform?.[5]||0,width:s.width||0,height:s.height||0,fontName:s.fontName,fontSize:s.transform?.[0]||12}));return {pageNumber:e,text:r.text,textItems:a,dimensions:{width:r.width,height:r.height}}}countWords(t){return !t||t.trim()===""?0:t.split(/\s+/).filter(e=>e.length>0).length}async processSinglePage(t,e){try{let r=P__namespace.readFileSync(t),a=await pdfLib.PDFDocument.load(r,{ignoreEncryption:!0});if(e<1||e>a.getPageCount())return null;let n=a.getPages()[e-1];if(!n)return null;let{width:o,height:g}=n.getSize(),i=await pdfLib.PDFDocument.create(),[l]=await i.copyPages(a,[e-1]);i.addPage(l);let m=await i.save(),u=[],c={pagerender:async h=>{try{let x=await h.getTextContent();return u=x.items,x.items.map(y=>y.str||"").join(" ")}catch{return ""}}},f=Buffer.from(m),d=(await se__default.default(f,c)).text.replace(/\s+/g," ").trim();return {pageNumber:e,text:d,width:o,height:g,rotation:n.getRotation().angle,mediaBox:[n.getMediaBox().x,n.getMediaBox().y,n.getMediaBox().width,n.getMediaBox().height],textItems:u,wordCount:this.countWords(d),characterCount:d.length}}catch{return null}}};var $=class{async extract(t){try{let e=P__namespace.default.readFileSync(t),r=[],s=await se__default.default(e,{pagerender:async o=>{try{let i=(await o.getTextContent()).items.map(l=>l.str).join(" ");return r[o.pageNumber-1]=i,i}catch{return r[o.pageNumber-1]="",""}}});return {text:r.filter(o=>o&&o.length>0).join(`
|
|
15
|
+
|
|
16
|
+
`),numPages:s.numpages,info:s.info,metadata:s.metadata,version:s.version}}catch(e){throw new Error(`Failed to extract text from PDF: ${e instanceof Error?e.message:"Unknown error"}`)}}async extractWithPages(t){try{let e=P__namespace.default.readFileSync(t),a=await se__default.default(e,{pagerender:s=>s.getTextContent().then(n=>n.items.map(o=>o.str).join(" "))});return {text:a.text,numPages:a.numpages,info:a.info,metadata:a.metadata,version:a.version,pages:a.text?this.splitTextIntoPages(a.text,a.numpages):[]}}catch(e){throw new Error(`Failed to extract text with pages: ${e instanceof Error?e.message:"Unknown error"}`)}}splitTextIntoPages(t,e){let r=t.split(`
|
|
17
|
+
`),a=Math.ceil(r.length/e),s=[];for(let n=0;n<e;n++){let o=n*a,g=Math.min(o+a,r.length),i=r.slice(o,g).join(`
|
|
18
|
+
`);s.push(i);}return s}async extractTextItems(t,e={}){try{let r=await this.extract(t),a=r.text,s=r.numpages||1,n=a.split(`
|
|
19
|
+
`),o=[],g=1,i=Math.ceil(n.length/s);return n.forEach((l,m)=>{if(l.trim()){g=Math.ceil((m+1)/i);let u="text";l.length<50&&l.trim().match(/^[A-Z\s]+$/)?u="heading":l.length>100?u="paragraph":l.length<30&&(u="caption");let c=12;u==="heading"?c=16:u==="caption"&&(c=10);let f={id:`text_${m+1}`,content:l.trim(),position:{x:0,y:m%i*15,width:l.length*8,height:c},font:{name:"Unknown",size:c,style:u==="heading"?"bold":"normal"},page:g,type:u,fontSize:c,color:"#000000"};o.push(f);}}),e.verbose,o}catch(r){throw new Error(`Failed to extract text items: ${r instanceof Error?r.message:"Unknown error"}`)}}async extractStatistics(t){let e=await this.extract(t),r=e.text,a=r.length,s=r.split(/\s+/).filter(l=>l.length>0).length,n=r.split(`
|
|
20
|
+
`).length,o=e.numPages,g=Math.round(s/o),i=Math.ceil(s/200);return {characterCount:a,wordCount:s,lineCount:n,pageCount:o,averageWordsPerPage:g,readingTime:i}}async extractWithFontInfo(t){return this.extract(t)}cleanText(t){return t.replace(/\s+/g," ").replace(/\n\s*\n/g,`
|
|
19
21
|
`).trim()}async extractPageRange(t,e,r){let a=await this.extractWithPages(t);if(e<1||r>a.numPages||e>r)throw new Error(`Invalid page range: ${e}-${r}. Document has ${a.numPages} pages.`);return a.pages.slice(e-1,r).join(`
|
|
20
22
|
|
|
21
|
-
`)}async searchText(t,e,r=false){let a=await this.extractWithPages(t),s=r?"g":"gi",
|
|
22
|
-
`);
|
|
23
|
-
`);i.push(`Page ${
|
|
24
|
-
`),
|
|
25
|
-
`);
|
|
26
|
-
${
|
|
27
|
-
`;}
|
|
28
|
-
`);}return
|
|
23
|
+
`)}async searchText(t,e,r=false){let a=await this.extractWithPages(t),s=r?"g":"gi",n=new RegExp(e,s),o=0,g=[],i=[];return a.pages.forEach((l,m)=>{let u=l.match(n);if(u){o+=u.length,g.push(m+1);let c=l.split(`
|
|
24
|
+
`);c.forEach((f,b)=>{if(n.test(f)){let d=Math.max(0,b-1),h=Math.min(c.length,b+2),x=c.slice(d,h).join(`
|
|
25
|
+
`);i.push(`Page ${m+1}: ${x}`);}});}}),{found:o>0,occurrences:o,pages:g,context:i}}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){try{let a=new z,s={includeImageRefs:r.includeImageRefs??!0,imageRefFormat:r.imageRefFormat||"[IMG:{id}] {name}"};r.imageEngine&&(s.imageEngine=r.imageEngine);let n=await a.extractWithPageMarkers(t,e,s),o=n.pages.map(g=>({pageNumber:g.pageNumber+(r.pageOffset||0),text:{content:g.text,rawText:g.text,wordCount:g.wordCount,characterCount:g.characterCount},images:[],imageCount:0}));return {text:n.text,pages:o}}catch(a){throw new Error(`Failed to extract text with page markers: ${a instanceof Error?a.message:"Unknown error"}`)}}async extractWithAccuratePages(t){let r=await new z().processPDF(t),a=r.pages.map(s=>({pageNumber:s.pageNumber,text:{content:s.text,rawText:s.text,wordCount:s.wordCount,characterCount:s.characterCount},images:[],imageCount:0}));return {fullText:r.fullText,pages:a,totalPages:r.totalPages}}};N();var S=class{generateTextWithImageRefs(t,e,r,a){if(!t||e.length===0)return t||"";let s=t.split(`
|
|
26
|
+
`),n=Math.ceil(s.length/a),o="";for(let g=1;g<=a;g++){let i=(g-1)*n,l=Math.min(i+n,s.length),m=s.slice(i,l).join(`
|
|
27
|
+
`);m.trim()&&(o+=m);let u=e.filter(c=>c.page===g);for(let c of u){let f=this.formatImageReference(c,r,e.indexOf(c)+1);o+=`
|
|
28
|
+
${f}
|
|
29
|
+
`;}g<a&&m.trim()&&(o+=`
|
|
30
|
+
`);}return o.trim()}generateImageOnlyRefs(t,e){return t.map((r,a)=>this.formatImageReference(r,e,a+1)).join(`
|
|
29
31
|
`)}formatImageReference(t,e,r){let a={id:t.id,name:t.name||t.id,page:t.page,index:r,path:t.filePath||t.id};return this.replacePlaceholders(e,a)}replacePlaceholders(t,e){return t.replace(/\{id\}/g,e.id).replace(/\{name\}/g,e.name||e.id).replace(/\{page\}/g,e.page.toString()).replace(/\{index\}/g,e.index.toString()).replace(/\{path\}/g,e.path||e.id)}extractPlaceholders(t){let e=/\{([^}]+)\}/g,r=[],a=null;for(a=e.exec(t);a!==null;)a[1]&&r.push(a[1]),a=e.exec(t);return [...new Set(r)]}isValidFormat(t){let e=["id","name","page","index","path"];return this.extractPlaceholders(t).every(a=>e.includes(a))}getDefaultFormat(t=false){return t?"[IMAGE:{path}]":"[IMAGE:{id}]"}cleanTextFromImageRefs(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g");return t.replace(a,"").replace(/\n\s*\n/g,`
|
|
30
|
-
`).trim()}countImageReferences(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=t.match(a);return s?s.length:0}generateSummary(t,e,r,a,s){let
|
|
32
|
+
`).trim()}countImageReferences(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=t.match(a);return s?s.length:0}generateSummary(t,e,r,a,s){let n=(r/t).toFixed(2),o=["\u{1F4C4} Document Summary",` Pages: ${t}`,` Text items: ${e}`,` Images: ${r} (avg ${n} per page)`,` Text length: ${a.toLocaleString()} characters`];return s&&o.push(` Processing time: ${s}ms`),o.join(`
|
|
31
33
|
`)}formatFileSize(t){let e=["B","KB","MB","GB"],r=t,a=0;for(;r>=1024&&a<e.length-1;)r/=1024,a++;return `${r.toFixed(1)} ${e[a]}`}formatDuration(t){if(t<1e3)return `${t}ms`;let e=Math.floor(t/1e3);if(e<60)return `${e}s`;let r=Math.floor(e/60),a=e%60;return `${r}m ${a}s`}};var U=class{extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
|
|
32
34
|
|
|
33
|
-
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}generateStructuredData(t,e,r,a,s){let
|
|
34
|
-
`),a=Math.ceil(r.length/e),s=[];for(let
|
|
35
|
-
`);s.push(i);}return s}createPageDataArray(t,e,r){let a=[];for(let s=0;s<r;s++){let
|
|
36
|
-
`),
|
|
35
|
+
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}generateStructuredData(t,e,r,a,s){let n=this.splitTextIntoPages(e,a),o=this.createPageDataArray(n,r,a);return {metadata:{filename:t,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:e.length,totalImages:r.length,extractionOptions:s},pages:o}}splitTextIntoPages(t,e){if(e<=1)return [t];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=t.match(r);return a&&a.length>0?this.splitByPageMarkers(t,r):this.splitByEstimatedLength(t,e)}splitByPageMarkers(t,e){let r=t.split(e),a=[];for(let s=1;s<r.length;s++){let n=r[s];n&&a.push(n.trim());}return a.length===0&&a.push(t),a}splitByEstimatedLength(t,e){let r=t.split(`
|
|
36
|
+
`),a=Math.ceil(r.length/e),s=[];for(let n=0;n<e;n++){let o=n*a,g=Math.min((n+1)*a,r.length),i=r.slice(o,g).join(`
|
|
37
|
+
`);s.push(i);}return s}createPageDataArray(t,e,r){let a=[];for(let s=0;s<r;s++){let n=s+1,o=t[s]||"",g=this.getImagesForPage(e,n),i=this.extractRawText(o);a.push({pageNumber:n,text:{content:o,rawText:i,wordCount:this.countWords(i),characterCount:i.length},images:g,imageCount:g.length});}return a}getImagesForPage(t,e){return t.filter(r=>r.page===e).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r){let s=r.filename;s!==void 0&&(a.filename=s);}if("path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("size"in r){let s=r.size;s!==void 0&&(a.size=s);}return a})}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}generateJSONString(t,e=2){return JSON.stringify(t,null,e)}generateSummary(t){let e=t.pages.reduce((n,o)=>n+o.text.wordCount,0),r=t.pages.reduce((n,o)=>n+o.text.characterCount,0),a=t.pages.filter(n=>n.text.content.trim().length>0).length,s=t.pages.filter(n=>n.imageCount>0).length;return {totalWords:e,totalCharacters:r,averageWordsPerPage:Math.round(e/t.pages.length),averageImagesPerPage:Math.round(t.metadata.totalImages/t.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var W=class{cacheDir;constructor(t="./tmp/pdf-cache"){this.cacheDir=t,this.ensureCacheDir();}generateCacheKey(t){let e=D__default.default.resolve(t),r=P__namespace.default.statSync(e),a=`${e}:${r.mtime.getTime()}:${r.size}`;return ue__default.default.createHash("md5").update(a).digest("hex")}getCacheDir(t){let e=this.generateCacheKey(t);return D__default.default.join(this.cacheDir,e)}ensureCacheDir(){P__namespace.default.existsSync(this.cacheDir)||P__namespace.default.mkdirSync(this.cacheDir,{recursive:true});}isCached(t){try{let e=this.getCacheDir(t),r=D__default.default.join(e,"cache-info.json");return P__namespace.default.existsSync(r)}catch{return false}}getCacheInfo(t){try{let e=this.getCacheDir(t),r=D__default.default.join(e,"cache-info.json");return P__namespace.default.existsSync(r)?JSON.parse(P__namespace.default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(t,e){let r=this.getCacheDir(t);P__namespace.default.existsSync(r)||P__namespace.default.mkdirSync(r,{recursive:true});let a=P__namespace.default.statSync(t),s={pdfPath:D__default.default.resolve(t),lastModified:a.mtime.getTime(),totalPages:e,cacheDir:r,created:new Date().toISOString()},n=D__default.default.join(r,"cache-info.json");return P__namespace.default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(t,e,r){try{let a=this.getCacheDir(t),s=D__default.default.join(a,`page-${e}.json`);P__namespace.default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(t,e){try{let r=this.getCacheDir(t),a=D__default.default.join(r,`page-${e}.json`);return P__namespace.default.existsSync(a)?JSON.parse(P__namespace.default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(t){try{let e=this.getCacheDir(t),r=[];if(!P__namespace.default.existsSync(e))return r;let s=P__namespace.default.readdirSync(e).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=D__default.default.join(e,n),g=JSON.parse(P__namespace.default.readFileSync(o,"utf-8"));r.push(g);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(t){try{let e=this.getCacheDir(t);P__namespace.default.existsSync(e)&&P__namespace.default.rmSync(e,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{P__namespace.default.existsSync(this.cacheDir)&&P__namespace.default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{let t=0,e=0,r=0;if(P__namespace.default.existsSync(this.cacheDir)){let a=P__namespace.default.readdirSync(this.cacheDir);t=a.length;for(let s of a){let n=D__default.default.join(this.cacheDir,s);if(P__namespace.default.statSync(n).isDirectory()){let o=P__namespace.default.readdirSync(n),g=o.filter(i=>i.startsWith("page-")&&i.endsWith(".json"));e+=g.length;for(let i of o){let l=D__default.default.join(n,i);r+=P__namespace.default.statSync(l).size;}}}}return {totalCachedPdfs:t,totalCachedPages:e,totalCacheSize:r,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var j=class{textExtractor;imageExtractor;formatProcessor;structuredDataGenerator;cacheManager;constructor(t){this.textExtractor=new $,this.imageExtractor=new exports.ImageExtractor,this.formatProcessor=new S,this.structuredDataGenerator=new U,this.cacheManager=new W(t);}async extract(t,e={}){let r={pdfPath:t,outputDir:e.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,...e}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!P__namespace.default.existsSync(t))throw new Error(`PDF file not found: ${t}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(t),r.options.includePageMarkers||r.options.includeImageRefs)){let m=r.options.pageMarkerFormat||"--- PAGE {page} ---",c={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};r.options.imageEngine&&(c.imageEngine=r.options.imageEngine),o=await this.textExtractor.extractWithPageMarkers(t,m,c);}let g=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,g=await this.textExtractor.extractTextItems(t,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(t,r.options));let l=await this.processResults(t,n,o,i,g,r.options,s);return this.reportProgress(r.options,{currentPage:l.document.pages,totalPages:l.document.pages,phase:"complete"}),l}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(t,e={}){return (await this.extract(t,{...e,extractText:true,extractImages:false})).cleanText}async extractImages(t,e={}){return (await this.extract(t,{...e,extractText:false,extractImages:true})).images}async extractImageFiles(t,e="./extracted-images",r={}){return (await this.extract(t,{...r,extractImageFiles:true,imageOutputDir:e,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(t){return k(t)}async processResults(t,e,r,a,s,n,o){let g=D__default.default.basename(t),l=this.extractRawText(e?.text||""),m={document:{filename:g,pages:a?.totalPages||e?.numPages||0,textLength:e?.text?.length||0,extractedAt:new Date().toISOString(),metadata:e?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:l,textWithRefs:"",cleanText:l};if(n.extractText&&n.extractImages&&e&&a)if(r?.text&&n.includeImageRefs)m.textWithRefs=r.text;else if(n.includeImageRefs){let u=r?.text||e.text;m.textWithRefs=this.formatProcessor.generateTextWithImageRefs(u,a.images,n.imageRefFormat||"[IMAGE:{id}]",m.document.pages);}else m.textWithRefs=r?.text||e.text;else n.extractText&&e?m.textWithRefs=r?.text||e.text:n.extractImages&&a&&(m.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(m.summary={totalPages:m.document.pages,totalTextItems:0,totalImages:m.images.length,totalTextLength:m.document.textLength,averageImagesPerPage:(m.images.length/m.document.pages).toFixed(2),pagesWithImages:new Set(m.images.map(u=>u.page)).size},n.generateStructuredData){let u=m.textWithRefs||m.cleanText;m.structuredData=this.structuredDataGenerator.generateStructuredData(g,u,m.images,m.document.pages,n),n.verbose;}return n.verbose,m}async getText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).text}async getImages(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:false,extractImages:true})).images}async getTextItems(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).rawText}async getPage(t,e,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(t,e);if(m)return r.verbose,m}let a={...r,specificPages:[e]},s=await this.extract(t,a),n=this.extractPageText(s.textWithRefs||s.cleanText,e),o=s.images.filter(m=>m.page===e),g=s.textItems?.filter(m=>m.page===e)||[],i=this.extractRawText(n),l={pageNumber:e,text:n,rawText:i,textItems:g,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(t,e,l),l}extractPageText(t,e){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=t.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===e)return a[i+3]||""}let s=t.split(`
|
|
38
|
+
`),n=Math.ceil(s.length/e),o=(e-1)*n,g=Math.min(e*n,s.length);return s.slice(o,g).join(`
|
|
37
39
|
`)}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
|
|
38
40
|
|
|
39
|
-
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}clearCache(t){this.cacheManager.clearCache(t);}getCacheStats(){return this.cacheManager.getCacheStats()}reportProgress(t,e){t.progressCallback&&t.progressCallback(e);}createValidationError(t,e){let r=new Error(t);return r.code="VALIDATION_ERROR",r.validationErrors=e,r}createExtractionError(t,e){let r=new Error(t);return r.code="EXTRACTION_ERROR",r.originalError=e,r}},C=new j;N();N();async function
|
|
41
|
+
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}clearCache(t){this.cacheManager.clearCache(t);}getCacheStats(){return this.cacheManager.getCacheStats()}reportProgress(t,e){t.progressCallback&&t.progressCallback(e);}createValidationError(t,e){let r=new Error(t);return r.code="VALIDATION_ERROR",r.validationErrors=e,r}createExtractionError(t,e){let r=new Error(t);return r.code="EXTRACTION_ERROR",r.originalError=e,r}},C=new j;N();N();async function de(p,t={}){return C.extract(p,t)}async function xe(p,t={}){return C.extractText(p,t)}async function he(p,t={}){return C.extractImages(p,t)}async function be(p,t="./extracted-images",e={}){return C.extractImageFiles(p,t,e)}var ye="1.0.0",mt={PDFExtractor:j,pdfExtractor:C,TextExtractor:$,ImageExtractor:exports.ImageExtractor,FormatProcessor:S,extractPdfContent:de,extractText:xe,extractImages:he,extractImageFiles:be,validateConfig:k,validateImageRefFormat:V,validateFilePath:J,version:ye};exports.FormatProcessor=S;exports.PDFExtractor=j;exports.TextExtractor=$;exports.default=mt;exports.extractImageFiles=be;exports.extractImages=he;exports.extractPdfContent=de;exports.extractText=xe;exports.pdfExtractor=C;exports.validateConfig=k;exports.validateFilePath=J;exports.validateImageRefFormat=V;exports.version=ye;//# sourceMappingURL=index.js.map
|
|
40
42
|
//# sourceMappingURL=index.js.map
|
package/dist/index.mjs
CHANGED
|
@@ -1,40 +1,42 @@
|
|
|
1
|
-
import*as
|
|
2
|
-
`);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let o=parseInt(s[1],10),n=parseInt(s[2],10),l=parseInt(s[3],10),i=parseInt(s[4],10),c=s[5]?.toUpperCase()||"PNG";e.push({page:o,index:n,width:l,height:i,format:c});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>F});var F,N=O(()=>{F=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(y__default.existsSync(r.imageOutputDir)||y__default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let o=await s.extractImages(t,r);if(!o.success)throw new Error(o.error||"Engine extraction failed");return {success:!0,images:o.images||[],metadata:{totalImages:o.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=y__default.readFileSync(t),o=await r.load(s,{ignoreEncryption:!0}),n=o.getPageCount(),l=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(y__default.existsSync(e.imageOutputDir)||y__default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let c=0;c<n;c++){let u=c+1;try{let g=o.getPage(c).node.Resources();if(!g){e.verbose;continue}let m=g.get(a.of("XObject"));if(!m){e.verbose;continue}let b=m.dict;e.verbose;for(let[d,x]of b)try{let h=o.context.lookup(x),P=h.dict.get(a.of("Subtype"));if(!P||P.toString()!=="/Image")continue;let I=await this.extractImageFromPdfObject(h,u,i,e);I&&(l.push(I),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:l,totalPages:n,totalImages:l.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),o=t.dict.get(s.of("Width")),n=t.dict.get(s.of("Height")),l=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),c=t.dict.get(s.of("BitsPerComponent")),u=o&&typeof o.value=="number"?o.value:100,f=n&&typeof n.value=="number"?n.value:100,g=c&&typeof c.value=="number"?c.value:8;a.verbose;let m=await this.extractImageData(t,l,u,f,i,g,a);if(!m.success||!m.imageData)return a.verbose,null;let b=m.imageData,d=m.mimeType||"image/jpeg",x=m.extension||"jpg",h=`img_p${e}_${r}.${x}`,P="",I=b.length;return a.extractImageFiles&&a.imageOutputDir&&(P=D.join(a.imageOutputDir,h),y__default.writeFileSync(P,b),a.verbose),{id:`img_${r}`,name:h,page:e,position:{x:0,y:0,width:u,height:f},width:u,height:f,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:P}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,o,n){try{let l=await import('zlib'),i,c="image/jpeg",u="jpg";if(e){let f=e.toString();if(n.verbose,f.includes("DCTDecode")&&f.includes("FlateDecode")){n.verbose;try{let g=t.contents;i=l.inflateSync(Buffer.from(g)),c="image/jpeg",u="jpg",n.verbose;}catch(g){return n.verbose,{success:!1,error:`Zlib decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("DCTDecode"))n.verbose,i=Buffer.from(t.contents),c="image/jpeg",u="jpg";else if(f.includes("FlateDecode")){n.verbose;try{let g=t.contents,m=l.inflateSync(Buffer.from(g));n.verbose;let b=this.detectImageFormat(m);if(b.valid)i=m,c=b.mimeType,u=b.extension,n.verbose;else {let d=await this.createPngFromPdfMetadata(m,r,a,s,o,n);if(d.success&&d.pngData)i=d.pngData,c="image/png",u="png",n.verbose;else return n.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(g){return n.verbose,{success:!1,error:`FlateDecode decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("JPXDecode")){n.verbose;try{i=Buffer.from(t.contents),c="image/jp2",u="jp2",n.verbose;}catch(g){return n.verbose,{success:!1,error:`JPXDecode extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else {n.verbose;try{let g=await t.asUint8Array();i=Buffer.from(g);let m=this.detectImageFormat(i);m.valid&&(c=m.mimeType,u=m.extension);}catch(g){return n.verbose,{success:!1,error:`Generic decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}}else {n.verbose;try{let f=await t.asUint8Array();i=Buffer.from(f);let g=this.detectImageFormat(i);g.valid&&(c=g.mimeType,u=g.extension);}catch(f){return n.verbose,{success:!1,error:`Raw data extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:c,extension:u}}catch(l){return n.verbose,{success:false,error:l instanceof Error?l.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,o){try{let{PNG:n}=await import('pngjs'),l=a?.toString()||"",i=3,c=2;l.includes("DeviceGray")||l.includes("Gray")?(i=1,c=0):l.includes("DeviceRGB")||l.includes("RGB")?(i=3,c=2):(l.includes("DeviceCMYK")||l.includes("CMYK"))&&(i=4,c=2);let u=e*r*i*(s/8),f=t.length;if(o.verbose,Math.abs(f-u)>f*.1)return {success:!1,error:`Data size mismatch: expected ${u}, got ${f} bytes`};let g=new n({width:e,height:r,colorType:c===0?0:6,bitDepth:8}),m;if(i===1){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=t[d]||0,h=d*4;m[h]=x,m[h+1]=x,m[h+2]=x,m[h+3]=255;}}else if(i===3){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*3,h=d*4;m[h]=t[x]||0,m[h+1]=t[x+1]||0,m[h+2]=t[x+2]||0,m[h+3]=255;}}else if(i===4){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*4,h=(t[x]||0)/255,P=(t[x+1]||0)/255,I=(t[x+2]||0)/255,E=(t[x+3]||0)/255,v=d*4;m[v]=Math.round(255*(1-h)*(1-E)),m[v+1]=Math.round(255*(1-P)*(1-E)),m[v+2]=Math.round(255*(1-I)*(1-E)),m[v+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};g.data=m;let b=n.sync.write(g);return o.verbose,{success:!0,pngData:b}}catch(n){return {success:false,error:`PNG creation error: ${n instanceof Error?n.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):y__default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!ce(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function ce(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(o=>p.includes(o))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let o of s)e.includes(o)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${o}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!y__default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=y.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(o=>o.text).join(`
|
|
3
|
-
`).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(t){return this.pdfLibDoc=await PDFDocument.load(t,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((e,r)=>{let{width:a,height:s}=e.getSize();return {pageNumber:r+1,width:a,height:s,rotation:e.getRotation(),mediaBox:e.getMediaBox()}})}async processPDFParse(t){let e=
|
|
4
|
-
`,c=
|
|
5
|
-
`);if(i.trim()){let
|
|
6
|
-
`);
|
|
1
|
+
import*as P from'fs';import P__default from'fs';import D from'path';import se from'pdf-parse';import {PDFDocument}from'pdf-lib';import ue from'crypto';var ce=Object.defineProperty;var O=(p,t)=>()=>(p&&(t=p(p=0)),t);var Y=(p,t)=>{for(var e in t)ce(p,e,{get:t[e],enumerable:true});};var T,H=O(()=>{T=class{};});var B,Q=O(()=>{H();B=class extends T{name="pdf-lib";description="PDF-lib based extraction with full format support";async isAvailable(){try{return await import('pdf-lib'),!0}catch{return false}}getCapabilities(){return {formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}}async extractImages(t,e){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib');if(!P__default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let s=P__default.readFileSync(t),n=await r.load(s),o=n.getPages(),g=[],i=1;e.verbose;for(let l=0;l<o.length;l++){let m=o[l],u=l+1,c=m?.node.Resources;if(!c)continue;let b=(typeof c=="function"?c():c)?.get?.(a.of("XObject"));if(!b)continue;let d=b.entries?.()||[],h=0;e.verbose;for(let[,x]of d){let y=n.context.lookup(x);if(!y||y.dict?.get?.(a.of("Subtype"))?.toString()!=="/Image")continue;h++;let E=await this.extractImageFromPdfObject(y,u,i,e);E&&g.push(E),i++;}}return e.verbose,{success:!0,images:g}}catch(r){return {success:false,error:`PDF-lib extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=t.dict.get(s.of("Width")),o=t.dict.get(s.of("Height")),g=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),l=t.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,c=l&&typeof l.value=="number"?l.value:8;a.verbose;let f=await this.extractImageData(t,g,m,u,i,c,a);if(!f.success||!f.imageData)return a.verbose,null;let b=f.extension||"bin",d=`img_p${e}_${r}.${b}`,h,x=f.imageData.length;if(a.extractImageFiles&&a.imageOutputDir){let y=D.join(a.imageOutputDir,"images");P__default.existsSync(y)||P__default.mkdirSync(y,{recursive:!0}),h=D.join(y,d),P__default.writeFileSync(h,f.imageData),a.verbose;}return {id:`img_${r}`,filename:`images/${d}`,filepath:h||"",page:e,width:m,height:u,format:this.getFormatFromMimeType(f.mimeType||""),mimeType:f.mimeType||"",size:x,position:{x:0,y:0,width:m,height:u}}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,n,o){try{let g=await import('zlib'),i,l="image/jpeg",m="jpg";if(e){let u=e.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let c=t.contents;i=g.inflateSync(Buffer.from(c)),l="image/jpeg",m="jpg",o.verbose;}catch(c){return o.verbose,{success:!1,error:`Zlib decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(t.contents),l="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let c=t.contents,f=g.inflateSync(Buffer.from(c));o.verbose;let b=this.detectImageFormat(f);if(b.valid)i=f,l=b.mimeType,m=b.extension,o.verbose;else {let d=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(d.success&&d.pngData)i=d.pngData,l="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(c){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(t.contents),l="image/jp2",m="jp2",o.verbose;}catch(c){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else {o.verbose;try{let c=await t.asUint8Array();i=Buffer.from(c);let f=this.detectImageFormat(i);f.valid&&(l=f.mimeType,m=f.extension);}catch(c){return o.verbose,{success:!1,error:`Generic decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await t.asUint8Array();i=Buffer.from(u);let c=this.detectImageFormat(i);c.valid&&(l=c.mimeType,m=c.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return {success:!0,imageData:i,mimeType:l,extension:m}}catch(g){return {success:false,error:`Image data extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,n){try{let{PNG:o}=await import('pngjs'),g=a?.toString()||"",i=3,l=2;g.includes("DeviceGray")||g.includes("Gray")?(i=1,l=0):g.includes("DeviceRGB")||g.includes("RGB")?(i=3,l=2):(g.includes("DeviceCMYK")||g.includes("CMYK"))&&(i=4,l=2);let m=e*r*i*(s/8),u=t.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let c=new o({width:e,height:r,colorType:l===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=t[d]||0,x=d*4;f[x]=h,f[x+1]=h,f[x+2]=h,f[x+3]=255;}}else if(i===3){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*3,x=d*4;f[x]=t[h]||0,f[x+1]=t[h+1]||0,f[x+2]=t[h+2]||0,f[x+3]=255;}}else if(i===4){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*4,x=(t[h]||0)/255,y=(t[h+1]||0)/255,w=(t[h+2]||0)/255,E=(t[h+3]||0)/255,I=d*4;f[I]=Math.round(255*(1-x)*(1-E)),f[I+1]=Math.round(255*(1-y)*(1-E)),f[I+2]=Math.round(255*(1-w)*(1-E)),f[I+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};c.data=f;let b=o.sync.write(c);return n.verbose,{success:!0,pngData:b}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}getFormatFromMimeType(t){switch(t){case "image/jpeg":return "JPEG";case "image/png":return "PNG";case "image/jp2":return "JPEG 2000";case "image/gif":return "GIF";case "image/tiff":return "TIFF";default:return "unknown"}}};});var A,ee=O(()=>{H();A=class extends T{name="poppler";description="Poppler-based extraction using pdfimages command";async isAvailable(){try{let{Poppler:t}=await import('node-poppler');return new t,!0}catch{return false}}getCapabilities(){return {formats:["png"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}async extractImages(t,e){try{let{Poppler:r}=await import('node-poppler');if(!P__default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let a=new r,s=[],n=D.join(process.cwd(),"temp-poppler-images");P__default.existsSync(n)||P__default.mkdirSync(n,{recursive:!0});try{e.verbose;let o=D.join(n,"img"),g={firstPageToConvert:1,lastPageToConvert:-1,pngFile:!0};e.verbose,await a.pdfImages(t,o,g),e.verbose;let i={list:!0};e.verbose;let l=await a.pdfImages(t,void 0,i),m=this.parseImageList(l);e.verbose;let u=P__default.readdirSync(n).filter(c=>c.startsWith("img-")&&c.endsWith(".png"));e.verbose;for(let c=0;c<u.length;c++){let f=u[c];if(!f)continue;let b=D.join(n,f);if(!P__default.existsSync(b))continue;let d=P__default.statSync(b);P__default.readFileSync(b);let h=f.match(/img-(\d+)\.png/),x=h?parseInt(h[1],10)+1:c+1,y=m[c]||{page:1,index:x,width:0,height:0,format:"PNG"},w=y.page,E=`img_p${w}_${x}.png`,I;if(e.extractImageFiles&&e.imageOutputDir){let L=D.join(e.imageOutputDir,"images");P__default.existsSync(L)||P__default.mkdirSync(L,{recursive:!0}),I=D.join(L,E),P__default.copyFileSync(b,I),e.verbose;}let ie={id:`img_${x}`,filename:`images/${E}`,filepath:I||"",page:w,width:y.width,height:y.height,format:"PNG",mimeType:"image/png",size:d.size,position:{x:0,y:0,width:y.width,height:y.height}};s.push(ie);}return e.verbose,{success:!0,images:s}}finally{P__default.existsSync(n)&&P__default.rmSync(n,{recursive:!0,force:!0});}}catch(r){return {success:false,error:`Poppler extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}parseImageList(t){let e=[],r=t.split(`
|
|
2
|
+
`);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let n=parseInt(s[1],10),o=parseInt(s[2],10),g=parseInt(s[3],10),i=parseInt(s[4],10),l=s[5]?.toUpperCase()||"PNG";e.push({page:n,index:o,width:g,height:i,format:l});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>F});var F,N=O(()=>{F=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(P__default.existsSync(r.imageOutputDir)||P__default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let n=await s.extractImages(t,r);if(!n.success)throw new Error(n.error||"Engine extraction failed");return {success:!0,images:n.images||[],metadata:{totalImages:n.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=P__default.readFileSync(t),n=await r.load(s,{ignoreEncryption:!0}),o=n.getPageCount(),g=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(P__default.existsSync(e.imageOutputDir)||P__default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let l=0;l<o;l++){let m=l+1;try{let c=n.getPage(l).node.Resources();if(!c){e.verbose;continue}let f=c.get(a.of("XObject"));if(!f){e.verbose;continue}let b=f.dict;e.verbose;for(let[d,h]of b)try{let x=n.context.lookup(h),y=x.dict.get(a.of("Subtype"));if(!y||y.toString()!=="/Image")continue;let w=await this.extractImageFromPdfObject(x,m,i,e);w&&(g.push(w),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:g,totalPages:o,totalImages:g.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=t.dict.get(s.of("Width")),o=t.dict.get(s.of("Height")),g=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),l=t.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,c=l&&typeof l.value=="number"?l.value:8;a.verbose;let f=await this.extractImageData(t,g,m,u,i,c,a);if(!f.success||!f.imageData)return a.verbose,null;let b=f.imageData,d=f.mimeType||"image/jpeg",h=f.extension||"jpg",x=`img_p${e}_${r}.${h}`,y="",w=b.length;return a.extractImageFiles&&a.imageOutputDir&&(y=D.join(a.imageOutputDir,x),P__default.writeFileSync(y,b),a.verbose),{id:`img_${r}`,name:x,page:e,position:{x:0,y:0,width:m,height:u},width:m,height:u,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:y}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,n,o){try{let g=await import('zlib'),i,l="image/jpeg",m="jpg";if(e){let u=e.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let c=t.contents;i=g.inflateSync(Buffer.from(c)),l="image/jpeg",m="jpg",o.verbose;}catch(c){return o.verbose,{success:!1,error:`Zlib decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(t.contents),l="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let c=t.contents,f=g.inflateSync(Buffer.from(c));o.verbose;let b=this.detectImageFormat(f);if(b.valid)i=f,l=b.mimeType,m=b.extension,o.verbose;else {let d=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(d.success&&d.pngData)i=d.pngData,l="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(c){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(t.contents),l="image/jp2",m="jp2",o.verbose;}catch(c){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else {o.verbose;try{let c=await t.asUint8Array();i=Buffer.from(c);let f=this.detectImageFormat(i);f.valid&&(l=f.mimeType,m=f.extension);}catch(c){return o.verbose,{success:!1,error:`Generic decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await t.asUint8Array();i=Buffer.from(u);let c=this.detectImageFormat(i);c.valid&&(l=c.mimeType,m=c.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:l,extension:m}}catch(g){return o.verbose,{success:false,error:g instanceof Error?g.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,n){try{let{PNG:o}=await import('pngjs'),g=a?.toString()||"",i=3,l=2;g.includes("DeviceGray")||g.includes("Gray")?(i=1,l=0):g.includes("DeviceRGB")||g.includes("RGB")?(i=3,l=2):(g.includes("DeviceCMYK")||g.includes("CMYK"))&&(i=4,l=2);let m=e*r*i*(s/8),u=t.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let c=new o({width:e,height:r,colorType:l===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=t[d]||0,x=d*4;f[x]=h,f[x+1]=h,f[x+2]=h,f[x+3]=255;}}else if(i===3){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*3,x=d*4;f[x]=t[h]||0,f[x+1]=t[h+1]||0,f[x+2]=t[h+2]||0,f[x+3]=255;}}else if(i===4){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*4,x=(t[h]||0)/255,y=(t[h+1]||0)/255,w=(t[h+2]||0)/255,E=(t[h+3]||0)/255,I=d*4;f[I]=Math.round(255*(1-x)*(1-E)),f[I+1]=Math.round(255*(1-y)*(1-E)),f[I+2]=Math.round(255*(1-w)*(1-E)),f[I+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};c.data=f;let b=o.sync.write(c);return n.verbose,{success:!0,pngData:b}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):P__default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!le(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function le(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(n=>p.includes(n))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let n of s)e.includes(n)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${n}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!P__default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=P.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(n=>n.text).join(`
|
|
3
|
+
`).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(t){return this.pdfLibDoc=await PDFDocument.load(t,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((e,r)=>{let{width:a,height:s}=e.getSize();return {pageNumber:r+1,width:a,height:s,rotation:e.getRotation(),mediaBox:e.getMediaBox()}})}async processPDFParse(t){let e=[];return await se(t,{pagerender:async a=>{try{let s=await a.getTextContent(),n=a.getViewport({scale:1}),o=s.items.filter(u=>typeof u.str=="string");o.sort((u,c)=>{let f=c.transform[5]-u.transform[5];return Math.abs(f)>2?f:u.transform[4]-c.transform[4]});let g="",i=null,l="";for(let u of o){let c=u.transform[5];i===null?(i=c,l=u.str):Math.abs(c-i)>2?(g+=`${l}
|
|
4
|
+
`,i=c,l=u.str):l+=` ${u.str}`;}l&&(g+=l),g=g.trim();let m={pageNumber:a.pageIndex+1,text:g,textItems:s.items,pdfParseWidth:n.width,pdfParseHeight:n.height};return e.push(m),g}catch{return e.push({pageNumber:a.pageIndex+1,text:"",textItems:[],pdfParseWidth:0,pdfParseHeight:0}),""}}}),e.sort((a,s)=>a.pageNumber-s.pageNumber)}combineResults(t,e){return t.map(r=>{let a=e.find(n=>n.pageNumber===r.pageNumber),s=a?.text||"";return {pageNumber:r.pageNumber,text:s,width:r.width,height:r.height,rotation:r.rotation,mediaBox:r.mediaBox,textItems:a?.textItems||[],wordCount:this.countWords(s),characterCount:s.length}})}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){let a=await this.processPDF(t),s=[];if(r.includeImageRefs)try{let{ImageExtractor:o}=await Promise.resolve().then(()=>(N(),ae));s=(await new o().extract(t,{extractImageFiles:!1,verbose:!1,imageEngine:r.imageEngine||"auto"})).images||[];}catch{}let n="";return a.pages.forEach(o=>{let g=e.replace("{page}",o.pageNumber.toString()),i=o.text;if(r.includeImageRefs&&s.length>0){let l=s.filter(m=>m.page===o.pageNumber);if(l.length>0){let m=l.map(u=>(r.imageRefFormat||"[IMG:{id}] {name}").replace("{id}",`img_${u.id}`).replace("{name}",u.filename||`img_p${u.page}_${u.id}.jpg`)).join(`
|
|
5
|
+
`);if(i.trim()){let u=i.split(`
|
|
6
|
+
`);u.length>1?(u.splice(1,0,m),i=u.join(`
|
|
7
7
|
`)):i=`${i}
|
|
8
|
-
${
|
|
8
|
+
${m}`;}else i=m;}}i.trim()?n+=`${g}
|
|
9
9
|
|
|
10
10
|
${i}
|
|
11
|
-
`:
|
|
11
|
+
`:n+=`${g}
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
`;}),{text:
|
|
15
|
-
|
|
16
|
-
`)
|
|
17
|
-
`),
|
|
18
|
-
`).
|
|
14
|
+
`;}),{text:n.trim(),cleanText:a.fullText,numPages:a.totalPages,pages:a.pages}}getPage(t){return this.textData[t-1]||null}async getDetailedPageInfo(t,e){this.textData.length||await this.processPDF(t);let r=this.getPage(e);if(!r)return null;let a=(r.textItems||[]).map(s=>({text:s.str||"",x:s.transform?.[4]||0,y:s.transform?.[5]||0,width:s.width||0,height:s.height||0,fontName:s.fontName,fontSize:s.transform?.[0]||12}));return {pageNumber:e,text:r.text,textItems:a,dimensions:{width:r.width,height:r.height}}}countWords(t){return !t||t.trim()===""?0:t.split(/\s+/).filter(e=>e.length>0).length}async processSinglePage(t,e){try{let r=P.readFileSync(t),a=await PDFDocument.load(r,{ignoreEncryption:!0});if(e<1||e>a.getPageCount())return null;let n=a.getPages()[e-1];if(!n)return null;let{width:o,height:g}=n.getSize(),i=await PDFDocument.create(),[l]=await i.copyPages(a,[e-1]);i.addPage(l);let m=await i.save(),u=[],c={pagerender:async h=>{try{let x=await h.getTextContent();return u=x.items,x.items.map(y=>y.str||"").join(" ")}catch{return ""}}},f=Buffer.from(m),d=(await se(f,c)).text.replace(/\s+/g," ").trim();return {pageNumber:e,text:d,width:o,height:g,rotation:n.getRotation().angle,mediaBox:[n.getMediaBox().x,n.getMediaBox().y,n.getMediaBox().width,n.getMediaBox().height],textItems:u,wordCount:this.countWords(d),characterCount:d.length}}catch{return null}}};var $=class{async extract(t){try{let e=P__default.readFileSync(t),r=[],s=await se(e,{pagerender:async o=>{try{let i=(await o.getTextContent()).items.map(l=>l.str).join(" ");return r[o.pageNumber-1]=i,i}catch{return r[o.pageNumber-1]="",""}}});return {text:r.filter(o=>o&&o.length>0).join(`
|
|
15
|
+
|
|
16
|
+
`),numPages:s.numpages,info:s.info,metadata:s.metadata,version:s.version}}catch(e){throw new Error(`Failed to extract text from PDF: ${e instanceof Error?e.message:"Unknown error"}`)}}async extractWithPages(t){try{let e=P__default.readFileSync(t),a=await se(e,{pagerender:s=>s.getTextContent().then(n=>n.items.map(o=>o.str).join(" "))});return {text:a.text,numPages:a.numpages,info:a.info,metadata:a.metadata,version:a.version,pages:a.text?this.splitTextIntoPages(a.text,a.numpages):[]}}catch(e){throw new Error(`Failed to extract text with pages: ${e instanceof Error?e.message:"Unknown error"}`)}}splitTextIntoPages(t,e){let r=t.split(`
|
|
17
|
+
`),a=Math.ceil(r.length/e),s=[];for(let n=0;n<e;n++){let o=n*a,g=Math.min(o+a,r.length),i=r.slice(o,g).join(`
|
|
18
|
+
`);s.push(i);}return s}async extractTextItems(t,e={}){try{let r=await this.extract(t),a=r.text,s=r.numpages||1,n=a.split(`
|
|
19
|
+
`),o=[],g=1,i=Math.ceil(n.length/s);return n.forEach((l,m)=>{if(l.trim()){g=Math.ceil((m+1)/i);let u="text";l.length<50&&l.trim().match(/^[A-Z\s]+$/)?u="heading":l.length>100?u="paragraph":l.length<30&&(u="caption");let c=12;u==="heading"?c=16:u==="caption"&&(c=10);let f={id:`text_${m+1}`,content:l.trim(),position:{x:0,y:m%i*15,width:l.length*8,height:c},font:{name:"Unknown",size:c,style:u==="heading"?"bold":"normal"},page:g,type:u,fontSize:c,color:"#000000"};o.push(f);}}),e.verbose,o}catch(r){throw new Error(`Failed to extract text items: ${r instanceof Error?r.message:"Unknown error"}`)}}async extractStatistics(t){let e=await this.extract(t),r=e.text,a=r.length,s=r.split(/\s+/).filter(l=>l.length>0).length,n=r.split(`
|
|
20
|
+
`).length,o=e.numPages,g=Math.round(s/o),i=Math.ceil(s/200);return {characterCount:a,wordCount:s,lineCount:n,pageCount:o,averageWordsPerPage:g,readingTime:i}}async extractWithFontInfo(t){return this.extract(t)}cleanText(t){return t.replace(/\s+/g," ").replace(/\n\s*\n/g,`
|
|
19
21
|
`).trim()}async extractPageRange(t,e,r){let a=await this.extractWithPages(t);if(e<1||r>a.numPages||e>r)throw new Error(`Invalid page range: ${e}-${r}. Document has ${a.numPages} pages.`);return a.pages.slice(e-1,r).join(`
|
|
20
22
|
|
|
21
|
-
`)}async searchText(t,e,r=false){let a=await this.extractWithPages(t),s=r?"g":"gi",
|
|
22
|
-
`);
|
|
23
|
-
`);i.push(`Page ${
|
|
24
|
-
`),
|
|
25
|
-
`);
|
|
26
|
-
${
|
|
27
|
-
`;}
|
|
28
|
-
`);}return
|
|
23
|
+
`)}async searchText(t,e,r=false){let a=await this.extractWithPages(t),s=r?"g":"gi",n=new RegExp(e,s),o=0,g=[],i=[];return a.pages.forEach((l,m)=>{let u=l.match(n);if(u){o+=u.length,g.push(m+1);let c=l.split(`
|
|
24
|
+
`);c.forEach((f,b)=>{if(n.test(f)){let d=Math.max(0,b-1),h=Math.min(c.length,b+2),x=c.slice(d,h).join(`
|
|
25
|
+
`);i.push(`Page ${m+1}: ${x}`);}});}}),{found:o>0,occurrences:o,pages:g,context:i}}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){try{let a=new z,s={includeImageRefs:r.includeImageRefs??!0,imageRefFormat:r.imageRefFormat||"[IMG:{id}] {name}"};r.imageEngine&&(s.imageEngine=r.imageEngine);let n=await a.extractWithPageMarkers(t,e,s),o=n.pages.map(g=>({pageNumber:g.pageNumber+(r.pageOffset||0),text:{content:g.text,rawText:g.text,wordCount:g.wordCount,characterCount:g.characterCount},images:[],imageCount:0}));return {text:n.text,pages:o}}catch(a){throw new Error(`Failed to extract text with page markers: ${a instanceof Error?a.message:"Unknown error"}`)}}async extractWithAccuratePages(t){let r=await new z().processPDF(t),a=r.pages.map(s=>({pageNumber:s.pageNumber,text:{content:s.text,rawText:s.text,wordCount:s.wordCount,characterCount:s.characterCount},images:[],imageCount:0}));return {fullText:r.fullText,pages:a,totalPages:r.totalPages}}};N();var S=class{generateTextWithImageRefs(t,e,r,a){if(!t||e.length===0)return t||"";let s=t.split(`
|
|
26
|
+
`),n=Math.ceil(s.length/a),o="";for(let g=1;g<=a;g++){let i=(g-1)*n,l=Math.min(i+n,s.length),m=s.slice(i,l).join(`
|
|
27
|
+
`);m.trim()&&(o+=m);let u=e.filter(c=>c.page===g);for(let c of u){let f=this.formatImageReference(c,r,e.indexOf(c)+1);o+=`
|
|
28
|
+
${f}
|
|
29
|
+
`;}g<a&&m.trim()&&(o+=`
|
|
30
|
+
`);}return o.trim()}generateImageOnlyRefs(t,e){return t.map((r,a)=>this.formatImageReference(r,e,a+1)).join(`
|
|
29
31
|
`)}formatImageReference(t,e,r){let a={id:t.id,name:t.name||t.id,page:t.page,index:r,path:t.filePath||t.id};return this.replacePlaceholders(e,a)}replacePlaceholders(t,e){return t.replace(/\{id\}/g,e.id).replace(/\{name\}/g,e.name||e.id).replace(/\{page\}/g,e.page.toString()).replace(/\{index\}/g,e.index.toString()).replace(/\{path\}/g,e.path||e.id)}extractPlaceholders(t){let e=/\{([^}]+)\}/g,r=[],a=null;for(a=e.exec(t);a!==null;)a[1]&&r.push(a[1]),a=e.exec(t);return [...new Set(r)]}isValidFormat(t){let e=["id","name","page","index","path"];return this.extractPlaceholders(t).every(a=>e.includes(a))}getDefaultFormat(t=false){return t?"[IMAGE:{path}]":"[IMAGE:{id}]"}cleanTextFromImageRefs(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g");return t.replace(a,"").replace(/\n\s*\n/g,`
|
|
30
|
-
`).trim()}countImageReferences(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=t.match(a);return s?s.length:0}generateSummary(t,e,r,a,s){let
|
|
32
|
+
`).trim()}countImageReferences(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=t.match(a);return s?s.length:0}generateSummary(t,e,r,a,s){let n=(r/t).toFixed(2),o=["\u{1F4C4} Document Summary",` Pages: ${t}`,` Text items: ${e}`,` Images: ${r} (avg ${n} per page)`,` Text length: ${a.toLocaleString()} characters`];return s&&o.push(` Processing time: ${s}ms`),o.join(`
|
|
31
33
|
`)}formatFileSize(t){let e=["B","KB","MB","GB"],r=t,a=0;for(;r>=1024&&a<e.length-1;)r/=1024,a++;return `${r.toFixed(1)} ${e[a]}`}formatDuration(t){if(t<1e3)return `${t}ms`;let e=Math.floor(t/1e3);if(e<60)return `${e}s`;let r=Math.floor(e/60),a=e%60;return `${r}m ${a}s`}};var U=class{extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
|
|
32
34
|
|
|
33
|
-
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}generateStructuredData(t,e,r,a,s){let
|
|
34
|
-
`),a=Math.ceil(r.length/e),s=[];for(let
|
|
35
|
-
`);s.push(i);}return s}createPageDataArray(t,e,r){let a=[];for(let s=0;s<r;s++){let
|
|
36
|
-
`),
|
|
35
|
+
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}generateStructuredData(t,e,r,a,s){let n=this.splitTextIntoPages(e,a),o=this.createPageDataArray(n,r,a);return {metadata:{filename:t,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:e.length,totalImages:r.length,extractionOptions:s},pages:o}}splitTextIntoPages(t,e){if(e<=1)return [t];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=t.match(r);return a&&a.length>0?this.splitByPageMarkers(t,r):this.splitByEstimatedLength(t,e)}splitByPageMarkers(t,e){let r=t.split(e),a=[];for(let s=1;s<r.length;s++){let n=r[s];n&&a.push(n.trim());}return a.length===0&&a.push(t),a}splitByEstimatedLength(t,e){let r=t.split(`
|
|
36
|
+
`),a=Math.ceil(r.length/e),s=[];for(let n=0;n<e;n++){let o=n*a,g=Math.min((n+1)*a,r.length),i=r.slice(o,g).join(`
|
|
37
|
+
`);s.push(i);}return s}createPageDataArray(t,e,r){let a=[];for(let s=0;s<r;s++){let n=s+1,o=t[s]||"",g=this.getImagesForPage(e,n),i=this.extractRawText(o);a.push({pageNumber:n,text:{content:o,rawText:i,wordCount:this.countWords(i),characterCount:i.length},images:g,imageCount:g.length});}return a}getImagesForPage(t,e){return t.filter(r=>r.page===e).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r){let s=r.filename;s!==void 0&&(a.filename=s);}if("path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("size"in r){let s=r.size;s!==void 0&&(a.size=s);}return a})}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}generateJSONString(t,e=2){return JSON.stringify(t,null,e)}generateSummary(t){let e=t.pages.reduce((n,o)=>n+o.text.wordCount,0),r=t.pages.reduce((n,o)=>n+o.text.characterCount,0),a=t.pages.filter(n=>n.text.content.trim().length>0).length,s=t.pages.filter(n=>n.imageCount>0).length;return {totalWords:e,totalCharacters:r,averageWordsPerPage:Math.round(e/t.pages.length),averageImagesPerPage:Math.round(t.metadata.totalImages/t.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var W=class{cacheDir;constructor(t="./tmp/pdf-cache"){this.cacheDir=t,this.ensureCacheDir();}generateCacheKey(t){let e=D.resolve(t),r=P__default.statSync(e),a=`${e}:${r.mtime.getTime()}:${r.size}`;return ue.createHash("md5").update(a).digest("hex")}getCacheDir(t){let e=this.generateCacheKey(t);return D.join(this.cacheDir,e)}ensureCacheDir(){P__default.existsSync(this.cacheDir)||P__default.mkdirSync(this.cacheDir,{recursive:true});}isCached(t){try{let e=this.getCacheDir(t),r=D.join(e,"cache-info.json");return P__default.existsSync(r)}catch{return false}}getCacheInfo(t){try{let e=this.getCacheDir(t),r=D.join(e,"cache-info.json");return P__default.existsSync(r)?JSON.parse(P__default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(t,e){let r=this.getCacheDir(t);P__default.existsSync(r)||P__default.mkdirSync(r,{recursive:true});let a=P__default.statSync(t),s={pdfPath:D.resolve(t),lastModified:a.mtime.getTime(),totalPages:e,cacheDir:r,created:new Date().toISOString()},n=D.join(r,"cache-info.json");return P__default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(t,e,r){try{let a=this.getCacheDir(t),s=D.join(a,`page-${e}.json`);P__default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(t,e){try{let r=this.getCacheDir(t),a=D.join(r,`page-${e}.json`);return P__default.existsSync(a)?JSON.parse(P__default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(t){try{let e=this.getCacheDir(t),r=[];if(!P__default.existsSync(e))return r;let s=P__default.readdirSync(e).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=D.join(e,n),g=JSON.parse(P__default.readFileSync(o,"utf-8"));r.push(g);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(t){try{let e=this.getCacheDir(t);P__default.existsSync(e)&&P__default.rmSync(e,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{P__default.existsSync(this.cacheDir)&&P__default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{let t=0,e=0,r=0;if(P__default.existsSync(this.cacheDir)){let a=P__default.readdirSync(this.cacheDir);t=a.length;for(let s of a){let n=D.join(this.cacheDir,s);if(P__default.statSync(n).isDirectory()){let o=P__default.readdirSync(n),g=o.filter(i=>i.startsWith("page-")&&i.endsWith(".json"));e+=g.length;for(let i of o){let l=D.join(n,i);r+=P__default.statSync(l).size;}}}}return {totalCachedPdfs:t,totalCachedPages:e,totalCacheSize:r,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var j=class{textExtractor;imageExtractor;formatProcessor;structuredDataGenerator;cacheManager;constructor(t){this.textExtractor=new $,this.imageExtractor=new F,this.formatProcessor=new S,this.structuredDataGenerator=new U,this.cacheManager=new W(t);}async extract(t,e={}){let r={pdfPath:t,outputDir:e.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,...e}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!P__default.existsSync(t))throw new Error(`PDF file not found: ${t}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(t),r.options.includePageMarkers||r.options.includeImageRefs)){let m=r.options.pageMarkerFormat||"--- PAGE {page} ---",c={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};r.options.imageEngine&&(c.imageEngine=r.options.imageEngine),o=await this.textExtractor.extractWithPageMarkers(t,m,c);}let g=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,g=await this.textExtractor.extractTextItems(t,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(t,r.options));let l=await this.processResults(t,n,o,i,g,r.options,s);return this.reportProgress(r.options,{currentPage:l.document.pages,totalPages:l.document.pages,phase:"complete"}),l}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(t,e={}){return (await this.extract(t,{...e,extractText:true,extractImages:false})).cleanText}async extractImages(t,e={}){return (await this.extract(t,{...e,extractText:false,extractImages:true})).images}async extractImageFiles(t,e="./extracted-images",r={}){return (await this.extract(t,{...r,extractImageFiles:true,imageOutputDir:e,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(t){return k(t)}async processResults(t,e,r,a,s,n,o){let g=D.basename(t),l=this.extractRawText(e?.text||""),m={document:{filename:g,pages:a?.totalPages||e?.numPages||0,textLength:e?.text?.length||0,extractedAt:new Date().toISOString(),metadata:e?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:l,textWithRefs:"",cleanText:l};if(n.extractText&&n.extractImages&&e&&a)if(r?.text&&n.includeImageRefs)m.textWithRefs=r.text;else if(n.includeImageRefs){let u=r?.text||e.text;m.textWithRefs=this.formatProcessor.generateTextWithImageRefs(u,a.images,n.imageRefFormat||"[IMAGE:{id}]",m.document.pages);}else m.textWithRefs=r?.text||e.text;else n.extractText&&e?m.textWithRefs=r?.text||e.text:n.extractImages&&a&&(m.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(m.summary={totalPages:m.document.pages,totalTextItems:0,totalImages:m.images.length,totalTextLength:m.document.textLength,averageImagesPerPage:(m.images.length/m.document.pages).toFixed(2),pagesWithImages:new Set(m.images.map(u=>u.page)).size},n.generateStructuredData){let u=m.textWithRefs||m.cleanText;m.structuredData=this.structuredDataGenerator.generateStructuredData(g,u,m.images,m.document.pages,n),n.verbose;}return n.verbose,m}async getText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).text}async getImages(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:false,extractImages:true})).images}async getTextItems(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).rawText}async getPage(t,e,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(t,e);if(m)return r.verbose,m}let a={...r,specificPages:[e]},s=await this.extract(t,a),n=this.extractPageText(s.textWithRefs||s.cleanText,e),o=s.images.filter(m=>m.page===e),g=s.textItems?.filter(m=>m.page===e)||[],i=this.extractRawText(n),l={pageNumber:e,text:n,rawText:i,textItems:g,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(t,e,l),l}extractPageText(t,e){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=t.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===e)return a[i+3]||""}let s=t.split(`
|
|
38
|
+
`),n=Math.ceil(s.length/e),o=(e-1)*n,g=Math.min(e*n,s.length);return s.slice(o,g).join(`
|
|
37
39
|
`)}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
|
|
38
40
|
|
|
39
|
-
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}clearCache(t){this.cacheManager.clearCache(t);}getCacheStats(){return this.cacheManager.getCacheStats()}reportProgress(t,e){t.progressCallback&&t.progressCallback(e);}createValidationError(t,e){let r=new Error(t);return r.code="VALIDATION_ERROR",r.validationErrors=e,r}createExtractionError(t,e){let r=new Error(t);return r.code="EXTRACTION_ERROR",r.originalError=e,r}},C=new j;N();N();async function
|
|
41
|
+
`),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}clearCache(t){this.cacheManager.clearCache(t);}getCacheStats(){return this.cacheManager.getCacheStats()}reportProgress(t,e){t.progressCallback&&t.progressCallback(e);}createValidationError(t,e){let r=new Error(t);return r.code="VALIDATION_ERROR",r.validationErrors=e,r}createExtractionError(t,e){let r=new Error(t);return r.code="EXTRACTION_ERROR",r.originalError=e,r}},C=new j;N();N();async function de(p,t={}){return C.extract(p,t)}async function xe(p,t={}){return C.extractText(p,t)}async function he(p,t={}){return C.extractImages(p,t)}async function be(p,t="./extracted-images",e={}){return C.extractImageFiles(p,t,e)}var ye="1.0.0",mt={PDFExtractor:j,pdfExtractor:C,TextExtractor:$,ImageExtractor:F,FormatProcessor:S,extractPdfContent:de,extractText:xe,extractImages:he,extractImageFiles:be,validateConfig:k,validateImageRefFormat:V,validateFilePath:J,version:ye};export{S as FormatProcessor,F as ImageExtractor,j as PDFExtractor,$ as TextExtractor,mt as default,be as extractImageFiles,he as extractImages,de as extractPdfContent,xe as extractText,C as pdfExtractor,k as validateConfig,J as validateFilePath,V as validateImageRefFormat,ye as version};//# sourceMappingURL=index.mjs.map
|
|
40
42
|
//# sourceMappingURL=index.mjs.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-plus",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "A comprehensive PDF content extraction library with support for text, images, and structured data",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -35,19 +35,19 @@
|
|
|
35
35
|
"license": "MIT",
|
|
36
36
|
"repository": {
|
|
37
37
|
"type": "git",
|
|
38
|
-
"url": "https://github.com/kauandotnet/
|
|
38
|
+
"url": "https://github.com/kauandotnet/pdf-plus.git"
|
|
39
39
|
},
|
|
40
40
|
"bugs": {
|
|
41
|
-
"url": "https://github.com/kauandotnet/
|
|
41
|
+
"url": "https://github.com/kauandotnet/pdf-plus/issues"
|
|
42
42
|
},
|
|
43
|
-
"homepage": "https://github.com/kauandotnet/
|
|
43
|
+
"homepage": "https://github.com/kauandotnet/pdf-plus#readme",
|
|
44
44
|
"engines": {
|
|
45
45
|
"node": ">=18.0.0"
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
48
|
"node-poppler": "^8.0.4",
|
|
49
49
|
"pdf-lib": "^1.17.1",
|
|
50
|
-
"pdf-parse": "
|
|
50
|
+
"pdf-parse": "github:iamh2o/pdf-parse#1.1.3",
|
|
51
51
|
"pdfjs-dist": "^5.4.149",
|
|
52
52
|
"pngjs": "^7.0.0"
|
|
53
53
|
},
|