pdf-plus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,40 @@
1
+ 'use strict';Object.defineProperty(exports,'__esModule',{value:true});var y=require('fs'),D=require('path'),pdfLib=require('pdf-lib'),le=require('crypto');function _interopDefault(e){return e&&e.__esModule?e:{default:e}}function _interopNamespace(e){if(e&&e.__esModule)return e;var n=Object.create(null);if(e){Object.keys(e).forEach(function(k){if(k!=='default'){var d=Object.getOwnPropertyDescriptor(e,k);Object.defineProperty(n,k,d.get?d:{enumerable:true,get:function(){return e[k]}});}})}n.default=e;return Object.freeze(n)}var y__namespace=/*#__PURE__*/_interopNamespace(y);var D__default=/*#__PURE__*/_interopDefault(D);var le__default=/*#__PURE__*/_interopDefault(le);var oe=Object.defineProperty;var O=(p,t)=>()=>(p&&(t=p(p=0)),t);var Y=(p,t)=>{for(var e in t)oe(p,e,{get:t[e],enumerable:true});};var T,H=O(()=>{T=class{};});var B,Q=O(()=>{H();B=class extends T{name="pdf-lib";description="PDF-lib based extraction with full format support";async isAvailable(){try{return await import('pdf-lib'),!0}catch{return false}}getCapabilities(){return {formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}}async extractImages(t,e){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib');if(!y__namespace.default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let s=y__namespace.default.readFileSync(t),o=await r.load(s),n=o.getPages(),l=[],i=1;e.verbose;for(let c=0;c<n.length;c++){let u=n[c],f=c+1,g=u?.node.Resources;if(!g)continue;let b=(typeof g=="function"?g():g)?.get?.(a.of("XObject"));if(!b)continue;let d=b.entries?.()||[],x=0;e.verbose;for(let[,h]of d){let P=o.context.lookup(h);if(!P||P.dict?.get?.(a.of("Subtype"))?.toString()!=="/Image")continue;x++;let E=await this.extractImageFromPdfObject(P,f,i,e);E&&l.push(E),i++;}}return e.verbose,{success:!0,images:l}}catch(r){return {success:false,error:`PDF-lib extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),o=t.dict.get(s.of("Width")),n=t.dict.get(s.of("Height")),l=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),c=t.dict.get(s.of("BitsPerComponent")),u=o&&typeof o.value=="number"?o.value:100,f=n&&typeof n.value=="number"?n.value:100,g=c&&typeof c.value=="number"?c.value:8;a.verbose;let m=await this.extractImageData(t,l,u,f,i,g,a);if(!m.success||!m.imageData)return a.verbose,null;let b=m.extension||"bin",d=`img_p${e}_${r}.${b}`,x,h=m.imageData.length;if(a.extractImageFiles&&a.imageOutputDir){let P=D__default.default.join(a.imageOutputDir,"images");y__namespace.default.existsSync(P)||y__namespace.default.mkdirSync(P,{recursive:!0}),x=D__default.default.join(P,d),y__namespace.default.writeFileSync(x,m.imageData),a.verbose;}return {id:`img_${r}`,filename:`images/${d}`,filepath:x||"",page:e,width:u,height:f,format:this.getFormatFromMimeType(m.mimeType||""),mimeType:m.mimeType||"",size:h,position:{x:0,y:0,width:u,height:f}}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,o,n){try{let l=await import('zlib'),i,c="image/jpeg",u="jpg";if(e){let f=e.toString();if(n.verbose,f.includes("DCTDecode")&&f.includes("FlateDecode")){n.verbose;try{let g=t.contents;i=l.inflateSync(Buffer.from(g)),c="image/jpeg",u="jpg",n.verbose;}catch(g){return n.verbose,{success:!1,error:`Zlib decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("DCTDecode"))n.verbose,i=Buffer.from(t.contents),c="image/jpeg",u="jpg";else if(f.includes("FlateDecode")){n.verbose;try{let g=t.contents,m=l.inflateSync(Buffer.from(g));n.verbose;let b=this.detectImageFormat(m);if(b.valid)i=m,c=b.mimeType,u=b.extension,n.verbose;else {let d=await this.createPngFromPdfMetadata(m,r,a,s,o,n);if(d.success&&d.pngData)i=d.pngData,c="image/png",u="png",n.verbose;else return n.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(g){return n.verbose,{success:!1,error:`FlateDecode decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("JPXDecode")){n.verbose;try{i=Buffer.from(t.contents),c="image/jp2",u="jp2",n.verbose;}catch(g){return n.verbose,{success:!1,error:`JPXDecode extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else {n.verbose;try{let g=await t.asUint8Array();i=Buffer.from(g);let m=this.detectImageFormat(i);m.valid&&(c=m.mimeType,u=m.extension);}catch(g){return n.verbose,{success:!1,error:`Generic decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}}else {n.verbose;try{let f=await t.asUint8Array();i=Buffer.from(f);let g=this.detectImageFormat(i);g.valid&&(c=g.mimeType,u=g.extension);}catch(f){return n.verbose,{success:!1,error:`Raw data extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}return {success:!0,imageData:i,mimeType:c,extension:u}}catch(l){return {success:false,error:`Image data extraction failed: ${l instanceof Error?l.message:"Unknown error"}`}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,o){try{let{PNG:n}=await import('pngjs'),l=a?.toString()||"",i=3,c=2;l.includes("DeviceGray")||l.includes("Gray")?(i=1,c=0):l.includes("DeviceRGB")||l.includes("RGB")?(i=3,c=2):(l.includes("DeviceCMYK")||l.includes("CMYK"))&&(i=4,c=2);let u=e*r*i*(s/8),f=t.length;if(o.verbose,Math.abs(f-u)>f*.1)return {success:!1,error:`Data size mismatch: expected ${u}, got ${f} bytes`};let g=new n({width:e,height:r,colorType:c===0?0:6,bitDepth:8}),m;if(i===1){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=t[d]||0,h=d*4;m[h]=x,m[h+1]=x,m[h+2]=x,m[h+3]=255;}}else if(i===3){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*3,h=d*4;m[h]=t[x]||0,m[h+1]=t[x+1]||0,m[h+2]=t[x+2]||0,m[h+3]=255;}}else if(i===4){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*4,h=(t[x]||0)/255,P=(t[x+1]||0)/255,I=(t[x+2]||0)/255,E=(t[x+3]||0)/255,v=d*4;m[v]=Math.round(255*(1-h)*(1-E)),m[v+1]=Math.round(255*(1-P)*(1-E)),m[v+2]=Math.round(255*(1-I)*(1-E)),m[v+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};g.data=m;let b=n.sync.write(g);return o.verbose,{success:!0,pngData:b}}catch(n){return {success:false,error:`PNG creation error: ${n instanceof Error?n.message:"Unknown error"}`}}}getFormatFromMimeType(t){switch(t){case "image/jpeg":return "JPEG";case "image/png":return "PNG";case "image/jp2":return "JPEG 2000";case "image/gif":return "GIF";case "image/tiff":return "TIFF";default:return "unknown"}}};});var A,ee=O(()=>{H();A=class extends T{name="poppler";description="Poppler-based extraction using pdfimages command";async isAvailable(){try{let{Poppler:t}=await import('node-poppler');return new t,!0}catch{return false}}getCapabilities(){return {formats:["png"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}async extractImages(t,e){try{let{Poppler:r}=await import('node-poppler');if(!y__namespace.default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let a=new r,s=[],o=D__default.default.join(process.cwd(),"temp-poppler-images");y__namespace.default.existsSync(o)||y__namespace.default.mkdirSync(o,{recursive:!0});try{e.verbose;let n=D__default.default.join(o,"img"),l={firstPageToConvert:1,lastPageToConvert:-1,pngFile:!0};e.verbose,await a.pdfImages(t,n,l),e.verbose;let i={list:!0};e.verbose;let c=await a.pdfImages(t,void 0,i),u=this.parseImageList(c);e.verbose;let f=y__namespace.default.readdirSync(o).filter(g=>g.startsWith("img-")&&g.endsWith(".png"));e.verbose;for(let g=0;g<f.length;g++){let m=f[g];if(!m)continue;let b=D__default.default.join(o,m);if(!y__namespace.default.existsSync(b))continue;let d=y__namespace.default.statSync(b);y__namespace.default.readFileSync(b);let x=m.match(/img-(\d+)\.png/),h=x?parseInt(x[1],10)+1:g+1,P=u[g]||{page:1,index:h,width:0,height:0,format:"PNG"},I=P.page,E=`img_p${I}_${h}.png`,v;if(e.extractImageFiles&&e.imageOutputDir){let L=D__default.default.join(e.imageOutputDir,"images");y__namespace.default.existsSync(L)||y__namespace.default.mkdirSync(L,{recursive:!0}),v=D__default.default.join(L,E),y__namespace.default.copyFileSync(b,v),e.verbose;}let ne={id:`img_${h}`,filename:`images/${E}`,filepath:v||"",page:I,width:P.width,height:P.height,format:"PNG",mimeType:"image/png",size:d.size,position:{x:0,y:0,width:P.width,height:P.height}};s.push(ne);}return e.verbose,{success:!0,images:s}}finally{y__namespace.default.existsSync(o)&&y__namespace.default.rmSync(o,{recursive:!0,force:!0});}}catch(r){return {success:false,error:`Poppler extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}parseImageList(t){let e=[],r=t.split(`
2
+ `);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let o=parseInt(s[1],10),n=parseInt(s[2],10),l=parseInt(s[3],10),i=parseInt(s[4],10),c=s[5]?.toUpperCase()||"PNG";e.push({page:o,index:n,width:l,height:i,format:c});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>exports.ImageExtractor});exports.ImageExtractor=void 0;var N=O(()=>{exports.ImageExtractor=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(y__namespace.default.existsSync(r.imageOutputDir)||y__namespace.default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let o=await s.extractImages(t,r);if(!o.success)throw new Error(o.error||"Engine extraction failed");return {success:!0,images:o.images||[],metadata:{totalImages:o.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=y__namespace.default.readFileSync(t),o=await r.load(s,{ignoreEncryption:!0}),n=o.getPageCount(),l=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(y__namespace.default.existsSync(e.imageOutputDir)||y__namespace.default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let c=0;c<n;c++){let u=c+1;try{let g=o.getPage(c).node.Resources();if(!g){e.verbose;continue}let m=g.get(a.of("XObject"));if(!m){e.verbose;continue}let b=m.dict;e.verbose;for(let[d,x]of b)try{let h=o.context.lookup(x),P=h.dict.get(a.of("Subtype"));if(!P||P.toString()!=="/Image")continue;let I=await this.extractImageFromPdfObject(h,u,i,e);I&&(l.push(I),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:l,totalPages:n,totalImages:l.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),o=t.dict.get(s.of("Width")),n=t.dict.get(s.of("Height")),l=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),c=t.dict.get(s.of("BitsPerComponent")),u=o&&typeof o.value=="number"?o.value:100,f=n&&typeof n.value=="number"?n.value:100,g=c&&typeof c.value=="number"?c.value:8;a.verbose;let m=await this.extractImageData(t,l,u,f,i,g,a);if(!m.success||!m.imageData)return a.verbose,null;let b=m.imageData,d=m.mimeType||"image/jpeg",x=m.extension||"jpg",h=`img_p${e}_${r}.${x}`,P="",I=b.length;return a.extractImageFiles&&a.imageOutputDir&&(P=D__default.default.join(a.imageOutputDir,h),y__namespace.default.writeFileSync(P,b),a.verbose),{id:`img_${r}`,name:h,page:e,position:{x:0,y:0,width:u,height:f},width:u,height:f,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:P}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,o,n){try{let l=await import('zlib'),i,c="image/jpeg",u="jpg";if(e){let f=e.toString();if(n.verbose,f.includes("DCTDecode")&&f.includes("FlateDecode")){n.verbose;try{let g=t.contents;i=l.inflateSync(Buffer.from(g)),c="image/jpeg",u="jpg",n.verbose;}catch(g){return n.verbose,{success:!1,error:`Zlib decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("DCTDecode"))n.verbose,i=Buffer.from(t.contents),c="image/jpeg",u="jpg";else if(f.includes("FlateDecode")){n.verbose;try{let g=t.contents,m=l.inflateSync(Buffer.from(g));n.verbose;let b=this.detectImageFormat(m);if(b.valid)i=m,c=b.mimeType,u=b.extension,n.verbose;else {let d=await this.createPngFromPdfMetadata(m,r,a,s,o,n);if(d.success&&d.pngData)i=d.pngData,c="image/png",u="png",n.verbose;else return n.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(g){return n.verbose,{success:!1,error:`FlateDecode decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("JPXDecode")){n.verbose;try{i=Buffer.from(t.contents),c="image/jp2",u="jp2",n.verbose;}catch(g){return n.verbose,{success:!1,error:`JPXDecode extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else {n.verbose;try{let g=await t.asUint8Array();i=Buffer.from(g);let m=this.detectImageFormat(i);m.valid&&(c=m.mimeType,u=m.extension);}catch(g){return n.verbose,{success:!1,error:`Generic decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}}else {n.verbose;try{let f=await t.asUint8Array();i=Buffer.from(f);let g=this.detectImageFormat(i);g.valid&&(c=g.mimeType,u=g.extension);}catch(f){return n.verbose,{success:!1,error:`Raw data extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:c,extension:u}}catch(l){return n.verbose,{success:false,error:l instanceof Error?l.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,o){try{let{PNG:n}=await import('pngjs'),l=a?.toString()||"",i=3,c=2;l.includes("DeviceGray")||l.includes("Gray")?(i=1,c=0):l.includes("DeviceRGB")||l.includes("RGB")?(i=3,c=2):(l.includes("DeviceCMYK")||l.includes("CMYK"))&&(i=4,c=2);let u=e*r*i*(s/8),f=t.length;if(o.verbose,Math.abs(f-u)>f*.1)return {success:!1,error:`Data size mismatch: expected ${u}, got ${f} bytes`};let g=new n({width:e,height:r,colorType:c===0?0:6,bitDepth:8}),m;if(i===1){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=t[d]||0,h=d*4;m[h]=x,m[h+1]=x,m[h+2]=x,m[h+3]=255;}}else if(i===3){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*3,h=d*4;m[h]=t[x]||0,m[h+1]=t[x+1]||0,m[h+2]=t[x+2]||0,m[h+3]=255;}}else if(i===4){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*4,h=(t[x]||0)/255,P=(t[x+1]||0)/255,I=(t[x+2]||0)/255,E=(t[x+3]||0)/255,v=d*4;m[v]=Math.round(255*(1-h)*(1-E)),m[v+1]=Math.round(255*(1-P)*(1-E)),m[v+2]=Math.round(255*(1-I)*(1-E)),m[v+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};g.data=m;let b=n.sync.write(g);return o.verbose,{success:!0,pngData:b}}catch(n){return {success:false,error:`PNG creation error: ${n instanceof Error?n.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):y__namespace.default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!ce(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function ce(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(o=>p.includes(o))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let o of s)e.includes(o)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${o}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!y__namespace.default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D__default.default.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=y__namespace.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(o=>o.text).join(`
3
+ `).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(t){return this.pdfLibDoc=await pdfLib.PDFDocument.load(t,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((e,r)=>{let{width:a,height:s}=e.getSize();return {pageNumber:r+1,width:a,height:s,rotation:e.getRotation(),mediaBox:e.getMediaBox()}})}async processPDFParse(t){let e=(await import('pdf-parse')).default,r=[];return await e(t,{pagerender:async s=>{try{let o=await s.getTextContent(),n=s.getViewport({scale:1}),l=o.items.filter(g=>typeof g.str=="string");l.sort((g,m)=>{let b=m.transform[5]-g.transform[5];return Math.abs(b)>2?b:g.transform[4]-m.transform[4]});let i="",c=null,u="";for(let g of l){let m=g.transform[5];c===null?(c=m,u=g.str):Math.abs(m-c)>2?(i+=`${u}
4
+ `,c=m,u=g.str):u+=` ${g.str}`;}u&&(i+=u),i=i.trim();let f={pageNumber:s.pageIndex+1,text:i,textItems:o.items,pdfParseWidth:n.width,pdfParseHeight:n.height};return r.push(f),i}catch{return r.push({pageNumber:s.pageIndex+1,text:"",textItems:[],pdfParseWidth:0,pdfParseHeight:0}),""}}}),r.sort((s,o)=>s.pageNumber-o.pageNumber)}combineResults(t,e){return t.map(r=>{let a=e.find(o=>o.pageNumber===r.pageNumber),s=a?.text||"";return {pageNumber:r.pageNumber,text:s,width:r.width,height:r.height,rotation:r.rotation,mediaBox:r.mediaBox,textItems:a?.textItems||[],wordCount:this.countWords(s),characterCount:s.length}})}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){let a=await this.processPDF(t),s=[];if(r.includeImageRefs)try{let{ImageExtractor:n}=await Promise.resolve().then(()=>(N(),ae));s=(await new n().extract(t,{extractImageFiles:!1,verbose:!1,imageEngine:r.imageEngine||"auto"})).images||[];}catch{}let o="";return a.pages.forEach(n=>{let l=e.replace("{page}",n.pageNumber.toString()),i=n.text;if(r.includeImageRefs&&s.length>0){let c=s.filter(u=>u.page===n.pageNumber);if(c.length>0){let u=c.map(f=>(r.imageRefFormat||"[IMG:{id}] {name}").replace("{id}",`img_${f.id}`).replace("{name}",f.filename||`img_p${f.page}_${f.id}.jpg`)).join(`
5
+ `);if(i.trim()){let f=i.split(`
6
+ `);f.length>1?(f.splice(1,0,u),i=f.join(`
7
+ `)):i=`${i}
8
+ ${u}`;}else i=u;}}i.trim()?o+=`${l}
9
+
10
+ ${i}
11
+ `:o+=`${l}
12
+
13
+
14
+ `;}),{text:o.trim(),cleanText:a.fullText,numPages:a.totalPages,pages:a.pages}}getPage(t){return this.textData[t-1]||null}async getDetailedPageInfo(t,e){this.textData.length||await this.processPDF(t);let r=this.getPage(e);if(!r)return null;let a=(r.textItems||[]).map(s=>({text:s.str||"",x:s.transform?.[4]||0,y:s.transform?.[5]||0,width:s.width||0,height:s.height||0,fontName:s.fontName,fontSize:s.transform?.[0]||12}));return {pageNumber:e,text:r.text,textItems:a,dimensions:{width:r.width,height:r.height}}}countWords(t){return !t||t.trim()===""?0:t.split(/\s+/).filter(e=>e.length>0).length}async processSinglePage(t,e){try{let r=(await import('pdf-parse')).default,a=y__namespace.readFileSync(t),s=await pdfLib.PDFDocument.load(a,{ignoreEncryption:!0});if(e<1||e>s.getPageCount())return null;let n=s.getPages()[e-1];if(!n)return null;let{width:l,height:i}=n.getSize(),c=await pdfLib.PDFDocument.create(),[u]=await c.copyPages(s,[e-1]);c.addPage(u);let f=await c.save(),g=[],m={pagerender:async h=>{try{let P=await h.getTextContent();return g=P.items,P.items.map(I=>I.str||"").join(" ")}catch{return ""}}},b=Buffer.from(f),x=(await r(b,m)).text.replace(/\s+/g," ").trim();return {pageNumber:e,text:x,width:l,height:i,rotation:n.getRotation().angle,mediaBox:[n.getMediaBox().x,n.getMediaBox().y,n.getMediaBox().width,n.getMediaBox().height],textItems:g,wordCount:this.countWords(x),characterCount:x.length}}catch{return null}}};var $=class{async extract(t){try{let e=(await import('pdf-parse')).default,r=y__namespace.default.readFileSync(t),a=await e(r);return {text:a.text,numPages:a.numpages,info:a.info,metadata:a.metadata,version:a.version}}catch(e){throw new Error(`Failed to extract text from PDF: ${e instanceof Error?e.message:"Unknown error"}`)}}async extractWithPages(t){try{let e=(await import('pdf-parse')).default,r=y__namespace.default.readFileSync(t),s=await e(r,{pagerender:o=>o.getTextContent().then(n=>n.items.map(l=>l.str).join(" "))});return {text:s.text,numPages:s.numpages,info:s.info,metadata:s.metadata,version:s.version,pages:s.text?this.splitTextIntoPages(s.text,s.numpages):[]}}catch(e){throw new Error(`Failed to extract text with pages: ${e instanceof Error?e.message:"Unknown error"}`)}}splitTextIntoPages(t,e){let r=t.split(`
15
+ `),a=Math.ceil(r.length/e),s=[];for(let o=0;o<e;o++){let n=o*a,l=Math.min(n+a,r.length),i=r.slice(n,l).join(`
16
+ `);s.push(i);}return s}async extractTextItems(t,e={}){try{let r=await this.extract(t),a=r.text,s=r.numpages||1,o=a.split(`
17
+ `),n=[],l=1,i=Math.ceil(o.length/s);return o.forEach((c,u)=>{if(c.trim()){l=Math.ceil((u+1)/i);let f="text";c.length<50&&c.trim().match(/^[A-Z\s]+$/)?f="heading":c.length>100?f="paragraph":c.length<30&&(f="caption");let g=12;f==="heading"?g=16:f==="caption"&&(g=10);let m={id:`text_${u+1}`,content:c.trim(),position:{x:0,y:u%i*15,width:c.length*8,height:g},font:{name:"Unknown",size:g,style:f==="heading"?"bold":"normal"},page:l,type:f,fontSize:g,color:"#000000"};n.push(m);}}),e.verbose,n}catch(r){throw new Error(`Failed to extract text items: ${r instanceof Error?r.message:"Unknown error"}`)}}async extractStatistics(t){let e=await this.extract(t),r=e.text,a=r.length,s=r.split(/\s+/).filter(c=>c.length>0).length,o=r.split(`
18
+ `).length,n=e.numPages,l=Math.round(s/n),i=Math.ceil(s/200);return {characterCount:a,wordCount:s,lineCount:o,pageCount:n,averageWordsPerPage:l,readingTime:i}}async extractWithFontInfo(t){return this.extract(t)}cleanText(t){return t.replace(/\s+/g," ").replace(/\n\s*\n/g,`
19
+ `).trim()}async extractPageRange(t,e,r){let a=await this.extractWithPages(t);if(e<1||r>a.numPages||e>r)throw new Error(`Invalid page range: ${e}-${r}. Document has ${a.numPages} pages.`);return a.pages.slice(e-1,r).join(`
20
+
21
+ `)}async searchText(t,e,r=false){let a=await this.extractWithPages(t),s=r?"g":"gi",o=new RegExp(e,s),n=0,l=[],i=[];return a.pages.forEach((c,u)=>{let f=c.match(o);if(f){n+=f.length,l.push(u+1);let g=c.split(`
22
+ `);g.forEach((m,b)=>{if(o.test(m)){let d=Math.max(0,b-1),x=Math.min(g.length,b+2),h=g.slice(d,x).join(`
23
+ `);i.push(`Page ${u+1}: ${h}`);}});}}),{found:n>0,occurrences:n,pages:l,context:i}}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){try{let a=new z,s={includeImageRefs:r.includeImageRefs??!0,imageRefFormat:r.imageRefFormat||"[IMG:{id}] {name}"};r.imageEngine&&(s.imageEngine=r.imageEngine);let o=await a.extractWithPageMarkers(t,e,s),n=o.pages.map(l=>({pageNumber:l.pageNumber+(r.pageOffset||0),text:{content:l.text,rawText:l.text,wordCount:l.wordCount,characterCount:l.characterCount},images:[],imageCount:0}));return {text:o.text,pages:n}}catch(a){throw new Error(`Failed to extract text with page markers: ${a instanceof Error?a.message:"Unknown error"}`)}}async extractWithAccuratePages(t){let r=await new z().processPDF(t),a=r.pages.map(s=>({pageNumber:s.pageNumber,text:{content:s.text,rawText:s.text,wordCount:s.wordCount,characterCount:s.characterCount},images:[],imageCount:0}));return {fullText:r.fullText,pages:a,totalPages:r.totalPages}}};N();var S=class{generateTextWithImageRefs(t,e,r,a){if(!t||e.length===0)return t||"";let s=t.split(`
24
+ `),o=Math.ceil(s.length/a),n="";for(let l=1;l<=a;l++){let i=(l-1)*o,c=Math.min(i+o,s.length),u=s.slice(i,c).join(`
25
+ `);u.trim()&&(n+=u);let f=e.filter(g=>g.page===l);for(let g of f){let m=this.formatImageReference(g,r,e.indexOf(g)+1);n+=`
26
+ ${m}
27
+ `;}l<a&&u.trim()&&(n+=`
28
+ `);}return n.trim()}generateImageOnlyRefs(t,e){return t.map((r,a)=>this.formatImageReference(r,e,a+1)).join(`
29
+ `)}formatImageReference(t,e,r){let a={id:t.id,name:t.name||t.id,page:t.page,index:r,path:t.filePath||t.id};return this.replacePlaceholders(e,a)}replacePlaceholders(t,e){return t.replace(/\{id\}/g,e.id).replace(/\{name\}/g,e.name||e.id).replace(/\{page\}/g,e.page.toString()).replace(/\{index\}/g,e.index.toString()).replace(/\{path\}/g,e.path||e.id)}extractPlaceholders(t){let e=/\{([^}]+)\}/g,r=[],a=null;for(a=e.exec(t);a!==null;)a[1]&&r.push(a[1]),a=e.exec(t);return [...new Set(r)]}isValidFormat(t){let e=["id","name","page","index","path"];return this.extractPlaceholders(t).every(a=>e.includes(a))}getDefaultFormat(t=false){return t?"[IMAGE:{path}]":"[IMAGE:{id}]"}cleanTextFromImageRefs(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g");return t.replace(a,"").replace(/\n\s*\n/g,`
30
+ `).trim()}countImageReferences(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=t.match(a);return s?s.length:0}generateSummary(t,e,r,a,s){let o=(r/t).toFixed(2),n=["\u{1F4C4} Document Summary",` Pages: ${t}`,` Text items: ${e}`,` Images: ${r} (avg ${o} per page)`,` Text length: ${a.toLocaleString()} characters`];return s&&n.push(` Processing time: ${s}ms`),n.join(`
31
+ `)}formatFileSize(t){let e=["B","KB","MB","GB"],r=t,a=0;for(;r>=1024&&a<e.length-1;)r/=1024,a++;return `${r.toFixed(1)} ${e[a]}`}formatDuration(t){if(t<1e3)return `${t}ms`;let e=Math.floor(t/1e3);if(e<60)return `${e}s`;let r=Math.floor(e/60),a=e%60;return `${r}m ${a}s`}};var U=class{extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
32
+
33
+ `),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}generateStructuredData(t,e,r,a,s){let o=this.splitTextIntoPages(e,a),n=this.createPageDataArray(o,r,a);return {metadata:{filename:t,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:e.length,totalImages:r.length,extractionOptions:s},pages:n}}splitTextIntoPages(t,e){if(e<=1)return [t];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=t.match(r);return a&&a.length>0?this.splitByPageMarkers(t,r):this.splitByEstimatedLength(t,e)}splitByPageMarkers(t,e){let r=t.split(e),a=[];for(let s=1;s<r.length;s++){let o=r[s];o&&a.push(o.trim());}return a.length===0&&a.push(t),a}splitByEstimatedLength(t,e){let r=t.split(`
34
+ `),a=Math.ceil(r.length/e),s=[];for(let o=0;o<e;o++){let n=o*a,l=Math.min((o+1)*a,r.length),i=r.slice(n,l).join(`
35
+ `);s.push(i);}return s}createPageDataArray(t,e,r){let a=[];for(let s=0;s<r;s++){let o=s+1,n=t[s]||"",l=this.getImagesForPage(e,o),i=this.extractRawText(n);a.push({pageNumber:o,text:{content:n,rawText:i,wordCount:this.countWords(i),characterCount:i.length},images:l,imageCount:l.length});}return a}getImagesForPage(t,e){return t.filter(r=>r.page===e).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r){let s=r.filename;s!==void 0&&(a.filename=s);}if("path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("size"in r){let s=r.size;s!==void 0&&(a.size=s);}return a})}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}generateJSONString(t,e=2){return JSON.stringify(t,null,e)}generateSummary(t){let e=t.pages.reduce((o,n)=>o+n.text.wordCount,0),r=t.pages.reduce((o,n)=>o+n.text.characterCount,0),a=t.pages.filter(o=>o.text.content.trim().length>0).length,s=t.pages.filter(o=>o.imageCount>0).length;return {totalWords:e,totalCharacters:r,averageWordsPerPage:Math.round(e/t.pages.length),averageImagesPerPage:Math.round(t.metadata.totalImages/t.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var W=class{cacheDir;constructor(t="./tmp/pdf-cache"){this.cacheDir=t,this.ensureCacheDir();}generateCacheKey(t){let e=D__default.default.resolve(t),r=y__namespace.default.statSync(e),a=`${e}:${r.mtime.getTime()}:${r.size}`;return le__default.default.createHash("md5").update(a).digest("hex")}getCacheDir(t){let e=this.generateCacheKey(t);return D__default.default.join(this.cacheDir,e)}ensureCacheDir(){y__namespace.default.existsSync(this.cacheDir)||y__namespace.default.mkdirSync(this.cacheDir,{recursive:true});}isCached(t){try{let e=this.getCacheDir(t),r=D__default.default.join(e,"cache-info.json");return y__namespace.default.existsSync(r)}catch{return false}}getCacheInfo(t){try{let e=this.getCacheDir(t),r=D__default.default.join(e,"cache-info.json");return y__namespace.default.existsSync(r)?JSON.parse(y__namespace.default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(t,e){let r=this.getCacheDir(t);y__namespace.default.existsSync(r)||y__namespace.default.mkdirSync(r,{recursive:true});let a=y__namespace.default.statSync(t),s={pdfPath:D__default.default.resolve(t),lastModified:a.mtime.getTime(),totalPages:e,cacheDir:r,created:new Date().toISOString()},o=D__default.default.join(r,"cache-info.json");return y__namespace.default.writeFileSync(o,JSON.stringify(s,null,2)),r}cachePageResult(t,e,r){try{let a=this.getCacheDir(t),s=D__default.default.join(a,`page-${e}.json`);y__namespace.default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(t,e){try{let r=this.getCacheDir(t),a=D__default.default.join(r,`page-${e}.json`);return y__namespace.default.existsSync(a)?JSON.parse(y__namespace.default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(t){try{let e=this.getCacheDir(t),r=[];if(!y__namespace.default.existsSync(e))return r;let s=y__namespace.default.readdirSync(e).filter(o=>o.startsWith("page-")&&o.endsWith(".json"));for(let o of s)try{let n=D__default.default.join(e,o),l=JSON.parse(y__namespace.default.readFileSync(n,"utf-8"));r.push(l);}catch{}return r.sort((o,n)=>o.pageNumber-n.pageNumber),r}catch{return []}}clearCache(t){try{let e=this.getCacheDir(t);y__namespace.default.existsSync(e)&&y__namespace.default.rmSync(e,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{y__namespace.default.existsSync(this.cacheDir)&&y__namespace.default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{let t=0,e=0,r=0;if(y__namespace.default.existsSync(this.cacheDir)){let a=y__namespace.default.readdirSync(this.cacheDir);t=a.length;for(let s of a){let o=D__default.default.join(this.cacheDir,s);if(y__namespace.default.statSync(o).isDirectory()){let n=y__namespace.default.readdirSync(o),l=n.filter(i=>i.startsWith("page-")&&i.endsWith(".json"));e+=l.length;for(let i of n){let c=D__default.default.join(o,i);r+=y__namespace.default.statSync(c).size;}}}}return {totalCachedPdfs:t,totalCachedPages:e,totalCacheSize:r,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var j=class{textExtractor;imageExtractor;formatProcessor;structuredDataGenerator;cacheManager;constructor(t){this.textExtractor=new $,this.imageExtractor=new exports.ImageExtractor,this.formatProcessor=new S,this.structuredDataGenerator=new U,this.cacheManager=new W(t);}async extract(t,e={}){let r={pdfPath:t,outputDir:e.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,...e}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!y__namespace.default.existsSync(t))throw new Error(`PDF file not found: ${t}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let o=null,n=null;if(r.options.extractText&&(r.options.verbose,o=await this.textExtractor.extract(t),r.options.includePageMarkers)){let u=r.options.pageMarkerFormat||"--- PAGE {page} ---",g={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};r.options.imageEngine&&(g.imageEngine=r.options.imageEngine),n=await this.textExtractor.extractWithPageMarkers(t,u,g);}let l=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,l=await this.textExtractor.extractTextItems(t,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(t,r.options));let c=await this.processResults(t,o,n,i,l,r.options,s);return this.reportProgress(r.options,{currentPage:c.document.pages,totalPages:c.document.pages,phase:"complete"}),c}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(t,e={}){return (await this.extract(t,{...e,extractText:true,extractImages:false})).cleanText}async extractImages(t,e={}){return (await this.extract(t,{...e,extractText:false,extractImages:true})).images}async extractImageFiles(t,e="./extracted-images",r={}){return (await this.extract(t,{...r,extractImageFiles:true,imageOutputDir:e,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(t){return k(t)}async processResults(t,e,r,a,s,o,n){let l=D__default.default.basename(t),c={document:{filename:l,pages:a?.totalPages||e?.numPages||0,textLength:e?.text?.length||0,extractedAt:new Date().toISOString(),metadata:e?.info||{},options:o},pages:[],images:a?.images||[],textItems:s,textWithRefs:"",cleanText:this.extractRawText(e?.text||"")};if(o.extractText&&o.extractImages&&e&&a)if(r?.text&&o.includeImageRefs)c.textWithRefs=r.text;else {let u=r?.text||e.text;c.textWithRefs=this.formatProcessor.generateTextWithImageRefs(u,a.images,o.imageRefFormat||"[IMAGE:{id}]",c.document.pages);}else o.extractText&&e?c.textWithRefs=r?.text||e.text:o.extractImages&&a&&(c.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,o.imageRefFormat||"[IMAGE:{id}]"));if(c.summary={totalPages:c.document.pages,totalTextItems:0,totalImages:c.images.length,totalTextLength:c.document.textLength,averageImagesPerPage:(c.images.length/c.document.pages).toFixed(2),pagesWithImages:new Set(c.images.map(u=>u.page)).size},o.generateStructuredData){let u=c.textWithRefs||c.cleanText;c.structuredData=this.structuredDataGenerator.generateStructuredData(l,u,c.images,c.document.pages,o),o.verbose;}return o.verbose,c}async getText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).text}async getImages(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:false,extractImages:true})).images}async getTextItems(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).rawText}async getPage(t,e,r={}){if(r.useCache!==false){let u=this.cacheManager.getCachedPageResult(t,e);if(u)return r.verbose,u}let a={...r,specificPages:[e]},s=await this.extract(t,a),o=this.extractPageText(s.textWithRefs||s.cleanText,e),n=s.images.filter(u=>u.page===e),l=s.textItems?.filter(u=>u.page===e)||[],i=this.extractRawText(o),c={pageNumber:e,text:o,rawText:i,textItems:l,images:n,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:n.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(t,e,c),c}extractPageText(t,e){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=t.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===e)return a[i+3]||""}let s=t.split(`
36
+ `),o=Math.ceil(s.length/e),n=(e-1)*o,l=Math.min(e*o,s.length);return s.slice(n,l).join(`
37
+ `)}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
38
+
39
+ `),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}clearCache(t){this.cacheManager.clearCache(t);}getCacheStats(){return this.cacheManager.getCacheStats()}reportProgress(t,e){t.progressCallback&&t.progressCallback(e);}createValidationError(t,e){let r=new Error(t);return r.code="VALIDATION_ERROR",r.validationErrors=e,r}createExtractionError(t,e){let r=new Error(t);return r.code="EXTRACTION_ERROR",r.originalError=e,r}},C=new j;N();N();async function fe(p,t={}){return C.extract(p,t)}async function pe(p,t={}){return C.extractText(p,t)}async function de(p,t={}){return C.extractImages(p,t)}async function xe(p,t="./extracted-images",e={}){return C.extractImageFiles(p,t,e)}var he="1.0.0",it={PDFExtractor:j,pdfExtractor:C,TextExtractor:$,ImageExtractor:exports.ImageExtractor,FormatProcessor:S,extractPdfContent:fe,extractText:pe,extractImages:de,extractImageFiles:xe,validateConfig:k,validateImageRefFormat:V,validateFilePath:J,version:he};exports.FormatProcessor=S;exports.PDFExtractor=j;exports.TextExtractor=$;exports.default=it;exports.extractImageFiles=xe;exports.extractImages=de;exports.extractPdfContent=fe;exports.extractText=pe;exports.pdfExtractor=C;exports.validateConfig=k;exports.validateFilePath=J;exports.validateImageRefFormat=V;exports.version=he;//# sourceMappingURL=index.js.map
40
+ //# sourceMappingURL=index.js.map
package/dist/index.mjs ADDED
@@ -0,0 +1,40 @@
1
+ import*as y from'fs';import y__default from'fs';import D from'path';import {PDFDocument}from'pdf-lib';import le from'crypto';var oe=Object.defineProperty;var O=(p,t)=>()=>(p&&(t=p(p=0)),t);var Y=(p,t)=>{for(var e in t)oe(p,e,{get:t[e],enumerable:true});};var T,H=O(()=>{T=class{};});var B,Q=O(()=>{H();B=class extends T{name="pdf-lib";description="PDF-lib based extraction with full format support";async isAvailable(){try{return await import('pdf-lib'),!0}catch{return false}}getCapabilities(){return {formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}}async extractImages(t,e){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib');if(!y__default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let s=y__default.readFileSync(t),o=await r.load(s),n=o.getPages(),l=[],i=1;e.verbose;for(let c=0;c<n.length;c++){let u=n[c],f=c+1,g=u?.node.Resources;if(!g)continue;let b=(typeof g=="function"?g():g)?.get?.(a.of("XObject"));if(!b)continue;let d=b.entries?.()||[],x=0;e.verbose;for(let[,h]of d){let P=o.context.lookup(h);if(!P||P.dict?.get?.(a.of("Subtype"))?.toString()!=="/Image")continue;x++;let E=await this.extractImageFromPdfObject(P,f,i,e);E&&l.push(E),i++;}}return e.verbose,{success:!0,images:l}}catch(r){return {success:false,error:`PDF-lib extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),o=t.dict.get(s.of("Width")),n=t.dict.get(s.of("Height")),l=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),c=t.dict.get(s.of("BitsPerComponent")),u=o&&typeof o.value=="number"?o.value:100,f=n&&typeof n.value=="number"?n.value:100,g=c&&typeof c.value=="number"?c.value:8;a.verbose;let m=await this.extractImageData(t,l,u,f,i,g,a);if(!m.success||!m.imageData)return a.verbose,null;let b=m.extension||"bin",d=`img_p${e}_${r}.${b}`,x,h=m.imageData.length;if(a.extractImageFiles&&a.imageOutputDir){let P=D.join(a.imageOutputDir,"images");y__default.existsSync(P)||y__default.mkdirSync(P,{recursive:!0}),x=D.join(P,d),y__default.writeFileSync(x,m.imageData),a.verbose;}return {id:`img_${r}`,filename:`images/${d}`,filepath:x||"",page:e,width:u,height:f,format:this.getFormatFromMimeType(m.mimeType||""),mimeType:m.mimeType||"",size:h,position:{x:0,y:0,width:u,height:f}}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,o,n){try{let l=await import('zlib'),i,c="image/jpeg",u="jpg";if(e){let f=e.toString();if(n.verbose,f.includes("DCTDecode")&&f.includes("FlateDecode")){n.verbose;try{let g=t.contents;i=l.inflateSync(Buffer.from(g)),c="image/jpeg",u="jpg",n.verbose;}catch(g){return n.verbose,{success:!1,error:`Zlib decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("DCTDecode"))n.verbose,i=Buffer.from(t.contents),c="image/jpeg",u="jpg";else if(f.includes("FlateDecode")){n.verbose;try{let g=t.contents,m=l.inflateSync(Buffer.from(g));n.verbose;let b=this.detectImageFormat(m);if(b.valid)i=m,c=b.mimeType,u=b.extension,n.verbose;else {let d=await this.createPngFromPdfMetadata(m,r,a,s,o,n);if(d.success&&d.pngData)i=d.pngData,c="image/png",u="png",n.verbose;else return n.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(g){return n.verbose,{success:!1,error:`FlateDecode decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("JPXDecode")){n.verbose;try{i=Buffer.from(t.contents),c="image/jp2",u="jp2",n.verbose;}catch(g){return n.verbose,{success:!1,error:`JPXDecode extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else {n.verbose;try{let g=await t.asUint8Array();i=Buffer.from(g);let m=this.detectImageFormat(i);m.valid&&(c=m.mimeType,u=m.extension);}catch(g){return n.verbose,{success:!1,error:`Generic decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}}else {n.verbose;try{let f=await t.asUint8Array();i=Buffer.from(f);let g=this.detectImageFormat(i);g.valid&&(c=g.mimeType,u=g.extension);}catch(f){return n.verbose,{success:!1,error:`Raw data extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}return {success:!0,imageData:i,mimeType:c,extension:u}}catch(l){return {success:false,error:`Image data extraction failed: ${l instanceof Error?l.message:"Unknown error"}`}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,o){try{let{PNG:n}=await import('pngjs'),l=a?.toString()||"",i=3,c=2;l.includes("DeviceGray")||l.includes("Gray")?(i=1,c=0):l.includes("DeviceRGB")||l.includes("RGB")?(i=3,c=2):(l.includes("DeviceCMYK")||l.includes("CMYK"))&&(i=4,c=2);let u=e*r*i*(s/8),f=t.length;if(o.verbose,Math.abs(f-u)>f*.1)return {success:!1,error:`Data size mismatch: expected ${u}, got ${f} bytes`};let g=new n({width:e,height:r,colorType:c===0?0:6,bitDepth:8}),m;if(i===1){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=t[d]||0,h=d*4;m[h]=x,m[h+1]=x,m[h+2]=x,m[h+3]=255;}}else if(i===3){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*3,h=d*4;m[h]=t[x]||0,m[h+1]=t[x+1]||0,m[h+2]=t[x+2]||0,m[h+3]=255;}}else if(i===4){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*4,h=(t[x]||0)/255,P=(t[x+1]||0)/255,I=(t[x+2]||0)/255,E=(t[x+3]||0)/255,v=d*4;m[v]=Math.round(255*(1-h)*(1-E)),m[v+1]=Math.round(255*(1-P)*(1-E)),m[v+2]=Math.round(255*(1-I)*(1-E)),m[v+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};g.data=m;let b=n.sync.write(g);return o.verbose,{success:!0,pngData:b}}catch(n){return {success:false,error:`PNG creation error: ${n instanceof Error?n.message:"Unknown error"}`}}}getFormatFromMimeType(t){switch(t){case "image/jpeg":return "JPEG";case "image/png":return "PNG";case "image/jp2":return "JPEG 2000";case "image/gif":return "GIF";case "image/tiff":return "TIFF";default:return "unknown"}}};});var A,ee=O(()=>{H();A=class extends T{name="poppler";description="Poppler-based extraction using pdfimages command";async isAvailable(){try{let{Poppler:t}=await import('node-poppler');return new t,!0}catch{return false}}getCapabilities(){return {formats:["png"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}async extractImages(t,e){try{let{Poppler:r}=await import('node-poppler');if(!y__default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let a=new r,s=[],o=D.join(process.cwd(),"temp-poppler-images");y__default.existsSync(o)||y__default.mkdirSync(o,{recursive:!0});try{e.verbose;let n=D.join(o,"img"),l={firstPageToConvert:1,lastPageToConvert:-1,pngFile:!0};e.verbose,await a.pdfImages(t,n,l),e.verbose;let i={list:!0};e.verbose;let c=await a.pdfImages(t,void 0,i),u=this.parseImageList(c);e.verbose;let f=y__default.readdirSync(o).filter(g=>g.startsWith("img-")&&g.endsWith(".png"));e.verbose;for(let g=0;g<f.length;g++){let m=f[g];if(!m)continue;let b=D.join(o,m);if(!y__default.existsSync(b))continue;let d=y__default.statSync(b);y__default.readFileSync(b);let x=m.match(/img-(\d+)\.png/),h=x?parseInt(x[1],10)+1:g+1,P=u[g]||{page:1,index:h,width:0,height:0,format:"PNG"},I=P.page,E=`img_p${I}_${h}.png`,v;if(e.extractImageFiles&&e.imageOutputDir){let L=D.join(e.imageOutputDir,"images");y__default.existsSync(L)||y__default.mkdirSync(L,{recursive:!0}),v=D.join(L,E),y__default.copyFileSync(b,v),e.verbose;}let ne={id:`img_${h}`,filename:`images/${E}`,filepath:v||"",page:I,width:P.width,height:P.height,format:"PNG",mimeType:"image/png",size:d.size,position:{x:0,y:0,width:P.width,height:P.height}};s.push(ne);}return e.verbose,{success:!0,images:s}}finally{y__default.existsSync(o)&&y__default.rmSync(o,{recursive:!0,force:!0});}}catch(r){return {success:false,error:`Poppler extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}parseImageList(t){let e=[],r=t.split(`
2
+ `);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let o=parseInt(s[1],10),n=parseInt(s[2],10),l=parseInt(s[3],10),i=parseInt(s[4],10),c=s[5]?.toUpperCase()||"PNG";e.push({page:o,index:n,width:l,height:i,format:c});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>F});var F,N=O(()=>{F=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(y__default.existsSync(r.imageOutputDir)||y__default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let o=await s.extractImages(t,r);if(!o.success)throw new Error(o.error||"Engine extraction failed");return {success:!0,images:o.images||[],metadata:{totalImages:o.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=y__default.readFileSync(t),o=await r.load(s,{ignoreEncryption:!0}),n=o.getPageCount(),l=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(y__default.existsSync(e.imageOutputDir)||y__default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let c=0;c<n;c++){let u=c+1;try{let g=o.getPage(c).node.Resources();if(!g){e.verbose;continue}let m=g.get(a.of("XObject"));if(!m){e.verbose;continue}let b=m.dict;e.verbose;for(let[d,x]of b)try{let h=o.context.lookup(x),P=h.dict.get(a.of("Subtype"));if(!P||P.toString()!=="/Image")continue;let I=await this.extractImageFromPdfObject(h,u,i,e);I&&(l.push(I),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:l,totalPages:n,totalImages:l.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),o=t.dict.get(s.of("Width")),n=t.dict.get(s.of("Height")),l=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),c=t.dict.get(s.of("BitsPerComponent")),u=o&&typeof o.value=="number"?o.value:100,f=n&&typeof n.value=="number"?n.value:100,g=c&&typeof c.value=="number"?c.value:8;a.verbose;let m=await this.extractImageData(t,l,u,f,i,g,a);if(!m.success||!m.imageData)return a.verbose,null;let b=m.imageData,d=m.mimeType||"image/jpeg",x=m.extension||"jpg",h=`img_p${e}_${r}.${x}`,P="",I=b.length;return a.extractImageFiles&&a.imageOutputDir&&(P=D.join(a.imageOutputDir,h),y__default.writeFileSync(P,b),a.verbose),{id:`img_${r}`,name:h,page:e,position:{x:0,y:0,width:u,height:f},width:u,height:f,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:P}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,o,n){try{let l=await import('zlib'),i,c="image/jpeg",u="jpg";if(e){let f=e.toString();if(n.verbose,f.includes("DCTDecode")&&f.includes("FlateDecode")){n.verbose;try{let g=t.contents;i=l.inflateSync(Buffer.from(g)),c="image/jpeg",u="jpg",n.verbose;}catch(g){return n.verbose,{success:!1,error:`Zlib decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("DCTDecode"))n.verbose,i=Buffer.from(t.contents),c="image/jpeg",u="jpg";else if(f.includes("FlateDecode")){n.verbose;try{let g=t.contents,m=l.inflateSync(Buffer.from(g));n.verbose;let b=this.detectImageFormat(m);if(b.valid)i=m,c=b.mimeType,u=b.extension,n.verbose;else {let d=await this.createPngFromPdfMetadata(m,r,a,s,o,n);if(d.success&&d.pngData)i=d.pngData,c="image/png",u="png",n.verbose;else return n.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(g){return n.verbose,{success:!1,error:`FlateDecode decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else if(f.includes("JPXDecode")){n.verbose;try{i=Buffer.from(t.contents),c="image/jp2",u="jp2",n.verbose;}catch(g){return n.verbose,{success:!1,error:`JPXDecode extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}else {n.verbose;try{let g=await t.asUint8Array();i=Buffer.from(g);let m=this.detectImageFormat(i);m.valid&&(c=m.mimeType,u=m.extension);}catch(g){return n.verbose,{success:!1,error:`Generic decompression failed: ${g instanceof Error?g.message:"Unknown error"}`}}}}else {n.verbose;try{let f=await t.asUint8Array();i=Buffer.from(f);let g=this.detectImageFormat(i);g.valid&&(c=g.mimeType,u=g.extension);}catch(f){return n.verbose,{success:!1,error:`Raw data extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:c,extension:u}}catch(l){return n.verbose,{success:false,error:l instanceof Error?l.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,o){try{let{PNG:n}=await import('pngjs'),l=a?.toString()||"",i=3,c=2;l.includes("DeviceGray")||l.includes("Gray")?(i=1,c=0):l.includes("DeviceRGB")||l.includes("RGB")?(i=3,c=2):(l.includes("DeviceCMYK")||l.includes("CMYK"))&&(i=4,c=2);let u=e*r*i*(s/8),f=t.length;if(o.verbose,Math.abs(f-u)>f*.1)return {success:!1,error:`Data size mismatch: expected ${u}, got ${f} bytes`};let g=new n({width:e,height:r,colorType:c===0?0:6,bitDepth:8}),m;if(i===1){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=t[d]||0,h=d*4;m[h]=x,m[h+1]=x,m[h+2]=x,m[h+3]=255;}}else if(i===3){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*3,h=d*4;m[h]=t[x]||0,m[h+1]=t[x+1]||0,m[h+2]=t[x+2]||0,m[h+3]=255;}}else if(i===4){m=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let x=d*4,h=(t[x]||0)/255,P=(t[x+1]||0)/255,I=(t[x+2]||0)/255,E=(t[x+3]||0)/255,v=d*4;m[v]=Math.round(255*(1-h)*(1-E)),m[v+1]=Math.round(255*(1-P)*(1-E)),m[v+2]=Math.round(255*(1-I)*(1-E)),m[v+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};g.data=m;let b=n.sync.write(g);return o.verbose,{success:!0,pngData:b}}catch(n){return {success:false,error:`PNG creation error: ${n instanceof Error?n.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):y__default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!ce(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function ce(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(o=>p.includes(o))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let o of s)e.includes(o)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${o}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!y__default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=y.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(o=>o.text).join(`
3
+ `).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(t){return this.pdfLibDoc=await PDFDocument.load(t,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((e,r)=>{let{width:a,height:s}=e.getSize();return {pageNumber:r+1,width:a,height:s,rotation:e.getRotation(),mediaBox:e.getMediaBox()}})}async processPDFParse(t){let e=(await import('pdf-parse')).default,r=[];return await e(t,{pagerender:async s=>{try{let o=await s.getTextContent(),n=s.getViewport({scale:1}),l=o.items.filter(g=>typeof g.str=="string");l.sort((g,m)=>{let b=m.transform[5]-g.transform[5];return Math.abs(b)>2?b:g.transform[4]-m.transform[4]});let i="",c=null,u="";for(let g of l){let m=g.transform[5];c===null?(c=m,u=g.str):Math.abs(m-c)>2?(i+=`${u}
4
+ `,c=m,u=g.str):u+=` ${g.str}`;}u&&(i+=u),i=i.trim();let f={pageNumber:s.pageIndex+1,text:i,textItems:o.items,pdfParseWidth:n.width,pdfParseHeight:n.height};return r.push(f),i}catch{return r.push({pageNumber:s.pageIndex+1,text:"",textItems:[],pdfParseWidth:0,pdfParseHeight:0}),""}}}),r.sort((s,o)=>s.pageNumber-o.pageNumber)}combineResults(t,e){return t.map(r=>{let a=e.find(o=>o.pageNumber===r.pageNumber),s=a?.text||"";return {pageNumber:r.pageNumber,text:s,width:r.width,height:r.height,rotation:r.rotation,mediaBox:r.mediaBox,textItems:a?.textItems||[],wordCount:this.countWords(s),characterCount:s.length}})}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){let a=await this.processPDF(t),s=[];if(r.includeImageRefs)try{let{ImageExtractor:n}=await Promise.resolve().then(()=>(N(),ae));s=(await new n().extract(t,{extractImageFiles:!1,verbose:!1,imageEngine:r.imageEngine||"auto"})).images||[];}catch{}let o="";return a.pages.forEach(n=>{let l=e.replace("{page}",n.pageNumber.toString()),i=n.text;if(r.includeImageRefs&&s.length>0){let c=s.filter(u=>u.page===n.pageNumber);if(c.length>0){let u=c.map(f=>(r.imageRefFormat||"[IMG:{id}] {name}").replace("{id}",`img_${f.id}`).replace("{name}",f.filename||`img_p${f.page}_${f.id}.jpg`)).join(`
5
+ `);if(i.trim()){let f=i.split(`
6
+ `);f.length>1?(f.splice(1,0,u),i=f.join(`
7
+ `)):i=`${i}
8
+ ${u}`;}else i=u;}}i.trim()?o+=`${l}
9
+
10
+ ${i}
11
+ `:o+=`${l}
12
+
13
+
14
+ `;}),{text:o.trim(),cleanText:a.fullText,numPages:a.totalPages,pages:a.pages}}getPage(t){return this.textData[t-1]||null}async getDetailedPageInfo(t,e){this.textData.length||await this.processPDF(t);let r=this.getPage(e);if(!r)return null;let a=(r.textItems||[]).map(s=>({text:s.str||"",x:s.transform?.[4]||0,y:s.transform?.[5]||0,width:s.width||0,height:s.height||0,fontName:s.fontName,fontSize:s.transform?.[0]||12}));return {pageNumber:e,text:r.text,textItems:a,dimensions:{width:r.width,height:r.height}}}countWords(t){return !t||t.trim()===""?0:t.split(/\s+/).filter(e=>e.length>0).length}async processSinglePage(t,e){try{let r=(await import('pdf-parse')).default,a=y.readFileSync(t),s=await PDFDocument.load(a,{ignoreEncryption:!0});if(e<1||e>s.getPageCount())return null;let n=s.getPages()[e-1];if(!n)return null;let{width:l,height:i}=n.getSize(),c=await PDFDocument.create(),[u]=await c.copyPages(s,[e-1]);c.addPage(u);let f=await c.save(),g=[],m={pagerender:async h=>{try{let P=await h.getTextContent();return g=P.items,P.items.map(I=>I.str||"").join(" ")}catch{return ""}}},b=Buffer.from(f),x=(await r(b,m)).text.replace(/\s+/g," ").trim();return {pageNumber:e,text:x,width:l,height:i,rotation:n.getRotation().angle,mediaBox:[n.getMediaBox().x,n.getMediaBox().y,n.getMediaBox().width,n.getMediaBox().height],textItems:g,wordCount:this.countWords(x),characterCount:x.length}}catch{return null}}};var $=class{async extract(t){try{let e=(await import('pdf-parse')).default,r=y__default.readFileSync(t),a=await e(r);return {text:a.text,numPages:a.numpages,info:a.info,metadata:a.metadata,version:a.version}}catch(e){throw new Error(`Failed to extract text from PDF: ${e instanceof Error?e.message:"Unknown error"}`)}}async extractWithPages(t){try{let e=(await import('pdf-parse')).default,r=y__default.readFileSync(t),s=await e(r,{pagerender:o=>o.getTextContent().then(n=>n.items.map(l=>l.str).join(" "))});return {text:s.text,numPages:s.numpages,info:s.info,metadata:s.metadata,version:s.version,pages:s.text?this.splitTextIntoPages(s.text,s.numpages):[]}}catch(e){throw new Error(`Failed to extract text with pages: ${e instanceof Error?e.message:"Unknown error"}`)}}splitTextIntoPages(t,e){let r=t.split(`
15
+ `),a=Math.ceil(r.length/e),s=[];for(let o=0;o<e;o++){let n=o*a,l=Math.min(n+a,r.length),i=r.slice(n,l).join(`
16
+ `);s.push(i);}return s}async extractTextItems(t,e={}){try{let r=await this.extract(t),a=r.text,s=r.numpages||1,o=a.split(`
17
+ `),n=[],l=1,i=Math.ceil(o.length/s);return o.forEach((c,u)=>{if(c.trim()){l=Math.ceil((u+1)/i);let f="text";c.length<50&&c.trim().match(/^[A-Z\s]+$/)?f="heading":c.length>100?f="paragraph":c.length<30&&(f="caption");let g=12;f==="heading"?g=16:f==="caption"&&(g=10);let m={id:`text_${u+1}`,content:c.trim(),position:{x:0,y:u%i*15,width:c.length*8,height:g},font:{name:"Unknown",size:g,style:f==="heading"?"bold":"normal"},page:l,type:f,fontSize:g,color:"#000000"};n.push(m);}}),e.verbose,n}catch(r){throw new Error(`Failed to extract text items: ${r instanceof Error?r.message:"Unknown error"}`)}}async extractStatistics(t){let e=await this.extract(t),r=e.text,a=r.length,s=r.split(/\s+/).filter(c=>c.length>0).length,o=r.split(`
18
+ `).length,n=e.numPages,l=Math.round(s/n),i=Math.ceil(s/200);return {characterCount:a,wordCount:s,lineCount:o,pageCount:n,averageWordsPerPage:l,readingTime:i}}async extractWithFontInfo(t){return this.extract(t)}cleanText(t){return t.replace(/\s+/g," ").replace(/\n\s*\n/g,`
19
+ `).trim()}async extractPageRange(t,e,r){let a=await this.extractWithPages(t);if(e<1||r>a.numPages||e>r)throw new Error(`Invalid page range: ${e}-${r}. Document has ${a.numPages} pages.`);return a.pages.slice(e-1,r).join(`
20
+
21
+ `)}async searchText(t,e,r=false){let a=await this.extractWithPages(t),s=r?"g":"gi",o=new RegExp(e,s),n=0,l=[],i=[];return a.pages.forEach((c,u)=>{let f=c.match(o);if(f){n+=f.length,l.push(u+1);let g=c.split(`
22
+ `);g.forEach((m,b)=>{if(o.test(m)){let d=Math.max(0,b-1),x=Math.min(g.length,b+2),h=g.slice(d,x).join(`
23
+ `);i.push(`Page ${u+1}: ${h}`);}});}}),{found:n>0,occurrences:n,pages:l,context:i}}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){try{let a=new z,s={includeImageRefs:r.includeImageRefs??!0,imageRefFormat:r.imageRefFormat||"[IMG:{id}] {name}"};r.imageEngine&&(s.imageEngine=r.imageEngine);let o=await a.extractWithPageMarkers(t,e,s),n=o.pages.map(l=>({pageNumber:l.pageNumber+(r.pageOffset||0),text:{content:l.text,rawText:l.text,wordCount:l.wordCount,characterCount:l.characterCount},images:[],imageCount:0}));return {text:o.text,pages:n}}catch(a){throw new Error(`Failed to extract text with page markers: ${a instanceof Error?a.message:"Unknown error"}`)}}async extractWithAccuratePages(t){let r=await new z().processPDF(t),a=r.pages.map(s=>({pageNumber:s.pageNumber,text:{content:s.text,rawText:s.text,wordCount:s.wordCount,characterCount:s.characterCount},images:[],imageCount:0}));return {fullText:r.fullText,pages:a,totalPages:r.totalPages}}};N();var S=class{generateTextWithImageRefs(t,e,r,a){if(!t||e.length===0)return t||"";let s=t.split(`
24
+ `),o=Math.ceil(s.length/a),n="";for(let l=1;l<=a;l++){let i=(l-1)*o,c=Math.min(i+o,s.length),u=s.slice(i,c).join(`
25
+ `);u.trim()&&(n+=u);let f=e.filter(g=>g.page===l);for(let g of f){let m=this.formatImageReference(g,r,e.indexOf(g)+1);n+=`
26
+ ${m}
27
+ `;}l<a&&u.trim()&&(n+=`
28
+ `);}return n.trim()}generateImageOnlyRefs(t,e){return t.map((r,a)=>this.formatImageReference(r,e,a+1)).join(`
29
+ `)}formatImageReference(t,e,r){let a={id:t.id,name:t.name||t.id,page:t.page,index:r,path:t.filePath||t.id};return this.replacePlaceholders(e,a)}replacePlaceholders(t,e){return t.replace(/\{id\}/g,e.id).replace(/\{name\}/g,e.name||e.id).replace(/\{page\}/g,e.page.toString()).replace(/\{index\}/g,e.index.toString()).replace(/\{path\}/g,e.path||e.id)}extractPlaceholders(t){let e=/\{([^}]+)\}/g,r=[],a=null;for(a=e.exec(t);a!==null;)a[1]&&r.push(a[1]),a=e.exec(t);return [...new Set(r)]}isValidFormat(t){let e=["id","name","page","index","path"];return this.extractPlaceholders(t).every(a=>e.includes(a))}getDefaultFormat(t=false){return t?"[IMAGE:{path}]":"[IMAGE:{id}]"}cleanTextFromImageRefs(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g");return t.replace(a,"").replace(/\n\s*\n/g,`
30
+ `).trim()}countImageReferences(t,e){let r=e.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=t.match(a);return s?s.length:0}generateSummary(t,e,r,a,s){let o=(r/t).toFixed(2),n=["\u{1F4C4} Document Summary",` Pages: ${t}`,` Text items: ${e}`,` Images: ${r} (avg ${o} per page)`,` Text length: ${a.toLocaleString()} characters`];return s&&n.push(` Processing time: ${s}ms`),n.join(`
31
+ `)}formatFileSize(t){let e=["B","KB","MB","GB"],r=t,a=0;for(;r>=1024&&a<e.length-1;)r/=1024,a++;return `${r.toFixed(1)} ${e[a]}`}formatDuration(t){if(t<1e3)return `${t}ms`;let e=Math.floor(t/1e3);if(e<60)return `${e}s`;let r=Math.floor(e/60),a=e%60;return `${r}m ${a}s`}};var U=class{extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
32
+
33
+ `),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}generateStructuredData(t,e,r,a,s){let o=this.splitTextIntoPages(e,a),n=this.createPageDataArray(o,r,a);return {metadata:{filename:t,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:e.length,totalImages:r.length,extractionOptions:s},pages:n}}splitTextIntoPages(t,e){if(e<=1)return [t];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=t.match(r);return a&&a.length>0?this.splitByPageMarkers(t,r):this.splitByEstimatedLength(t,e)}splitByPageMarkers(t,e){let r=t.split(e),a=[];for(let s=1;s<r.length;s++){let o=r[s];o&&a.push(o.trim());}return a.length===0&&a.push(t),a}splitByEstimatedLength(t,e){let r=t.split(`
34
+ `),a=Math.ceil(r.length/e),s=[];for(let o=0;o<e;o++){let n=o*a,l=Math.min((o+1)*a,r.length),i=r.slice(n,l).join(`
35
+ `);s.push(i);}return s}createPageDataArray(t,e,r){let a=[];for(let s=0;s<r;s++){let o=s+1,n=t[s]||"",l=this.getImagesForPage(e,o),i=this.extractRawText(n);a.push({pageNumber:o,text:{content:n,rawText:i,wordCount:this.countWords(i),characterCount:i.length},images:l,imageCount:l.length});}return a}getImagesForPage(t,e){return t.filter(r=>r.page===e).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r){let s=r.filename;s!==void 0&&(a.filename=s);}if("path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("size"in r){let s=r.size;s!==void 0&&(a.size=s);}return a})}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}generateJSONString(t,e=2){return JSON.stringify(t,null,e)}generateSummary(t){let e=t.pages.reduce((o,n)=>o+n.text.wordCount,0),r=t.pages.reduce((o,n)=>o+n.text.characterCount,0),a=t.pages.filter(o=>o.text.content.trim().length>0).length,s=t.pages.filter(o=>o.imageCount>0).length;return {totalWords:e,totalCharacters:r,averageWordsPerPage:Math.round(e/t.pages.length),averageImagesPerPage:Math.round(t.metadata.totalImages/t.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var W=class{cacheDir;constructor(t="./tmp/pdf-cache"){this.cacheDir=t,this.ensureCacheDir();}generateCacheKey(t){let e=D.resolve(t),r=y__default.statSync(e),a=`${e}:${r.mtime.getTime()}:${r.size}`;return le.createHash("md5").update(a).digest("hex")}getCacheDir(t){let e=this.generateCacheKey(t);return D.join(this.cacheDir,e)}ensureCacheDir(){y__default.existsSync(this.cacheDir)||y__default.mkdirSync(this.cacheDir,{recursive:true});}isCached(t){try{let e=this.getCacheDir(t),r=D.join(e,"cache-info.json");return y__default.existsSync(r)}catch{return false}}getCacheInfo(t){try{let e=this.getCacheDir(t),r=D.join(e,"cache-info.json");return y__default.existsSync(r)?JSON.parse(y__default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(t,e){let r=this.getCacheDir(t);y__default.existsSync(r)||y__default.mkdirSync(r,{recursive:true});let a=y__default.statSync(t),s={pdfPath:D.resolve(t),lastModified:a.mtime.getTime(),totalPages:e,cacheDir:r,created:new Date().toISOString()},o=D.join(r,"cache-info.json");return y__default.writeFileSync(o,JSON.stringify(s,null,2)),r}cachePageResult(t,e,r){try{let a=this.getCacheDir(t),s=D.join(a,`page-${e}.json`);y__default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(t,e){try{let r=this.getCacheDir(t),a=D.join(r,`page-${e}.json`);return y__default.existsSync(a)?JSON.parse(y__default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(t){try{let e=this.getCacheDir(t),r=[];if(!y__default.existsSync(e))return r;let s=y__default.readdirSync(e).filter(o=>o.startsWith("page-")&&o.endsWith(".json"));for(let o of s)try{let n=D.join(e,o),l=JSON.parse(y__default.readFileSync(n,"utf-8"));r.push(l);}catch{}return r.sort((o,n)=>o.pageNumber-n.pageNumber),r}catch{return []}}clearCache(t){try{let e=this.getCacheDir(t);y__default.existsSync(e)&&y__default.rmSync(e,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{y__default.existsSync(this.cacheDir)&&y__default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{let t=0,e=0,r=0;if(y__default.existsSync(this.cacheDir)){let a=y__default.readdirSync(this.cacheDir);t=a.length;for(let s of a){let o=D.join(this.cacheDir,s);if(y__default.statSync(o).isDirectory()){let n=y__default.readdirSync(o),l=n.filter(i=>i.startsWith("page-")&&i.endsWith(".json"));e+=l.length;for(let i of n){let c=D.join(o,i);r+=y__default.statSync(c).size;}}}}return {totalCachedPdfs:t,totalCachedPages:e,totalCacheSize:r,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var j=class{textExtractor;imageExtractor;formatProcessor;structuredDataGenerator;cacheManager;constructor(t){this.textExtractor=new $,this.imageExtractor=new F,this.formatProcessor=new S,this.structuredDataGenerator=new U,this.cacheManager=new W(t);}async extract(t,e={}){let r={pdfPath:t,outputDir:e.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,...e}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!y__default.existsSync(t))throw new Error(`PDF file not found: ${t}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let o=null,n=null;if(r.options.extractText&&(r.options.verbose,o=await this.textExtractor.extract(t),r.options.includePageMarkers)){let u=r.options.pageMarkerFormat||"--- PAGE {page} ---",g={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};r.options.imageEngine&&(g.imageEngine=r.options.imageEngine),n=await this.textExtractor.extractWithPageMarkers(t,u,g);}let l=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,l=await this.textExtractor.extractTextItems(t,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(t,r.options));let c=await this.processResults(t,o,n,i,l,r.options,s);return this.reportProgress(r.options,{currentPage:c.document.pages,totalPages:c.document.pages,phase:"complete"}),c}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(t,e={}){return (await this.extract(t,{...e,extractText:true,extractImages:false})).cleanText}async extractImages(t,e={}){return (await this.extract(t,{...e,extractText:false,extractImages:true})).images}async extractImageFiles(t,e="./extracted-images",r={}){return (await this.extract(t,{...r,extractImageFiles:true,imageOutputDir:e,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(t){return k(t)}async processResults(t,e,r,a,s,o,n){let l=D.basename(t),c={document:{filename:l,pages:a?.totalPages||e?.numPages||0,textLength:e?.text?.length||0,extractedAt:new Date().toISOString(),metadata:e?.info||{},options:o},pages:[],images:a?.images||[],textItems:s,textWithRefs:"",cleanText:this.extractRawText(e?.text||"")};if(o.extractText&&o.extractImages&&e&&a)if(r?.text&&o.includeImageRefs)c.textWithRefs=r.text;else {let u=r?.text||e.text;c.textWithRefs=this.formatProcessor.generateTextWithImageRefs(u,a.images,o.imageRefFormat||"[IMAGE:{id}]",c.document.pages);}else o.extractText&&e?c.textWithRefs=r?.text||e.text:o.extractImages&&a&&(c.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,o.imageRefFormat||"[IMAGE:{id}]"));if(c.summary={totalPages:c.document.pages,totalTextItems:0,totalImages:c.images.length,totalTextLength:c.document.textLength,averageImagesPerPage:(c.images.length/c.document.pages).toFixed(2),pagesWithImages:new Set(c.images.map(u=>u.page)).size},o.generateStructuredData){let u=c.textWithRefs||c.cleanText;c.structuredData=this.structuredDataGenerator.generateStructuredData(l,u,c.images,c.document.pages,o),o.verbose;}return o.verbose,c}async getText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).text}async getImages(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:false,extractImages:true})).images}async getTextItems(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(t,e,r={}){return (await this.getPage(t,e,{...r,extractText:true,extractImages:false})).rawText}async getPage(t,e,r={}){if(r.useCache!==false){let u=this.cacheManager.getCachedPageResult(t,e);if(u)return r.verbose,u}let a={...r,specificPages:[e]},s=await this.extract(t,a),o=this.extractPageText(s.textWithRefs||s.cleanText,e),n=s.images.filter(u=>u.page===e),l=s.textItems?.filter(u=>u.page===e)||[],i=this.extractRawText(o),c={pageNumber:e,text:o,rawText:i,textItems:l,images:n,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:n.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(t,e,c),c}extractPageText(t,e){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=t.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===e)return a[i+3]||""}let s=t.split(`
36
+ `),o=Math.ceil(s.length/e),n=(e-1)*o,l=Math.min(e*o,s.length);return s.slice(n,l).join(`
37
+ `)}countWords(t){return t.trim()?t.trim().split(/\s+/).length:0}extractRawText(t){let e=t;return e=e.replace(/--- PAGE \d+ ---\s*/g,""),e=e.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),e=e.replace(/PAGE \d+\s*/g,""),e=e.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),e=e.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),e=e.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),e=e.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),e=e.replace(/\n\s*\n\s*\n/g,`
38
+
39
+ `),e=e.replace(/^\s+|\s+$/g,""),e=e.replace(/[ \t]+/g," "),e}clearCache(t){this.cacheManager.clearCache(t);}getCacheStats(){return this.cacheManager.getCacheStats()}reportProgress(t,e){t.progressCallback&&t.progressCallback(e);}createValidationError(t,e){let r=new Error(t);return r.code="VALIDATION_ERROR",r.validationErrors=e,r}createExtractionError(t,e){let r=new Error(t);return r.code="EXTRACTION_ERROR",r.originalError=e,r}},C=new j;N();N();async function fe(p,t={}){return C.extract(p,t)}async function pe(p,t={}){return C.extractText(p,t)}async function de(p,t={}){return C.extractImages(p,t)}async function xe(p,t="./extracted-images",e={}){return C.extractImageFiles(p,t,e)}var he="1.0.0",it={PDFExtractor:j,pdfExtractor:C,TextExtractor:$,ImageExtractor:F,FormatProcessor:S,extractPdfContent:fe,extractText:pe,extractImages:de,extractImageFiles:xe,validateConfig:k,validateImageRefFormat:V,validateFilePath:J,version:he};export{S as FormatProcessor,F as ImageExtractor,j as PDFExtractor,$ as TextExtractor,it as default,xe as extractImageFiles,de as extractImages,fe as extractPdfContent,pe as extractText,C as pdfExtractor,k as validateConfig,J as validateFilePath,V as validateImageRefFormat,he as version};//# sourceMappingURL=index.mjs.map
40
+ //# sourceMappingURL=index.mjs.map
package/package.json ADDED
@@ -0,0 +1,80 @@
1
+ {
2
+ "name": "pdf-plus",
3
+ "version": "1.0.0",
4
+ "description": "A comprehensive PDF content extraction library with support for text, images, and structured data",
5
+ "main": "dist/index.js",
6
+ "module": "dist/index.mjs",
7
+ "types": "dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.mjs",
12
+ "require": "./dist/index.js"
13
+ }
14
+ },
15
+ "files": [
16
+ "dist/**/*.js",
17
+ "dist/**/*.mjs",
18
+ "dist/**/*.d.ts",
19
+ "dist/**/*.d.mts",
20
+ "README.md",
21
+ "LICENSE"
22
+ ],
23
+ "sideEffects": false,
24
+ "keywords": [
25
+ "pdf",
26
+ "extraction",
27
+ "text",
28
+ "images",
29
+ "document",
30
+ "parsing",
31
+ "content",
32
+ "typescript"
33
+ ],
34
+ "author": "Kauan Guesser <54258870+kauandotnet@users.noreply.github.com>",
35
+ "license": "MIT",
36
+ "repository": {
37
+ "type": "git",
38
+ "url": "https://github.com/kauandotnet/pdfnode.git"
39
+ },
40
+ "bugs": {
41
+ "url": "https://github.com/kauandotnet/pdfnode/issues"
42
+ },
43
+ "homepage": "https://github.com/kauandotnet/pdfnode#readme",
44
+ "engines": {
45
+ "node": ">=18.0.0"
46
+ },
47
+ "dependencies": {
48
+ "node-poppler": "^8.0.4",
49
+ "pdf-lib": "^1.17.1",
50
+ "pdf-parse": "^1.1.1",
51
+ "pdfjs-dist": "^5.4.149",
52
+ "pngjs": "^7.0.0"
53
+ },
54
+ "devDependencies": {
55
+ "@biomejs/biome": "^2.2.4",
56
+ "@types/node": "^24.5.2",
57
+ "@types/pdf-parse": "^1.1.5",
58
+ "@types/pngjs": "^6.0.5",
59
+ "rimraf": "^6.0.1",
60
+ "tsup": "^8.3.5",
61
+ "typedoc": "^0.28.13",
62
+ "typescript": "^5.9.2"
63
+ },
64
+ "scripts": {
65
+ "build": "tsup",
66
+ "build:prod": "NODE_ENV=production tsup",
67
+ "dev": "tsup --watch",
68
+ "lint": "biome lint src",
69
+ "lint:fix": "biome lint --write src",
70
+ "format": "biome format src",
71
+ "format:fix": "biome format --write src",
72
+ "check": "biome check src",
73
+ "check:fix": "biome check --write src",
74
+ "typecheck": "tsc --noEmit",
75
+ "test:unit": "echo 'No tests configured - skipping'",
76
+ "clean": "rimraf dist",
77
+ "docs:dev": "typedoc --watch",
78
+ "docs:build": "typedoc"
79
+ }
80
+ }