pdf-plus 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +303 -2
- package/dist/index.d.mts +973 -21
- package/dist/index.d.ts +973 -21
- package/dist/index.js +35 -36
- package/dist/index.mjs +35 -36
- package/dist/workers/image-decoder.worker.d.mts +2 -0
- package/dist/workers/image-decoder.worker.d.ts +2 -0
- package/dist/workers/image-decoder.worker.js +2 -0
- package/dist/workers/image-decoder.worker.mjs +2 -0
- package/dist/workers/jp2-converter.worker.d.mts +2 -0
- package/dist/workers/jp2-converter.worker.d.ts +2 -0
- package/dist/workers/jp2-converter.worker.js +2 -0
- package/dist/workers/jp2-converter.worker.mjs +2 -0
- package/package.json +22 -7
package/dist/index.mjs
CHANGED
|
@@ -1,42 +1,41 @@
|
|
|
1
|
-
import*as P from'fs';import P__default from'fs';import D from'path';import se from'pdf-parse';import {PDFDocument}from'pdf-lib';import ue from'crypto';var ce=Object.defineProperty;var O=(p,t)=>()=>(p&&(t=p(p=0)),t);var Y=(p,t)=>{for(var e in t)ce(p,e,{get:t[e],enumerable:true});};var T,H=O(()=>{T=class{};});var B,Q=O(()=>{H();B=class extends T{name="pdf-lib";description="PDF-lib based extraction with full format support";async isAvailable(){try{return await import('pdf-lib'),!0}catch{return false}}getCapabilities(){return {formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}}async extractImages(t,e){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib');if(!P__default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let s=P__default.readFileSync(t),n=await r.load(s),o=n.getPages(),g=[],i=1;e.verbose;for(let l=0;l<o.length;l++){let m=o[l],u=l+1,c=m?.node.Resources;if(!c)continue;let b=(typeof c=="function"?c():c)?.get?.(a.of("XObject"));if(!b)continue;let d=b.entries?.()||[],h=0;e.verbose;for(let[,x]of d){let y=n.context.lookup(x);if(!y||y.dict?.get?.(a.of("Subtype"))?.toString()!=="/Image")continue;h++;let E=await this.extractImageFromPdfObject(y,u,i,e);E&&g.push(E),i++;}}return e.verbose,{success:!0,images:g}}catch(r){return {success:false,error:`PDF-lib extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=t.dict.get(s.of("Width")),o=t.dict.get(s.of("Height")),g=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),l=t.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,c=l&&typeof l.value=="number"?l.value:8;a.verbose;let f=await this.extractImageData(t,g,m,u,i,c,a);if(!f.success||!f.imageData)return a.verbose,null;let b=f.extension||"bin",d=`img_p${e}_${r}.${b}`,h,x=f.imageData.length;if(a.extractImageFiles&&a.imageOutputDir){let y=D.join(a.imageOutputDir,"images");P__default.existsSync(y)||P__default.mkdirSync(y,{recursive:!0}),h=D.join(y,d),P__default.writeFileSync(h,f.imageData),a.verbose;}return {id:`img_${r}`,filename:`images/${d}`,filepath:h||"",page:e,width:m,height:u,format:this.getFormatFromMimeType(f.mimeType||""),mimeType:f.mimeType||"",size:x,position:{x:0,y:0,width:m,height:u}}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,n,o){try{let g=await import('zlib'),i,l="image/jpeg",m="jpg";if(e){let u=e.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let c=t.contents;i=g.inflateSync(Buffer.from(c)),l="image/jpeg",m="jpg",o.verbose;}catch(c){return o.verbose,{success:!1,error:`Zlib decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(t.contents),l="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let c=t.contents,f=g.inflateSync(Buffer.from(c));o.verbose;let b=this.detectImageFormat(f);if(b.valid)i=f,l=b.mimeType,m=b.extension,o.verbose;else {let d=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(d.success&&d.pngData)i=d.pngData,l="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(c){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(t.contents),l="image/jp2",m="jp2",o.verbose;}catch(c){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else {o.verbose;try{let c=await t.asUint8Array();i=Buffer.from(c);let f=this.detectImageFormat(i);f.valid&&(l=f.mimeType,m=f.extension);}catch(c){return o.verbose,{success:!1,error:`Generic decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await t.asUint8Array();i=Buffer.from(u);let c=this.detectImageFormat(i);c.valid&&(l=c.mimeType,m=c.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return {success:!0,imageData:i,mimeType:l,extension:m}}catch(g){return {success:false,error:`Image data extraction failed: ${g instanceof Error?g.message:"Unknown error"}`}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,n){try{let{PNG:o}=await import('pngjs'),g=a?.toString()||"",i=3,l=2;g.includes("DeviceGray")||g.includes("Gray")?(i=1,l=0):g.includes("DeviceRGB")||g.includes("RGB")?(i=3,l=2):(g.includes("DeviceCMYK")||g.includes("CMYK"))&&(i=4,l=2);let m=e*r*i*(s/8),u=t.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let c=new o({width:e,height:r,colorType:l===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=t[d]||0,x=d*4;f[x]=h,f[x+1]=h,f[x+2]=h,f[x+3]=255;}}else if(i===3){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*3,x=d*4;f[x]=t[h]||0,f[x+1]=t[h+1]||0,f[x+2]=t[h+2]||0,f[x+3]=255;}}else if(i===4){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*4,x=(t[h]||0)/255,y=(t[h+1]||0)/255,w=(t[h+2]||0)/255,E=(t[h+3]||0)/255,I=d*4;f[I]=Math.round(255*(1-x)*(1-E)),f[I+1]=Math.round(255*(1-y)*(1-E)),f[I+2]=Math.round(255*(1-w)*(1-E)),f[I+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};c.data=f;let b=o.sync.write(c);return n.verbose,{success:!0,pngData:b}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}getFormatFromMimeType(t){switch(t){case "image/jpeg":return "JPEG";case "image/png":return "PNG";case "image/jp2":return "JPEG 2000";case "image/gif":return "GIF";case "image/tiff":return "TIFF";default:return "unknown"}}};});var A,ee=O(()=>{H();A=class extends T{name="poppler";description="Poppler-based extraction using pdfimages command";async isAvailable(){try{let{Poppler:t}=await import('node-poppler');return new t,!0}catch{return false}}getCapabilities(){return {formats:["png"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}async extractImages(t,e){try{let{Poppler:r}=await import('node-poppler');if(!P__default.existsSync(t))return {success:!1,error:`PDF file not found: ${t}`};let a=new r,s=[],n=D.join(process.cwd(),"temp-poppler-images");P__default.existsSync(n)||P__default.mkdirSync(n,{recursive:!0});try{e.verbose;let o=D.join(n,"img"),g={firstPageToConvert:1,lastPageToConvert:-1,pngFile:!0};e.verbose,await a.pdfImages(t,o,g),e.verbose;let i={list:!0};e.verbose;let l=await a.pdfImages(t,void 0,i),m=this.parseImageList(l);e.verbose;let u=P__default.readdirSync(n).filter(c=>c.startsWith("img-")&&c.endsWith(".png"));e.verbose;for(let c=0;c<u.length;c++){let f=u[c];if(!f)continue;let b=D.join(n,f);if(!P__default.existsSync(b))continue;let d=P__default.statSync(b);P__default.readFileSync(b);let h=f.match(/img-(\d+)\.png/),x=h?parseInt(h[1],10)+1:c+1,y=m[c]||{page:1,index:x,width:0,height:0,format:"PNG"},w=y.page,E=`img_p${w}_${x}.png`,I;if(e.extractImageFiles&&e.imageOutputDir){let L=D.join(e.imageOutputDir,"images");P__default.existsSync(L)||P__default.mkdirSync(L,{recursive:!0}),I=D.join(L,E),P__default.copyFileSync(b,I),e.verbose;}let ie={id:`img_${x}`,filename:`images/${E}`,filepath:I||"",page:w,width:y.width,height:y.height,format:"PNG",mimeType:"image/png",size:d.size,position:{x:0,y:0,width:y.width,height:y.height}};s.push(ie);}return e.verbose,{success:!0,images:s}}finally{P__default.existsSync(n)&&P__default.rmSync(n,{recursive:!0,force:!0});}}catch(r){return {success:false,error:`Poppler extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}parseImageList(t){let e=[],r=t.split(`
|
|
2
|
-
`);for(let a of r){let s=a.match(/^\s*(\d+)\s+(\d+)\s+\w+\s+(\d+)\s+(\d+)\s+\w+\s+\d+\s+\d+\s+(\w+)/);if(s){let n=parseInt(s[1],10),o=parseInt(s[2],10),g=parseInt(s[3],10),i=parseInt(s[4],10),l=s[5]?.toUpperCase()||"PNG";e.push({page:n,index:o,width:g,height:i,format:l});}}return e}};});var te={};Y(te,{ImageEngineFactory:()=>X});var X,re=O(()=>{Q();ee();X=class p{static engines=new Map;static async getEngine(t){if(t==="auto"&&(t=await p.selectBestEngine()),p.engines.has(t))return p.engines.get(t);let e;switch(t){case "pdf-lib":e=new B;break;case "poppler":e=new A;break;default:throw new Error(`Unknown image extraction engine: ${t}`)}if(!await e.isAvailable())throw new Error(`Image extraction engine '${t}' is not available on this system`);return p.engines.set(t,e),e}static async getAvailableEngines(){let t=[B,A],e=[];for(let r of t){let a=new r,s=await a.isAvailable();e.push({name:a.name,description:a.description,available:s,capabilities:a.getCapabilities()});}return e}static async selectBestEngine(){let t=await p.getAvailableEngines(),e=["pdf-lib","poppler"];for(let r of e)if(t.find(s=>s.name===r)?.available)return r;throw new Error("No image extraction engines are available on this system")}static clearCache(){p.engines.clear();}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Vector image extraction",engine:"poppler",reason:"Poppler can extract vector graphics as raster images"}]}};});var ae={};Y(ae,{ImageExtractor:()=>F});var F,N=O(()=>{F=class{async extract(t,e={}){let r={verbose:false,extractImageFiles:false,imageEngine:"auto",...e};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(P__default.existsSync(r.imageOutputDir)||P__default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(re(),te)),s=await a.getEngine(r.imageEngine);r.verbose;let n=await s.extractImages(t,r);if(!n.success)throw new Error(n.error||"Engine extraction failed");return {success:!0,images:n.images||[],metadata:{totalImages:n.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(t,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(t,e={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=P__default.readFileSync(t),n=await r.load(s,{ignoreEncryption:!0}),o=n.getPageCount(),g=[],i=1;e.verbose,e.extractImageFiles&&e.imageOutputDir&&(P__default.existsSync(e.imageOutputDir)||P__default.mkdirSync(e.imageOutputDir,{recursive:!0}));for(let l=0;l<o;l++){let m=l+1;try{let c=n.getPage(l).node.Resources();if(!c){e.verbose;continue}let f=c.get(a.of("XObject"));if(!f){e.verbose;continue}let b=f.dict;e.verbose;for(let[d,h]of b)try{let x=n.context.lookup(h),y=x.dict.get(a.of("Subtype"));if(!y||y.toString()!=="/Image")continue;let w=await this.extractImageFromPdfObject(x,m,i,e);w&&(g.push(w),i++);}catch{e.verbose;}}catch{e.verbose;}}return e.verbose,{images:g,totalPages:o,totalImages:g.length}}catch(r){throw e.verbose,r}}async extractImageFromPdfObject(t,e,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=t.dict.get(s.of("Width")),o=t.dict.get(s.of("Height")),g=t.dict.get(s.of("Filter")),i=t.dict.get(s.of("ColorSpace")),l=t.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,c=l&&typeof l.value=="number"?l.value:8;a.verbose;let f=await this.extractImageData(t,g,m,u,i,c,a);if(!f.success||!f.imageData)return a.verbose,null;let b=f.imageData,d=f.mimeType||"image/jpeg",h=f.extension||"jpg",x=`img_p${e}_${r}.${h}`,y="",w=b.length;return a.extractImageFiles&&a.imageOutputDir&&(y=D.join(a.imageOutputDir,x),P__default.writeFileSync(y,b),a.verbose),{id:`img_${r}`,name:x,page:e,position:{x:0,y:0,width:m,height:u},width:m,height:u,format:d==="image/jpeg"?"JPEG":d==="image/png"?"PNG":"unknown",filePath:y}}catch{return a.verbose,null}}async extractImageData(t,e,r,a,s,n,o){try{let g=await import('zlib'),i,l="image/jpeg",m="jpg";if(e){let u=e.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let c=t.contents;i=g.inflateSync(Buffer.from(c)),l="image/jpeg",m="jpg",o.verbose;}catch(c){return o.verbose,{success:!1,error:`Zlib decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(t.contents),l="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let c=t.contents,f=g.inflateSync(Buffer.from(c));o.verbose;let b=this.detectImageFormat(f);if(b.valid)i=f,l=b.mimeType,m=b.extension,o.verbose;else {let d=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(d.success&&d.pngData)i=d.pngData,l="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(c){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(t.contents),l="image/jp2",m="jp2",o.verbose;}catch(c){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${c instanceof Error?c.message:"Unknown error"}`}}}else {o.verbose;try{let c=await t.asUint8Array();i=Buffer.from(c);let f=this.detectImageFormat(i);f.valid&&(l=f.mimeType,m=f.extension);}catch(c){return o.verbose,{success:!1,error:`Generic decompression failed: ${c instanceof Error?c.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await t.asUint8Array();i=Buffer.from(u);let c=this.detectImageFormat(i);c.valid&&(l=c.mimeType,m=c.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:l,extension:m}}catch(g){return o.verbose,{success:false,error:g instanceof Error?g.message:"Unknown error"}}}detectImageFormat(t){return !t||t.length<10?{valid:false}:t[0]===255&&t[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:t[0]===137&&t[1]===80&&t[2]===78&&t[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:t[0]===71&&t[1]===73&&t[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:t[0]===73&&t[1]===73||t[0]===77&&t[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:t.length>=12&&t[0]===0&&t[1]===0&&t[2]===0&&t[3]===12&&t[4]===106&&t[5]===80&&t[6]===32&&t[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(t,e,r,a,s,n){try{let{PNG:o}=await import('pngjs'),g=a?.toString()||"",i=3,l=2;g.includes("DeviceGray")||g.includes("Gray")?(i=1,l=0):g.includes("DeviceRGB")||g.includes("RGB")?(i=3,l=2):(g.includes("DeviceCMYK")||g.includes("CMYK"))&&(i=4,l=2);let m=e*r*i*(s/8),u=t.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let c=new o({width:e,height:r,colorType:l===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=t[d]||0,x=d*4;f[x]=h,f[x+1]=h,f[x+2]=h,f[x+3]=255;}}else if(i===3){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*3,x=d*4;f[x]=t[h]||0,f[x+1]=t[h+1]||0,f[x+2]=t[h+2]||0,f[x+3]=255;}}else if(i===4){f=Buffer.alloc(e*r*4);for(let d=0;d<e*r;d++){let h=d*4,x=(t[h]||0)/255,y=(t[h+1]||0)/255,w=(t[h+2]||0)/255,E=(t[h+3]||0)/255,I=d*4;f[I]=Math.round(255*(1-x)*(1-E)),f[I+1]=Math.round(255*(1-y)*(1-E)),f[I+2]=Math.round(255*(1-w)*(1-E)),f[I+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};c.data=f;let b=o.sync.write(c);return n.verbose,{success:!0,pngData:b}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}};});function k(p){let t=[];if(p.pdfPath?typeof p.pdfPath!="string"?t.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):P__default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||t.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):t.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&t.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:e}=p;e.extractText!==void 0&&typeof e.extractText!="boolean"&&t.push({field:"options.extractText",message:"extractText must be a boolean",value:e.extractText}),e.extractImages!==void 0&&typeof e.extractImages!="boolean"&&t.push({field:"options.extractImages",message:"extractImages must be a boolean",value:e.extractImages}),e.extractImageFiles!==void 0&&typeof e.extractImageFiles!="boolean"&&t.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:e.extractImageFiles}),e.useImagePaths!==void 0&&typeof e.useImagePaths!="boolean"&&t.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:e.useImagePaths}),e.imageOutputDir&&typeof e.imageOutputDir!="string"&&t.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:e.imageOutputDir}),e.imageRefFormat&&typeof e.imageRefFormat!="string"&&t.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:e.imageRefFormat}),e.baseName&&typeof e.baseName!="string"&&t.push({field:"options.baseName",message:"baseName must be a string",value:e.baseName}),e.verbose!==void 0&&typeof e.verbose!="boolean"&&t.push({field:"options.verbose",message:"verbose must be a boolean",value:e.verbose}),e.memoryLimit&&typeof e.memoryLimit!="string"?t.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:e.memoryLimit}):e.memoryLimit&&!le(e.memoryLimit)&&t.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:e.memoryLimit}),e.batchSize!==void 0&&(typeof e.batchSize!="number"?t.push({field:"options.batchSize",message:"batchSize must be a number",value:e.batchSize}):(e.batchSize<1||e.batchSize>100)&&t.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:e.batchSize})),e.progressCallback&&typeof e.progressCallback!="function"&&t.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof e.progressCallback}),e.extractText===false&&e.extractImages===false&&t.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:e.extractText,extractImages:e.extractImages}}),e.useImagePaths===true&&e.extractImageFiles!==true&&t.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:e.useImagePaths,extractImageFiles:e.extractImageFiles}});}return t}function le(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function V(p){let t=[],e=["{id}","{name}","{page}","{index}","{path}"];e.some(n=>p.includes(n))||t.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${e.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let n of s)e.includes(n)||t.push({field:"imageRefFormat",message:`Invalid placeholder: ${n}. Valid placeholders are: ${e.join(", ")}`,value:p});return t}function J(p,t=[".pdf"]){let e=[];if(!p)return e.push({field:"filePath",message:"File path is required",value:p}),e;if(typeof p!="string")return e.push({field:"filePath",message:"File path must be a string",value:p}),e;if(!P__default.existsSync(p))return e.push({field:"filePath",message:"File does not exist",value:p}),e;let r=D.extname(p).toLowerCase();return t.length>0&&!t.includes(r)&&e.push({field:"filePath",message:`File must have one of these extensions: ${t.join(", ")}`,value:p}),e}var z=class{pdfLibDoc=null;pdfLibPages=[];textData=[];async processPDF(t){let e=P.readFileSync(t),[r,a]=await Promise.all([this.processPDFLib(e),this.processPDFParse(e)]);this.textData=this.combineResults(r,a);let s=this.textData.map(n=>n.text).join(`
|
|
3
|
-
|
|
4
|
-
`,i=c,l=u.str):l+=` ${u.str}`;}l&&(g+=l),g=g.trim();let m={pageNumber:a.pageIndex+1,text:g,textItems:s.items,pdfParseWidth:n.width,pdfParseHeight:n.height};return e.push(m),g}catch{return e.push({pageNumber:a.pageIndex+1,text:"",textItems:[],pdfParseWidth:0,pdfParseHeight:0}),""}}}),e.sort((a,s)=>a.pageNumber-s.pageNumber)}combineResults(t,e){return t.map(r=>{let a=e.find(n=>n.pageNumber===r.pageNumber),s=a?.text||"";return {pageNumber:r.pageNumber,text:s,width:r.width,height:r.height,rotation:r.rotation,mediaBox:r.mediaBox,textItems:a?.textItems||[],wordCount:this.countWords(s),characterCount:s.length}})}async extractWithPageMarkers(t,e="--- PAGE {page} ---",r={}){let a=await this.processPDF(t),s=[];if(r.includeImageRefs)try{let{ImageExtractor:o}=await Promise.resolve().then(()=>(N(),ae));s=(await new o().extract(t,{extractImageFiles:!1,verbose:!1,imageEngine:r.imageEngine||"auto"})).images||[];}catch{}let n="";return a.pages.forEach(o=>{let g=e.replace("{page}",o.pageNumber.toString()),i=o.text;if(r.includeImageRefs&&s.length>0){let l=s.filter(m=>m.page===o.pageNumber);if(l.length>0){let m=l.map(u=>(r.imageRefFormat||"[IMG:{id}] {name}").replace("{id}",`img_${u.id}`).replace("{name}",u.filename||`img_p${u.page}_${u.id}.jpg`)).join(`
|
|
1
|
+
import {Worker}from'worker_threads';import Se from'os';import T from'path';import {fileURLToPath}from'url';import*as w from'fs';import w__default from'fs';import st from'jimp';import C from'fs/promises';import it from'image-size';import {createRequire}from'module';import*as F from'pdfjs-dist/legacy/build/pdf.mjs';import {PDFDocument}from'pdf-lib';import ft from'crypto';var He=Object.defineProperty;var S=(p,e)=>()=>(p&&(e=p(p=0)),e);var H=(p,e)=>{for(var t in e)He(p,t,{get:e[t],enumerable:true});};var ee,ke=S(()=>{ee=class{};});var A,Ee=S(()=>{A=class{static async executeWithLimit(e,t={}){let r=t.maxConcurrency||10,a=t.verbose||false;if(e.length===0)return [];if(e.length<=r)return Promise.all(e.map(i=>i()));let s=Math.ceil(e.length/r),o=Array.from({length:s},(i,g)=>g).map(async i=>{let g=i*r,m=e.slice(g,g+r),u=await Promise.all(m.map(l=>l()));return a&&g+r<e.length,u});return (await Promise.all(o)).flat()}static async executeWithLimitSettled(e,t={}){let r=t.maxConcurrency||10,a=t.verbose||false;if(e.length===0)return [];if(e.length<=r)return Promise.allSettled(e.map(i=>i()));let s=Math.ceil(e.length/r),o=Array.from({length:s},(i,g)=>g).map(async i=>{let m=i*r,u=e.slice(m,m+r),l=await Promise.allSettled(u.map(h=>h()));if(a){l.filter(d=>d.status==="fulfilled").length;l.filter(d=>d.status==="rejected").length;}return l});return (await Promise.all(o)).flat()}static async map(e,t,r={}){let a=e.map((s,n)=>()=>t(s,n));return this.executeWithLimit(a,r)}static async mapSettled(e,t,r={}){let a=e.map((s,n)=>()=>t(s,n));return this.executeWithLimitSettled(a,r)}static async filter(e,t,r={}){let a=await this.map(e,t,r);return e.filter((s,n)=>a[n])}static async processInChunks(e,t,r,a={}){let s=Math.ceil(e.length/t),o=Array.from({length:s},(c,i)=>{let g=i*t;return e.slice(g,g+t)}).map((c,i)=>()=>r(c,i));return this.executeWithLimit(o,a)}};});var et,fe,re,Te=S(()=>{et=fileURLToPath(import.meta.url),fe=T.dirname(et),re=class{workers=new Map;availableWorkers=[];taskQueue=[];workerInstances=new Map;options;stats={completedTasks:0,failedTasks:0,totalTaskDuration:0};monitorInterval;isTerminating=false;constructor(e={}){let t=Se.cpus().length;this.options={maxWorkerThreads:e.maxWorkerThreads??Math.max(1,t-1),minWorkerThreads:e.minWorkerThreads??1,autoScaleWorkers:e.autoScaleWorkers??true,memoryThreshold:e.memoryThreshold??.8,cpuThreshold:e.cpuThreshold??.9,workerTaskTimeout:e.workerTaskTimeout??3e4,workerIdleTimeout:e.workerIdleTimeout??6e4,workerMemoryLimit:e.workerMemoryLimit??512,verbose:e.verbose??false};}async initialize(){await this.initializeWorkers(),this.options.autoScaleWorkers&&this.startMonitoring();}async initializeWorkers(){let e=new Promise((a,s)=>setTimeout(()=>s(new Error("Worker initialization timeout after 10s")),1e4)),t=Array.from({length:this.options.minWorkerThreads},(a,s)=>s),r=Promise.all(t.map(()=>this.spawnWorker()));await Promise.race([r,e]);}async spawnWorker(){let e=`worker-${Date.now()}-${Math.random().toString(36).substr(2,9)}`,t={id:e,state:"idle",tasksCompleted:0,lastTaskTime:Date.now(),memoryUsage:0};return this.workers.set(e,t),this.availableWorkers.push(e),this.options.verbose,e}async getWorkerInstance(e,t){let r=`${e}-${t}`,a=this.workerInstances.get(r);if(a)return a;let s=this.getWorkerScriptPath(t);if(!(await import('fs')).existsSync(s))throw new Error(`Worker script not found: ${s}`);let o=new Worker(s,{resourceLimits:{maxOldGenerationSizeMb:this.options.workerMemoryLimit,maxYoungGenerationSizeMb:Math.floor(this.options.workerMemoryLimit/4)}});return this.workerInstances.set(r,o),o.on("error",c=>{this.options.verbose,this.handleWorkerError(e,c);}),o.on("exit",c=>{c!==0&&this.options.verbose,this.workerInstances.delete(r);}),o}getWorkerScriptPath(e){let t={decode:T.resolve(fe,"workers/image-decoder.worker.js"),convert:T.resolve(fe,"workers/jp2-converter.worker.js"),optimize:T.resolve(fe,"workers/image-optimizer.worker.js")};return t[e]||t.decode}async execute(e){return new Promise((t,r)=>{let a={task:e,resolve:t,reject:r,timestamp:Date.now()};this.taskQueue.push(a),this.processQueue();})}async processQueue(){for(;this.taskQueue.length>0&&this.availableWorkers.length>0;){let e=this.taskQueue.shift(),t=this.availableWorkers.shift();if(!e||!t)break;this.executeTask(t,e);}this.taskQueue.length>0&&this.availableWorkers.length===0&&this.workers.size<this.options.maxWorkerThreads&&(await this.scaleUp(),this.processQueue());}async executeTask(e,t){let r=this.workers.get(e);if(!r)return;r.state="busy";let a=Date.now();try{let s=await this.getWorkerInstance(e,t.task.type),n=setTimeout(()=>{t.reject(new Error(`Worker task ${t.task.taskId} timed out after ${this.options.workerTaskTimeout}ms`)),this.handleWorkerTimeout(e);},this.options.workerTaskTimeout),o=c=>{clearTimeout(n),s.off("message",o);let i=Date.now()-a;this.stats.completedTasks++,this.stats.totalTaskDuration+=i,r.tasksCompleted++,r.lastTaskTime=Date.now(),r.state="idle",this.availableWorkers.push(e),c.success?t.resolve(c):t.reject(new Error(c.error||"Worker task failed")),this.processQueue();};s.on("message",o),s.postMessage(t.task);}catch(s){clearTimeout(setTimeout(()=>{},this.options.workerTaskTimeout)),this.stats.failedTasks++,r.state="idle",this.availableWorkers.push(e),t.reject(s instanceof Error?s:new Error("Unknown worker error"));}}handleWorkerError(e,t){let r=this.workers.get(e);r&&(r.state="idle");}handleWorkerTimeout(e){this.options.verbose,this.terminateWorker(e);}async terminateWorker(e){let t=this.workers.get(e);if(!t)return;t.state="terminating";for(let[a,s]of this.workerInstances.entries())a.startsWith(e)&&(await s.terminate(),this.workerInstances.delete(a));this.workers.delete(e);let r=this.availableWorkers.indexOf(e);r>-1&&this.availableWorkers.splice(r,1),this.options.verbose;}async scaleUp(){if(this.workers.size>=this.options.maxWorkerThreads)return;if(this.getMemoryUsage()>this.options.memoryThreshold){this.options.verbose;return}await this.spawnWorker();}async scaleDown(){if(this.workers.size<=this.options.minWorkerThreads)return;let e=Array.from(this.workers.entries()).filter(([,t])=>t.state==="idle"&&Date.now()-t.lastTaskTime>this.options.workerIdleTimeout).map(([t])=>t);if(e.length>0){let t=e[0];await this.terminateWorker(t);}}startMonitoring(){this.monitorInterval=setInterval(()=>{this.monitorResources();},5e3);}async monitorResources(){if(this.isTerminating)return;this.getMemoryUsage()>this.options.memoryThreshold?await this.scaleDown():this.taskQueue.length>0?await this.scaleUp():await this.scaleDown();}getMemoryUsage(){let e=process.memoryUsage(),t=Se.totalmem();return e.heapUsed/t}getStats(){let e=Array.from(this.workers.values()).filter(t=>t.state==="busy").length;return {totalWorkers:this.workers.size,activeWorkers:e,idleWorkers:this.workers.size-e,queuedTasks:this.taskQueue.length,completedTasks:this.stats.completedTasks,failedTasks:this.stats.failedTasks,averageTaskDuration:this.stats.completedTasks>0?this.stats.totalTaskDuration/this.stats.completedTasks:0,memoryUsage:this.getMemoryUsage(),cpuUsage:0}}async terminate(){this.isTerminating=true,this.monitorInterval&&clearInterval(this.monitorInterval);let e=Array.from(this.workers.keys()).map(t=>this.terminateWorker(t));await Promise.all(e),this.options.verbose;}};});var K,$e=S(()=>{K=class{totalPixels;constructor(e,t){this.totalPixels=e*t;}static detectColorSpace(e){return e.includes("DeviceGray")||e.includes("Gray")?{componentsPerPixel:1,colorType:0}:e.includes("DeviceRGB")||e.includes("RGB")?{componentsPerPixel:3,colorType:2}:e.includes("DeviceCMYK")||e.includes("CMYK")?{componentsPerPixel:4,colorType:2}:{componentsPerPixel:3,colorType:2}}convertToRGBA(e,t){switch(t){case 1:return this.grayscaleToRGBA(e);case 3:return this.rgbToRGBA(e);case 4:return this.cmykToRGB(e);default:return null}}grayscaleToRGBA(e){let t=Buffer.allocUnsafe(this.totalPixels*4);for(let r=0;r<this.totalPixels;r++){let a=e[r]??0,s=r*4;t[s]=a,t[s+1]=a,t[s+2]=a,t[s+3]=255;}return t}rgbToRGBA(e){let t=Buffer.allocUnsafe(this.totalPixels*4);for(let r=0;r<this.totalPixels;r++){let a=r*3,s=r*4;t[s]=e[a]??0,t[s+1]=e[a+1]??0,t[s+2]=e[a+2]??0,t[s+3]=255;}return t}cmykToRGB(e){let t=Buffer.allocUnsafe(this.totalPixels*4);for(let r=0;r<this.totalPixels;r++){let a=r*4,s=(e[a]??0)/255,n=(e[a+1]??0)/255,o=(e[a+2]??0)/255,c=(e[a+3]??0)/255,i=r*4;t[i]=Math.round(255*(1-s)*(1-c)),t[i+1]=Math.round(255*(1-n)*(1-c)),t[i+2]=Math.round(255*(1-o)*(1-c)),t[i+3]=255;}return t}};});function tt(p,e,t){let r=p+e-t,a=Math.abs(r-p),s=Math.abs(r-e),n=Math.abs(r-t);return a<=s&&a<=n?p:s<=n?e:t}function rt(p,e,t=3,r=8){let a=Math.ceil(t*r/8),s=e*a,n=s+1;if(p.length%n!==0)throw new Error(`Data length doesn't match filter columns: ${p.length} % ${n} !== 0`);let o=p.length/n,c=Buffer.alloc(o*s),i=Buffer.alloc(s),g=Buffer.alloc(s),m=h=>h-a<0?0:g[h-a],u=h=>i[h],l=h=>h-a<0?0:i[h-a],f=0;for(let h=0;h<o;h++){let x=h*n,d=p[x];for(let b=0;b<s;b++){let y=p[x+1+b],v;switch(d){case 0:v=y;break;case 1:v=y+m(b)&255;break;case 2:v=y+u(b)&255;break;case 3:v=y+Math.floor((m(b)+u(b))/2)&255;break;case 4:v=y+tt(m(b),u(b),l(b))&255;break;default:throw new Error(`Unknown PNG filter type: ${d}`)}g[b]=v,c[f++]=v;}g.copy(i);}return c}function at(p,e,t=3,r=8){let a=Math.ceil(t*r/8),s=e*a,n=p.length/s,o=Buffer.alloc(p.length);for(let c=0;c<n;c++){let i=c*s;for(let g=0;g<a;g++)o[i+g]=p[i+g];for(let g=a;g<s;g++)o[i+g]=p[i+g]+o[i+g-a]&255;}return o}function De(p,e=1,t=1,r=3,a=8){if(e===1)return p;if(e===2)return at(p,t,r,a);if(e>=10&&e<=15)return rt(p,t,r,a);throw new Error(`Unsupported predictor type: ${e}`)}var Ce=S(()=>{});var Fe={};H(Fe,{getSharp:()=>he,isSharpAvailable:()=>pe});async function pe(){try{return await import('sharp'),!0}catch{return false}}async function he(){try{return (await import('sharp')).default}catch{return null}}var de=S(()=>{});var Oe={};H(Oe,{convertJp2ToJpg:()=>nt,convertJp2ToJpgSharp:()=>ze,convertJp2ToJpgWasm:()=>je});async function Re(){return xe||(xe=await(await import('@cornerstonejs/codec-openjpeg')).default({print:()=>{},printErr:()=>{}})),xe}async function je(p,e={}){let t=e.quality!==void 0?e.quality:100;e.verbose!==void 0?e.verbose:false;let a=e.deleteOriginal!==void 0?e.deleteOriginal:true;if(!w__default.existsSync(p))return {success:false,error:`File not found: ${p}`};try{let s=w__default.statSync(p).size,n=p.replace(/\.jp2$/i,".jpg"),o=w__default.readFileSync(p),c=await Re(),i=new c.J2KDecoder;i.getEncodedBuffer(o.length).set(o),i.decode();let m=i.getDecodedBuffer(),u=i.getFrameInfo();await new st({data:Buffer.from(m),width:u.width,height:u.height}).quality(t).writeAsync(n);let f=w__default.statSync(n).size;return a&&w__default.unlinkSync(p),{success:!0,newPath:n,originalSize:s,newSize:f}}catch(s){return {success:false,error:`Conversion failed: ${s.message}`}}}async function ze(p,e={}){let t=e.quality!==void 0?e.quality:100;e.verbose!==void 0?e.verbose:false;let a=e.deleteOriginal!==void 0?e.deleteOriginal:true;if(!w__default.existsSync(p))return {success:false,error:`File not found: ${p}`};try{let s=w__default.statSync(p).size,n=p.replace(/\.jp2$/i,".jpg"),o=w__default.readFileSync(p),c=await Re(),i=new c.J2KDecoder;i.getEncodedBuffer(o.length).set(o),i.decode();let m=i.getDecodedBuffer(),u=i.getFrameInfo(),l=await he();if(!l)throw new Error("Sharp module not available");let f=Buffer.from(m),h=u.componentCount;await l(f,{raw:{width:u.width,height:u.height,channels:h}}).jpeg({quality:t,chromaSubsampling:"4:4:4",mozjpeg:!0}).toFile(n);let d=w__default.statSync(n).size;return a&&w__default.unlinkSync(p),{success:!0,newPath:n,originalSize:s,newSize:d}}catch(s){return {success:false,error:`Conversion failed: ${s.message}`}}}async function nt(p,e={}){e.verbose!==void 0?e.verbose:false;return e.useSharp&&await pe()?ze(p,e):je(p,e)}var xe,We=S(()=>{de();xe=null;});var Be={};H(Be,{ImageOptimizer:()=>O});var O,be=S(()=>{O=class{static async optimizeFile(e,t={}){if(!w__default.existsSync(e))return {success:false,originalSize:0,optimizedSize:0,savedBytes:0,savedPercent:0,engine:"none",error:`File not found: ${e}`};let r=w__default.statSync(e).size;if(t.useSharp){let s=await this.optimizeWithSharp(e,t);if(s.success)return {...s,originalSize:r,savedBytes:r-s.optimizedSize,savedPercent:(r-s.optimizedSize)/r*100,engine:"sharp"};t.verbose;}let a=await this.optimizeWithJimp(e,t);return a.success?{...a,originalSize:r,savedBytes:r-a.optimizedSize,savedPercent:(r-a.optimizedSize)/r*100,engine:"jimp"}:{success:false,originalSize:r,optimizedSize:r,savedBytes:0,savedPercent:0,engine:"none",error:a.error||"Image optimization failed"}}static async optimizeWithSharp(e,t){try{let{getSharp:r,isSharpAvailable:a}=await Promise.resolve().then(()=>(de(),Fe));if(!a())return {success:!1,optimizedSize:0,error:"Sharp is not installed. Install it with: npm install sharp"};let s=await r(),n=T.extname(e).toLowerCase();if(n!==".jpg"&&n!==".jpeg"&&n!==".png")return {success:!1,optimizedSize:0,error:`Unsupported format for Sharp: ${n}`};let o=e+".tmp",c=t.quality||80;n===".jpg"||n===".jpeg"?await s(e).jpeg({quality:c,mozjpeg:!0}).toFile(o):n===".png"&&await s(e).png({quality:c,compressionLevel:9}).toFile(o);let i=w__default.statSync(o).size;return w__default.unlinkSync(e),w__default.renameSync(o,e),{success:!0,optimizedSize:i}}catch(r){return {success:false,optimizedSize:0,error:r instanceof Error?r.message:"Unknown error"}}}static async optimizeWithJimp(e,t){try{let r=T.extname(e).toLowerCase();if(r!==".jpg"&&r!==".jpeg"&&r!==".png")return {success:!1,optimizedSize:0,error:`Unsupported format for Jimp: ${r}`};let a=await st.read(e);r===".jpg"||r===".jpeg"?a.quality(t.quality||80):r===".png"&&a.deflateLevel(9);let s=e+".tmp";await a.writeAsync(s);let n=w__default.statSync(s).size;return w__default.unlinkSync(e),w__default.renameSync(s,e),{success:!0,optimizedSize:n}}catch(r){return t.verbose,{success:false,optimizedSize:0,error:r instanceof Error?r.message:"Unknown error"}}}static async convertJp2ToJpg(e,t={}){t.verbose;let{convertJp2ToJpg:r}=await Promise.resolve().then(()=>(We(),Oe));return r(e,{quality:t.quality,verbose:t.verbose,deleteOriginal:true,useSharp:t.useSharp})}};});var Ae={};H(Ae,{ImageOptimizer:()=>O});var ae=S(()=>{be();});var Y,Ge=S(()=>{ke();Ee();Te();$e();Ce();Y=class p extends ee{name="pdf-lib";description="PDF-lib based extraction with full format support";static pdfLibModule=null;static imageOptimizerModule=null;workerPool=null;async isAvailable(){try{return await this.getPdfLibModule(),!0}catch{return false}}async getPdfLibModule(){return p.pdfLibModule||(p.pdfLibModule=await import('pdf-lib')),p.pdfLibModule}async getImageOptimizerModule(){return p.imageOptimizerModule||(p.imageOptimizerModule=await Promise.resolve().then(()=>(ae(),Ae))),p.imageOptimizerModule}async initializeWorkerPool(e){if(!e.useWorkerThreads||this.workerPool)return;let t={};e.maxWorkerThreads!==void 0&&(t.maxWorkerThreads=e.maxWorkerThreads),e.minWorkerThreads!==void 0&&(t.minWorkerThreads=e.minWorkerThreads),e.autoScaleWorkers!==void 0&&(t.autoScaleWorkers=e.autoScaleWorkers),e.memoryThreshold!==void 0&&(t.memoryThreshold=e.memoryThreshold),e.cpuThreshold!==void 0&&(t.cpuThreshold=e.cpuThreshold),e.workerTaskTimeout!==void 0&&(t.workerTaskTimeout=e.workerTaskTimeout),e.workerIdleTimeout!==void 0&&(t.workerIdleTimeout=e.workerIdleTimeout),e.workerMemoryLimit!==void 0&&(t.workerMemoryLimit=e.workerMemoryLimit),e.verbose!==void 0&&(t.verbose=e.verbose);try{this.workerPool=new re(t),await this.workerPool.initialize();}catch{e.verbose,this.workerPool=null;}}async cleanupWorkerPool(){this.workerPool&&(await this.workerPool.terminate(),this.workerPool=null);}async convertJp2FileWithWorker(e,t,r,a){if(!(this.workerPool&&this.workerPool.getStats().totalWorkers>0)){let{ImageOptimizer:n}=await this.getImageOptimizerModule();return n.convertJp2ToJpg(e,{quality:t,verbose:r,useSharp:a})}try{let n=await C.readFile(e),o={type:"convert",taskId:`convert-${Date.now()}-${Math.random()}`,data:{buffer:n,options:{quality:t,useSharp:a}}},c=await this.workerPool.execute(o);if(!c.success||!c.data)throw new Error(c.error||"JP2 conversion failed");let i=e.replace(/\.jp2$/i,".jpg");return await C.writeFile(i,c.data),await C.unlink(e),{success:!0,newPath:i}}catch(n){return {success:false,error:n instanceof Error?n.message:"Unknown error"}}}async optimizeFileWithWorker(e,t){if(!(this.workerPool&&this.workerPool.getStats().totalWorkers>0)){let{ImageOptimizer:a}=await this.getImageOptimizerModule();return a.optimizeFile(e,t)}try{let a=await C.readFile(e),s=a.length,n=T.extname(e).toLowerCase().slice(1),o=n==="jpg"?"jpeg":n,c={type:"optimize",taskId:`optimize-${Date.now()}-${Math.random()}`,data:{buffer:a,options:{format:o,quality:t.quality||80,progressive:t.progressive!==!1,engine:t.engine||"auto"}}},i=await this.workerPool.execute(c);if(!i.success||!i.data)throw new Error(i.error||"Optimization failed");await C.writeFile(e,i.data);let g=i.data.length,u=(s-g)/s*100;return {success:!0,originalSize:s,optimizedSize:g,savedPercent:u,engine:"worker"}}catch(a){return {success:false,error:a instanceof Error?a.message:"Unknown error"}}}getCapabilities(){return {formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}}async extractImages(e,t){try{await this.initializeWorkerPool(t);let{PDFDocument:r,PDFName:a}=await this.getPdfLibModule();try{await C.access(e);}catch{return await this.cleanupWorkerPool(),{success:!1,error:`PDF file not found: ${e}`}}let s=await C.readFile(e);t.verbose;let n=await r.load(s,{ignoreEncryption:!0});t.verbose;let o=n.getPages();t.verbose;let c=t.parallelProcessing!==!1,i=t.maxConcurrentPages||10,g=t.maxConcurrentImages||20;t.verbose;let m=c?await this.extractImagesParallel(n,o,a,t,i,g):await this.extractImagesSequential(n,o,a,t);if(t.verbose,t.extractImageFiles&&t.imageOutputDir&&m.length>0){let l=m.filter(f=>f._imageData&&f.filepath);if(l.length>0){let f=T.join(t.imageOutputDir,"images");await C.mkdir(f,{recursive:!0}),t.verbose,await Promise.all(l.map(h=>C.writeFile(h.filepath,h._imageData))),l.forEach(h=>{delete h._imageData;});}}if(t.extractImageFiles&&t.preserveJp2!==!0&&m.length>0){let l=m.filter(f=>f.filepath&&f.filepath.toLowerCase().endsWith(".jp2"));if(t.verbose,l.length>0){t.verbose;let f=t.maxConcurrentConversions||5,h=t.imageQuality!==void 0?t.imageQuality:100;if(c)(await A.mapSettled(l,async d=>d.filepath&&w__default.existsSync(d.filepath)?this.convertJp2FileWithWorker(d.filepath,h,t.verbose||!1,t.useSharp):{success:!1,error:"File not found"},(()=>{let d={maxConcurrency:f};return t.verbose!==void 0&&(d.verbose=t.verbose),d})())).forEach((d,b)=>{if(d.status==="fulfilled"&&d.value.success&&d.value.newPath){let y=l[b];if(!y)return;y.filepath=d.value.newPath,y.filename=y.filename?.replace(/\.jp2$/i,".jpg"),y.format="jpg",y.mimeType="image/jpeg";}});else for(let x of l)if(x.filepath&&w__default.existsSync(x.filepath)){let d=await this.convertJp2FileWithWorker(x.filepath,h,t.verbose||!1);d.success&&d.newPath&&(x.filepath=d.newPath,x.filename=x.filename?.replace(/\.jp2$/i,".jpg"),x.format="jpg",x.mimeType="image/jpeg");}}}if(t.optimizeImages&&m.length>0){t.verbose;let l=t.maxConcurrentOptimizations||5;if(c){let f=await A.mapSettled(m,async h=>h.filepath&&w__default.existsSync(h.filepath)?this.optimizeFileWithWorker(h.filepath,{quality:t.imageQuality||80,verbose:!1,useSharp:t.useSharp}):{success:!1,error:"File not found"},{maxConcurrency:l,verbose:t.verbose});t.verbose&&f.forEach((h,x)=>{let d=m[x];h.status==="fulfilled"&&h.value.success||h.status==="fulfilled"&&h.value.success;});}else for(let f of m)if(f.filepath&&w__default.existsSync(f.filepath)){let h=await this.optimizeFileWithWorker(f.filepath,{quality:t.imageQuality||80,verbose:t.verbose,useSharp:t.useSharp});h.success&&t.verbose||!h.success&&t.verbose;}}return await this.cleanupWorkerPool(),{success:!0,images:m}}catch(r){return await this.cleanupWorkerPool(),{success:false,error:`PDF-lib extraction failed: ${r instanceof Error?r.message:"Unknown error"}`}}}async extractImagesParallel(e,t,r,a,s,n){let o=[];for(let m=0;m<t.length;m++){let l=t[m]?.node?.Resources?.();if(!l){o.push(0);continue}let f=l?.get?.(r.of("XObject"));if(!f){o.push(0);continue}let x=(f.entries?.()||[]).reduce((d,[,b])=>{let y=e.context.lookup(b);return y&&y.dict?.get?.(r.of("Subtype"))?.toString()==="/Image"?d+1:d},0);o.push(x);}let c=o.reduce((m,u)=>{let l=m.length===0?1:m[m.length-1]+o[m.length-1];return [...m,l]},[]),i=await A.mapSettled(t,async(m,u)=>{let l=u+1,f=c[u];return this.extractImagesFromPage(e,m,l,f,r,a,n)},{maxConcurrency:s,verbose:a.verbose}),g=[];return i.forEach((m,u)=>{m.status==="fulfilled"?g.push(...m.value):a.verbose;}),g}async extractImagesFromPage(e,t,r,a,s,n,o){let c=t?.node?.Resources?.();if(!c)return [];let i=c?.get?.(s.of("XObject"));if(!i)return [];let g=i.entries?.()||[];n.verbose;let m=await A.mapSettled(g,async([,l],f)=>{let h=e.context.lookup(l);if(!h||h.dict?.get?.(s.of("Subtype"))?.toString()!=="/Image")return null;let d=a+f;return this.extractImageFromPdfObject(h,r,d,n)},{maxConcurrency:o,verbose:false}),u=[];return m.forEach(l=>{l.status==="fulfilled"&&l.value&&u.push(l.value);}),u}async extractImagesSequential(e,t,r,a){let s=[],n=1;for(let o=0;o<t.length;o++){let c=t[o],i=o+1,g=c?.node?.Resources?.();if(!g)continue;let m=g?.get?.(r.of("XObject"));if(!m)continue;let u=m.entries?.()||[];a.verbose;for(let[,l]of u){let f=e.context.lookup(l);if(!f||f.dict?.get?.(r.of("Subtype"))?.toString()!=="/Image")continue;let x=await this.extractImageFromPdfObject(f,i,n,a);x&&s.push(x),n++;}}return s}async extractImageFromPdfObject(e,t,r,a){try{let{PDFName:s}=await this.getPdfLibModule(),n=e.dict.get(s.of("Width")),o=e.dict.get(s.of("Height")),c=e.dict.get(s.of("Filter")),i=e.dict.get(s.of("ColorSpace")),g=e.dict.get(s.of("BitsPerComponent")),m=e.dict.get(s.of("DecodeParms")),{widthVal:u,heightVal:l}=(()=>{let P=n?typeof n.asNumber=="function"?n.asNumber():n.value??100:100,k=o?typeof o.asNumber=="function"?o.asNumber():o.value??100:100;if(P===100&&k===100&&e.dict){let E=e.dict.entries(),j=Array.from(E).reduce((L,[_,J])=>_.toString()==="/Width"&&J?.asNumber?{...L,width:J.asNumber()}:_.toString()==="/Height"&&J?.asNumber?{...L,height:J.asNumber()}:L,{width:P,height:k});return {widthVal:j.width,heightVal:j.height}}return {widthVal:P,heightVal:k}})(),f=g&&typeof g.value=="number"?g.value:8;a.verbose;let h=await this.extractImageData(e,c,u,l,i,f,m,a);if(!h.success||!h.imageData)return a.verbose,null;let x=h.extension||"bin",d=`img_p${t}_${r}.${x}`,b=h.imageData.length,{finalWidth:y,finalHeight:v}=(()=>{if(a.verbose&&r<=3,u===100&&l===100&&h.imageData)try{let P=it(Buffer.from(h.imageData));if(P.width&&P.height)return a.verbose&&r<=3,{finalWidth:P.width,finalHeight:P.height}}catch{a.verbose&&r<=3;}return {finalWidth:u,finalHeight:l}})(),I=(()=>{if(a.extractImageFiles&&a.imageOutputDir){let P=T.join(a.imageOutputDir,"images"),k=T.join(P,d);return a.verbose,k}})();return {id:`img_${r}`,filename:`images/${d}`,filepath:I||"",page:t,width:y,height:v,format:this.getFormatFromMimeType(h.mimeType||""),mimeType:h.mimeType||"",size:b,position:{x:0,y:0,width:y,height:v},_imageData:h.imageData}}catch{return a.verbose,null}}async extractImageData(e,t,r,a,s,n,o,c){try{let i=await import('zlib'),g,m="image/jpeg",u="jpg";if(t){let l=t.toString();if(c.verbose,l.includes("DCTDecode")&&l.includes("FlateDecode")){c.verbose;try{let f=e.contents;g=i.inflateSync(Buffer.from(f)),m="image/jpeg",u="jpg",c.verbose;}catch(f){return c.verbose,{success:!1,error:`Zlib decompression failed: ${f instanceof Error?f.message:"Unknown error"}`}}}else if(l.includes("DCTDecode"))c.verbose,g=Buffer.from(e.contents),m="image/jpeg",u="jpg";else if(l.includes("FlateDecode")){c.verbose;try{let f=e.contents,h=i.inflateSync(Buffer.from(f));if(c.verbose,o){let d=o.get?o.get(await this.getPdfLibModule().then(k=>k.PDFName.of("Predictor"))):o.Predictor,b=o.get?o.get(await this.getPdfLibModule().then(k=>k.PDFName.of("Columns"))):o.Columns,y=o.get?o.get(await this.getPdfLibModule().then(k=>k.PDFName.of("Colors"))):o.Colors,v=d?.asNumber?d.asNumber():d?.value??d,I=b?.asNumber?b.asNumber():b?.value??b??r,P=y?.asNumber?y.asNumber():y?.value??y;if(v&&v>1){c.verbose;try{let k=P??this.getColorComponents(s);h=De(h,v,I,k,n),c.verbose;}catch{c.verbose;}}}let x=this.detectImageFormat(h);if(x.valid)g=h,m=x.mimeType,u=x.extension,c.verbose;else {let d=await this.createPngFromPdfMetadata(h,r,a,s,n,c);if(d.success&&d.pngData)g=d.pngData,m="image/png",u="png",c.verbose;else return c.verbose,{success:!1,error:`PNG creation failed: ${d.error}`}}}catch(f){return c.verbose,{success:!1,error:`FlateDecode decompression failed: ${f instanceof Error?f.message:"Unknown error"}`}}}else if(l.includes("JPXDecode")){c.verbose;try{g=Buffer.from(e.contents),m="image/jp2",u="jp2",c.verbose;}catch(f){return c.verbose,{success:!1,error:`JPXDecode extraction failed: ${f instanceof Error?f.message:"Unknown error"}`}}}else {c.verbose;try{let f=await e.asUint8Array();g=Buffer.from(f);let h=this.detectImageFormat(g);h.valid&&(m=h.mimeType,u=h.extension);}catch(f){return c.verbose,{success:!1,error:`Generic decompression failed: ${f instanceof Error?f.message:"Unknown error"}`}}}}else {c.verbose;try{let l=await e.asUint8Array();g=Buffer.from(l);let f=this.detectImageFormat(g);f.valid&&(m=f.mimeType,u=f.extension);}catch(l){return c.verbose,{success:!1,error:`Raw data extraction failed: ${l instanceof Error?l.message:"Unknown error"}`}}}return {success:!0,imageData:g,mimeType:m,extension:u}}catch(i){return {success:false,error:`Image data extraction failed: ${i instanceof Error?i.message:"Unknown error"}`}}}detectImageFormat(e){return !e||e.length<10?{valid:false}:e[0]===255&&e[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:e[0]===137&&e[1]===80&&e[2]===78&&e[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:e[0]===71&&e[1]===73&&e[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:e[0]===73&&e[1]===73||e[0]===77&&e[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:e.length>=12&&e[0]===0&&e[1]===0&&e[2]===0&&e[3]===12&&e[4]===106&&e[5]===80&&e[6]===32&&e[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(e,t,r,a,s,n){try{let{PNG:o}=await import('pngjs'),c=a?.toString()||"",{componentsPerPixel:i,colorType:g}=K.detectColorSpace(c),m=t*r*i*(s/8),u=e.length;n.verbose;let l=i*(s/8),f=Math.floor(u/l),h=t*r,x=f/h;n.verbose;let d=t,b=r;if(Math.abs(x-1)>.1){let k=u/r,E=Math.floor(k/l);if(n.verbose,E>0&&E<1e5)d=E;else return {success:!1,error:`Cannot determine image dimensions: expected ${t}x${r}, data suggests ${E}x${r}`}}let y=new o({width:d,height:b,colorType:g===0?0:6,bitDepth:8}),I=new K(t,r).convertToRGBA(e,i);if(!I)return {success:!1,error:`Unsupported color space with ${i} components`};y.data=I;let P=o.sync.write(y);return n.verbose,{success:!0,pngData:P}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}getFormatFromMimeType(e){switch(e){case "image/jpeg":return "JPEG";case "image/png":return "PNG";case "image/jp2":return "JPEG 2000";case "image/gif":return "GIF";case "image/tiff":return "TIFF";default:return "unknown"}}getColorComponents(e){if(!e)return 3;let t=e.toString();return t.includes("Gray")?1:t.includes("RGB")?3:t.includes("CMYK")?4:t.includes("Indexed")?1:3}};});var Ue={};H(Ue,{ImageEngineFactory:()=>ye});var ye,Ne=S(()=>{Ge();ye=class p{static engine=null;static async getEngine(){if(p.engine)return p.engine;let e=new Y;if(!await e.isAvailable())throw new Error("PDF-lib engine is not available on this system. Please install pdf-lib: npm install pdf-lib");return p.engine=e,e}static async getAvailableEngines(){let e=new Y,t=await e.isAvailable();return [{name:e.name,description:e.description,available:t,capabilities:e.getCapabilities()}]}static clearCache(){p.engine=null;}static getRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"},{useCase:"Best performance",engine:"pdf-lib",reason:"Direct PDF buffer reading with no external dependencies"}]}};});function X(p){let e=[];if(p.pdfPath?typeof p.pdfPath!="string"?e.push({field:"pdfPath",message:"PDF path must be a string",value:p.pdfPath}):w__default.existsSync(p.pdfPath)?p.pdfPath.toLowerCase().endsWith(".pdf")||e.push({field:"pdfPath",message:"File must have .pdf extension",value:p.pdfPath}):e.push({field:"pdfPath",message:"PDF file does not exist",value:p.pdfPath}):e.push({field:"pdfPath",message:"PDF path is required",value:p.pdfPath}),p.outputDir&&typeof p.outputDir!="string"&&e.push({field:"outputDir",message:"Output directory must be a string",value:p.outputDir}),p.options){let{options:t}=p;t.extractText!==void 0&&typeof t.extractText!="boolean"&&e.push({field:"options.extractText",message:"extractText must be a boolean",value:t.extractText}),t.extractImages!==void 0&&typeof t.extractImages!="boolean"&&e.push({field:"options.extractImages",message:"extractImages must be a boolean",value:t.extractImages}),t.extractImageFiles!==void 0&&typeof t.extractImageFiles!="boolean"&&e.push({field:"options.extractImageFiles",message:"extractImageFiles must be a boolean",value:t.extractImageFiles}),t.useImagePaths!==void 0&&typeof t.useImagePaths!="boolean"&&e.push({field:"options.useImagePaths",message:"useImagePaths must be a boolean",value:t.useImagePaths}),t.imageOutputDir&&typeof t.imageOutputDir!="string"&&e.push({field:"options.imageOutputDir",message:"imageOutputDir must be a string",value:t.imageOutputDir}),t.imageRefFormat&&typeof t.imageRefFormat!="string"&&e.push({field:"options.imageRefFormat",message:"imageRefFormat must be a string",value:t.imageRefFormat}),t.baseName&&typeof t.baseName!="string"&&e.push({field:"options.baseName",message:"baseName must be a string",value:t.baseName}),t.verbose!==void 0&&typeof t.verbose!="boolean"&&e.push({field:"options.verbose",message:"verbose must be a boolean",value:t.verbose}),t.memoryLimit&&typeof t.memoryLimit!="string"?e.push({field:"options.memoryLimit",message:"memoryLimit must be a string",value:t.memoryLimit}):t.memoryLimit&&!Ke(t.memoryLimit)&&e.push({field:"options.memoryLimit",message:'memoryLimit must be in format like "512MB", "1GB", etc.',value:t.memoryLimit}),t.batchSize!==void 0&&(typeof t.batchSize!="number"?e.push({field:"options.batchSize",message:"batchSize must be a number",value:t.batchSize}):(t.batchSize<1||t.batchSize>100)&&e.push({field:"options.batchSize",message:"batchSize must be between 1 and 100",value:t.batchSize})),t.progressCallback&&typeof t.progressCallback!="function"&&e.push({field:"options.progressCallback",message:"progressCallback must be a function",value:typeof t.progressCallback}),t.extractText===false&&t.extractImages===false&&e.push({field:"options",message:"At least one of extractText or extractImages must be true",value:{extractText:t.extractText,extractImages:t.extractImages}}),t.useImagePaths===true&&t.extractImageFiles!==true&&e.push({field:"options",message:"useImagePaths requires extractImageFiles to be true",value:{useImagePaths:t.useImagePaths,extractImageFiles:t.extractImageFiles}});}return e}function Ke(p){return /^\d+(\.\d+)?(MB|GB|KB)$/i.test(p)}function me(p){let e=[],t=["{id}","{name}","{page}","{index}","{path}"];t.some(n=>p.includes(n))||e.push({field:"imageRefFormat",message:`Format must contain at least one valid placeholder: ${t.join(", ")}`,value:p});let a=/\{([^}]+)\}/g,s=p.match(a);if(s)for(let n of s)t.includes(n)||e.push({field:"imageRefFormat",message:`Invalid placeholder: ${n}. Valid placeholders are: ${t.join(", ")}`,value:p});return e}function ue(p,e=[".pdf"]){let t=[];if(!p)return t.push({field:"filePath",message:"File path is required",value:p}),t;if(typeof p!="string")return t.push({field:"filePath",message:"File path must be a string",value:p}),t;if(!w__default.existsSync(p))return t.push({field:"filePath",message:"File does not exist",value:p}),t;let r=T.extname(p).toLowerCase();return e.length>0&&!e.includes(r)&&t.push({field:"filePath",message:`File must have one of these extensions: ${e.join(", ")}`,value:p}),t}var D=class{async extract(e,t={}){let r={verbose:false,extractImageFiles:false,...t};r.verbose,r.extractImageFiles&&r.imageOutputDir&&(w__default.existsSync(r.imageOutputDir)||w__default.mkdirSync(r.imageOutputDir,{recursive:true}));try{let{ImageEngineFactory:a}=await Promise.resolve().then(()=>(Ne(),Ue)),s=await a.getEngine();r.verbose;let n=await s.extractImages(e,r);if(!n.success)throw new Error(n.error||"Engine extraction failed");return {success:!0,images:n.images||[],metadata:{totalImages:n.images?.length||0,engine:s.name}}}catch{r.verbose;try{return await this.extractWithPdfLib(e,r)}catch(s){return r.verbose,{success:false,images:[],error:s instanceof Error?s.message:String(s)}}}}static async getAvailableEngines(){return [{name:"pdf-lib",description:"PDF-lib based extraction with full format support",available:true,capabilities:{formats:["jpg","jpeg","png","jp2","tiff"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:false}},{name:"poppler",description:"Poppler-based extraction using pdfimages command",available:false,capabilities:{formats:["jpg","jpeg","png","tiff","ppm","pbm"],supportsMetadata:true,supportsEmbeddedImages:true,supportsVectorImages:true}}]}static getEngineRecommendations(){return [{useCase:"Maximum format support and metadata accuracy",engine:"pdf-lib",reason:"Supports all PDF image formats including JPEG 2000, PNG with proper metadata extraction"},{useCase:"Fast extraction with system tools",engine:"poppler",reason:"Uses optimized native poppler tools, good for batch processing (coming soon)"},{useCase:"Cross-platform compatibility",engine:"pdf-lib",reason:"Pure JavaScript implementation, works everywhere Node.js runs"}]}async extractWithPdfLib(e,t={}){try{let{PDFDocument:r,PDFName:a}=await import('pdf-lib'),s=w__default.readFileSync(e),n=await r.load(s,{ignoreEncryption:!0}),o=n.getPageCount(),c=[],i=1;t.verbose,t.extractImageFiles&&t.imageOutputDir&&(w__default.existsSync(t.imageOutputDir)||w__default.mkdirSync(t.imageOutputDir,{recursive:!0}));for(let g=0;g<o;g++){let m=g+1;try{let l=n.getPage(g).node.Resources();if(!l){t.verbose;continue}let f=l.get(a.of("XObject"));if(!f){t.verbose;continue}let h=f.dict;t.verbose;for(let[x,d]of h)try{let b=n.context.lookup(d),y=b.dict.get(a.of("Subtype"));if(!y||y.toString()!=="/Image")continue;let v=await this.extractImageFromPdfObject(b,m,i,t);v&&(c.push(v),i++);}catch{t.verbose;}}catch{t.verbose;}}if(t.verbose,!t.preserveJp2&&t.extractImageFiles){let g=c.filter(m=>m.filePath?.endsWith(".jp2")||m.filepath?.endsWith(".jp2"));if(g.length>0){t.verbose;let{ImageOptimizer:m}=await Promise.resolve().then(()=>(be(),Be));for(let u of g){let l=u.filePath||u.filepath;if(!l)continue;let f=await m.convertJp2ToJpg(l,{quality:100,verbose:t.verbose,useSharp:t.useSharp});f.success&&f.newPath&&(u.filePath=f.newPath,u.filepath=f.newPath,u.format="jpg");}if(t.verbose){let u=g.filter(l=>l.filePath?.endsWith(".jpg")||l.filepath?.endsWith(".jpg")).length;}}}return {images:c,totalPages:o,totalImages:c.length}}catch(r){throw t.verbose,r}}async extractImageFromPdfObject(e,t,r,a){try{let{PDFName:s}=await import('pdf-lib'),n=e.dict.get(s.of("Width")),o=e.dict.get(s.of("Height")),c=e.dict.get(s.of("Filter")),i=e.dict.get(s.of("ColorSpace")),g=e.dict.get(s.of("BitsPerComponent")),m=n&&typeof n.value=="number"?n.value:100,u=o&&typeof o.value=="number"?o.value:100,l=g&&typeof g.value=="number"?g.value:8;a.verbose;let f=await this.extractImageData(e,c,m,u,i,l,a);if(!f.success||!f.imageData)return a.verbose,null;let h=f.imageData,x=f.mimeType||"image/jpeg",d=f.extension||"jpg",b=`img_p${t}_${r}.${d}`,y="",v=h.length;a.extractImageFiles&&a.imageOutputDir&&(y=T.join(a.imageOutputDir,b),w__default.writeFileSync(y,h),a.verbose);let I=m,P=u;if(h)try{let E=it(Buffer.from(h));E.width&&E.height&&(I=E.width,P=E.height,a.verbose);}catch{a.verbose;}return {id:`img_${r}`,name:b,page:t,position:{x:0,y:0,width:I,height:P},width:I,height:P,format:x==="image/jpeg"?"JPEG":x==="image/png"?"PNG":"unknown",filePath:y}}catch{return a.verbose,null}}async extractImageData(e,t,r,a,s,n,o){try{let c=await import('zlib'),i,g="image/jpeg",m="jpg";if(t){let u=t.toString();if(o.verbose,u.includes("DCTDecode")&&u.includes("FlateDecode")){o.verbose;try{let l=e.contents;i=c.inflateSync(Buffer.from(l)),g="image/jpeg",m="jpg",o.verbose;}catch(l){return o.verbose,{success:!1,error:`Zlib decompression failed: ${l instanceof Error?l.message:"Unknown error"}`}}}else if(u.includes("DCTDecode"))o.verbose,i=Buffer.from(e.contents),g="image/jpeg",m="jpg";else if(u.includes("FlateDecode")){o.verbose;try{let l=e.contents,f=c.inflateSync(Buffer.from(l));o.verbose;let h=this.detectImageFormat(f);if(h.valid)i=f,g=h.mimeType,m=h.extension,o.verbose;else {let x=await this.createPngFromPdfMetadata(f,r,a,s,n,o);if(x.success&&x.pngData)i=x.pngData,g="image/png",m="png",o.verbose;else return o.verbose,{success:!1,error:`PNG creation failed: ${x.error}`}}}catch(l){return o.verbose,{success:!1,error:`FlateDecode decompression failed: ${l instanceof Error?l.message:"Unknown error"}`}}}else if(u.includes("JPXDecode")){o.verbose;try{i=Buffer.from(e.contents),g="image/jp2",m="jp2",o.verbose;}catch(l){return o.verbose,{success:!1,error:`JPXDecode extraction failed: ${l instanceof Error?l.message:"Unknown error"}`}}}else {o.verbose;try{let l=await e.asUint8Array();i=Buffer.from(l);let f=this.detectImageFormat(i);f.valid&&(g=f.mimeType,m=f.extension);}catch(l){return o.verbose,{success:!1,error:`Generic decompression failed: ${l instanceof Error?l.message:"Unknown error"}`}}}}else {o.verbose;try{let u=await e.asUint8Array();i=Buffer.from(u);let l=this.detectImageFormat(i);l.valid&&(g=l.mimeType,m=l.extension);}catch(u){return o.verbose,{success:!1,error:`Raw data extraction failed: ${u instanceof Error?u.message:"Unknown error"}`}}}return !i||i.length<100?{success:!1,error:`Image data too small: ${i?.length||0} bytes`}:{success:!0,imageData:i,mimeType:g,extension:m}}catch(c){return o.verbose,{success:false,error:c instanceof Error?c.message:"Unknown error"}}}detectImageFormat(e){return !e||e.length<10?{valid:false}:e[0]===255&&e[1]===216?{valid:true,mimeType:"image/jpeg",extension:"jpg"}:e[0]===137&&e[1]===80&&e[2]===78&&e[3]===71?{valid:true,mimeType:"image/png",extension:"png"}:e[0]===71&&e[1]===73&&e[2]===70?{valid:true,mimeType:"image/gif",extension:"gif"}:e[0]===73&&e[1]===73||e[0]===77&&e[1]===77?{valid:true,mimeType:"image/tiff",extension:"tiff"}:e.length>=12&&e[0]===0&&e[1]===0&&e[2]===0&&e[3]===12&&e[4]===106&&e[5]===80&&e[6]===32&&e[7]===32?{valid:true,mimeType:"image/jp2",extension:"jp2"}:{valid:false}}async createPngFromPdfMetadata(e,t,r,a,s,n){try{let{PNG:o}=await import('pngjs'),c=a?.toString()||"",i=3,g=2;c.includes("DeviceGray")||c.includes("Gray")?(i=1,g=0):c.includes("DeviceRGB")||c.includes("RGB")?(i=3,g=2):(c.includes("DeviceCMYK")||c.includes("CMYK"))&&(i=4,g=2);let m=t*r*i*(s/8),u=e.length;if(n.verbose,Math.abs(u-m)>u*.1)return {success:!1,error:`Data size mismatch: expected ${m}, got ${u} bytes`};let l=new o({width:t,height:r,colorType:g===0?0:6,bitDepth:8}),f;if(i===1){f=Buffer.alloc(t*r*4);for(let x=0;x<t*r;x++){let d=e[x]||0,b=x*4;f[b]=d,f[b+1]=d,f[b+2]=d,f[b+3]=255;}}else if(i===3){f=Buffer.alloc(t*r*4);for(let x=0;x<t*r;x++){let d=x*3,b=x*4;f[b]=e[d]||0,f[b+1]=e[d+1]||0,f[b+2]=e[d+2]||0,f[b+3]=255;}}else if(i===4){f=Buffer.alloc(t*r*4);for(let x=0;x<t*r;x++){let d=x*4,b=(e[d]||0)/255,y=(e[d+1]||0)/255,v=(e[d+2]||0)/255,I=(e[d+3]||0)/255,P=x*4;f[P]=Math.round(255*(1-b)*(1-I)),f[P+1]=Math.round(255*(1-y)*(1-I)),f[P+2]=Math.round(255*(1-v)*(1-I)),f[P+3]=255;}}else return {success:!1,error:`Unsupported color space with ${i} components`};l.data=f;let h=o.sync.write(l);return n.verbose,{success:!0,pngData:h}}catch(o){return {success:false,error:`PNG creation error: ${o instanceof Error?o.message:"Unknown error"}`}}}};var G=class{pdfLibDoc=null;pdfLibPages=[];textData=[];constructor(){this.initializePdfjs();}initializePdfjs(){if(!F.GlobalWorkerOptions.workerSrc){let e=createRequire(import.meta.url),t=T.dirname(e.resolve("pdfjs-dist/package.json"));F.GlobalWorkerOptions.workerSrc=T.join(t,"legacy","build","pdf.worker.mjs");}}async processPDF(e){let t=w.readFileSync(e),[r,a]=await Promise.all([this.processPDFLib(t),this.processPDFjs(t)]);this.textData=this.combineResults(r,a);let s=this.textData.map(n=>n.text).join(`
|
|
2
|
+
`).trim();return {totalPages:this.textData.length,pages:this.textData,fullText:s}}async processPDFLib(e){return this.pdfLibDoc=await PDFDocument.load(e,{ignoreEncryption:true}),this.pdfLibPages=this.pdfLibDoc.getPages(),this.pdfLibPages.map((t,r)=>{let{width:a,height:s}=t.getSize();return {pageNumber:r+1,width:a,height:s,rotation:t.getRotation().angle,mediaBox:t.getMediaBox()}})}async processPDFjs(e){let t=new Uint8Array(e),a=await F.getDocument({data:t,verbosity:F.VerbosityLevel.ERRORS}).promise,s=[];try{for(let n=1;n<=a.numPages;n++)try{let o=await a.getPage(n),c=await o.getTextContent({includeMarkedContent:!1,disableNormalization:!1}),i=o.getViewport({scale:1}),g=c.items.filter(h=>"str"in h&&typeof h.str=="string");g.sort((h,x)=>{let d=x.transform[5]-h.transform[5];return Math.abs(d)>2?d:h.transform[4]-x.transform[4]});let m="",u=null,l="";for(let h of g){if(!("str"in h))continue;let x=h.transform[5];u===null?(u=x,l=h.str):Math.abs(x-u)>2?(m+=`${l}
|
|
3
|
+
`,u=x,l=h.str):l+=` ${h.str}`;}l&&(m+=l),m=m.trim();let f={pageNumber:n,text:m,textItems:c.items,pdfParseWidth:i.width,pdfParseHeight:i.height};s.push(f),o.cleanup();}catch{s.push({pageNumber:n,text:"",textItems:[],pdfParseWidth:0,pdfParseHeight:0});}return s.sort((n,o)=>n.pageNumber-o.pageNumber)}finally{await a.destroy();}}combineResults(e,t){return e.map(r=>{let a=t.find(n=>n.pageNumber===r.pageNumber),s=a?.text||"";return {pageNumber:r.pageNumber,text:s,width:r.width,height:r.height,rotation:r.rotation,mediaBox:r.mediaBox,textItems:a?.textItems||[],wordCount:this.countWords(s),characterCount:s.length}})}async extractWithPageMarkers(e,t="--- PAGE {page} ---",r={}){let a=await this.processPDF(e),s=[];if(r.includeImageRefs)try{s=(await new D().extract(e,{extractImageFiles:!1,verbose:!1})).images||[];}catch{}let n="";return a.pages.forEach(o=>{let c=t.replace("{page}",o.pageNumber.toString()),i=o.text;if(r.includeImageRefs&&s.length>0){let g=s.filter(m=>m.page===o.pageNumber);if(g.length>0){let m=g.map(u=>(r.imageRefFormat||"[IMG:{id}] {name}").replace("{id}",`img_${u.id}`).replace("{name}",u.filename||`img_p${u.page}_${u.id}.jpg`)).join(`
|
|
5
4
|
`);if(i.trim()){let u=i.split(`
|
|
6
5
|
`);u.length>1?(u.splice(1,0,m),i=u.join(`
|
|
7
6
|
`)):i=`${i}
|
|
8
|
-
${m}`;}else i=m;}}i.trim()?n+=`${
|
|
7
|
+
${m}`;}else i=m;}}i.trim()?n+=`${c}
|
|
9
8
|
|
|
10
9
|
${i}
|
|
11
|
-
`:n+=`${
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
`;}),{text:n.trim(),cleanText:a.fullText,numPages:a.totalPages,pages:a.pages}}getPage(
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
`),
|
|
18
|
-
|
|
19
|
-
`),
|
|
20
|
-
`).length,o=
|
|
21
|
-
`).trim()}async extractPageRange(t,
|
|
22
|
-
|
|
23
|
-
`)}async searchText(t,
|
|
24
|
-
`);
|
|
25
|
-
`);i.push(`Page ${m+1}: ${
|
|
26
|
-
`),n=Math.ceil(s.length/a)
|
|
27
|
-
`)
|
|
28
|
-
${
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
`)}formatImageReference(t,
|
|
32
|
-
`).trim()}countImageReferences(t
|
|
33
|
-
`)}formatFileSize(
|
|
34
|
-
|
|
35
|
-
`)
|
|
36
|
-
`),a=Math.ceil(r.length/
|
|
37
|
-
`)
|
|
38
|
-
`),n=Math.ceil(s.length/
|
|
39
|
-
`)}countWords(
|
|
40
|
-
|
|
41
|
-
`),
|
|
10
|
+
`:n+=`${c}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
`;}),{text:n.trim(),cleanText:a.fullText,numPages:a.totalPages,pages:a.pages}}getPage(e){return this.textData[e-1]||null}async getDetailedPageInfo(e,t){this.textData.length||await this.processPDF(e);let r=this.getPage(t);if(!r)return null;let a=(r.textItems||[]).map(s=>({text:s.str||"",x:s.transform?.[4]||0,y:s.transform?.[5]||0,width:s.width||0,height:s.height||0,fontName:s.fontName,fontSize:s.transform?.[0]||12}));return {pageNumber:t,text:r.text,textItems:a,dimensions:{width:r.width,height:r.height}}}countWords(e){return !e||e.trim()===""?0:e.split(/\s+/).filter(t=>t.length>0).length}async processSinglePage(e,t){try{let r=w.readFileSync(e),a=await PDFDocument.load(r,{ignoreEncryption:!0});if(t<1||t>a.getPageCount())return null;let n=a.getPages()[t-1];if(!n)return null;let{width:o,height:c}=n.getSize(),i=new Uint8Array(r),m=await F.getDocument({data:i,verbosity:F.VerbosityLevel.ERRORS}).promise,u=[],l="";try{let f=await m.getPage(t),h=await f.getTextContent({includeMarkedContent:!1,disableNormalization:!1});u=h.items,l=h.items.filter(x=>"str"in x).map(x=>x.str||"").join(" ").replace(/\s+/g," ").trim(),f.cleanup();}finally{await m.destroy();}return {pageNumber:t,text:l,width:o,height:c,rotation:n.getRotation().angle,mediaBox:[n.getMediaBox().x,n.getMediaBox().y,n.getMediaBox().width,n.getMediaBox().height],textItems:u,wordCount:this.countWords(l),characterCount:l.length}}catch{return null}}};var W=class{constructor(){this.initializePdfjs();}initializePdfjs(){if(!F.GlobalWorkerOptions.workerSrc){let e=createRequire(import.meta.url),t=T.dirname(e.resolve("pdfjs-dist/package.json"));F.GlobalWorkerOptions.workerSrc=T.join(t,"legacy","build","pdf.worker.mjs");}}async loadDocument(e){let t=w__default.readFileSync(e),r=new Uint8Array(t);return await F.getDocument({data:r,verbosity:F.VerbosityLevel.ERRORS}).promise}async getPageText(e){let t=await e.getTextContent({includeMarkedContent:false,disableNormalization:false}),r=[];for(let a of t.items)"str"in a&&(r.push(a.str),a.hasEOL&&r.push(`
|
|
14
|
+
`));return r.join("")}async extract(e){let t=null;try{t=await this.loadDocument(e);let r=await t.getMetadata(),a=[];for(let n=1;n<=t.numPages;n++){let o=await t.getPage(n),c=await this.getPageText(o);a.push(c),o.cleanup();}return {text:a.filter(n=>n&&n.length>0).join(`
|
|
15
|
+
|
|
16
|
+
`),numPages:t.numPages,info:r.info,metadata:r.metadata,version:r.info?.PDFFormatVersion||"1.0"}}catch(r){throw new Error(`Failed to extract text from PDF: ${r instanceof Error?r.message:"Unknown error"}`)}finally{t&&await t.destroy();}}async extractWithMetadata(e){let t=await this.extract(e);return {text:t.text,metadata:{numPages:t.numPages,info:t.info,metadata:t.metadata,version:t.version}}}async extractWithPages(e){let t=null;try{t=await this.loadDocument(e);let r=await t.getMetadata(),a=[];for(let n=1;n<=t.numPages;n++){let o=await t.getPage(n),c=await this.getPageText(o);a.push(c),o.cleanup();}return {text:a.filter(n=>n&&n.length>0).join(`
|
|
17
|
+
|
|
18
|
+
`),numPages:t.numPages,info:r.info,metadata:r.metadata,version:r.info?.PDFFormatVersion||"1.0",pages:a}}catch(r){throw new Error(`Failed to extract text with pages: ${r instanceof Error?r.message:"Unknown error"}`)}finally{t&&await t.destroy();}}async extractTextItems(e,t={}){let r=null;try{r=await this.loadDocument(e);let a=[],s=0;for(let n=1;n<=r.numPages;n++){let o=await r.getPage(n),c=await o.getTextContent({includeMarkedContent:!1,disableNormalization:!1});for(let i of c.items){if(!("str"in i)||!i.str.trim())continue;let g="text",m=i.height||12;m>14?g="heading":i.str.length>100?g="paragraph":i.str.length<30&&(g="caption");let u={id:`text_${++s}`,content:i.str,position:{x:i.transform[4],y:i.transform[5],width:i.width,height:i.height},font:{name:i.fontName||"Unknown",size:m,style:"normal"},page:n,type:g,fontSize:m,color:"#000000"};a.push(u);}o.cleanup();}return t.verbose,a}catch(a){throw new Error(`Failed to extract text items: ${a instanceof Error?a.message:"Unknown error"}`)}finally{r&&await r.destroy();}}async extractStatistics(e){let t=await this.extract(e),r=t.text,a=r.length,s=r.split(/\s+/).filter(g=>g.length>0).length,n=r.split(`
|
|
19
|
+
`).length,o=t.numPages,c=Math.round(s/o),i=Math.ceil(s/200);return {characterCount:a,wordCount:s,lineCount:n,pageCount:o,averageWordsPerPage:c,readingTime:i}}async extractWithFontInfo(e){return this.extract(e)}cleanText(e){return e.replace(/\s+/g," ").replace(/\n\s*\n/g,`
|
|
20
|
+
`).trim()}async extractPageRange(e,t,r){let a=await this.extractWithPages(e);if(t<1||r>a.numPages||t>r)throw new Error(`Invalid page range: ${t}-${r}. Document has ${a.numPages} pages.`);return a.pages.slice(t-1,r).join(`
|
|
21
|
+
|
|
22
|
+
`)}async searchText(e,t,r=false){let a=await this.extractWithPages(e),s=r?"g":"gi",n=new RegExp(t,s),o=0,c=[],i=[];return a.pages.forEach((g,m)=>{let u=g.match(n);if(u){o+=u.length,c.push(m+1);let l=g.split(`
|
|
23
|
+
`);l.forEach((f,h)=>{if(n.test(f)){let x=Math.max(0,h-1),d=Math.min(l.length,h+2),b=l.slice(x,d).join(`
|
|
24
|
+
`);i.push(`Page ${m+1}: ${b}`);}});}}),{found:o>0,occurrences:o,pages:c,context:i}}async extractWithPageMarkers(e,t="--- PAGE {page} ---",r={}){try{let a=new G,s={includeImageRefs:r.includeImageRefs??!0,imageRefFormat:r.imageRefFormat||"[IMG:{id}] {name}"},n=await a.extractWithPageMarkers(e,t,s),o=n.pages.map(c=>({pageNumber:c.pageNumber+(r.pageOffset||0),text:{content:c.text,rawText:c.text,wordCount:c.wordCount,characterCount:c.characterCount},images:[],imageCount:0}));return {text:n.text,pages:o}}catch(a){throw new Error(`Failed to extract text with page markers: ${a instanceof Error?a.message:"Unknown error"}`)}}async extractWithAccuratePages(e){let r=await new G().processPDF(e),a=r.pages.map(s=>({pageNumber:s.pageNumber,text:{content:s.text,rawText:s.text,wordCount:s.wordCount,characterCount:s.characterCount},images:[],imageCount:0}));return {fullText:r.fullText,pages:a,totalPages:r.totalPages}}};var q=class{pdfjs=null;async getPdfjs(){if(!this.pdfjs){this.pdfjs=await import('pdfjs-dist/legacy/build/pdf.mjs');let{createRequire:e}=await import('module'),t=e(import.meta.url),r=T.dirname(t.resolve("pdfjs-dist/package.json"));this.pdfjs.GlobalWorkerOptions.workerSrc=T.join(r,"legacy","build","pdf.worker.mjs");}return this.pdfjs}async convertToImages(e,t={}){let{outputDir:r="./page-images",format:a="png",quality:s=90,dpi:n=72,scale:o=1,pages:c,pageRange:i,filenamePattern:g="page-{page}.{ext}",backgroundColor:m="#FFFFFF",transparent:u=false,onProgress:l,onPageComplete:f,verbose:h=false}=t;w__default.existsSync(r)||w__default.mkdirSync(r,{recursive:true});let x=await this.getPdfjs(),d=new Uint8Array(w__default.readFileSync(e)),y=await x.getDocument({data:d,useWorkerFetch:false,isEvalSupported:false,useSystemFonts:true}).promise,v=y.numPages,I=this.getPageNumbers(v,c,i),P=[],k=0;for(let E=0;E<I.length;E++){let j=I[E];if(!j)continue;if(l){let _e=Math.round((E+1)/I.length*100);l(E+1,I.length,_e);}let L=await y.getPage(j),_=await this.renderPageToBuffer(L,{format:a,quality:s,dpi:n,scale:o,backgroundColor:m,transparent:u},y),J=this.generateFilename(g,j,v,T.basename(e,".pdf"),a),ge=T.join(r,J);w__default.writeFileSync(ge,_);let ve=_.length;k+=ve;let we=L.getViewport({scale:o*(n/72)}),Qe={page:j,filepath:ge,width:Math.floor(we.width),height:Math.floor(we.height),fileSize:ve,format:a};P.push(Qe),f&&f(j,ge);}return {images:P,totalPages:I.length,outputDir:r,totalSize:k}}async convertPage(e,t,r,a={}){let s=await this.convertPageToBuffer(e,t,a),n=T.dirname(r);w__default.existsSync(n)||w__default.mkdirSync(n,{recursive:true}),w__default.writeFileSync(r,s);let o=a.format||"png",c=await this.getPdfjs(),i=new Uint8Array(w__default.readFileSync(e)),l=(await(await c.getDocument({data:i}).promise).getPage(t)).getViewport({scale:(a.scale||1)*((a.dpi||72)/72)});return {page:t,filepath:r,width:Math.floor(l.width),height:Math.floor(l.height),fileSize:s.length,format:o}}async convertPageToBuffer(e,t,r={}){let a=await this.getPdfjs(),s=new Uint8Array(w__default.readFileSync(e)),o=await a.getDocument({data:s}).promise,c=await o.getPage(t);return this.renderPageToBuffer(c,r,o)}async convertPageToBase64(e,t,r={}){return (await this.convertPageToBuffer(e,t,r)).toString("base64")}async generateThumbnails(e,t={}){let{maxWidth:r=200,maxHeight:a=200,maintainAspectRatio:s=true,...n}=t,o={...n,outputDir:t.outputDir||"./thumbnails",format:t.format||"jpg",quality:t.quality||70,dpi:72,scale:.25,filenamePattern:"thumb-{page}.{ext}"};return this.convertToImages(e,o)}async renderPageToBuffer(e,t,r){let{format:a="png",quality:s=90,dpi:n=72,scale:o=1,backgroundColor:c="#FFFFFF",transparent:i=false}=t,g=e.getViewport({scale:o*(n/72)}),{canvas:m}=r.canvasFactory.create(g.width,g.height,i);return await e.render({canvas:m,viewport:g,background:i?"transparent":c}).promise,this.canvasToBuffer(m,a,s)}canvasToBuffer(e,t,r){let a=t==="jpg"?"jpeg":t;if(a==="png")return e.toBuffer("image/png");if(a==="jpeg")return e.toBuffer("image/jpeg",{quality:r/100});if(a==="webp")return e.toBuffer("image/webp",{quality:r/100});throw new Error(`Unsupported format: ${t}`)}getPageNumbers(e,t,r){return t&&t.length>0?t.filter(a=>a>=1&&a<=e):r?this.parsePageRange(r,e):Array.from({length:e},(a,s)=>s+1)}parsePageRange(e,t){let r=new Set,a=e.split(",");for(let s of a){let n=s.trim();if(n.includes("-")){let[o,c]=n.split("-"),i=parseInt(o?.trim()||"0"),g=parseInt(c?.trim()||"0");if(!isNaN(i)&&!isNaN(g))for(let m=i;m<=g&&m<=t;m++)m>=1&&r.add(m);}else {let o=parseInt(n);!isNaN(o)&&o>=1&&o<=t&&r.add(o);}}return Array.from(r).sort((s,n)=>s-n)}generateFilename(e,t,r,a,s){let n=s==="jpg"?"jpg":s;return e.replace("{page}",t.toString().padStart(3,"0")).replace("{total}",r.toString()).replace("{name}",a).replace("{ext}",n)}formatBytes(e){return e<1024?`${e} B`:e<1024*1024?`${(e/1024).toFixed(1)} KB`:`${(e/(1024*1024)).toFixed(1)} MB`}};var N=class{generateTextWithImageRefs(e,t,r,a){if(!e||t.length===0)return e||"";let s=e.split(`
|
|
25
|
+
`),n=Math.ceil(s.length/a);return Array.from({length:a},(i,g)=>g+1).map(i=>{let g=(i-1)*n,m=Math.min(g+n,s.length),u=s.slice(g,m).join(`
|
|
26
|
+
`),l=u.trim()?u:"",h=t.filter(b=>b.page===i).map(b=>`
|
|
27
|
+
${this.formatImageReference(b,r,t.indexOf(b)+1)}
|
|
28
|
+
`).join(""),x=l+h,d=i<a&&u.trim()?`
|
|
29
|
+
`:"";return x+d}).join("").trim()}generateImageOnlyRefs(e,t){return e.map((r,a)=>this.formatImageReference(r,t,a+1)).join(`
|
|
30
|
+
`)}formatImageReference(e,t,r){let a={id:e.id,name:e.name||e.id,page:e.page,index:r,path:e.filePath||e.id};return this.replacePlaceholders(t,a)}replacePlaceholders(e,t){return e.replace(/\{id\}/g,t.id).replace(/\{name\}/g,t.name||t.id).replace(/\{page\}/g,t.page.toString()).replace(/\{index\}/g,t.index.toString()).replace(/\{path\}/g,t.path||t.id)}extractPlaceholders(e){let t=/\{([^}]+)\}/g,a=Array.from(e.matchAll(t)).map(s=>s[1]).filter(s=>s!==void 0);return [...new Set(a)]}isValidFormat(e){let t=["id","name","page","index","path"];return this.extractPlaceholders(e).every(a=>t.includes(a))}getDefaultFormat(e=false){return e?"[IMAGE:{path}]":"[IMAGE:{id}]"}cleanTextFromImageRefs(e,t){let r=t.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g");return e.replace(a,"").replace(/\n\s*\n/g,`
|
|
31
|
+
`).trim()}countImageReferences(e,t){let r=t.replace(/[.*+?^${}()|[\]\\]/g,"\\$&").replace(/\\?\{id\\?\}/g,"[^\\s\\]]+").replace(/\\?\{name\\?\}/g,"[^\\s\\]]+").replace(/\\?\{page\\?\}/g,"\\d+").replace(/\\?\{index\\?\}/g,"\\d+").replace(/\\?\{path\\?\}/g,"[^\\s\\]]+"),a=new RegExp(r,"g"),s=e.match(a);return s?s.length:0}generateSummary(e,t,r,a,s){let n=(r/e).toFixed(2),o=["\u{1F4C4} Document Summary",` Pages: ${e}`,` Text items: ${t}`,` Images: ${r} (avg ${n} per page)`,` Text length: ${a.toLocaleString()} characters`];return s&&o.push(` Processing time: ${s}ms`),o.join(`
|
|
32
|
+
`)}formatFileSize(e){let t=["B","KB","MB","GB"],r=t.reduce((a,s,n)=>a.size>=1024&&n<t.length-1?{size:a.size/1024,unitIndex:n+1}:a,{size:e,unitIndex:0});return `${r.size.toFixed(1)} ${t[r.unitIndex]}`}formatDuration(e){if(e<1e3)return `${e}ms`;let t=Math.floor(e/1e3);if(t<60)return `${t}s`;let r=Math.floor(t/60),a=t%60;return `${r}m ${a}s`}};var oe=class{extractRawText(e){return e.replace(/--- PAGE \d+ ---\s*/g,"").replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,"").replace(/PAGE \d+\s*/g,"").replace(/\[IMG:\w+\]\s*\w*\s*/g,"").replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,"").replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,"").replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,"").replace(/\n\s*\n\s*\n/g,`
|
|
33
|
+
|
|
34
|
+
`).replace(/^\s+|\s+$/g,"").replace(/[ \t]+/g," ")}generateStructuredData(e,t,r,a,s,n,o){let c=this.splitTextIntoPages(t,a),i=this.createPageDataArray(c,r,a,n,o);return {metadata:{filename:e,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:t.length,totalImages:r.length,extractionOptions:s},pages:i}}splitTextIntoPages(e,t){if(t<=1)return [e];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=e.match(r);return a&&a.length>0?this.splitByPageMarkers(e,r):this.splitByEstimatedLength(e,t)}splitByPageMarkers(e,t){let a=e.split(t).slice(1).map(s=>s.trim()).filter(s=>s.length>0);return a.length===0?[e]:a}splitByEstimatedLength(e,t){let r=e.split(`
|
|
35
|
+
`),a=Math.ceil(r.length/t);return Array.from({length:t},(o,c)=>c).map(o=>{let c=o*a,i=Math.min((o+1)*a,r.length);return r.slice(c,i).join(`
|
|
36
|
+
`)})}createPageDataArray(e,t,r,a,s){return Array.from({length:r},(c,i)=>i).map(c=>{let i=c+1,g=e[c]||"",m=this.getImagesForPage(t,i),u=this.extractRawText(g),l={pageNumber:i,text:{content:g,rawText:u,wordCount:this.countWords(u),characterCount:u.length},images:m,imageCount:m.length};if(a&&a.has(i)&&(l.pageImage=a.get(i)),s&&s.has(i)&&(l.thumbnail=s.get(i)),a&&a.has(i)){let f=a.get(i);f.variants&&f.variants.length>0&&(l.pageImageVariants=f.variants);}return l})}getImagesForPage(e,t){return e.filter(r=>r.page===t).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r&&r.filename!==void 0&&(a.filename=r.filename),"path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("filepath"in r&&r.filepath!==void 0&&(a.path=r.filepath),"filePath"in r){let s=r.filePath;s!==void 0&&(a.path=s);}return "size"in r&&r.size!==void 0&&(a.size=r.size),"width"in r&&r.width!==void 0&&(a.width=r.width),"height"in r&&r.height!==void 0&&(a.height=r.height),"mimeType"in r&&r.mimeType!==void 0&&(a.mimeType=r.mimeType),a})}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}generateJSONString(e,t=2){return JSON.stringify(e,null,t)}generateSummary(e){let t=e.pages.reduce((n,o)=>n+o.text.wordCount,0),r=e.pages.reduce((n,o)=>n+o.text.characterCount,0),a=e.pages.filter(n=>n.text.content.trim().length>0).length,s=e.pages.filter(n=>n.imageCount>0).length;return {totalWords:t,totalCharacters:r,averageWordsPerPage:Math.round(t/e.pages.length),averageImagesPerPage:Math.round(e.metadata.totalImages/e.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var ie=class{cacheDir;constructor(e="./tmp/pdf-cache"){this.cacheDir=e,this.ensureCacheDir();}generateCacheKey(e){let t=T.resolve(e),r=w__default.statSync(t),a=`${t}:${r.mtime.getTime()}:${r.size}`;return ft.createHash("md5").update(a).digest("hex")}getCacheDir(e){let t=this.generateCacheKey(e);return T.join(this.cacheDir,t)}ensureCacheDir(){w__default.existsSync(this.cacheDir)||w__default.mkdirSync(this.cacheDir,{recursive:true});}isCached(e){try{let t=this.getCacheDir(e),r=T.join(t,"cache-info.json");return w__default.existsSync(r)}catch{return false}}getCacheInfo(e){try{let t=this.getCacheDir(e),r=T.join(t,"cache-info.json");return w__default.existsSync(r)?JSON.parse(w__default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(e,t){let r=this.getCacheDir(e);w__default.existsSync(r)||w__default.mkdirSync(r,{recursive:true});let a=w__default.statSync(e),s={pdfPath:T.resolve(e),lastModified:a.mtime.getTime(),totalPages:t,cacheDir:r,created:new Date().toISOString()},n=T.join(r,"cache-info.json");return w__default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(e,t,r){try{let a=this.getCacheDir(e),s=T.join(a,`page-${t}.json`);w__default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(e,t){try{let r=this.getCacheDir(e),a=T.join(r,`page-${t}.json`);return w__default.existsSync(a)?JSON.parse(w__default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(e){try{let t=this.getCacheDir(e),r=[];if(!w__default.existsSync(t))return r;let s=w__default.readdirSync(t).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=T.join(t,n),c=JSON.parse(w__default.readFileSync(o,"utf-8"));r.push(c);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(e){try{let t=this.getCacheDir(e);w__default.existsSync(t)&&w__default.rmSync(t,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{w__default.existsSync(this.cacheDir)&&w__default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{if(!w__default.existsSync(this.cacheDir))return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir};let e=w__default.readdirSync(this.cacheDir),t=e.length,{totalCachedPages:r,totalCacheSize:a}=e.reduce((s,n)=>{let o=T.join(this.cacheDir,n);if(!w__default.statSync(o).isDirectory())return s;let c=w__default.readdirSync(o),i=c.filter(m=>m.startsWith("page-")&&m.endsWith(".json")),g=c.reduce((m,u)=>{let l=T.join(o,u);return m+w__default.statSync(l).size},0);return {totalCachedPages:s.totalCachedPages+i.length,totalCacheSize:s.totalCacheSize+g}},{totalCachedPages:0,totalCacheSize:0});return {totalCachedPdfs:t,totalCachedPages:r,totalCacheSize:a,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var M=class{textExtractor;imageExtractor;pageToImageConverter;formatProcessor;structuredDataGenerator;cacheManager;constructor(e){this.textExtractor=new W,this.imageExtractor=new D,this.pageToImageConverter=new q,this.formatProcessor=new N,this.structuredDataGenerator=new oe,this.cacheManager=new ie(e);}async extract(e,t={}){let r={pdfPath:e,outputDir:t.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,includePageMarkers:true,pageMarkerFormat:"--- PAGE {page} ---",...t}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!w__default.existsSync(e))throw new Error(`PDF file not found: ${e}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(e),r.options.includePageMarkers||r.options.includeImageRefs)){let l=r.options.pageMarkerFormat||"--- PAGE {page} ---",h={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};o=await this.textExtractor.extractWithPageMarkers(e,l,h);}let c=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,c=await this.textExtractor.extractTextItems(e,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(e,r.options));let g=null,m=null;if(r.options.generatePageImages||r.options.generateThumbnails){let l=i?.totalPages||n?.numPages||0,f=r.options.pageNumbers||Array.from({length:l},(h,x)=>x+1);r.options.generatePageImages&&(g=await this.generatePageImagesWithVariants(e,f,r.options)),r.options.generateThumbnails&&(m=await this.generatePageThumbnails(e,f,r.options));}let u=await this.processResults(e,n,o,i,c,r.options,s,g,m);return this.reportProgress(r.options,{currentPage:u.document.pages,totalPages:u.document.pages,phase:"complete"}),u}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(e,t={}){return (await this.extract(e,{...t,extractText:true,extractImages:false})).cleanText}async extractImages(e,t={}){return (await this.extract(e,{...t,extractText:false,extractImages:true})).images}async extractImageFiles(e,t="./extracted-images",r={}){return (await this.extract(e,{...r,extractImageFiles:true,imageOutputDir:t,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(e){return X(e)}async processResults(e,t,r,a,s,n,o,c,i){let g=T.basename(e),u=this.extractRawText(t?.text||""),l={document:{filename:g,pages:a?.totalPages||t?.numPages||0,textLength:t?.text?.length||0,extractedAt:new Date().toISOString(),metadata:t?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:u,textWithRefs:"",cleanText:u};if(n.extractText&&n.extractImages&&t&&a)if(r?.text&&n.includeImageRefs)l.textWithRefs=r.text;else if(n.includeImageRefs){let f=r?.text||t.text;l.textWithRefs=this.formatProcessor.generateTextWithImageRefs(f,a.images,n.imageRefFormat||"[IMAGE:{id}]",l.document.pages);}else l.textWithRefs=r?.text||t.text;else n.extractText&&t?l.textWithRefs=r?.text||t.text:n.extractImages&&a&&(l.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(l.summary={totalPages:l.document.pages,totalTextItems:0,totalImages:l.images.length,totalTextLength:l.document.textLength,averageImagesPerPage:(l.images.length/l.document.pages).toFixed(2),pagesWithImages:new Set(l.images.map(f=>f.page)).size},n.generateStructuredData){let f=l.textWithRefs||l.cleanText;l.structuredData=this.structuredDataGenerator.generateStructuredData(g,f,l.images,l.document.pages,n,c,i),n.verbose;}return n.verbose,l}async getText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).text}async getImages(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:false,extractImages:true})).images}async getTextItems(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).rawText}async getPage(e,t,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(e,t);if(m)return r.verbose,m}let a={...r,specificPages:[t]},s=await this.extract(e,a),n=this.extractPageText(s.textWithRefs||s.cleanText,t),o=s.images.filter(m=>m.page===t),c=s.textItems?.filter(m=>m.page===t)||[],i=this.extractRawText(n),g={pageNumber:t,text:n,rawText:i,textItems:c,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(e,t,g),g}extractPageText(e,t){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=e.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===t)return a[i+3]||""}let s=e.split(`
|
|
37
|
+
`),n=Math.ceil(s.length/t),o=(t-1)*n,c=Math.min(t*n,s.length);return s.slice(o,c).join(`
|
|
38
|
+
`)}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}extractRawText(e){let t=e;return t=t.replace(/--- PAGE \d+ ---\s*/g,""),t=t.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),t=t.replace(/PAGE \d+\s*/g,""),t=t.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),t=t.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),t=t.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),t=t.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),t=t.replace(/\n\s*\n\s*\n/g,`
|
|
39
|
+
|
|
40
|
+
`),t=t.replace(/^\s+|\s+$/g,""),t=t.replace(/[ \t]+/g," "),t}clearCache(e){this.cacheManager.clearCache(e);}getCacheStats(){return this.cacheManager.getCacheStats()}async generatePageImagesWithVariants(e,t,r){let a=new Map,s=r.imageOutputDir||"./page-images",n=r.pageImageFormat||"png",o=r.pageImageDpi||150,c=r.pageImageQualities||[r.pageImageQuality||90];r.verbose;let i=c[0],g={outputDir:T.join(s,n),format:n,quality:i,dpi:o,pages:t,verbose:r.verbose??false},m=await this.pageToImageConverter.convertToImages(e,g);for(let u of m.images){let l=w__default.statSync(u.filepath);a.set(u.page,{path:u.filepath,format:u.format,width:u.width,height:u.height,size:l.size,dpi:o,quality:i,variants:[]});}if(c.length>1)for(let u of c.slice(1)){let l={outputDir:T.join(s,`${n}-q${u}`),format:n,quality:u,dpi:o,pages:t,verbose:false},f=await this.pageToImageConverter.convertToImages(e,l);for(let h of f.images){let x=w__default.statSync(h.filepath),d=a.get(h.page);d&&d.variants.push({path:h.filepath,format:h.format,width:h.width,height:h.height,size:x.size,quality:u,dpi:o});}}return r.verbose,a}async generatePageThumbnails(e,t,r){let a=new Map,s=r.imageOutputDir||"./page-images",n=r.thumbnailQuality||80;r.verbose;let o={outputDir:T.join(s,"thumbnails"),format:"jpg",quality:n,dpi:72,scale:.25,pages:t,verbose:r.verbose??false,filenamePattern:"thumb-{page}.{ext}"},c=await this.pageToImageConverter.convertToImages(e,o);for(let i of c.images){let g=w__default.statSync(i.filepath);a.set(i.page,{path:i.filepath,format:i.format,width:i.width,height:i.height,size:g.size,quality:n});}return r.verbose,a}reportProgress(e,t){e.progressCallback&&e.progressCallback(t);}createValidationError(e,t){let r=new Error(e);return r.code="VALIDATION_ERROR",r.validationErrors=t,r}createExtractionError(e,t){let r=new Error(e);return r.code="EXTRACTION_ERROR",r.originalError=t,r}},B=new M;var Q=class{state;options;pdfPath;extractor;eventQueue=[];resolveNext=null;extractionPromise=null;constructor(e,t={}){this.pdfPath=e,this.options={progressInterval:5,enableBackpressure:true,maxBufferedPages:10,...t},this.extractor=new M,this.state={totalPages:0,pagesProcessed:0,imagesExtracted:0,totalTextLength:0,bytesProcessed:0,startTime:Date.now(),lastProgressTime:Date.now(),isPaused:false,isCancelled:false,isComplete:false,bufferedPages:0,eventQueue:[],callbacks:{}};}async*[Symbol.asyncIterator](){for(this.extractionPromise||(this.extractionPromise=this.startExtraction());;){if(this.state.isCancelled)return;if(this.eventQueue.length>0){let e=this.eventQueue.shift();if(yield e,e.type==="complete"||e.type==="error")return;continue}if(this.state.isComplete)return;await new Promise(e=>{this.resolveNext=()=>e();});}}on(e,t){return e==="start"?this.state.callbacks.onStart=t:e==="page"?this.state.callbacks.onPage=t:e==="image"?this.state.callbacks.onImage=t:e==="progress"?this.state.callbacks.onProgress=t:e==="complete"?this.state.callbacks.onComplete=t:e==="error"?this.state.callbacks.onError=t:e==="any"&&(this.state.callbacks.onAny=t),this}async cancel(){this.state.isCancelled=true,this.resolveNext&&this.resolveNext();}pause(){this.state.isPaused=true;}resume(){this.state.isPaused=false;}getStats(){let e=Date.now()-this.state.startTime,t=this.state.pagesProcessed>0?e/this.state.pagesProcessed:0,r=this.state.totalPages-this.state.pagesProcessed,a=t*r;return {pagesProcessed:this.state.pagesProcessed,totalPages:this.state.totalPages,imagesExtracted:this.state.imagesExtracted,bytesProcessed:this.state.bytesProcessed,startTime:this.state.startTime,elapsedTime:e,isPaused:this.state.isPaused,isCancelled:this.state.isCancelled,isComplete:this.state.isComplete,averagePageTime:t,estimatedTimeRemaining:a}}async emitEvent(e){this.eventQueue.push(e),e.type==="start"&&this.state.callbacks.onStart?await this.state.callbacks.onStart(e):e.type==="page"&&this.state.callbacks.onPage?await this.state.callbacks.onPage(e):e.type==="image"&&this.state.callbacks.onImage?await this.state.callbacks.onImage(e):e.type==="progress"&&this.state.callbacks.onProgress?await this.state.callbacks.onProgress(e):e.type==="complete"&&this.state.callbacks.onComplete?await this.state.callbacks.onComplete(e):e.type==="error"&&this.state.callbacks.onError&&await this.state.callbacks.onError(e),this.state.callbacks.onAny&&await this.state.callbacks.onAny(e),this.resolveNext&&(this.resolveNext(),this.resolveNext=null);}async startExtraction(){try{let e=await this.extractor.extract(this.pdfPath,{...this.options,extractImageFiles:!1,extractImages:!1,verbose:!1});this.state.totalPages=e.document.pages||0,await this.emitEvent({type:"start",timestamp:Date.now(),totalPages:this.state.totalPages,pdfPath:this.pdfPath});let t=Array.from({length:this.state.totalPages},(a,s)=>s+1);for(let a of t){if(this.state.isCancelled)break;for(;(this.state.isPaused||this.options.enableBackpressure&&this.state.bufferedPages>=(this.options.maxBufferedPages||10))&&(await new Promise(n=>setTimeout(n,100)),!this.state.isCancelled););let s=await this.extractor.getPage(this.pdfPath,a,this.options);if(this.state.pagesProcessed++,this.state.bufferedPages++,await this.emitEvent({type:"page",timestamp:Date.now(),pageNumber:a,totalPages:this.state.totalPages,textLength:s.text.length||0,imageCount:s.images.length||0}),s.images&&s.images.length>0&&await Promise.all(s.images.map(async(n,o)=>{n&&(this.state.imagesExtracted++,await this.emitEvent({type:"image",timestamp:Date.now(),image:n,pageNumber:a,imageIndex:o+1,totalImages:s.images.length}));})),this.state.totalTextLength+=s.text.length||0,this.state.bufferedPages--,a%(this.options.progressInterval||5)===0||a===this.state.totalPages){let n=this.getStats();await this.emitEvent({type:"progress",timestamp:Date.now(),pagesProcessed:this.state.pagesProcessed,totalPages:this.state.totalPages,imagesExtracted:this.state.imagesExtracted,percentComplete:this.state.pagesProcessed/this.state.totalPages*100,estimatedTimeRemaining:n.estimatedTimeRemaining});}}this.state.isComplete=!0;let r=Date.now()-this.state.startTime;await this.emitEvent({type:"complete",timestamp:Date.now(),totalPages:this.state.totalPages,totalImages:this.state.imagesExtracted,totalTextLength:this.state.totalTextLength,duration:r});}catch(e){await this.emitEvent({type:"error",timestamp:Date.now(),error:e instanceof Error?e:new Error(String(e)),recoverable:false}),this.state.isComplete=true;}}};ae();ae();async function pt(p,e={}){return e.autoStreamThreshold&&e.streamMode!==false&&e.autoStreamThreshold>0&&(await B.extract(p,{extractText:true,extractImages:false,extractImageFiles:false,verbose:false})).document.pages>e.autoStreamThreshold?(e.verbose,qe(p,{...e,streamMode:true})):B.extract(p,e)}async function ht(p,e={}){return B.extractText(p,e)}async function dt(p,e={}){return B.extractImages(p,e)}async function xt(p,e="./extracted-images",t={}){return B.extractImageFiles(p,e,t)}function qe(p,e={}){return new Q(p,e)}var bt="1.0.3",_r={PDFExtractor:M,pdfExtractor:B,StreamingPDFExtractor:Q,TextExtractor:W,ImageExtractor:D,ImageOptimizer:O,FormatProcessor:N,extractPdfContent:pt,extractText:ht,extractImages:dt,extractImageFiles:xt,extractPdfStream:qe,validateConfig:X,validateImageRefFormat:me,validateFilePath:ue,version:bt};export{N as FormatProcessor,D as ImageExtractor,O as ImageOptimizer,M as PDFExtractor,q as PageToImageConverter,Q as StreamingPDFExtractor,G as StructuredTextExtractor,W as TextExtractor,_r as default,xt as extractImageFiles,dt as extractImages,pt as extractPdfContent,qe as extractPdfStream,ht as extractText,B as pdfExtractor,X as validateConfig,ue as validateFilePath,me as validateImageRefFormat,bt as version};//# sourceMappingURL=index.mjs.map
|
|
42
41
|
//# sourceMappingURL=index.mjs.map
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
'use strict';var worker_threads=require('worker_threads'),p=require('zlib'),util=require('util');function _interopDefault(e){return e&&e.__esModule?e:{default:e}}var p__default=/*#__PURE__*/_interopDefault(p);var l=util.promisify(p__default.default.inflate);if(!worker_threads.parentPort)throw new Error("This script must be run as a worker thread");worker_threads.parentPort.on("message",async t=>{let a=Date.now();try{if(t.type!=="decode")throw new Error(`Invalid task type: ${t.type}`);let{buffer:e,options:s}=t.data,{filter:n}=s,c=e.length,o=await(async()=>{switch(n){case "FlateDecode":return await l(e);case "DCTDecode":return e;case "JPXDecode":return e;default:return e}})(),i=o.length,u=Date.now()-a,d={success:!0,taskId:t.taskId,data:o,stats:{duration:u,inputSize:c,outputSize:i}};worker_threads.parentPort.postMessage(d);}catch(e){let s={success:false,taskId:t.taskId,error:e instanceof Error?e.message:"Unknown decode error"};worker_threads.parentPort.postMessage(s);}});//# sourceMappingURL=image-decoder.worker.js.map
|
|
2
|
+
//# sourceMappingURL=image-decoder.worker.js.map
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import {parentPort}from'worker_threads';import p from'zlib';import {promisify}from'util';var l=promisify(p.inflate);if(!parentPort)throw new Error("This script must be run as a worker thread");parentPort.on("message",async t=>{let a=Date.now();try{if(t.type!=="decode")throw new Error(`Invalid task type: ${t.type}`);let{buffer:e,options:s}=t.data,{filter:n}=s,c=e.length,o=await(async()=>{switch(n){case "FlateDecode":return await l(e);case "DCTDecode":return e;case "JPXDecode":return e;default:return e}})(),i=o.length,u=Date.now()-a,d={success:!0,taskId:t.taskId,data:o,stats:{duration:u,inputSize:c,outputSize:i}};parentPort.postMessage(d);}catch(e){let s={success:false,taskId:t.taskId,error:e instanceof Error?e.message:"Unknown decode error"};parentPort.postMessage(s);}});//# sourceMappingURL=image-decoder.worker.mjs.map
|
|
2
|
+
//# sourceMappingURL=image-decoder.worker.mjs.map
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
'use strict';var i=require('fs');require('path');var B=require('jimp'),worker_threads=require('worker_threads');function _interopDefault(e){return e&&e.__esModule?e:{default:e}}var i__default=/*#__PURE__*/_interopDefault(i);var B__default=/*#__PURE__*/_interopDefault(B);var O=Object.defineProperty;var v=(e,n)=>()=>(e&&(n=e(e=0)),n);var j=(e,n)=>{for(var r in n)O(e,r,{get:n[r],enumerable:true});};async function $(){try{return await import('sharp'),!0}catch{return false}}async function J(){try{return (await import('sharp')).default}catch{return null}}var I=v(()=>{});var C={};j(C,{convertJp2ToJpg:()=>x,convertJp2ToJpgSharp:()=>q,convertJp2ToJpgWasm:()=>P});async function k(){return w||(w=await(await import('@cornerstonejs/codec-openjpeg')).default({print:()=>{},printErr:()=>{}})),w}async function P(e,n={}){let r=n.quality!==void 0?n.quality:100;n.verbose!==void 0?n.verbose:false;let g=n.deleteOriginal!==void 0?n.deleteOriginal:true;if(!i__default.default.existsSync(e))return {success:false,error:`File not found: ${e}`};try{let c=i__default.default.statSync(e).size,l=e.replace(/\.jp2$/i,".jpg"),o=i__default.default.readFileSync(e),p=await k(),s=new p.J2KDecoder;s.getEncodedBuffer(o.length).set(o),s.decode();let f=s.getDecodedBuffer(),u=s.getFrameInfo();await new B__default.default({data:Buffer.from(f),width:u.width,height:u.height}).quality(r).writeAsync(l);let d=i__default.default.statSync(l).size;return g&&i__default.default.unlinkSync(e),{success:!0,newPath:l,originalSize:c,newSize:d}}catch(c){return {success:false,error:`Conversion failed: ${c.message}`}}}async function q(e,n={}){let r=n.quality!==void 0?n.quality:100;n.verbose!==void 0?n.verbose:false;let g=n.deleteOriginal!==void 0?n.deleteOriginal:true;if(!i__default.default.existsSync(e))return {success:false,error:`File not found: ${e}`};try{let c=i__default.default.statSync(e).size,l=e.replace(/\.jp2$/i,".jpg"),o=i__default.default.readFileSync(e),p=await k(),s=new p.J2KDecoder;s.getEncodedBuffer(o.length).set(o),s.decode();let f=s.getDecodedBuffer(),u=s.getFrameInfo(),a=await J();if(!a)throw new Error("Sharp module not available");let d=Buffer.from(f),y=u.componentCount;await a(d,{raw:{width:u.width,height:u.height,channels:y}}).jpeg({quality:r,chromaSubsampling:"4:4:4",mozjpeg:!0}).toFile(l);let b=i__default.default.statSync(l).size;return g&&i__default.default.unlinkSync(e),{success:!0,newPath:l,originalSize:c,newSize:b}}catch(c){return {success:false,error:`Conversion failed: ${c.message}`}}}async function x(e,n={}){n.verbose!==void 0?n.verbose:false;return n.useSharp&&await $()?q(e,n):P(e,n)}var w,z=v(()=>{I();w=null;});if(!worker_threads.parentPort)throw new Error("This script must be run as a worker thread");worker_threads.parentPort.on("message",async e=>{let n=Date.now();try{if(e.type!=="convert")throw new Error(`Invalid task type: ${e.type}`);let{buffer:r,options:t}=e.data,g=t?.quality??100,c=t?.useSharp??!1,{convertJp2ToJpg:l}=await Promise.resolve().then(()=>(z(),C)),o=await import('fs'),p=await import('path'),h=(await import('os')).tmpdir(),f=p.join(h,`worker-${e.taskId}.jp2`),u=p.join(h,`worker-${e.taskId}.jpg`);try{o.writeFileSync(f,r);let a=await l(f,{quality:g,verbose:!1,deleteOriginal:!0,useSharp:c});if(!a.success||!a.newPath)throw new Error(a.error||"Conversion failed");let d=o.readFileSync(a.newPath);try{o.existsSync(a.newPath)&&o.unlinkSync(a.newPath);}catch{}let y=Date.now()-n,S={success:!0,taskId:e.taskId,data:d,stats:{duration:y,inputSize:r.length,outputSize:d.length}};worker_threads.parentPort.postMessage(S);}finally{try{o.existsSync(f)&&o.unlinkSync(f),o.existsSync(u)&&o.unlinkSync(u);}catch{}}}catch(r){let t={success:false,taskId:e.taskId,error:r instanceof Error?r.message:"Unknown conversion error"};worker_threads.parentPort.postMessage(t);}});//# sourceMappingURL=jp2-converter.worker.js.map
|
|
2
|
+
//# sourceMappingURL=jp2-converter.worker.js.map
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
import i from'fs';import'path';import B from'jimp';import {parentPort}from'worker_threads';var O=Object.defineProperty;var v=(e,n)=>()=>(e&&(n=e(e=0)),n);var j=(e,n)=>{for(var r in n)O(e,r,{get:n[r],enumerable:true});};async function $(){try{return await import('sharp'),!0}catch{return false}}async function J(){try{return (await import('sharp')).default}catch{return null}}var I=v(()=>{});var C={};j(C,{convertJp2ToJpg:()=>x,convertJp2ToJpgSharp:()=>q,convertJp2ToJpgWasm:()=>P});async function k(){return w||(w=await(await import('@cornerstonejs/codec-openjpeg')).default({print:()=>{},printErr:()=>{}})),w}async function P(e,n={}){let r=n.quality!==void 0?n.quality:100;n.verbose!==void 0?n.verbose:false;let g=n.deleteOriginal!==void 0?n.deleteOriginal:true;if(!i.existsSync(e))return {success:false,error:`File not found: ${e}`};try{let c=i.statSync(e).size,l=e.replace(/\.jp2$/i,".jpg"),o=i.readFileSync(e),p=await k(),s=new p.J2KDecoder;s.getEncodedBuffer(o.length).set(o),s.decode();let f=s.getDecodedBuffer(),u=s.getFrameInfo();await new B({data:Buffer.from(f),width:u.width,height:u.height}).quality(r).writeAsync(l);let d=i.statSync(l).size;return g&&i.unlinkSync(e),{success:!0,newPath:l,originalSize:c,newSize:d}}catch(c){return {success:false,error:`Conversion failed: ${c.message}`}}}async function q(e,n={}){let r=n.quality!==void 0?n.quality:100;n.verbose!==void 0?n.verbose:false;let g=n.deleteOriginal!==void 0?n.deleteOriginal:true;if(!i.existsSync(e))return {success:false,error:`File not found: ${e}`};try{let c=i.statSync(e).size,l=e.replace(/\.jp2$/i,".jpg"),o=i.readFileSync(e),p=await k(),s=new p.J2KDecoder;s.getEncodedBuffer(o.length).set(o),s.decode();let f=s.getDecodedBuffer(),u=s.getFrameInfo(),a=await J();if(!a)throw new Error("Sharp module not available");let d=Buffer.from(f),y=u.componentCount;await a(d,{raw:{width:u.width,height:u.height,channels:y}}).jpeg({quality:r,chromaSubsampling:"4:4:4",mozjpeg:!0}).toFile(l);let b=i.statSync(l).size;return g&&i.unlinkSync(e),{success:!0,newPath:l,originalSize:c,newSize:b}}catch(c){return {success:false,error:`Conversion failed: ${c.message}`}}}async function x(e,n={}){n.verbose!==void 0?n.verbose:false;return n.useSharp&&await $()?q(e,n):P(e,n)}var w,z=v(()=>{I();w=null;});if(!parentPort)throw new Error("This script must be run as a worker thread");parentPort.on("message",async e=>{let n=Date.now();try{if(e.type!=="convert")throw new Error(`Invalid task type: ${e.type}`);let{buffer:r,options:t}=e.data,g=t?.quality??100,c=t?.useSharp??!1,{convertJp2ToJpg:l}=await Promise.resolve().then(()=>(z(),C)),o=await import('fs'),p=await import('path'),h=(await import('os')).tmpdir(),f=p.join(h,`worker-${e.taskId}.jp2`),u=p.join(h,`worker-${e.taskId}.jpg`);try{o.writeFileSync(f,r);let a=await l(f,{quality:g,verbose:!1,deleteOriginal:!0,useSharp:c});if(!a.success||!a.newPath)throw new Error(a.error||"Conversion failed");let d=o.readFileSync(a.newPath);try{o.existsSync(a.newPath)&&o.unlinkSync(a.newPath);}catch{}let y=Date.now()-n,S={success:!0,taskId:e.taskId,data:d,stats:{duration:y,inputSize:r.length,outputSize:d.length}};parentPort.postMessage(S);}finally{try{o.existsSync(f)&&o.unlinkSync(f),o.existsSync(u)&&o.unlinkSync(u);}catch{}}}catch(r){let t={success:false,taskId:e.taskId,error:r instanceof Error?r.message:"Unknown conversion error"};parentPort.postMessage(t);}});//# sourceMappingURL=jp2-converter.worker.mjs.map
|
|
2
|
+
//# sourceMappingURL=jp2-converter.worker.mjs.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pdf-plus",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.4",
|
|
4
4
|
"description": "A comprehensive PDF content extraction library with support for text, images, and structured data",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -45,21 +45,31 @@
|
|
|
45
45
|
"node": ">=18.0.0"
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
|
-
"
|
|
48
|
+
"@cornerstonejs/codec-openjpeg": "^1.2.4",
|
|
49
|
+
"canvas": "^3.2.0",
|
|
50
|
+
"canvas-5-polyfill": "^0.1.5",
|
|
51
|
+
"file-type": "^21.0.0",
|
|
52
|
+
"image-size": "^1.1.1",
|
|
53
|
+
"jimp": "0.22.12",
|
|
49
54
|
"pdf-lib": "^1.17.1",
|
|
50
|
-
"pdf-parse": "github:iamh2o/pdf-parse#1.1.3",
|
|
51
55
|
"pdfjs-dist": "^5.4.149",
|
|
52
|
-
"pngjs": "^7.0.0"
|
|
56
|
+
"pngjs": "^7.0.0",
|
|
57
|
+
"utif": "^3.1.0"
|
|
58
|
+
},
|
|
59
|
+
"optionalDependencies": {
|
|
60
|
+
"sharp": "^0.33.5"
|
|
53
61
|
},
|
|
54
62
|
"devDependencies": {
|
|
55
63
|
"@biomejs/biome": "^2.2.4",
|
|
56
64
|
"@types/node": "^24.5.2",
|
|
57
|
-
"@types/pdf-parse": "^1.1.5",
|
|
58
65
|
"@types/pngjs": "^6.0.5",
|
|
66
|
+
"@types/utif": "^3.0.6",
|
|
67
|
+
"@vitest/ui": "^3.2.4",
|
|
59
68
|
"rimraf": "^6.0.1",
|
|
60
69
|
"tsup": "^8.3.5",
|
|
61
70
|
"typedoc": "^0.28.13",
|
|
62
|
-
"typescript": "^5.9.2"
|
|
71
|
+
"typescript": "^5.9.2",
|
|
72
|
+
"vitest": "^3.2.4"
|
|
63
73
|
},
|
|
64
74
|
"scripts": {
|
|
65
75
|
"build": "tsup",
|
|
@@ -72,7 +82,12 @@
|
|
|
72
82
|
"check": "biome check src",
|
|
73
83
|
"check:fix": "biome check --write src",
|
|
74
84
|
"typecheck": "tsc --noEmit",
|
|
75
|
-
"test
|
|
85
|
+
"test": "vitest run",
|
|
86
|
+
"test:watch": "vitest watch",
|
|
87
|
+
"test:ui": "vitest --ui",
|
|
88
|
+
"test:coverage": "vitest run --coverage",
|
|
89
|
+
"test:unit": "vitest run tests/unit",
|
|
90
|
+
"test:integration": "vitest run tests/integration",
|
|
76
91
|
"clean": "rimraf dist",
|
|
77
92
|
"docs:dev": "typedoc --watch",
|
|
78
93
|
"docs:build": "typedoc"
|