pdf-plus 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -33,7 +33,7 @@ ${this.formatImageReference(b,r,t.indexOf(b)+1)}
|
|
|
33
33
|
|
|
34
34
|
`).replace(/^\s+|\s+$/g,"").replace(/[ \t]+/g," ")}generateStructuredData(e,t,r,a,s,n,o){let c=this.splitTextIntoPages(t,a),i=this.createPageDataArray(c,r,a,n,o);return {metadata:{filename:e,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:t.length,totalImages:r.length,extractionOptions:s},pages:i}}splitTextIntoPages(e,t){if(t<=1)return [e];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=e.match(r);return a&&a.length>0?this.splitByPageMarkers(e,r):this.splitByEstimatedLength(e,t)}splitByPageMarkers(e,t){let a=e.split(t).slice(1).map(s=>s.trim()).filter(s=>s.length>0);return a.length===0?[e]:a}splitByEstimatedLength(e,t){let r=e.split(`
|
|
35
35
|
`),a=Math.ceil(r.length/t);return Array.from({length:t},(o,c)=>c).map(o=>{let c=o*a,i=Math.min((o+1)*a,r.length);return r.slice(c,i).join(`
|
|
36
|
-
`)})}createPageDataArray(e,t,r,a,s){return Array.from({length:r},(c,i)=>i).map(c=>{let i=c+1,g=e[c]||"",m=this.getImagesForPage(t,i),u=this.extractRawText(g),l={pageNumber:i,text:{content:g,rawText:u,wordCount:this.countWords(u),characterCount:u.length},images:m,imageCount:m.length};if(a&&a.has(i)&&(l.pageImage=a.get(i)),s&&s.has(i)&&(l.thumbnail=s.get(i)),a&&a.has(i)){let f=a.get(i);f.variants&&f.variants.length>0&&(l.pageImageVariants=f.variants);}return l})}getImagesForPage(e,t){return e.filter(r=>r.page===t).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r&&r.filename!==void 0&&(a.filename=r.filename),"path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("filePath"in r){let s=r.filePath;s!==void 0&&(a.path=s);}return "size"in r&&r.size!==void 0&&(a.size=r.size),"width"in r&&r.width!==void 0&&(a.width=r.width),"height"in r&&r.height!==void 0&&(a.height=r.height),"mimeType"in r&&r.mimeType!==void 0&&(a.mimeType=r.mimeType),a})}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}generateJSONString(e,t=2){return JSON.stringify(e,null,t)}generateSummary(e){let t=e.pages.reduce((n,o)=>n+o.text.wordCount,0),r=e.pages.reduce((n,o)=>n+o.text.characterCount,0),a=e.pages.filter(n=>n.text.content.trim().length>0).length,s=e.pages.filter(n=>n.imageCount>0).length;return {totalWords:t,totalCharacters:r,averageWordsPerPage:Math.round(t/e.pages.length),averageImagesPerPage:Math.round(e.metadata.totalImages/e.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var ie=class{cacheDir;constructor(e="./tmp/pdf-cache"){this.cacheDir=e,this.ensureCacheDir();}generateCacheKey(e){let t=T__default.default.resolve(e),r=w__namespace.default.statSync(t),a=`${t}:${r.mtime.getTime()}:${r.size}`;return ft__default.default.createHash("md5").update(a).digest("hex")}getCacheDir(e){let t=this.generateCacheKey(e);return T__default.default.join(this.cacheDir,t)}ensureCacheDir(){w__namespace.default.existsSync(this.cacheDir)||w__namespace.default.mkdirSync(this.cacheDir,{recursive:true});}isCached(e){try{let t=this.getCacheDir(e),r=T__default.default.join(t,"cache-info.json");return w__namespace.default.existsSync(r)}catch{return false}}getCacheInfo(e){try{let t=this.getCacheDir(e),r=T__default.default.join(t,"cache-info.json");return w__namespace.default.existsSync(r)?JSON.parse(w__namespace.default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(e,t){let r=this.getCacheDir(e);w__namespace.default.existsSync(r)||w__namespace.default.mkdirSync(r,{recursive:true});let a=w__namespace.default.statSync(e),s={pdfPath:T__default.default.resolve(e),lastModified:a.mtime.getTime(),totalPages:t,cacheDir:r,created:new Date().toISOString()},n=T__default.default.join(r,"cache-info.json");return w__namespace.default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(e,t,r){try{let a=this.getCacheDir(e),s=T__default.default.join(a,`page-${t}.json`);w__namespace.default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(e,t){try{let r=this.getCacheDir(e),a=T__default.default.join(r,`page-${t}.json`);return w__namespace.default.existsSync(a)?JSON.parse(w__namespace.default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(e){try{let t=this.getCacheDir(e),r=[];if(!w__namespace.default.existsSync(t))return r;let s=w__namespace.default.readdirSync(t).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=T__default.default.join(t,n),c=JSON.parse(w__namespace.default.readFileSync(o,"utf-8"));r.push(c);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(e){try{let t=this.getCacheDir(e);w__namespace.default.existsSync(t)&&w__namespace.default.rmSync(t,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{w__namespace.default.existsSync(this.cacheDir)&&w__namespace.default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{if(!w__namespace.default.existsSync(this.cacheDir))return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir};let e=w__namespace.default.readdirSync(this.cacheDir),t=e.length,{totalCachedPages:r,totalCacheSize:a}=e.reduce((s,n)=>{let o=T__default.default.join(this.cacheDir,n);if(!w__namespace.default.statSync(o).isDirectory())return s;let c=w__namespace.default.readdirSync(o),i=c.filter(m=>m.startsWith("page-")&&m.endsWith(".json")),g=c.reduce((m,u)=>{let l=T__default.default.join(o,u);return m+w__namespace.default.statSync(l).size},0);return {totalCachedPages:s.totalCachedPages+i.length,totalCacheSize:s.totalCacheSize+g}},{totalCachedPages:0,totalCacheSize:0});return {totalCachedPdfs:t,totalCachedPages:r,totalCacheSize:a,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var M=class{textExtractor;imageExtractor;pageToImageConverter;formatProcessor;structuredDataGenerator;cacheManager;constructor(e){this.textExtractor=new W,this.imageExtractor=new D,this.pageToImageConverter=new q,this.formatProcessor=new N,this.structuredDataGenerator=new oe,this.cacheManager=new ie(e);}async extract(e,t={}){let r={pdfPath:e,outputDir:t.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,includePageMarkers:true,pageMarkerFormat:"--- PAGE {page} ---",...t}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!w__namespace.default.existsSync(e))throw new Error(`PDF file not found: ${e}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(e),r.options.includePageMarkers||r.options.includeImageRefs)){let l=r.options.pageMarkerFormat||"--- PAGE {page} ---",h={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};o=await this.textExtractor.extractWithPageMarkers(e,l,h);}let c=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,c=await this.textExtractor.extractTextItems(e,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(e,r.options));let g=null,m=null;if(r.options.generatePageImages||r.options.generateThumbnails){let l=i?.totalPages||n?.numPages||0,f=r.options.pageNumbers||Array.from({length:l},(h,x)=>x+1);r.options.generatePageImages&&(g=await this.generatePageImagesWithVariants(e,f,r.options)),r.options.generateThumbnails&&(m=await this.generatePageThumbnails(e,f,r.options));}let u=await this.processResults(e,n,o,i,c,r.options,s,g,m);return this.reportProgress(r.options,{currentPage:u.document.pages,totalPages:u.document.pages,phase:"complete"}),u}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(e,t={}){return (await this.extract(e,{...t,extractText:true,extractImages:false})).cleanText}async extractImages(e,t={}){return (await this.extract(e,{...t,extractText:false,extractImages:true})).images}async extractImageFiles(e,t="./extracted-images",r={}){return (await this.extract(e,{...r,extractImageFiles:true,imageOutputDir:t,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(e){return X(e)}async processResults(e,t,r,a,s,n,o,c,i){let g=T__default.default.basename(e),u=this.extractRawText(t?.text||""),l={document:{filename:g,pages:a?.totalPages||t?.numPages||0,textLength:t?.text?.length||0,extractedAt:new Date().toISOString(),metadata:t?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:u,textWithRefs:"",cleanText:u};if(n.extractText&&n.extractImages&&t&&a)if(r?.text&&n.includeImageRefs)l.textWithRefs=r.text;else if(n.includeImageRefs){let f=r?.text||t.text;l.textWithRefs=this.formatProcessor.generateTextWithImageRefs(f,a.images,n.imageRefFormat||"[IMAGE:{id}]",l.document.pages);}else l.textWithRefs=r?.text||t.text;else n.extractText&&t?l.textWithRefs=r?.text||t.text:n.extractImages&&a&&(l.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(l.summary={totalPages:l.document.pages,totalTextItems:0,totalImages:l.images.length,totalTextLength:l.document.textLength,averageImagesPerPage:(l.images.length/l.document.pages).toFixed(2),pagesWithImages:new Set(l.images.map(f=>f.page)).size},n.generateStructuredData){let f=l.textWithRefs||l.cleanText;l.structuredData=this.structuredDataGenerator.generateStructuredData(g,f,l.images,l.document.pages,n,c,i),n.verbose;}return n.verbose,l}async getText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).text}async getImages(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:false,extractImages:true})).images}async getTextItems(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).rawText}async getPage(e,t,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(e,t);if(m)return r.verbose,m}let a={...r,specificPages:[t]},s=await this.extract(e,a),n=this.extractPageText(s.textWithRefs||s.cleanText,t),o=s.images.filter(m=>m.page===t),c=s.textItems?.filter(m=>m.page===t)||[],i=this.extractRawText(n),g={pageNumber:t,text:n,rawText:i,textItems:c,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(e,t,g),g}extractPageText(e,t){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=e.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===t)return a[i+3]||""}let s=e.split(`
|
|
36
|
+
`)})}createPageDataArray(e,t,r,a,s){return Array.from({length:r},(c,i)=>i).map(c=>{let i=c+1,g=e[c]||"",m=this.getImagesForPage(t,i),u=this.extractRawText(g),l={pageNumber:i,text:{content:g,rawText:u,wordCount:this.countWords(u),characterCount:u.length},images:m,imageCount:m.length};if(a&&a.has(i)&&(l.pageImage=a.get(i)),s&&s.has(i)&&(l.thumbnail=s.get(i)),a&&a.has(i)){let f=a.get(i);f.variants&&f.variants.length>0&&(l.pageImageVariants=f.variants);}return l})}getImagesForPage(e,t){return e.filter(r=>r.page===t).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r&&r.filename!==void 0&&(a.filename=r.filename),"path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("filepath"in r&&r.filepath!==void 0&&(a.path=r.filepath),"filePath"in r){let s=r.filePath;s!==void 0&&(a.path=s);}return "size"in r&&r.size!==void 0&&(a.size=r.size),"width"in r&&r.width!==void 0&&(a.width=r.width),"height"in r&&r.height!==void 0&&(a.height=r.height),"mimeType"in r&&r.mimeType!==void 0&&(a.mimeType=r.mimeType),a})}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}generateJSONString(e,t=2){return JSON.stringify(e,null,t)}generateSummary(e){let t=e.pages.reduce((n,o)=>n+o.text.wordCount,0),r=e.pages.reduce((n,o)=>n+o.text.characterCount,0),a=e.pages.filter(n=>n.text.content.trim().length>0).length,s=e.pages.filter(n=>n.imageCount>0).length;return {totalWords:t,totalCharacters:r,averageWordsPerPage:Math.round(t/e.pages.length),averageImagesPerPage:Math.round(e.metadata.totalImages/e.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var ie=class{cacheDir;constructor(e="./tmp/pdf-cache"){this.cacheDir=e,this.ensureCacheDir();}generateCacheKey(e){let t=T__default.default.resolve(e),r=w__namespace.default.statSync(t),a=`${t}:${r.mtime.getTime()}:${r.size}`;return ft__default.default.createHash("md5").update(a).digest("hex")}getCacheDir(e){let t=this.generateCacheKey(e);return T__default.default.join(this.cacheDir,t)}ensureCacheDir(){w__namespace.default.existsSync(this.cacheDir)||w__namespace.default.mkdirSync(this.cacheDir,{recursive:true});}isCached(e){try{let t=this.getCacheDir(e),r=T__default.default.join(t,"cache-info.json");return w__namespace.default.existsSync(r)}catch{return false}}getCacheInfo(e){try{let t=this.getCacheDir(e),r=T__default.default.join(t,"cache-info.json");return w__namespace.default.existsSync(r)?JSON.parse(w__namespace.default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(e,t){let r=this.getCacheDir(e);w__namespace.default.existsSync(r)||w__namespace.default.mkdirSync(r,{recursive:true});let a=w__namespace.default.statSync(e),s={pdfPath:T__default.default.resolve(e),lastModified:a.mtime.getTime(),totalPages:t,cacheDir:r,created:new Date().toISOString()},n=T__default.default.join(r,"cache-info.json");return w__namespace.default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(e,t,r){try{let a=this.getCacheDir(e),s=T__default.default.join(a,`page-${t}.json`);w__namespace.default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(e,t){try{let r=this.getCacheDir(e),a=T__default.default.join(r,`page-${t}.json`);return w__namespace.default.existsSync(a)?JSON.parse(w__namespace.default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(e){try{let t=this.getCacheDir(e),r=[];if(!w__namespace.default.existsSync(t))return r;let s=w__namespace.default.readdirSync(t).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=T__default.default.join(t,n),c=JSON.parse(w__namespace.default.readFileSync(o,"utf-8"));r.push(c);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(e){try{let t=this.getCacheDir(e);w__namespace.default.existsSync(t)&&w__namespace.default.rmSync(t,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{w__namespace.default.existsSync(this.cacheDir)&&w__namespace.default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{if(!w__namespace.default.existsSync(this.cacheDir))return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir};let e=w__namespace.default.readdirSync(this.cacheDir),t=e.length,{totalCachedPages:r,totalCacheSize:a}=e.reduce((s,n)=>{let o=T__default.default.join(this.cacheDir,n);if(!w__namespace.default.statSync(o).isDirectory())return s;let c=w__namespace.default.readdirSync(o),i=c.filter(m=>m.startsWith("page-")&&m.endsWith(".json")),g=c.reduce((m,u)=>{let l=T__default.default.join(o,u);return m+w__namespace.default.statSync(l).size},0);return {totalCachedPages:s.totalCachedPages+i.length,totalCacheSize:s.totalCacheSize+g}},{totalCachedPages:0,totalCacheSize:0});return {totalCachedPdfs:t,totalCachedPages:r,totalCacheSize:a,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var M=class{textExtractor;imageExtractor;pageToImageConverter;formatProcessor;structuredDataGenerator;cacheManager;constructor(e){this.textExtractor=new W,this.imageExtractor=new D,this.pageToImageConverter=new q,this.formatProcessor=new N,this.structuredDataGenerator=new oe,this.cacheManager=new ie(e);}async extract(e,t={}){let r={pdfPath:e,outputDir:t.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,includePageMarkers:true,pageMarkerFormat:"--- PAGE {page} ---",...t}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!w__namespace.default.existsSync(e))throw new Error(`PDF file not found: ${e}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(e),r.options.includePageMarkers||r.options.includeImageRefs)){let l=r.options.pageMarkerFormat||"--- PAGE {page} ---",h={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};o=await this.textExtractor.extractWithPageMarkers(e,l,h);}let c=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,c=await this.textExtractor.extractTextItems(e,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(e,r.options));let g=null,m=null;if(r.options.generatePageImages||r.options.generateThumbnails){let l=i?.totalPages||n?.numPages||0,f=r.options.pageNumbers||Array.from({length:l},(h,x)=>x+1);r.options.generatePageImages&&(g=await this.generatePageImagesWithVariants(e,f,r.options)),r.options.generateThumbnails&&(m=await this.generatePageThumbnails(e,f,r.options));}let u=await this.processResults(e,n,o,i,c,r.options,s,g,m);return this.reportProgress(r.options,{currentPage:u.document.pages,totalPages:u.document.pages,phase:"complete"}),u}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(e,t={}){return (await this.extract(e,{...t,extractText:true,extractImages:false})).cleanText}async extractImages(e,t={}){return (await this.extract(e,{...t,extractText:false,extractImages:true})).images}async extractImageFiles(e,t="./extracted-images",r={}){return (await this.extract(e,{...r,extractImageFiles:true,imageOutputDir:t,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(e){return X(e)}async processResults(e,t,r,a,s,n,o,c,i){let g=T__default.default.basename(e),u=this.extractRawText(t?.text||""),l={document:{filename:g,pages:a?.totalPages||t?.numPages||0,textLength:t?.text?.length||0,extractedAt:new Date().toISOString(),metadata:t?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:u,textWithRefs:"",cleanText:u};if(n.extractText&&n.extractImages&&t&&a)if(r?.text&&n.includeImageRefs)l.textWithRefs=r.text;else if(n.includeImageRefs){let f=r?.text||t.text;l.textWithRefs=this.formatProcessor.generateTextWithImageRefs(f,a.images,n.imageRefFormat||"[IMAGE:{id}]",l.document.pages);}else l.textWithRefs=r?.text||t.text;else n.extractText&&t?l.textWithRefs=r?.text||t.text:n.extractImages&&a&&(l.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(l.summary={totalPages:l.document.pages,totalTextItems:0,totalImages:l.images.length,totalTextLength:l.document.textLength,averageImagesPerPage:(l.images.length/l.document.pages).toFixed(2),pagesWithImages:new Set(l.images.map(f=>f.page)).size},n.generateStructuredData){let f=l.textWithRefs||l.cleanText;l.structuredData=this.structuredDataGenerator.generateStructuredData(g,f,l.images,l.document.pages,n,c,i),n.verbose;}return n.verbose,l}async getText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).text}async getImages(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:false,extractImages:true})).images}async getTextItems(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).rawText}async getPage(e,t,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(e,t);if(m)return r.verbose,m}let a={...r,specificPages:[t]},s=await this.extract(e,a),n=this.extractPageText(s.textWithRefs||s.cleanText,t),o=s.images.filter(m=>m.page===t),c=s.textItems?.filter(m=>m.page===t)||[],i=this.extractRawText(n),g={pageNumber:t,text:n,rawText:i,textItems:c,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(e,t,g),g}extractPageText(e,t){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=e.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===t)return a[i+3]||""}let s=e.split(`
|
|
37
37
|
`),n=Math.ceil(s.length/t),o=(t-1)*n,c=Math.min(t*n,s.length);return s.slice(o,c).join(`
|
|
38
38
|
`)}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}extractRawText(e){let t=e;return t=t.replace(/--- PAGE \d+ ---\s*/g,""),t=t.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),t=t.replace(/PAGE \d+\s*/g,""),t=t.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),t=t.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),t=t.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),t=t.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),t=t.replace(/\n\s*\n\s*\n/g,`
|
|
39
39
|
|
package/dist/index.mjs
CHANGED
|
@@ -33,7 +33,7 @@ ${this.formatImageReference(b,r,t.indexOf(b)+1)}
|
|
|
33
33
|
|
|
34
34
|
`).replace(/^\s+|\s+$/g,"").replace(/[ \t]+/g," ")}generateStructuredData(e,t,r,a,s,n,o){let c=this.splitTextIntoPages(t,a),i=this.createPageDataArray(c,r,a,n,o);return {metadata:{filename:e,extractedAt:new Date().toISOString(),totalPages:a,totalTextLength:t.length,totalImages:r.length,extractionOptions:s},pages:i}}splitTextIntoPages(e,t){if(t<=1)return [e];let r=/(?:--- PAGE \d+ ---|🎨 ART BASEL PAGE \d+ 🎨|PAGE \d+)/g,a=e.match(r);return a&&a.length>0?this.splitByPageMarkers(e,r):this.splitByEstimatedLength(e,t)}splitByPageMarkers(e,t){let a=e.split(t).slice(1).map(s=>s.trim()).filter(s=>s.length>0);return a.length===0?[e]:a}splitByEstimatedLength(e,t){let r=e.split(`
|
|
35
35
|
`),a=Math.ceil(r.length/t);return Array.from({length:t},(o,c)=>c).map(o=>{let c=o*a,i=Math.min((o+1)*a,r.length);return r.slice(c,i).join(`
|
|
36
|
-
`)})}createPageDataArray(e,t,r,a,s){return Array.from({length:r},(c,i)=>i).map(c=>{let i=c+1,g=e[c]||"",m=this.getImagesForPage(t,i),u=this.extractRawText(g),l={pageNumber:i,text:{content:g,rawText:u,wordCount:this.countWords(u),characterCount:u.length},images:m,imageCount:m.length};if(a&&a.has(i)&&(l.pageImage=a.get(i)),s&&s.has(i)&&(l.thumbnail=s.get(i)),a&&a.has(i)){let f=a.get(i);f.variants&&f.variants.length>0&&(l.pageImageVariants=f.variants);}return l})}getImagesForPage(e,t){return e.filter(r=>r.page===t).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r&&r.filename!==void 0&&(a.filename=r.filename),"path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("filePath"in r){let s=r.filePath;s!==void 0&&(a.path=s);}return "size"in r&&r.size!==void 0&&(a.size=r.size),"width"in r&&r.width!==void 0&&(a.width=r.width),"height"in r&&r.height!==void 0&&(a.height=r.height),"mimeType"in r&&r.mimeType!==void 0&&(a.mimeType=r.mimeType),a})}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}generateJSONString(e,t=2){return JSON.stringify(e,null,t)}generateSummary(e){let t=e.pages.reduce((n,o)=>n+o.text.wordCount,0),r=e.pages.reduce((n,o)=>n+o.text.characterCount,0),a=e.pages.filter(n=>n.text.content.trim().length>0).length,s=e.pages.filter(n=>n.imageCount>0).length;return {totalWords:t,totalCharacters:r,averageWordsPerPage:Math.round(t/e.pages.length),averageImagesPerPage:Math.round(e.metadata.totalImages/e.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var ie=class{cacheDir;constructor(e="./tmp/pdf-cache"){this.cacheDir=e,this.ensureCacheDir();}generateCacheKey(e){let t=T.resolve(e),r=w__default.statSync(t),a=`${t}:${r.mtime.getTime()}:${r.size}`;return ft.createHash("md5").update(a).digest("hex")}getCacheDir(e){let t=this.generateCacheKey(e);return T.join(this.cacheDir,t)}ensureCacheDir(){w__default.existsSync(this.cacheDir)||w__default.mkdirSync(this.cacheDir,{recursive:true});}isCached(e){try{let t=this.getCacheDir(e),r=T.join(t,"cache-info.json");return w__default.existsSync(r)}catch{return false}}getCacheInfo(e){try{let t=this.getCacheDir(e),r=T.join(t,"cache-info.json");return w__default.existsSync(r)?JSON.parse(w__default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(e,t){let r=this.getCacheDir(e);w__default.existsSync(r)||w__default.mkdirSync(r,{recursive:true});let a=w__default.statSync(e),s={pdfPath:T.resolve(e),lastModified:a.mtime.getTime(),totalPages:t,cacheDir:r,created:new Date().toISOString()},n=T.join(r,"cache-info.json");return w__default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(e,t,r){try{let a=this.getCacheDir(e),s=T.join(a,`page-${t}.json`);w__default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(e,t){try{let r=this.getCacheDir(e),a=T.join(r,`page-${t}.json`);return w__default.existsSync(a)?JSON.parse(w__default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(e){try{let t=this.getCacheDir(e),r=[];if(!w__default.existsSync(t))return r;let s=w__default.readdirSync(t).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=T.join(t,n),c=JSON.parse(w__default.readFileSync(o,"utf-8"));r.push(c);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(e){try{let t=this.getCacheDir(e);w__default.existsSync(t)&&w__default.rmSync(t,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{w__default.existsSync(this.cacheDir)&&w__default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{if(!w__default.existsSync(this.cacheDir))return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir};let e=w__default.readdirSync(this.cacheDir),t=e.length,{totalCachedPages:r,totalCacheSize:a}=e.reduce((s,n)=>{let o=T.join(this.cacheDir,n);if(!w__default.statSync(o).isDirectory())return s;let c=w__default.readdirSync(o),i=c.filter(m=>m.startsWith("page-")&&m.endsWith(".json")),g=c.reduce((m,u)=>{let l=T.join(o,u);return m+w__default.statSync(l).size},0);return {totalCachedPages:s.totalCachedPages+i.length,totalCacheSize:s.totalCacheSize+g}},{totalCachedPages:0,totalCacheSize:0});return {totalCachedPdfs:t,totalCachedPages:r,totalCacheSize:a,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var M=class{textExtractor;imageExtractor;pageToImageConverter;formatProcessor;structuredDataGenerator;cacheManager;constructor(e){this.textExtractor=new W,this.imageExtractor=new D,this.pageToImageConverter=new q,this.formatProcessor=new N,this.structuredDataGenerator=new oe,this.cacheManager=new ie(e);}async extract(e,t={}){let r={pdfPath:e,outputDir:t.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,includePageMarkers:true,pageMarkerFormat:"--- PAGE {page} ---",...t}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!w__default.existsSync(e))throw new Error(`PDF file not found: ${e}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(e),r.options.includePageMarkers||r.options.includeImageRefs)){let l=r.options.pageMarkerFormat||"--- PAGE {page} ---",h={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};o=await this.textExtractor.extractWithPageMarkers(e,l,h);}let c=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,c=await this.textExtractor.extractTextItems(e,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(e,r.options));let g=null,m=null;if(r.options.generatePageImages||r.options.generateThumbnails){let l=i?.totalPages||n?.numPages||0,f=r.options.pageNumbers||Array.from({length:l},(h,x)=>x+1);r.options.generatePageImages&&(g=await this.generatePageImagesWithVariants(e,f,r.options)),r.options.generateThumbnails&&(m=await this.generatePageThumbnails(e,f,r.options));}let u=await this.processResults(e,n,o,i,c,r.options,s,g,m);return this.reportProgress(r.options,{currentPage:u.document.pages,totalPages:u.document.pages,phase:"complete"}),u}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(e,t={}){return (await this.extract(e,{...t,extractText:true,extractImages:false})).cleanText}async extractImages(e,t={}){return (await this.extract(e,{...t,extractText:false,extractImages:true})).images}async extractImageFiles(e,t="./extracted-images",r={}){return (await this.extract(e,{...r,extractImageFiles:true,imageOutputDir:t,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(e){return X(e)}async processResults(e,t,r,a,s,n,o,c,i){let g=T.basename(e),u=this.extractRawText(t?.text||""),l={document:{filename:g,pages:a?.totalPages||t?.numPages||0,textLength:t?.text?.length||0,extractedAt:new Date().toISOString(),metadata:t?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:u,textWithRefs:"",cleanText:u};if(n.extractText&&n.extractImages&&t&&a)if(r?.text&&n.includeImageRefs)l.textWithRefs=r.text;else if(n.includeImageRefs){let f=r?.text||t.text;l.textWithRefs=this.formatProcessor.generateTextWithImageRefs(f,a.images,n.imageRefFormat||"[IMAGE:{id}]",l.document.pages);}else l.textWithRefs=r?.text||t.text;else n.extractText&&t?l.textWithRefs=r?.text||t.text:n.extractImages&&a&&(l.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(l.summary={totalPages:l.document.pages,totalTextItems:0,totalImages:l.images.length,totalTextLength:l.document.textLength,averageImagesPerPage:(l.images.length/l.document.pages).toFixed(2),pagesWithImages:new Set(l.images.map(f=>f.page)).size},n.generateStructuredData){let f=l.textWithRefs||l.cleanText;l.structuredData=this.structuredDataGenerator.generateStructuredData(g,f,l.images,l.document.pages,n,c,i),n.verbose;}return n.verbose,l}async getText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).text}async getImages(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:false,extractImages:true})).images}async getTextItems(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).rawText}async getPage(e,t,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(e,t);if(m)return r.verbose,m}let a={...r,specificPages:[t]},s=await this.extract(e,a),n=this.extractPageText(s.textWithRefs||s.cleanText,t),o=s.images.filter(m=>m.page===t),c=s.textItems?.filter(m=>m.page===t)||[],i=this.extractRawText(n),g={pageNumber:t,text:n,rawText:i,textItems:c,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(e,t,g),g}extractPageText(e,t){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=e.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===t)return a[i+3]||""}let s=e.split(`
|
|
36
|
+
`)})}createPageDataArray(e,t,r,a,s){return Array.from({length:r},(c,i)=>i).map(c=>{let i=c+1,g=e[c]||"",m=this.getImagesForPage(t,i),u=this.extractRawText(g),l={pageNumber:i,text:{content:g,rawText:u,wordCount:this.countWords(u),characterCount:u.length},images:m,imageCount:m.length};if(a&&a.has(i)&&(l.pageImage=a.get(i)),s&&s.has(i)&&(l.thumbnail=s.get(i)),a&&a.has(i)){let f=a.get(i);f.variants&&f.variants.length>0&&(l.pageImageVariants=f.variants);}return l})}getImagesForPage(e,t){return e.filter(r=>r.page===t).map(r=>{let a={id:r.id,name:r.name||`image_${r.id}`,position:r.position,format:r.format||"unknown"};if("filename"in r&&r.filename!==void 0&&(a.filename=r.filename),"path"in r){let s=r.path;s!==void 0&&(a.path=s);}if("filepath"in r&&r.filepath!==void 0&&(a.path=r.filepath),"filePath"in r){let s=r.filePath;s!==void 0&&(a.path=s);}return "size"in r&&r.size!==void 0&&(a.size=r.size),"width"in r&&r.width!==void 0&&(a.width=r.width),"height"in r&&r.height!==void 0&&(a.height=r.height),"mimeType"in r&&r.mimeType!==void 0&&(a.mimeType=r.mimeType),a})}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}generateJSONString(e,t=2){return JSON.stringify(e,null,t)}generateSummary(e){let t=e.pages.reduce((n,o)=>n+o.text.wordCount,0),r=e.pages.reduce((n,o)=>n+o.text.characterCount,0),a=e.pages.filter(n=>n.text.content.trim().length>0).length,s=e.pages.filter(n=>n.imageCount>0).length;return {totalWords:t,totalCharacters:r,averageWordsPerPage:Math.round(t/e.pages.length),averageImagesPerPage:Math.round(e.metadata.totalImages/e.pages.length*10)/10,pagesWithText:a,pagesWithImages:s}}};var ie=class{cacheDir;constructor(e="./tmp/pdf-cache"){this.cacheDir=e,this.ensureCacheDir();}generateCacheKey(e){let t=T.resolve(e),r=w__default.statSync(t),a=`${t}:${r.mtime.getTime()}:${r.size}`;return ft.createHash("md5").update(a).digest("hex")}getCacheDir(e){let t=this.generateCacheKey(e);return T.join(this.cacheDir,t)}ensureCacheDir(){w__default.existsSync(this.cacheDir)||w__default.mkdirSync(this.cacheDir,{recursive:true});}isCached(e){try{let t=this.getCacheDir(e),r=T.join(t,"cache-info.json");return w__default.existsSync(r)}catch{return false}}getCacheInfo(e){try{let t=this.getCacheDir(e),r=T.join(t,"cache-info.json");return w__default.existsSync(r)?JSON.parse(w__default.readFileSync(r,"utf-8")):null}catch{return null}}createCache(e,t){let r=this.getCacheDir(e);w__default.existsSync(r)||w__default.mkdirSync(r,{recursive:true});let a=w__default.statSync(e),s={pdfPath:T.resolve(e),lastModified:a.mtime.getTime(),totalPages:t,cacheDir:r,created:new Date().toISOString()},n=T.join(r,"cache-info.json");return w__default.writeFileSync(n,JSON.stringify(s,null,2)),r}cachePageResult(e,t,r){try{let a=this.getCacheDir(e),s=T.join(a,`page-${t}.json`);w__default.writeFileSync(s,JSON.stringify(r,null,2));}catch{}}getCachedPageResult(e,t){try{let r=this.getCacheDir(e),a=T.join(r,`page-${t}.json`);return w__default.existsSync(a)?JSON.parse(w__default.readFileSync(a,"utf-8")):null}catch{return null}}getAllCachedPages(e){try{let t=this.getCacheDir(e),r=[];if(!w__default.existsSync(t))return r;let s=w__default.readdirSync(t).filter(n=>n.startsWith("page-")&&n.endsWith(".json"));for(let n of s)try{let o=T.join(t,n),c=JSON.parse(w__default.readFileSync(o,"utf-8"));r.push(c);}catch{}return r.sort((n,o)=>n.pageNumber-o.pageNumber),r}catch{return []}}clearCache(e){try{let t=this.getCacheDir(e);w__default.existsSync(t)&&w__default.rmSync(t,{recursive:!0,force:!0});}catch{}}clearAllCache(){try{w__default.existsSync(this.cacheDir)&&w__default.rmSync(this.cacheDir,{recursive:!0,force:!0}),this.ensureCacheDir();}catch{}}getCacheStats(){try{if(!w__default.existsSync(this.cacheDir))return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir};let e=w__default.readdirSync(this.cacheDir),t=e.length,{totalCachedPages:r,totalCacheSize:a}=e.reduce((s,n)=>{let o=T.join(this.cacheDir,n);if(!w__default.statSync(o).isDirectory())return s;let c=w__default.readdirSync(o),i=c.filter(m=>m.startsWith("page-")&&m.endsWith(".json")),g=c.reduce((m,u)=>{let l=T.join(o,u);return m+w__default.statSync(l).size},0);return {totalCachedPages:s.totalCachedPages+i.length,totalCacheSize:s.totalCacheSize+g}},{totalCachedPages:0,totalCacheSize:0});return {totalCachedPdfs:t,totalCachedPages:r,totalCacheSize:a,cacheDir:this.cacheDir}}catch{return {totalCachedPdfs:0,totalCachedPages:0,totalCacheSize:0,cacheDir:this.cacheDir}}}};var M=class{textExtractor;imageExtractor;pageToImageConverter;formatProcessor;structuredDataGenerator;cacheManager;constructor(e){this.textExtractor=new W,this.imageExtractor=new D,this.pageToImageConverter=new q,this.formatProcessor=new N,this.structuredDataGenerator=new oe,this.cacheManager=new ie(e);}async extract(e,t={}){let r={pdfPath:e,outputDir:t.imageOutputDir||"./extracted-images",options:{extractText:true,extractImages:true,extractImageFiles:false,useImagePaths:false,imageRefFormat:"[IMAGE:{id}]",verbose:false,includePageMarkers:true,pageMarkerFormat:"--- PAGE {page} ---",...t}},a=this.validateConfiguration(r);if(a.length>0)throw this.createValidationError("Invalid configuration",a);try{if(!w__default.existsSync(e))throw new Error(`PDF file not found: ${e}`);let s=Date.now();this.reportProgress(r.options,{currentPage:0,totalPages:0,phase:"processing"});let n=null,o=null;if(r.options.extractText&&(r.options.verbose,n=await this.textExtractor.extract(e),r.options.includePageMarkers||r.options.includeImageRefs)){let l=r.options.pageMarkerFormat||"--- PAGE {page} ---",h={pageOffset:r.options.pageOffset||0,includeImageRefs:r.options.includeImageRefs??!1,imageRefFormat:r.options.imageRefFormat??"[IMG:{id}] {name}"};o=await this.textExtractor.extractWithPageMarkers(e,l,h);}let c=[];r.options.extractTextItems&&r.options.extractText&&(r.options.verbose,c=await this.textExtractor.extractTextItems(e,r.options));let i=null;r.options.extractImages&&(r.options.verbose,i=await this.imageExtractor.extract(e,r.options));let g=null,m=null;if(r.options.generatePageImages||r.options.generateThumbnails){let l=i?.totalPages||n?.numPages||0,f=r.options.pageNumbers||Array.from({length:l},(h,x)=>x+1);r.options.generatePageImages&&(g=await this.generatePageImagesWithVariants(e,f,r.options)),r.options.generateThumbnails&&(m=await this.generatePageThumbnails(e,f,r.options));}let u=await this.processResults(e,n,o,i,c,r.options,s,g,m);return this.reportProgress(r.options,{currentPage:u.document.pages,totalPages:u.document.pages,phase:"complete"}),u}catch(s){throw r.options.verbose,this.createExtractionError("PDF content extraction failed",s)}}async extractText(e,t={}){return (await this.extract(e,{...t,extractText:true,extractImages:false})).cleanText}async extractImages(e,t={}){return (await this.extract(e,{...t,extractText:false,extractImages:true})).images}async extractImageFiles(e,t="./extracted-images",r={}){return (await this.extract(e,{...r,extractImageFiles:true,imageOutputDir:t,useImagePaths:true})).images.filter(s=>s.filePath).map(s=>s.filePath)}validateConfiguration(e){return X(e)}async processResults(e,t,r,a,s,n,o,c,i){let g=T.basename(e),u=this.extractRawText(t?.text||""),l={document:{filename:g,pages:a?.totalPages||t?.numPages||0,textLength:t?.text?.length||0,extractedAt:new Date().toISOString(),metadata:t?.info||{},options:n},pages:[],images:a?.images||[],textItems:s,text:u,textWithRefs:"",cleanText:u};if(n.extractText&&n.extractImages&&t&&a)if(r?.text&&n.includeImageRefs)l.textWithRefs=r.text;else if(n.includeImageRefs){let f=r?.text||t.text;l.textWithRefs=this.formatProcessor.generateTextWithImageRefs(f,a.images,n.imageRefFormat||"[IMAGE:{id}]",l.document.pages);}else l.textWithRefs=r?.text||t.text;else n.extractText&&t?l.textWithRefs=r?.text||t.text:n.extractImages&&a&&(l.textWithRefs=this.formatProcessor.generateImageOnlyRefs(a.images,n.imageRefFormat||"[IMAGE:{id}]"));if(l.summary={totalPages:l.document.pages,totalTextItems:0,totalImages:l.images.length,totalTextLength:l.document.textLength,averageImagesPerPage:(l.images.length/l.document.pages).toFixed(2),pagesWithImages:new Set(l.images.map(f=>f.page)).size},n.generateStructuredData){let f=l.textWithRefs||l.cleanText;l.structuredData=this.structuredDataGenerator.generateStructuredData(g,f,l.images,l.document.pages,n,c,i),n.verbose;}return n.verbose,l}async getText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).text}async getImages(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:false,extractImages:true})).images}async getTextItems(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractTextItems:true})).textItems}async getRawText(e,t,r={}){return (await this.getPage(e,t,{...r,extractText:true,extractImages:false})).rawText}async getPage(e,t,r={}){if(r.useCache!==false){let m=this.cacheManager.getCachedPageResult(e,t);if(m)return r.verbose,m}let a={...r,specificPages:[t]},s=await this.extract(e,a),n=this.extractPageText(s.textWithRefs||s.cleanText,t),o=s.images.filter(m=>m.page===t),c=s.textItems?.filter(m=>m.page===t)||[],i=this.extractRawText(n),g={pageNumber:t,text:n,rawText:i,textItems:c,images:o,metadata:{wordCount:this.countWords(i),characterCount:i.length,imageCount:o.length}};return r.useCache!==false&&this.cacheManager.cachePageResult(e,t,g),g}extractPageText(e,t){let r=/(?:--- PAGE (\d+) ---|🎨 ART BASEL PAGE (\d+) 🎨|PAGE (\d+))/g,a=e.split(r);if(a.length>1){for(let i=1;i<a.length;i+=4)if(parseInt(a[i]||a[i+1]||a[i+2]||"0",10)===t)return a[i+3]||""}let s=e.split(`
|
|
37
37
|
`),n=Math.ceil(s.length/t),o=(t-1)*n,c=Math.min(t*n,s.length);return s.slice(o,c).join(`
|
|
38
38
|
`)}countWords(e){return e.trim()?e.trim().split(/\s+/).length:0}extractRawText(e){let t=e;return t=t.replace(/--- PAGE \d+ ---\s*/g,""),t=t.replace(/🎨 ART BASEL PAGE \d+ 🎨\s*/g,""),t=t.replace(/PAGE \d+\s*/g,""),t=t.replace(/\[IMG:\w+\]\s*\w*\s*/g,""),t=t.replace(/\[IMG-\w+\]\s*[^[\n]*\s*/g,""),t=t.replace(/📷\s*[^-\n]*-\s*Page\s*\d+\s*-\s*Image\s*#\d+\s*/g,""),t=t.replace(/🎨\s*Art\s*Basel\s*Image\s*\d+\s*\(Page\s*\d+\)\s*/g,""),t=t.replace(/\n\s*\n\s*\n/g,`
|
|
39
39
|
|
package/package.json
CHANGED