npm - scribe.js-ocr - Versions diffs - 0.7.2 → 0.7.3 - Mend

scribe.js-ocr 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/cli/cli.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { detectPDFType } from './detectPDFType.js';
 import { extract } from './extract.js';
 import {
   check,
@@ -36,7 +37,7 @@ export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
  * @param {string} pdfFile - Path to PDF file.
  * @param {?string} [outputDir='.'] - Output directory.
  * @param {Object} [options]
- * @param {'txt'} [options.format]
+ * @param {"pdf" | "hocr" | "docx" | "xlsx" | "txt" | "text" | "html"} [options.format]
  * @param {boolean} [options.reflow]
  */
 export const extractCLI = async (pdfFile, outputDir, options) => {
@@ -44,6 +45,16 @@ export const extractCLI = async (pdfFile, outputDir, options) => {
   process.exitCode = 0;
 };
+/**
+ *
+ * @param {string} pdfFile - Path to PDF file.
+ * @param {string} [outputPath] - Output file path.
+ */
+export const detectPDFTypeCLI = async (pdfFile, outputPath) => {
+  await detectPDFType(pdfFile, outputPath);
+  process.exitCode = 0;
+};
 /**
  *
  * @param {string} pdfFile - Path to PDF file.

package/cli/detectPDFType.js CHANGED Viewed

@@ -1,20 +1,17 @@
-// Code for adding visualization to OCR output
 import fs from 'fs';
-import { createRequire } from 'module';
-import Worker from 'web-worker';
-import { initMuPDFWorker } from '../mupdf/mupdf-async.js';
-globalThis.Worker = Worker;
-globalThis.require = createRequire(import.meta.url);
+import scribe from '../scribe.js';
-const args = process.argv.slice(2);
+/**
+ *
+ * @param {string} pdfFile - Path to PDF file.
+ * @param {string} [outputPath] - Output file path.
+ *    If provided, the text will be extracted and saved to this path.
+ */
+export const detectPDFType = async (pdfFile, outputPath) => {
+  const mupdfScheduler = await scribe.data.image.getMuPDFScheduler(1);
+  const w = mupdfScheduler.workers[0];
-async function main() {
-  const w = await initMuPDFWorker();
-  const fileData = await fs.readFileSync(args[0]);
-  const outputPath = args[1];
+  const fileData = await fs.readFileSync(pdfFile);
   const pdfDoc = await w.openDocument(fileData, 'file.pdf');
   w.pdfDoc = pdfDoc;
@@ -32,10 +29,6 @@ async function main() {
   console.log('PDF Type:', type);
-  // Terminate all workers
-  w.terminate();
-  process.exitCode = 0;
-}
+  mupdfScheduler.scheduler.terminate();
-main();
+};

package/cli/extract.js CHANGED Viewed

@@ -7,7 +7,7 @@ import scribe from '../scribe.js';
  * @param {string} pdfFile - Path to PDF file.
  * @param {?string} [output='.'] - Output file or directory.
  * @param {Object} [options]
- * @param {'txt'} [options.format]
+ * @param {Parameters<typeof scribe.download>[0]} [options.format]
  * @param {boolean} [options.reflow]
  */
 export const extract = async (pdfFile, output, options) => {
@@ -18,7 +18,9 @@ export const extract = async (pdfFile, output, options) => {
   const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,5}$/i, `.${format}`)}` : path.basename(output);
   const outputPath = `${outputDir}/${outputFile}`;
-  scribe.setOptions({ reflow: true, extractText: true });
+  scribe.opt.reflow = true;
+  scribe.opt.extractText = true;
   await scribe.init();
   await scribe.importFiles([pdfFile]);

package/cli/scribe.js CHANGED Viewed

@@ -4,6 +4,7 @@ import {
   checkCLI,
   confCLI,
   debugCLI,
+  detectPDFTypeCLI,
   evalInternalCLI, extractCLI, overlayCLI, recognizeCLI,
 } from './cli.js';
@@ -35,7 +36,7 @@ program
   .command('extract')
   .argument('<pdf_file>', 'Input PDF file.')
   .argument('[output]', 'Output directory or file to save results.', '.')
-  .addOption(new Option('-f, --format <ext>', 'Output format.').choices(['txt']).default('txt'))
+  .addOption(new Option('-f, --format <ext>', 'Output format.').choices(['pdf', 'hocr', 'docx', 'xlsx', 'txt', 'text', 'html']).default('txt'))
   .option('-r, --reflow', 'Reflow text by combining lines into paragraphs.')
   .description('Extract text from PDF file and save in requested format.')
   .action(extractCLI);
@@ -61,6 +62,13 @@ program
   .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
   .action(recognizeCLI);
+program
+  .command('type')
+  .argument('<pdf_file>', 'Input PDF file.')
+  .argument('[output]', 'Output file path to save text.')
+  .description('Detect PDF file type (\'Text native\', \'Image + OCR text\', or \'Image native\').')
+  .action(detectPDFTypeCLI);
 program
   .command('debug')
   .argument('<pdf_file>', 'Input PDF file.')

package/js/export/export.js CHANGED Viewed

@@ -207,7 +207,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
 /**
  * Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
  * @public
- * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
+ * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
  * @param {string} fileName
  * @param {number} [minPage=0] - First page to export.
  * @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.

package/js/export/writeHtml.js CHANGED Viewed

@@ -5,6 +5,31 @@ import { assignParagraphs } from '../utils/reflowPars.js';
 import { pageMetricsArr } from '../containers/dataContainer.js';
 import ocr from '../objects/ocrObjects.js';
+/**
+ * Calculate the font metrics for a given font and font size.
+ * This is used to get metrics that match `ctx.measureText`, but without requiring a canvas.
+ * @param {FontContainerFont} fontI
+ * @param {number} fontSize
+ */
+const calcFontMetrics = (fontI, fontSize) => {
+  const os2 = fontI.opentype.tables.os2;
+  const unitsPerEm = fontI.opentype.unitsPerEm;
+  // Bit 7: Use_Typo_Metrics (1 = Yes)
+  // eslint-disable-next-line no-bitwise
+  if (os2.fsSelection >> 7 & 1) {
+    return {
+      fontBoundingBoxAscent: Math.round(os2.sTypoAscender * (fontSize / unitsPerEm)),
+      fontBoundingBoxDescent: Math.round(os2.sTypoDescender * (fontSize / unitsPerEm)),
+    };
+  }
+  return {
+    fontBoundingBoxAscent: Math.round(os2.usWinAscent * (fontSize / unitsPerEm)),
+    fontBoundingBoxDescent: Math.round(os2.usWinDescent * (fontSize / unitsPerEm)),
+  };
+};
 /**
  *
  * @param {string} text
@@ -33,18 +58,11 @@ const makeSmallCapsDivs = (text, fontSizeHTMLSmallCaps) => {
  *    If omitted, all words are included.
  */
 export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, removeMargins = false, wordIds = null) {
-  if (!(typeof process === 'undefined')) {
-    throw new Error('HTML exports are not supported in Node.js');
-  }
-  const canvas = new OffscreenCanvas(1, 1);
-  const ctx = /** @type {OffscreenCanvasRenderingContext2D} */ (canvas.getContext('2d'));
   const fontsUsed = new Set();
   const pad = 5;
-  let bodyStr = '<body>';
+  let bodyStr = '<body>\n';
   if (maxpage === -1) maxpage = ocrCurrent.length - 1;
@@ -71,7 +89,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
       }
     }
-    bodyStr += `<div class="scribe-page" id="page${g}" style="position:absolute;top:${top}px;">`;
+    bodyStr += `  <div class="scribe-page" id="page${g}" style="position:absolute;top:${top}px;">\n`;
     if (removeMargins) {
       top += Math.min((maxBottom - minTop) + 200, pageMetricsArr[g].dims.height + 10);
     } else {
@@ -130,9 +148,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
         const fontSizeHTML = fontSize * scale;
-        ctx.font = `${fontI.fontFaceStyle} ${fontI.fontFaceWeight} ${fontSizeHTML}px ${fontI.fontFaceName}`;
-        const metrics = ctx.measureText(wordStr);
+        const metrics = calcFontMetrics(fontI, fontSizeHTML);
         const fontSizeHTMLSmallCaps = fontSize * scale * fontI.smallCapsMult;
@@ -174,29 +190,29 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
         // Line height must match the height of the font bounding box for the font metrics to be accurate.
         styleStr += `line-height:${metrics.fontBoundingBoxAscent + metrics.fontBoundingBoxDescent}px;`;
-        bodyStr += `<span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>`;
+        bodyStr += `    <span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>`;
       }
     }
-    bodyStr += '</div>';
+    bodyStr += '\n  </div>\n';
     opt.progressHandler({ n: g, type: 'export', info: { } });
   }
-  let styleStr = '<style>.scribe-word {';
+  let styleStr = '<style>\n  .scribe-word {\n';
-  styleStr += 'position:absolute;';
-  styleStr += `padding-left:${pad}px;`;
-  styleStr += `padding-right:${pad}px;`;
-  styleStr += 'z-index:1;';
-  styleStr += 'white-space:nowrap;';
+  styleStr += '    position:absolute;\n';
+  styleStr += `    padding-left:${pad}px;\n`;
+  styleStr += `    padding-right:${pad}px;\n`;
+  styleStr += '    z-index:1;\n';
+  styleStr += '    white-space:nowrap;\n';
   if (opt.kerning) {
-    styleStr += 'font-kerning:normal;';
+    styleStr += '    font-kerning:normal;\n';
   } else {
-    styleStr += 'font-kerning:none;';
+    styleStr += '    font-kerning:none;\n';
   }
-  styleStr += '}';
+  styleStr += '  }\n';
   for (const fontI of fontsUsed) {
     const cdnPath = 'https://cdn.jsdelivr.net/npm/scribe.js-ocr@0.7.1/fonts/all/';
@@ -205,19 +221,19 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
     const fontName = `${fontI.family}-${styleTitleCase}.woff`;
     const fontPath = cdnPath + fontName;
-    styleStr += `@font-face {
+    styleStr += `  @font-face {
     font-family: '${fontI.fontFaceName}';
     font-style: ${fontI.fontFaceStyle};
     font-weight: ${fontI.fontFaceWeight};
     src: url('${fontPath}');
-}\n`;
+  }\n`;
   }
-  styleStr += '</style>';
+  styleStr += '</style>\n';
-  bodyStr += '</body>';
+  bodyStr += '</body>\n';
-  const htmlStr = `<html><head>${styleStr}</head>${bodyStr}</html>`;
+  const htmlStr = `<html>\n<head>\n${styleStr}</head>\n${bodyStr}</html>`;
   return htmlStr;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.7.2",
+  "version": "0.7.3",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {