npm - scribe.js-ocr - Versions diffs - 0.7.3 → 0.8.0 - Mend

scribe.js-ocr 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/cli/scribe.js +2 -0
package/fonts/all/Carlito-BoldItalic.woff +0 -0
package/fonts/all/Century-BoldItalic.woff +0 -0
package/fonts/all/Garamond-BoldItalic.woff +0 -0
package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
package/fonts/all/Palatino-BoldItalic.woff +0 -0
package/fonts/latin/Carlito-BoldItalic.woff +0 -0
package/fonts/latin/Century-BoldItalic.woff +0 -0
package/fonts/latin/Garamond-BoldItalic.woff +0 -0
package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
package/fonts/latin/Palatino-BoldItalic.woff +0 -0
package/js/clear.js +5 -6
package/js/containers/app.js +1 -1
package/js/containers/dataContainer.js +0 -3
package/js/containers/fontContainer.js +91 -77
package/js/export/export.js +20 -5
package/js/export/writeHocr.js +20 -18
package/js/export/writeHtml.js +1 -1
package/js/export/writePdf.js +52 -14
package/js/export/writePdfFonts.js +11 -9
package/js/export/writeTabular.js +2 -2
package/js/export/writeText.js +10 -6
package/js/extractTables.js +5 -5
package/js/fontContainerMain.js +92 -49
package/js/fontEval.js +12 -12
package/js/fontStatistics.js +93 -92
package/js/fontSupp.js +20 -20
package/js/generalWorkerMain.js +4 -0
package/js/global.d.ts +39 -4
package/js/import/convertPageAbbyy.js +55 -26
package/js/import/convertPageBlocks.js +2 -2
package/js/import/convertPageHocr.js +10 -20
package/js/import/convertPageShared.js +13 -9
package/js/import/convertPageStext.js +67 -32
package/js/import/import.js +89 -45
package/js/import/importOCR.js +27 -33
package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
package/js/objects/layoutObjects.js +37 -0
package/js/objects/ocrObjects.js +55 -19
package/js/recognizeConvert.js +21 -8
package/js/utils/fontUtils.js +11 -11
package/js/utils/miscUtils.js +43 -6
package/js/worker/compareOCRModule.js +20 -23
package/js/worker/generalWorker.js +5 -5
package/js/worker/optimizeFontModule.js +19 -19
package/mupdf/libmupdf.js +123 -17
package/mupdf/libmupdf.wasm +0 -0
package/package.json +6 -3

package/js/export/writePdf.js CHANGED Viewed

@@ -10,6 +10,7 @@ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFont
 import { opt } from '../containers/app.js';
 import { pageMetricsArr } from '../containers/dataContainer.js';
 import ocr from '../objects/ocrObjects.js';
+import { getStyleLookup } from '../utils/miscUtils.js';
 /**
  * @param {number} x
@@ -97,6 +98,7 @@ export async function writePdf(hocrArr, minpage = 0, maxpage = -1, textMode = 'e
       normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
       italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
       bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
+      boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
     };
     await addFamilyObj(familyKeyI, familyObjI);
   }
@@ -301,6 +303,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
   const pdfFontsUsed = new Set();
+  const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]);
   // Start 1st object: Text Content
   let textContentObjStr = '';
@@ -349,7 +353,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
     if (!wordFontOpentype) {
-      const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
+      const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
       console.log(`Skipping word due to missing font (${fontNameMessage})`);
       continue;
     }
@@ -359,7 +363,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     let wordFontSize = word0Metrics.fontSize;
     // Set font and font size
-    const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
+    const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
     pdfFontNameCurrent = pdfFontCurrent.name;
     pdfFontTypeCurrent = pdfFontCurrent.type;
     pdfFontsUsed.add(pdfFontCurrent);
@@ -372,7 +376,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
     let tz = 100;
-    if (wordJ.dropcap) {
+    if (wordJ.style.dropcap) {
       const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
       tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
     }
@@ -406,6 +410,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     let spacingAdj = 0;
     let kernSpacing = false;
     let wordLast = wordJ;
+    let underlineLeft = /** @type {?number} */ null;
+    let underlineRight = /** @type {?number} */ null;
     let wordFontOpentypeLast = wordFontOpentype;
     let fontSizeLast = wordFontSize;
     let tsCurrent = 0;
@@ -426,7 +432,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
       if (!wordFontOpentype) {
-        const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
+        const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
         console.log(`Skipping word due to missing font (${fontNameMessage})`);
         continue;
       }
@@ -446,11 +452,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
         fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
       }
-      const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
+      const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
       const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
       let ts = 0;
-      if (wordJ.sup || wordJ.dropcap) {
+      if (wordJ.style.sup || wordJ.style.dropcap) {
         ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
         if (!wordJ.visualCoords) {
           const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
@@ -462,12 +468,12 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       // TODO: This probably fails for Chinese, rethink.
       tz = 100;
-      if (wordJ.dropcap) {
+      if (wordJ.style.dropcap) {
         const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
         tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
       }
-      const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
+      const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
       const pdfFontName = pdfFont.name;
       const pdfFontType = pdfFont.type;
       pdfFontsUsed.add(pdfFont);
@@ -480,7 +486,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
         // The space between words determined by:
         // (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words,
         // (4) the current character spacing value (applied twice--both before and after the space character).
-        const spaceWidthGlyph = wordFontOpentypeLast.charToGlyph(' ').advanceWidth * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
+        const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2;
+        const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
         const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing;
@@ -503,10 +510,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       // However, this assumption does not hold for single-character words, as there is no space between character to adjust.
       // Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
       // (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
-      if (charArr.length === 1 && !wordJ.dropcap) {
+      if (charArr.length === 1 && !wordJ.style.dropcap) {
         const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
         const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
-        const lastCharWidth = (wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth) * (wordFontSize / wordFontOpentype.unitsPerEm);
+        const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFontOpentype.unitsPerEm / 2;
+        const lastCharWidth = lastCharAdvance * (wordFontSize / wordFontOpentype.unitsPerEm);
         spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX;
       } else {
         spacingAdj = 0 - angleAdjWordX;
@@ -514,7 +522,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       textContentObjStr += ' ] TJ\n';
-      const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
+      const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
       if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) {
         textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`;
         pdfFontNameCurrent = pdfFontName;
@@ -541,8 +549,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       // Non-ASCII and special characters are encoded/escaped using winEncodingLookup
       for (let k = 0; k < charArr.length; k++) {
         const letterSrc = charArr[k];
-        const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
-        const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
+        const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k];
+        const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
         const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
         if (letterEnc) {
@@ -611,6 +619,28 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
         }
       }
+      if (wordJ.style.underline && underlineLeft === null) {
+        underlineLeft = wordJ.bbox.left;
+      }
+      if (wordJ.style.underline) {
+        underlineRight = wordJ.bbox.right;
+      }
+      if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) {
+        underlines.push({
+          left: underlineLeft,
+          right: underlineRight,
+          top: lineTopAdj,
+          height: lineObj.bbox.bottom - lineObj.bbox.top,
+          fontSize: wordFontSize,
+          bold: wordJ.style.bold,
+        });
+        underlineLeft = null;
+        underlineRight = null;
+      }
       wordLast = wordJ;
       wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
       wordFontOpentypeLast = wordFontOpentype;
@@ -622,5 +652,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
   textContentObjStr += 'ET';
+  // Add underlines
+  underlines.forEach((underline) => {
+    const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24);
+    const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness;
+    textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`;
+  });
   return { textContentObjStr, pdfFontsUsed };
 }

package/js/export/writePdfFonts.js CHANGED Viewed

@@ -108,12 +108,12 @@ const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-dis
  *
  * @param {opentype.Font} font - Opentype.js font object
  * @param {number} objIndex - Index for font descriptor PDF object
- * @param {('normal'|'italic'|'bold')} style - Style of the font
+ * @param {boolean} italic
  * @param {?number} embeddedObjIndex - Index for embedded font file PDF object.
  *  If not provided, the font will not be embedded in the PDF.
  * @returns {string} The font descriptor object string.
  */
-function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex = null) {
+function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) {
   let objOut = `${String(objIndex)} 0 obj\n<</Type/FontDescriptor`;
   const namesTable = font.names.windows || font.names;
@@ -155,7 +155,7 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
   // Symbolic is always set to false, even if the font contains glyphs outside the Adobe standard Latin character set.
   // This is because symbolic fonts are only used when embedded, and this does not appear to matter for embedded fonts.
-  objOut += `/Flags ${String(generateFontFlags(serif, style === 'italic', false, false))}`;
+  objOut += `/Flags ${String(generateFontFlags(serif, italic, false, false))}`;
   if (embeddedObjIndex === null || embeddedObjIndex === undefined) {
     objOut += '>>\nendobj\n\n';
@@ -175,12 +175,12 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
  *
  * @param {opentype.Font} font - Opentype.js font object
  * @param {number} firstObjIndex - Index for the first PDF object
- * @param {('normal'|'italic'|'bold')} style - Style of the font
+ * @param {boolean} [italic=false] - Whether the font is italic.
  * @param {boolean} [isStandardFont=false] - Whether the font is a standard font.
  *  Standard fonts are not embedded in the PDF.
  * @returns {Array<string>}
  */
-export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', isStandardFont = false) {
+export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) {
   // Start 1st object: Font Dictionary
   let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type1`;
@@ -193,7 +193,8 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
   fontDictObjStr += '/Widths[';
   for (let i = 0; i < win1252Chars.length; i++) {
-    const advanceNorm = Math.round(font.charToGlyph(win1252Chars[i]).advanceWidth * (1000 / font.unitsPerEm));
+    const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm;
+    const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm));
     fontDictObjStr += `${String(advanceNorm)} `;
   }
   fontDictObjStr += ']/FirstChar 32/LastChar 255';
@@ -201,7 +202,7 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
   fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`;
   // Start 2nd object: Font Descriptor
-  const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, style, isStandardFont ? null : firstObjIndex + 2);
+  const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2);
   // objOut += `${String(firstObjIndex + 1)} 0 obj\n<</Type/FontDescriptor`;
@@ -249,13 +250,14 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
  *
  * @param {opentype.Font} font - Opentype.js font object
  * @param {number} firstObjIndex - Index for the first PDF object
+ * @param {boolean} [italic=false] - Whether the font is italic.
  *
  * This function does not produce "toUnicode" or "Widths" objects,
  * so any PDF it creates directly will lack usable copy/paste.
  * However, both of these objects will be created from the embedded file
  * when the result is run through mupdf.
  */
-export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
+export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
   // Start 1st object: Font Dictionary
   let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
@@ -282,7 +284,7 @@ export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
   toUnicodeStr += '\nendstream\nendobj\n\n';
   // Start 3rd object: FontDescriptor
-  const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, style, firstObjIndex + 3);
+  const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3);
   // objOut += `${String(firstObjIndex + 2)} 0 obj\n`;

package/js/export/writeTabular.js CHANGED Viewed

@@ -86,9 +86,9 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode
         if (xlsxMode) {
           let fontStyle;
-          if (wordObj.style === 'italic') {
+          if (wordObj.style.italic) {
             fontStyle = '<i/>';
-          } else if (wordObj.smallCaps) {
+          } else if (wordObj.style.smallCaps) {
             fontStyle = '<smallCaps/>';
           } else {
             fontStyle = '';

package/js/export/writeText.js CHANGED Viewed

@@ -54,17 +54,21 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
         if (docxMode) {
           let fontStyle = '';
-          if (wordObj.style === 'italic') {
+          if (wordObj.style.italic) {
             fontStyle += '<w:i/>';
-          } else if (wordObj.style === 'bold') {
+          } else if (wordObj.style.bold) {
             fontStyle += '<w:b/>';
           }
-          if (wordObj.smallCaps) {
+          if (wordObj.style.smallCaps) {
             fontStyle += '<w:smallCaps/>';
           }
-          if (wordObj.sup) {
+          if (wordObj.style.underline) {
+            fontStyle += '<w:u w:val="single"/>';
+          }
+          if (wordObj.style.sup) {
             fontStyle += '<w:vertAlign w:val="superscript"/>';
           }
@@ -79,7 +83,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
             } else if (supPrev) {
               textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
             // If this word is a superscript, no space is added between words.
-            } else if (wordObj.sup && i > 0) {
+            } else if (wordObj.style.sup && i > 0) {
               textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
             } else {
               textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
@@ -89,7 +93,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
           }
           fontStylePrev = fontStyle;
-          supPrev = wordObj.sup;
+          supPrev = wordObj.style.sup;
         } else if (newLine) {
           textStr = `${textStr}\n`;
         } else if (h > 0 || g > 0 || i > 0) {

package/js/extractTables.js CHANGED Viewed

@@ -22,11 +22,11 @@ export function extractTableContent(pageObj, layoutObj) {
 // TODO: This currently creates junk rows with only punctuation, as those bounding boxes are so small they often do not overlap with other lines.
 /**
-   * Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
-   * The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
-   * @param {OcrPage} pageObj
-   * @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
-   */
+ * Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
+ * The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
+ * @param {OcrPage} pageObj
+ * @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
+ */
 export function extractSingleTableContent(pageObj, boxes) {
   /** @type {Array<OcrWord>} */
   const wordArr = [];