npm - @turntrout/subfont - Versions diffs - 1.3.0 → 1.3.2 - Mend

@turntrout/subfont 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/lib/collectTextsByPage.js +3 -1
package/lib/extractVisibleText.js +115 -44
package/lib/getFontInfo.js +3 -3
package/lib/subsetFontWithGlyphs.js +109 -47
package/package.json +1 -1

package/lib/collectTextsByPage.js CHANGED Viewed

@@ -231,7 +231,7 @@ function findFontFamiliesWithFeatureSettings(
       if (recorded === true) {
         result = true;
-      } else {
+      } else if (result !== true) {
         if (!result) result = new Set();
         for (const family of recorded) {
           result.add(family.toLowerCase());
@@ -1214,3 +1214,5 @@ module.exports = collectTextsByPage;
 // Exported for testing only
 module.exports._extractFeatureTagsFromDecl = extractFeatureTagsFromDecl;
 module.exports._resolveFeatureSettings = resolveFeatureSettings;
+module.exports._findFontFamiliesWithFeatureSettings =
+  findFontFamiliesWithFeatureSettings;

package/lib/extractVisibleText.js CHANGED Viewed

@@ -1,5 +1,3 @@
-const parse5 = require('parse5');
 const INVISIBLE_ELEMENTS = new Set([
   'script',
   'style',
@@ -12,63 +10,136 @@ const INVISIBLE_ELEMENTS = new Set([
   'embed',
   'datalist',
 ]);
-const TEXT_ATTRIBUTES = new Set([
-  'alt',
-  'title',
-  'placeholder',
-  'value',
-  'aria-label',
-]);
+// Build a regex that strips invisible element blocks (greedy, case-insensitive).
+// For void elements like <embed> there is no closing tag — just the opening
+// tag is stripped (which the tag-stripping regex below handles).
+const invisibleBlockTags = [...INVISIBLE_ELEMENTS].filter((t) => t !== 'embed');
+const invisibleBlockRe = new RegExp(
+  `<(${invisibleBlockTags.join('|')})\\b[^>]*>[\\s\\S]*?<\\/\\1\\s*>`,
+  'gi'
+);
+const commentRe = /<!--[\s\S]*?-->/g;
+// Match text-bearing attributes: alt="...", title='...', placeholder=..., etc.
+// Captures the attribute name (group 1) and the value (groups 2, 3, or 4 for
+// double-quoted, single-quoted, and unquoted respectively).
+// Negative lookbehind prevents matching data- prefixed attributes (e.g. data-alt).
+const attrRe =
+  /(?<![-\w])(alt|title|placeholder|value|aria-label)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*))/gi;
+// Match <input ... type="hidden" ...> or <input ... type=hidden ...>
+// \b only after the unquoted alternative — quotes already delimit the value.
+const hiddenInputRe =
+  /<input\b[^>]*?\btype\s*=\s*(?:"hidden"|'hidden'|hidden\b)[^>]*/gi;
+const tagRe = /<[^>]+>/g;
+// Named and numeric HTML entity decoder.  Covers the XML built-ins plus
+// typographic entities commonly found in blog/article content.  Rare
+// entities are left as-is (their literal characters still enter the
+// subset, so glyphs are never lost — just slightly overcounted).
+const namedEntities = {
+  amp: '&',
+  lt: '<',
+  gt: '>',
+  quot: '"',
+  apos: "'",
+  nbsp: '\u00A0',
+  // Typographic quotes & dashes
+  ldquo: '\u201C',
+  rdquo: '\u201D',
+  lsquo: '\u2018',
+  rsquo: '\u2019',
+  mdash: '\u2014',
+  ndash: '\u2013',
+  hellip: '\u2026',
+  laquo: '\u00AB',
+  raquo: '\u00BB',
+  // Common symbols
+  bull: '\u2022',
+  middot: '\u00B7',
+  copy: '\u00A9',
+  reg: '\u00AE',
+  trade: '\u2122',
+  times: '\u00D7',
+  divide: '\u00F7',
+  minus: '\u2212',
+  plusmn: '\u00B1',
+  deg: '\u00B0',
+  micro: '\u00B5',
+  para: '\u00B6',
+  sect: '\u00A7',
+  // Currency
+  euro: '\u20AC',
+  pound: '\u00A3',
+  yen: '\u00A5',
+  cent: '\u00A2',
+  // Arrows
+  larr: '\u2190',
+  rarr: '\u2192',
+  uarr: '\u2191',
+  darr: '\u2193',
+};
+const entityRe = /&(?:#x([0-9a-fA-F]+)|#(\d+)|([a-zA-Z]+));/g;
+function decodeEntities(str) {
+  return str.replace(entityRe, (match, hex, dec, name) => {
+    if (hex) return String.fromCodePoint(parseInt(hex, 16));
+    if (dec) return String.fromCodePoint(parseInt(dec, 10));
+    if (name && namedEntities[name.toLowerCase()] !== undefined) {
+      return namedEntities[name.toLowerCase()];
+    }
+    return match;
+  });
+}
 /**
  * Fast extraction of visible text content from HTML source.
  * Used as a lightweight alternative to full font-tracer for pages
  * that share the same CSS configuration as an already-traced page.
  *
- * Walks the parse5 tree collecting text nodes and content attributes
- * (alt, title, placeholder, value, aria-label), skipping invisible
- * elements (script, style, svg, template).
+ * Uses regex-based stripping instead of a full DOM parse for speed.
+ * Collects text nodes and content attributes (alt, title, placeholder,
+ * value, aria-label), skipping invisible elements.
  */
 function extractVisibleText(html) {
-  const document = parse5.parse(html);
-  const parts = [];
+  if (!html) return '';
-  function walk(node) {
-    if (node.nodeName && INVISIBLE_ELEMENTS.has(node.nodeName)) {
-      return;
-    }
+  // Reset lastIndex on global regexes — a prior call that threw
+  // mid-function would leave them in an indeterminate state.
+  hiddenInputRe.lastIndex = 0;
+  attrRe.lastIndex = 0;
-    // Collect relevant attribute values
-    if (node.attrs) {
-      const isHiddenInput =
-        node.nodeName === 'input' &&
-        node.attrs.some(
-          (a) => a.name === 'type' && a.value.toLowerCase() === 'hidden'
-        );
-      for (const attr of node.attrs) {
-        if (TEXT_ATTRIBUTES.has(attr.name) && attr.value) {
-          if (attr.name === 'value' && isHiddenInput) {
-            continue;
-          }
-          parts.push(attr.value);
-        }
-      }
-    }
+  const parts = [];
-    // Collect text content
-    if (node.nodeName === '#text' && node.value) {
-      parts.push(node.value);
+  // Collect hidden-input value attrs that should be excluded.
+  const hiddenInputValues = new Set();
+  let hiddenMatch;
+  while ((hiddenMatch = hiddenInputRe.exec(html)) !== null) {
+    const fragment = hiddenMatch[0];
+    let m;
+    const localAttrRe = /\bvalue\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*))/gi;
+    while ((m = localAttrRe.exec(fragment)) !== null) {
+      const val = m[1] ?? m[2] ?? m[3];
+      if (val) hiddenInputValues.add(val);
     }
+  }
-    // Recurse into child nodes
-    if (node.childNodes) {
-      for (const child of node.childNodes) {
-        walk(child);
-      }
-    }
+  // Extract text attributes before stripping tags.
+  let attrMatch;
+  while ((attrMatch = attrRe.exec(html)) !== null) {
+    const attrName = attrMatch[1].toLowerCase();
+    const val = attrMatch[2] ?? attrMatch[3] ?? attrMatch[4];
+    if (!val) continue;
+    if (attrName === 'value' && hiddenInputValues.has(val)) continue;
+    parts.push(decodeEntities(val));
   }
-  walk(document);
+  // Strip invisible blocks, comments, and tags to get text content.
+  let text = html;
+  text = text.replace(invisibleBlockRe, ' ');
+  text = text.replace(commentRe, ' ');
+  text = text.replace(tagRe, ' ');
+  text = decodeEntities(text);
+  parts.push(text);
   return parts.join(' ');
 }

package/lib/getFontInfo.js CHANGED Viewed

@@ -1,10 +1,10 @@
-const fontverter = require('fontverter');
+const { toSfnt } = require('./sfntCache');
 async function getFontInfoFromBuffer(buffer) {
   const harfbuzzJs = await require('harfbuzzjs');
-  const blob = harfbuzzJs.createBlob(await fontverter.convert(buffer, 'sfnt')); // Load the font data into something Harfbuzz can use
-  const face = harfbuzzJs.createFace(blob, 0); // Select the first font in the file (there's normally only one!)
+  const blob = harfbuzzJs.createBlob(await toSfnt(buffer));
+  const face = harfbuzzJs.createFace(blob, 0);
   const fontInfo = {
     characterSet: Array.from(face.collectUnicodes()),

package/lib/subsetFontWithGlyphs.js CHANGED Viewed

@@ -1,3 +1,4 @@
+const os = require('os');
 const { readFile } = require('fs').promises;
 const fontverter = require('fontverter');
 const { toSfnt } = require('./sfntCache');
@@ -11,30 +12,74 @@ const HB_SUBSET_SETS_NAME_ID = 4;
 // hb_subset_flags_t
 const HB_SUBSET_FLAGS_NO_HINTING = 0x00000001;
-// All font subsetting goes through harfbuzz directly so we can apply
-// web-specific optimizations (no hinting, minimal name table, table
-// stripping) and support explicit glyph-ID inclusion.
-let _wasmExports;
-let _loadPromise;
-async function loadHarfbuzz() {
-  if (!_loadPromise) {
-    _loadPromise = (async () => {
-      const {
-        instance: { exports },
-      } = await WebAssembly.instantiate(
-        await readFile(require.resolve('harfbuzzjs/hb-subset.wasm'))
-      );
-      _wasmExports = exports;
-      return exports;
+// Pool of WASM instances for parallel subsetting.  Each instance has its
+// own linear memory so concurrent calls are safe.  The module is compiled
+// once and instantiated N times (N = CPU count, capped at 8).
+let _compilePromise;
+function compileModule() {
+  if (!_compilePromise) {
+    // Assign the promise synchronously so concurrent callers share it
+    // (an async function would await readFile before the assignment).
+    _compilePromise = readFile(
+      require.resolve('harfbuzzjs/hb-subset.wasm')
+    ).then((buf) => WebAssembly.compile(buf));
+  }
+  return _compilePromise;
+}
+const _pool = []; // Array of { exports, busy: boolean }
+let _poolReady;
+const POOL_SIZE = Math.min(os.cpus().length, 8);
+async function initPool() {
+  if (!_poolReady) {
+    _poolReady = (async () => {
+      const mod = await compileModule();
+      const instantiations = [];
+      for (let i = 0; i < POOL_SIZE; i++) {
+        instantiations.push(
+          WebAssembly.instantiate(mod).then(({ exports }) => {
+            _pool.push({ exports, busy: false });
+          })
+        );
+      }
+      await Promise.all(instantiations);
     })();
   }
-  return _loadPromise;
+  return _poolReady;
+}
+// Waiters queue: callers waiting for an idle WASM instance.
+const _waiters = [];
+async function acquireInstance() {
+  await initPool();
+  const idle = _pool.find((inst) => !inst.busy);
+  if (idle) {
+    idle.busy = true;
+    return idle;
+  }
+  // All instances busy — wait for one to be released.
+  return new Promise((resolve) => _waiters.push(resolve));
+}
+function releaseInstance(inst) {
+  inst.busy = false;
+  if (_waiters.length > 0) {
+    inst.busy = true;
+    _waiters.shift()(inst);
+  }
 }
+// Serialize fontverter.convert calls — the wawoff2 module (used internally by
+// fontverter for WOFF2 compression) has a shared WASM instance whose memory
+// is corrupted by concurrent calls.
+const convertLimiter = require('p-limit')(1);
 // Re-create on every call — WASM memory.buffer is detached when memory grows,
 // so a cached Uint8Array would silently read/write stale data.
-function getHeapu8() {
-  return new Uint8Array(_wasmExports.memory.buffer);
+function getHeapu8(exports) {
+  return new Uint8Array(exports.memory.buffer);
 }
 function HB_TAG(str) {
@@ -152,7 +197,7 @@ function configureSubsetInput(
   }
 }
-function extractSubsetFont(exports, heapu8, subset) {
+function extractSubsetFont(exports, subset) {
   const result = exports.hb_face_reference_blob(subset);
   const offset = exports.hb_blob_get_data(result, 0);
   const subsetByteLength = exports.hb_blob_get_length(result);
@@ -162,6 +207,9 @@ function extractSubsetFont(exports, heapu8, subset) {
     throw new Error('Failed to create subset font');
   }
+  // Fresh view AFTER the WASM calls above — memory.buffer may have been
+  // detached by a grow during hb_face_reference_blob / hb_blob_get_data.
+  const heapu8 = getHeapu8(exports);
   const subsetFont = Buffer.from(
     heapu8.subarray(offset, offset + subsetByteLength)
   );
@@ -174,45 +222,59 @@ async function subsetFontWithGlyphs(
   text,
   { targetFormat, glyphIds, variationAxes } = {}
 ) {
-  const exports = await loadHarfbuzz();
   // Reuse cached sfnt conversion when available (same buffer may have
   // been converted by getFontInfo or collectFeatureGlyphIds already).
   const ttf = await toSfnt(originalFont);
-  const fontBuffer = exports.malloc(ttf.byteLength);
-  // Fresh view — memory.buffer may have been detached by a prior malloc/grow.
-  getHeapu8().set(new Uint8Array(ttf), fontBuffer);
+  const inst = await acquireInstance();
+  const { exports } = inst;
+  let released = false;
+  try {
+    const fontBuffer = exports.malloc(ttf.byteLength);
+    // Fresh view — memory.buffer may have been detached by a prior malloc/grow.
+    getHeapu8(exports).set(new Uint8Array(ttf), fontBuffer);
-  const blob = exports.hb_blob_create(fontBuffer, ttf.byteLength, 2, 0, 0);
-  const face = exports.hb_face_create(blob, 0);
-  exports.hb_blob_destroy(blob);
+    const blob = exports.hb_blob_create(fontBuffer, ttf.byteLength, 2, 0, 0);
+    const face = exports.hb_face_create(blob, 0);
+    exports.hb_blob_destroy(blob);
-  const input = exports.hb_subset_input_create_or_fail();
-  if (input === 0) {
-    exports.hb_face_destroy(face);
-    exports.free(fontBuffer);
-    throw new Error('hb_subset_input_create_or_fail returned zero');
-  }
+    const input = exports.hb_subset_input_create_or_fail();
+    if (input === 0) {
+      exports.hb_face_destroy(face);
+      exports.free(fontBuffer);
+      throw new Error('hb_subset_input_create_or_fail returned zero');
+    }
-  let subset = 0;
-  try {
-    configureSubsetInput(exports, input, face, text, glyphIds, variationAxes);
+    let subsetFont;
+    let subset = 0;
+    try {
+      configureSubsetInput(exports, input, face, text, glyphIds, variationAxes);
+      subset = exports.hb_subset_or_fail(face, input);
+      if (subset === 0) {
+        throw new Error('hb_subset_or_fail returned zero');
+      }
-    subset = exports.hb_subset_or_fail(face, input);
-    if (subset === 0) {
-      throw new Error('hb_subset_or_fail returned zero');
+      subsetFont = extractSubsetFont(exports, subset);
+    } finally {
+      // Clean up all WASM resources while we still own the instance.
+      if (subset) exports.hb_face_destroy(subset);
+      exports.hb_subset_input_destroy(input);
+      exports.hb_face_destroy(face);
+      exports.free(fontBuffer);
     }
-    const subsetFont = extractSubsetFont(exports, getHeapu8(), subset);
-    return fontverter.convert(subsetFont, targetFormat, 'truetype');
+    // Instance is fully cleaned up — release it so other subsetting
+    // calls can proceed while we wait for the serialized WOFF2 step.
+    released = true;
+    releaseInstance(inst);
+    return convertLimiter(() =>
+      fontverter.convert(subsetFont, targetFormat, 'truetype')
+    );
   } finally {
-    if (subset) exports.hb_face_destroy(subset);
-    exports.hb_subset_input_destroy(input);
-    exports.hb_face_destroy(face);
-    exports.free(fontBuffer);
+    if (!released) releaseInstance(inst);
   }
 }
-const limiter = require('p-limit')(1);
-module.exports = (...args) => limiter(() => subsetFontWithGlyphs(...args));
+module.exports = subsetFontWithGlyphs;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@turntrout/subfont",
-  "version": "1.3.0",
+  "version": "1.3.2",
   "description": "Automatically subset web fonts to only the characters used on your pages. Fork of Munter/subfont with modern defaults.",
   "engines": {
     "node": ">=18.0.0"