@turntrout/subfont 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -231,7 +231,7 @@ function findFontFamiliesWithFeatureSettings(
231
231
 
232
232
  if (recorded === true) {
233
233
  result = true;
234
- } else {
234
+ } else if (result !== true) {
235
235
  if (!result) result = new Set();
236
236
  for (const family of recorded) {
237
237
  result.add(family.toLowerCase());
@@ -1214,3 +1214,5 @@ module.exports = collectTextsByPage;
1214
1214
  // Exported for testing only
1215
1215
  module.exports._extractFeatureTagsFromDecl = extractFeatureTagsFromDecl;
1216
1216
  module.exports._resolveFeatureSettings = resolveFeatureSettings;
1217
+ module.exports._findFontFamiliesWithFeatureSettings =
1218
+ findFontFamiliesWithFeatureSettings;
@@ -1,5 +1,3 @@
1
- const parse5 = require('parse5');
2
-
3
1
  const INVISIBLE_ELEMENTS = new Set([
4
2
  'script',
5
3
  'style',
@@ -12,63 +10,136 @@ const INVISIBLE_ELEMENTS = new Set([
12
10
  'embed',
13
11
  'datalist',
14
12
  ]);
15
- const TEXT_ATTRIBUTES = new Set([
16
- 'alt',
17
- 'title',
18
- 'placeholder',
19
- 'value',
20
- 'aria-label',
21
- ]);
13
+ // Build a regex that strips invisible element blocks (greedy, case-insensitive).
14
+ // For void elements like <embed> there is no closing tag — just the opening
15
+ // tag is stripped (which the tag-stripping regex below handles).
16
+ const invisibleBlockTags = [...INVISIBLE_ELEMENTS].filter((t) => t !== 'embed');
17
+ const invisibleBlockRe = new RegExp(
18
+ `<(${invisibleBlockTags.join('|')})\\b[^>]*>[\\s\\S]*?<\\/\\1\\s*>`,
19
+ 'gi'
20
+ );
21
+ const commentRe = /<!--[\s\S]*?-->/g;
22
+
23
+ // Match text-bearing attributes: alt="...", title='...', placeholder=..., etc.
24
+ // Captures the attribute name (group 1) and the value (groups 2, 3, or 4 for
25
+ // double-quoted, single-quoted, and unquoted respectively).
26
+ // Negative lookbehind prevents matching data- prefixed attributes (e.g. data-alt).
27
+ const attrRe =
28
+ /(?<![-\w])(alt|title|placeholder|value|aria-label)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*))/gi;
29
+ // Match <input ... type="hidden" ...> or <input ... type=hidden ...>
30
+ // \b only after the unquoted alternative — quotes already delimit the value.
31
+ const hiddenInputRe =
32
+ /<input\b[^>]*?\btype\s*=\s*(?:"hidden"|'hidden'|hidden\b)[^>]*/gi;
33
+ const tagRe = /<[^>]+>/g;
34
+
35
+ // Named and numeric HTML entity decoder. Covers the XML built-ins plus
36
+ // typographic entities commonly found in blog/article content. Rare
37
+ // entities are left as-is (their literal characters still enter the
38
+ // subset, so glyphs are never lost — just slightly overcounted).
39
+ const namedEntities = {
40
+ amp: '&',
41
+ lt: '<',
42
+ gt: '>',
43
+ quot: '"',
44
+ apos: "'",
45
+ nbsp: '\u00A0',
46
+ // Typographic quotes & dashes
47
+ ldquo: '\u201C',
48
+ rdquo: '\u201D',
49
+ lsquo: '\u2018',
50
+ rsquo: '\u2019',
51
+ mdash: '\u2014',
52
+ ndash: '\u2013',
53
+ hellip: '\u2026',
54
+ laquo: '\u00AB',
55
+ raquo: '\u00BB',
56
+ // Common symbols
57
+ bull: '\u2022',
58
+ middot: '\u00B7',
59
+ copy: '\u00A9',
60
+ reg: '\u00AE',
61
+ trade: '\u2122',
62
+ times: '\u00D7',
63
+ divide: '\u00F7',
64
+ minus: '\u2212',
65
+ plusmn: '\u00B1',
66
+ deg: '\u00B0',
67
+ micro: '\u00B5',
68
+ para: '\u00B6',
69
+ sect: '\u00A7',
70
+ // Currency
71
+ euro: '\u20AC',
72
+ pound: '\u00A3',
73
+ yen: '\u00A5',
74
+ cent: '\u00A2',
75
+ // Arrows
76
+ larr: '\u2190',
77
+ rarr: '\u2192',
78
+ uarr: '\u2191',
79
+ darr: '\u2193',
80
+ };
81
+ const entityRe = /&(?:#x([0-9a-fA-F]+)|#(\d+)|([a-zA-Z]+));/g;
82
+ function decodeEntities(str) {
83
+ return str.replace(entityRe, (match, hex, dec, name) => {
84
+ if (hex) return String.fromCodePoint(parseInt(hex, 16));
85
+ if (dec) return String.fromCodePoint(parseInt(dec, 10));
86
+ if (name && namedEntities[name.toLowerCase()] !== undefined) {
87
+ return namedEntities[name.toLowerCase()];
88
+ }
89
+ return match;
90
+ });
91
+ }
22
92
 
23
93
  /**
24
94
  * Fast extraction of visible text content from HTML source.
25
95
  * Used as a lightweight alternative to full font-tracer for pages
26
96
  * that share the same CSS configuration as an already-traced page.
27
97
  *
28
- * Walks the parse5 tree collecting text nodes and content attributes
29
- * (alt, title, placeholder, value, aria-label), skipping invisible
30
- * elements (script, style, svg, template).
98
+ * Uses regex-based stripping instead of a full DOM parse for speed.
99
+ * Collects text nodes and content attributes (alt, title, placeholder,
100
+ * value, aria-label), skipping invisible elements.
31
101
  */
32
102
  function extractVisibleText(html) {
33
- const document = parse5.parse(html);
34
- const parts = [];
103
+ if (!html) return '';
35
104
 
36
- function walk(node) {
37
- if (node.nodeName && INVISIBLE_ELEMENTS.has(node.nodeName)) {
38
- return;
39
- }
105
+ // Reset lastIndex on global regexes — a prior call that threw
106
+ // mid-function would leave them in an indeterminate state.
107
+ hiddenInputRe.lastIndex = 0;
108
+ attrRe.lastIndex = 0;
40
109
 
41
- // Collect relevant attribute values
42
- if (node.attrs) {
43
- const isHiddenInput =
44
- node.nodeName === 'input' &&
45
- node.attrs.some(
46
- (a) => a.name === 'type' && a.value.toLowerCase() === 'hidden'
47
- );
48
- for (const attr of node.attrs) {
49
- if (TEXT_ATTRIBUTES.has(attr.name) && attr.value) {
50
- if (attr.name === 'value' && isHiddenInput) {
51
- continue;
52
- }
53
- parts.push(attr.value);
54
- }
55
- }
56
- }
110
+ const parts = [];
57
111
 
58
- // Collect text content
59
- if (node.nodeName === '#text' && node.value) {
60
- parts.push(node.value);
112
+ // Collect hidden-input value attrs that should be excluded.
113
+ const hiddenInputValues = new Set();
114
+ let hiddenMatch;
115
+ while ((hiddenMatch = hiddenInputRe.exec(html)) !== null) {
116
+ const fragment = hiddenMatch[0];
117
+ let m;
118
+ const localAttrRe = /\bvalue\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*))/gi;
119
+ while ((m = localAttrRe.exec(fragment)) !== null) {
120
+ const val = m[1] ?? m[2] ?? m[3];
121
+ if (val) hiddenInputValues.add(val);
61
122
  }
123
+ }
62
124
 
63
- // Recurse into child nodes
64
- if (node.childNodes) {
65
- for (const child of node.childNodes) {
66
- walk(child);
67
- }
68
- }
125
+ // Extract text attributes before stripping tags.
126
+ let attrMatch;
127
+ while ((attrMatch = attrRe.exec(html)) !== null) {
128
+ const attrName = attrMatch[1].toLowerCase();
129
+ const val = attrMatch[2] ?? attrMatch[3] ?? attrMatch[4];
130
+ if (!val) continue;
131
+ if (attrName === 'value' && hiddenInputValues.has(val)) continue;
132
+ parts.push(decodeEntities(val));
69
133
  }
70
134
 
71
- walk(document);
135
+ // Strip invisible blocks, comments, and tags to get text content.
136
+ let text = html;
137
+ text = text.replace(invisibleBlockRe, ' ');
138
+ text = text.replace(commentRe, ' ');
139
+ text = text.replace(tagRe, ' ');
140
+ text = decodeEntities(text);
141
+ parts.push(text);
142
+
72
143
  return parts.join(' ');
73
144
  }
74
145
 
@@ -1,10 +1,10 @@
1
- const fontverter = require('fontverter');
1
+ const { toSfnt } = require('./sfntCache');
2
2
 
3
3
  async function getFontInfoFromBuffer(buffer) {
4
4
  const harfbuzzJs = await require('harfbuzzjs');
5
5
 
6
- const blob = harfbuzzJs.createBlob(await fontverter.convert(buffer, 'sfnt')); // Load the font data into something Harfbuzz can use
7
- const face = harfbuzzJs.createFace(blob, 0); // Select the first font in the file (there's normally only one!)
6
+ const blob = harfbuzzJs.createBlob(await toSfnt(buffer));
7
+ const face = harfbuzzJs.createFace(blob, 0);
8
8
 
9
9
  const fontInfo = {
10
10
  characterSet: Array.from(face.collectUnicodes()),
@@ -1,3 +1,4 @@
1
+ const os = require('os');
1
2
  const { readFile } = require('fs').promises;
2
3
  const fontverter = require('fontverter');
3
4
  const { toSfnt } = require('./sfntCache');
@@ -11,30 +12,74 @@ const HB_SUBSET_SETS_NAME_ID = 4;
11
12
  // hb_subset_flags_t
12
13
  const HB_SUBSET_FLAGS_NO_HINTING = 0x00000001;
13
14
 
14
- // All font subsetting goes through harfbuzz directly so we can apply
15
- // web-specific optimizations (no hinting, minimal name table, table
16
- // stripping) and support explicit glyph-ID inclusion.
17
- let _wasmExports;
18
- let _loadPromise;
19
- async function loadHarfbuzz() {
20
- if (!_loadPromise) {
21
- _loadPromise = (async () => {
22
- const {
23
- instance: { exports },
24
- } = await WebAssembly.instantiate(
25
- await readFile(require.resolve('harfbuzzjs/hb-subset.wasm'))
26
- );
27
- _wasmExports = exports;
28
- return exports;
15
+ // Pool of WASM instances for parallel subsetting. Each instance has its
16
+ // own linear memory so concurrent calls are safe. The module is compiled
17
+ // once and instantiated N times (N = CPU count, capped at 8).
18
+ let _compilePromise;
19
+ function compileModule() {
20
+ if (!_compilePromise) {
21
+ // Assign the promise synchronously so concurrent callers share it
22
+ // (an async function would await readFile before the assignment).
23
+ _compilePromise = readFile(
24
+ require.resolve('harfbuzzjs/hb-subset.wasm')
25
+ ).then((buf) => WebAssembly.compile(buf));
26
+ }
27
+ return _compilePromise;
28
+ }
29
+
30
+ const _pool = []; // Array of { exports, busy: boolean }
31
+ let _poolReady;
32
+ const POOL_SIZE = Math.min(os.cpus().length, 8);
33
+
34
+ async function initPool() {
35
+ if (!_poolReady) {
36
+ _poolReady = (async () => {
37
+ const mod = await compileModule();
38
+ const instantiations = [];
39
+ for (let i = 0; i < POOL_SIZE; i++) {
40
+ instantiations.push(
41
+ WebAssembly.instantiate(mod).then(({ exports }) => {
42
+ _pool.push({ exports, busy: false });
43
+ })
44
+ );
45
+ }
46
+ await Promise.all(instantiations);
29
47
  })();
30
48
  }
31
- return _loadPromise;
49
+ return _poolReady;
50
+ }
51
+
52
+ // Waiters queue: callers waiting for an idle WASM instance.
53
+ const _waiters = [];
54
+
55
+ async function acquireInstance() {
56
+ await initPool();
57
+ const idle = _pool.find((inst) => !inst.busy);
58
+ if (idle) {
59
+ idle.busy = true;
60
+ return idle;
61
+ }
62
+ // All instances busy — wait for one to be released.
63
+ return new Promise((resolve) => _waiters.push(resolve));
64
+ }
65
+
66
+ function releaseInstance(inst) {
67
+ inst.busy = false;
68
+ if (_waiters.length > 0) {
69
+ inst.busy = true;
70
+ _waiters.shift()(inst);
71
+ }
32
72
  }
33
73
 
74
+ // Serialize fontverter.convert calls — the wawoff2 module (used internally by
75
+ // fontverter for WOFF2 compression) has a shared WASM instance whose memory
76
+ // is corrupted by concurrent calls.
77
+ const convertLimiter = require('p-limit')(1);
78
+
34
79
  // Re-create on every call — WASM memory.buffer is detached when memory grows,
35
80
  // so a cached Uint8Array would silently read/write stale data.
36
- function getHeapu8() {
37
- return new Uint8Array(_wasmExports.memory.buffer);
81
+ function getHeapu8(exports) {
82
+ return new Uint8Array(exports.memory.buffer);
38
83
  }
39
84
 
40
85
  function HB_TAG(str) {
@@ -152,7 +197,7 @@ function configureSubsetInput(
152
197
  }
153
198
  }
154
199
 
155
- function extractSubsetFont(exports, heapu8, subset) {
200
+ function extractSubsetFont(exports, subset) {
156
201
  const result = exports.hb_face_reference_blob(subset);
157
202
  const offset = exports.hb_blob_get_data(result, 0);
158
203
  const subsetByteLength = exports.hb_blob_get_length(result);
@@ -162,6 +207,9 @@ function extractSubsetFont(exports, heapu8, subset) {
162
207
  throw new Error('Failed to create subset font');
163
208
  }
164
209
 
210
+ // Fresh view AFTER the WASM calls above — memory.buffer may have been
211
+ // detached by a grow during hb_face_reference_blob / hb_blob_get_data.
212
+ const heapu8 = getHeapu8(exports);
165
213
  const subsetFont = Buffer.from(
166
214
  heapu8.subarray(offset, offset + subsetByteLength)
167
215
  );
@@ -174,45 +222,59 @@ async function subsetFontWithGlyphs(
174
222
  text,
175
223
  { targetFormat, glyphIds, variationAxes } = {}
176
224
  ) {
177
- const exports = await loadHarfbuzz();
178
-
179
225
  // Reuse cached sfnt conversion when available (same buffer may have
180
226
  // been converted by getFontInfo or collectFeatureGlyphIds already).
181
227
  const ttf = await toSfnt(originalFont);
182
228
 
183
- const fontBuffer = exports.malloc(ttf.byteLength);
184
- // Fresh view memory.buffer may have been detached by a prior malloc/grow.
185
- getHeapu8().set(new Uint8Array(ttf), fontBuffer);
229
+ const inst = await acquireInstance();
230
+ const { exports } = inst;
231
+ let released = false;
232
+ try {
233
+ const fontBuffer = exports.malloc(ttf.byteLength);
234
+ // Fresh view — memory.buffer may have been detached by a prior malloc/grow.
235
+ getHeapu8(exports).set(new Uint8Array(ttf), fontBuffer);
186
236
 
187
- const blob = exports.hb_blob_create(fontBuffer, ttf.byteLength, 2, 0, 0);
188
- const face = exports.hb_face_create(blob, 0);
189
- exports.hb_blob_destroy(blob);
237
+ const blob = exports.hb_blob_create(fontBuffer, ttf.byteLength, 2, 0, 0);
238
+ const face = exports.hb_face_create(blob, 0);
239
+ exports.hb_blob_destroy(blob);
190
240
 
191
- const input = exports.hb_subset_input_create_or_fail();
192
- if (input === 0) {
193
- exports.hb_face_destroy(face);
194
- exports.free(fontBuffer);
195
- throw new Error('hb_subset_input_create_or_fail returned zero');
196
- }
241
+ const input = exports.hb_subset_input_create_or_fail();
242
+ if (input === 0) {
243
+ exports.hb_face_destroy(face);
244
+ exports.free(fontBuffer);
245
+ throw new Error('hb_subset_input_create_or_fail returned zero');
246
+ }
197
247
 
198
- let subset = 0;
199
- try {
200
- configureSubsetInput(exports, input, face, text, glyphIds, variationAxes);
248
+ let subsetFont;
249
+ let subset = 0;
250
+ try {
251
+ configureSubsetInput(exports, input, face, text, glyphIds, variationAxes);
252
+
253
+ subset = exports.hb_subset_or_fail(face, input);
254
+ if (subset === 0) {
255
+ throw new Error('hb_subset_or_fail returned zero');
256
+ }
201
257
 
202
- subset = exports.hb_subset_or_fail(face, input);
203
- if (subset === 0) {
204
- throw new Error('hb_subset_or_fail returned zero');
258
+ subsetFont = extractSubsetFont(exports, subset);
259
+ } finally {
260
+ // Clean up all WASM resources while we still own the instance.
261
+ if (subset) exports.hb_face_destroy(subset);
262
+ exports.hb_subset_input_destroy(input);
263
+ exports.hb_face_destroy(face);
264
+ exports.free(fontBuffer);
205
265
  }
206
266
 
207
- const subsetFont = extractSubsetFont(exports, getHeapu8(), subset);
208
- return fontverter.convert(subsetFont, targetFormat, 'truetype');
267
+ // Instance is fully cleaned up — release it so other subsetting
268
+ // calls can proceed while we wait for the serialized WOFF2 step.
269
+ released = true;
270
+ releaseInstance(inst);
271
+
272
+ return convertLimiter(() =>
273
+ fontverter.convert(subsetFont, targetFormat, 'truetype')
274
+ );
209
275
  } finally {
210
- if (subset) exports.hb_face_destroy(subset);
211
- exports.hb_subset_input_destroy(input);
212
- exports.hb_face_destroy(face);
213
- exports.free(fontBuffer);
276
+ if (!released) releaseInstance(inst);
214
277
  }
215
278
  }
216
279
 
217
- const limiter = require('p-limit')(1);
218
- module.exports = (...args) => limiter(() => subsetFontWithGlyphs(...args));
280
+ module.exports = subsetFontWithGlyphs;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@turntrout/subfont",
3
- "version": "1.3.0",
3
+ "version": "1.3.2",
4
4
  "description": "Automatically subset web fonts to only the characters used on your pages. Fork of Munter/subfont with modern defaults.",
5
5
  "engines": {
6
6
  "node": ">=18.0.0"