@turntrout/subfont 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/collectTextsByPage.js +3 -1
- package/lib/extractVisibleText.js +115 -44
- package/lib/getFontInfo.js +3 -3
- package/lib/subsetFontWithGlyphs.js +109 -47
- package/package.json +1 -1
|
@@ -231,7 +231,7 @@ function findFontFamiliesWithFeatureSettings(
|
|
|
231
231
|
|
|
232
232
|
if (recorded === true) {
|
|
233
233
|
result = true;
|
|
234
|
-
} else {
|
|
234
|
+
} else if (result !== true) {
|
|
235
235
|
if (!result) result = new Set();
|
|
236
236
|
for (const family of recorded) {
|
|
237
237
|
result.add(family.toLowerCase());
|
|
@@ -1214,3 +1214,5 @@ module.exports = collectTextsByPage;
|
|
|
1214
1214
|
// Exported for testing only
|
|
1215
1215
|
module.exports._extractFeatureTagsFromDecl = extractFeatureTagsFromDecl;
|
|
1216
1216
|
module.exports._resolveFeatureSettings = resolveFeatureSettings;
|
|
1217
|
+
module.exports._findFontFamiliesWithFeatureSettings =
|
|
1218
|
+
findFontFamiliesWithFeatureSettings;
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
const parse5 = require('parse5');
|
|
2
|
-
|
|
3
1
|
const INVISIBLE_ELEMENTS = new Set([
|
|
4
2
|
'script',
|
|
5
3
|
'style',
|
|
@@ -12,63 +10,136 @@ const INVISIBLE_ELEMENTS = new Set([
|
|
|
12
10
|
'embed',
|
|
13
11
|
'datalist',
|
|
14
12
|
]);
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
'
|
|
21
|
-
|
|
13
|
+
// Build a regex that strips invisible element blocks (greedy, case-insensitive).
|
|
14
|
+
// For void elements like <embed> there is no closing tag — just the opening
|
|
15
|
+
// tag is stripped (which the tag-stripping regex below handles).
|
|
16
|
+
const invisibleBlockTags = [...INVISIBLE_ELEMENTS].filter((t) => t !== 'embed');
|
|
17
|
+
const invisibleBlockRe = new RegExp(
|
|
18
|
+
`<(${invisibleBlockTags.join('|')})\\b[^>]*>[\\s\\S]*?<\\/\\1\\s*>`,
|
|
19
|
+
'gi'
|
|
20
|
+
);
|
|
21
|
+
const commentRe = /<!--[\s\S]*?-->/g;
|
|
22
|
+
|
|
23
|
+
// Match text-bearing attributes: alt="...", title='...', placeholder=..., etc.
|
|
24
|
+
// Captures the attribute name (group 1) and the value (groups 2, 3, or 4 for
|
|
25
|
+
// double-quoted, single-quoted, and unquoted respectively).
|
|
26
|
+
// Negative lookbehind prevents matching data- prefixed attributes (e.g. data-alt).
|
|
27
|
+
const attrRe =
|
|
28
|
+
/(?<![-\w])(alt|title|placeholder|value|aria-label)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*))/gi;
|
|
29
|
+
// Match <input ... type="hidden" ...> or <input ... type=hidden ...>
|
|
30
|
+
// \b only after the unquoted alternative — quotes already delimit the value.
|
|
31
|
+
const hiddenInputRe =
|
|
32
|
+
/<input\b[^>]*?\btype\s*=\s*(?:"hidden"|'hidden'|hidden\b)[^>]*/gi;
|
|
33
|
+
const tagRe = /<[^>]+>/g;
|
|
34
|
+
|
|
35
|
+
// Named and numeric HTML entity decoder. Covers the XML built-ins plus
|
|
36
|
+
// typographic entities commonly found in blog/article content. Rare
|
|
37
|
+
// entities are left as-is (their literal characters still enter the
|
|
38
|
+
// subset, so glyphs are never lost — just slightly overcounted).
|
|
39
|
+
const namedEntities = {
|
|
40
|
+
amp: '&',
|
|
41
|
+
lt: '<',
|
|
42
|
+
gt: '>',
|
|
43
|
+
quot: '"',
|
|
44
|
+
apos: "'",
|
|
45
|
+
nbsp: '\u00A0',
|
|
46
|
+
// Typographic quotes & dashes
|
|
47
|
+
ldquo: '\u201C',
|
|
48
|
+
rdquo: '\u201D',
|
|
49
|
+
lsquo: '\u2018',
|
|
50
|
+
rsquo: '\u2019',
|
|
51
|
+
mdash: '\u2014',
|
|
52
|
+
ndash: '\u2013',
|
|
53
|
+
hellip: '\u2026',
|
|
54
|
+
laquo: '\u00AB',
|
|
55
|
+
raquo: '\u00BB',
|
|
56
|
+
// Common symbols
|
|
57
|
+
bull: '\u2022',
|
|
58
|
+
middot: '\u00B7',
|
|
59
|
+
copy: '\u00A9',
|
|
60
|
+
reg: '\u00AE',
|
|
61
|
+
trade: '\u2122',
|
|
62
|
+
times: '\u00D7',
|
|
63
|
+
divide: '\u00F7',
|
|
64
|
+
minus: '\u2212',
|
|
65
|
+
plusmn: '\u00B1',
|
|
66
|
+
deg: '\u00B0',
|
|
67
|
+
micro: '\u00B5',
|
|
68
|
+
para: '\u00B6',
|
|
69
|
+
sect: '\u00A7',
|
|
70
|
+
// Currency
|
|
71
|
+
euro: '\u20AC',
|
|
72
|
+
pound: '\u00A3',
|
|
73
|
+
yen: '\u00A5',
|
|
74
|
+
cent: '\u00A2',
|
|
75
|
+
// Arrows
|
|
76
|
+
larr: '\u2190',
|
|
77
|
+
rarr: '\u2192',
|
|
78
|
+
uarr: '\u2191',
|
|
79
|
+
darr: '\u2193',
|
|
80
|
+
};
|
|
81
|
+
const entityRe = /&(?:#x([0-9a-fA-F]+)|#(\d+)|([a-zA-Z]+));/g;
|
|
82
|
+
function decodeEntities(str) {
|
|
83
|
+
return str.replace(entityRe, (match, hex, dec, name) => {
|
|
84
|
+
if (hex) return String.fromCodePoint(parseInt(hex, 16));
|
|
85
|
+
if (dec) return String.fromCodePoint(parseInt(dec, 10));
|
|
86
|
+
if (name && namedEntities[name.toLowerCase()] !== undefined) {
|
|
87
|
+
return namedEntities[name.toLowerCase()];
|
|
88
|
+
}
|
|
89
|
+
return match;
|
|
90
|
+
});
|
|
91
|
+
}
|
|
22
92
|
|
|
23
93
|
/**
|
|
24
94
|
* Fast extraction of visible text content from HTML source.
|
|
25
95
|
* Used as a lightweight alternative to full font-tracer for pages
|
|
26
96
|
* that share the same CSS configuration as an already-traced page.
|
|
27
97
|
*
|
|
28
|
-
*
|
|
29
|
-
* (alt, title, placeholder,
|
|
30
|
-
*
|
|
98
|
+
* Uses regex-based stripping instead of a full DOM parse for speed.
|
|
99
|
+
* Collects text nodes and content attributes (alt, title, placeholder,
|
|
100
|
+
* value, aria-label), skipping invisible elements.
|
|
31
101
|
*/
|
|
32
102
|
function extractVisibleText(html) {
|
|
33
|
-
|
|
34
|
-
const parts = [];
|
|
103
|
+
if (!html) return '';
|
|
35
104
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
105
|
+
// Reset lastIndex on global regexes — a prior call that threw
|
|
106
|
+
// mid-function would leave them in an indeterminate state.
|
|
107
|
+
hiddenInputRe.lastIndex = 0;
|
|
108
|
+
attrRe.lastIndex = 0;
|
|
40
109
|
|
|
41
|
-
|
|
42
|
-
if (node.attrs) {
|
|
43
|
-
const isHiddenInput =
|
|
44
|
-
node.nodeName === 'input' &&
|
|
45
|
-
node.attrs.some(
|
|
46
|
-
(a) => a.name === 'type' && a.value.toLowerCase() === 'hidden'
|
|
47
|
-
);
|
|
48
|
-
for (const attr of node.attrs) {
|
|
49
|
-
if (TEXT_ATTRIBUTES.has(attr.name) && attr.value) {
|
|
50
|
-
if (attr.name === 'value' && isHiddenInput) {
|
|
51
|
-
continue;
|
|
52
|
-
}
|
|
53
|
-
parts.push(attr.value);
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
110
|
+
const parts = [];
|
|
57
111
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
112
|
+
// Collect hidden-input value attrs that should be excluded.
|
|
113
|
+
const hiddenInputValues = new Set();
|
|
114
|
+
let hiddenMatch;
|
|
115
|
+
while ((hiddenMatch = hiddenInputRe.exec(html)) !== null) {
|
|
116
|
+
const fragment = hiddenMatch[0];
|
|
117
|
+
let m;
|
|
118
|
+
const localAttrRe = /\bvalue\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]*))/gi;
|
|
119
|
+
while ((m = localAttrRe.exec(fragment)) !== null) {
|
|
120
|
+
const val = m[1] ?? m[2] ?? m[3];
|
|
121
|
+
if (val) hiddenInputValues.add(val);
|
|
61
122
|
}
|
|
123
|
+
}
|
|
62
124
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
125
|
+
// Extract text attributes before stripping tags.
|
|
126
|
+
let attrMatch;
|
|
127
|
+
while ((attrMatch = attrRe.exec(html)) !== null) {
|
|
128
|
+
const attrName = attrMatch[1].toLowerCase();
|
|
129
|
+
const val = attrMatch[2] ?? attrMatch[3] ?? attrMatch[4];
|
|
130
|
+
if (!val) continue;
|
|
131
|
+
if (attrName === 'value' && hiddenInputValues.has(val)) continue;
|
|
132
|
+
parts.push(decodeEntities(val));
|
|
69
133
|
}
|
|
70
134
|
|
|
71
|
-
|
|
135
|
+
// Strip invisible blocks, comments, and tags to get text content.
|
|
136
|
+
let text = html;
|
|
137
|
+
text = text.replace(invisibleBlockRe, ' ');
|
|
138
|
+
text = text.replace(commentRe, ' ');
|
|
139
|
+
text = text.replace(tagRe, ' ');
|
|
140
|
+
text = decodeEntities(text);
|
|
141
|
+
parts.push(text);
|
|
142
|
+
|
|
72
143
|
return parts.join(' ');
|
|
73
144
|
}
|
|
74
145
|
|
package/lib/getFontInfo.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
const
|
|
1
|
+
const { toSfnt } = require('./sfntCache');
|
|
2
2
|
|
|
3
3
|
async function getFontInfoFromBuffer(buffer) {
|
|
4
4
|
const harfbuzzJs = await require('harfbuzzjs');
|
|
5
5
|
|
|
6
|
-
const blob = harfbuzzJs.createBlob(await
|
|
7
|
-
const face = harfbuzzJs.createFace(blob, 0);
|
|
6
|
+
const blob = harfbuzzJs.createBlob(await toSfnt(buffer));
|
|
7
|
+
const face = harfbuzzJs.createFace(blob, 0);
|
|
8
8
|
|
|
9
9
|
const fontInfo = {
|
|
10
10
|
characterSet: Array.from(face.collectUnicodes()),
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
const os = require('os');
|
|
1
2
|
const { readFile } = require('fs').promises;
|
|
2
3
|
const fontverter = require('fontverter');
|
|
3
4
|
const { toSfnt } = require('./sfntCache');
|
|
@@ -11,30 +12,74 @@ const HB_SUBSET_SETS_NAME_ID = 4;
|
|
|
11
12
|
// hb_subset_flags_t
|
|
12
13
|
const HB_SUBSET_FLAGS_NO_HINTING = 0x00000001;
|
|
13
14
|
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
//
|
|
17
|
-
let
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
15
|
+
// Pool of WASM instances for parallel subsetting. Each instance has its
|
|
16
|
+
// own linear memory so concurrent calls are safe. The module is compiled
|
|
17
|
+
// once and instantiated N times (N = CPU count, capped at 8).
|
|
18
|
+
let _compilePromise;
|
|
19
|
+
function compileModule() {
|
|
20
|
+
if (!_compilePromise) {
|
|
21
|
+
// Assign the promise synchronously so concurrent callers share it
|
|
22
|
+
// (an async function would await readFile before the assignment).
|
|
23
|
+
_compilePromise = readFile(
|
|
24
|
+
require.resolve('harfbuzzjs/hb-subset.wasm')
|
|
25
|
+
).then((buf) => WebAssembly.compile(buf));
|
|
26
|
+
}
|
|
27
|
+
return _compilePromise;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const _pool = []; // Array of { exports, busy: boolean }
|
|
31
|
+
let _poolReady;
|
|
32
|
+
const POOL_SIZE = Math.min(os.cpus().length, 8);
|
|
33
|
+
|
|
34
|
+
async function initPool() {
|
|
35
|
+
if (!_poolReady) {
|
|
36
|
+
_poolReady = (async () => {
|
|
37
|
+
const mod = await compileModule();
|
|
38
|
+
const instantiations = [];
|
|
39
|
+
for (let i = 0; i < POOL_SIZE; i++) {
|
|
40
|
+
instantiations.push(
|
|
41
|
+
WebAssembly.instantiate(mod).then(({ exports }) => {
|
|
42
|
+
_pool.push({ exports, busy: false });
|
|
43
|
+
})
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
await Promise.all(instantiations);
|
|
29
47
|
})();
|
|
30
48
|
}
|
|
31
|
-
return
|
|
49
|
+
return _poolReady;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Waiters queue: callers waiting for an idle WASM instance.
|
|
53
|
+
const _waiters = [];
|
|
54
|
+
|
|
55
|
+
async function acquireInstance() {
|
|
56
|
+
await initPool();
|
|
57
|
+
const idle = _pool.find((inst) => !inst.busy);
|
|
58
|
+
if (idle) {
|
|
59
|
+
idle.busy = true;
|
|
60
|
+
return idle;
|
|
61
|
+
}
|
|
62
|
+
// All instances busy — wait for one to be released.
|
|
63
|
+
return new Promise((resolve) => _waiters.push(resolve));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function releaseInstance(inst) {
|
|
67
|
+
inst.busy = false;
|
|
68
|
+
if (_waiters.length > 0) {
|
|
69
|
+
inst.busy = true;
|
|
70
|
+
_waiters.shift()(inst);
|
|
71
|
+
}
|
|
32
72
|
}
|
|
33
73
|
|
|
74
|
+
// Serialize fontverter.convert calls — the wawoff2 module (used internally by
|
|
75
|
+
// fontverter for WOFF2 compression) has a shared WASM instance whose memory
|
|
76
|
+
// is corrupted by concurrent calls.
|
|
77
|
+
const convertLimiter = require('p-limit')(1);
|
|
78
|
+
|
|
34
79
|
// Re-create on every call — WASM memory.buffer is detached when memory grows,
|
|
35
80
|
// so a cached Uint8Array would silently read/write stale data.
|
|
36
|
-
function getHeapu8() {
|
|
37
|
-
return new Uint8Array(
|
|
81
|
+
function getHeapu8(exports) {
|
|
82
|
+
return new Uint8Array(exports.memory.buffer);
|
|
38
83
|
}
|
|
39
84
|
|
|
40
85
|
function HB_TAG(str) {
|
|
@@ -152,7 +197,7 @@ function configureSubsetInput(
|
|
|
152
197
|
}
|
|
153
198
|
}
|
|
154
199
|
|
|
155
|
-
function extractSubsetFont(exports,
|
|
200
|
+
function extractSubsetFont(exports, subset) {
|
|
156
201
|
const result = exports.hb_face_reference_blob(subset);
|
|
157
202
|
const offset = exports.hb_blob_get_data(result, 0);
|
|
158
203
|
const subsetByteLength = exports.hb_blob_get_length(result);
|
|
@@ -162,6 +207,9 @@ function extractSubsetFont(exports, heapu8, subset) {
|
|
|
162
207
|
throw new Error('Failed to create subset font');
|
|
163
208
|
}
|
|
164
209
|
|
|
210
|
+
// Fresh view AFTER the WASM calls above — memory.buffer may have been
|
|
211
|
+
// detached by a grow during hb_face_reference_blob / hb_blob_get_data.
|
|
212
|
+
const heapu8 = getHeapu8(exports);
|
|
165
213
|
const subsetFont = Buffer.from(
|
|
166
214
|
heapu8.subarray(offset, offset + subsetByteLength)
|
|
167
215
|
);
|
|
@@ -174,45 +222,59 @@ async function subsetFontWithGlyphs(
|
|
|
174
222
|
text,
|
|
175
223
|
{ targetFormat, glyphIds, variationAxes } = {}
|
|
176
224
|
) {
|
|
177
|
-
const exports = await loadHarfbuzz();
|
|
178
|
-
|
|
179
225
|
// Reuse cached sfnt conversion when available (same buffer may have
|
|
180
226
|
// been converted by getFontInfo or collectFeatureGlyphIds already).
|
|
181
227
|
const ttf = await toSfnt(originalFont);
|
|
182
228
|
|
|
183
|
-
const
|
|
184
|
-
|
|
185
|
-
|
|
229
|
+
const inst = await acquireInstance();
|
|
230
|
+
const { exports } = inst;
|
|
231
|
+
let released = false;
|
|
232
|
+
try {
|
|
233
|
+
const fontBuffer = exports.malloc(ttf.byteLength);
|
|
234
|
+
// Fresh view — memory.buffer may have been detached by a prior malloc/grow.
|
|
235
|
+
getHeapu8(exports).set(new Uint8Array(ttf), fontBuffer);
|
|
186
236
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
237
|
+
const blob = exports.hb_blob_create(fontBuffer, ttf.byteLength, 2, 0, 0);
|
|
238
|
+
const face = exports.hb_face_create(blob, 0);
|
|
239
|
+
exports.hb_blob_destroy(blob);
|
|
190
240
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
241
|
+
const input = exports.hb_subset_input_create_or_fail();
|
|
242
|
+
if (input === 0) {
|
|
243
|
+
exports.hb_face_destroy(face);
|
|
244
|
+
exports.free(fontBuffer);
|
|
245
|
+
throw new Error('hb_subset_input_create_or_fail returned zero');
|
|
246
|
+
}
|
|
197
247
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
248
|
+
let subsetFont;
|
|
249
|
+
let subset = 0;
|
|
250
|
+
try {
|
|
251
|
+
configureSubsetInput(exports, input, face, text, glyphIds, variationAxes);
|
|
252
|
+
|
|
253
|
+
subset = exports.hb_subset_or_fail(face, input);
|
|
254
|
+
if (subset === 0) {
|
|
255
|
+
throw new Error('hb_subset_or_fail returned zero');
|
|
256
|
+
}
|
|
201
257
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
258
|
+
subsetFont = extractSubsetFont(exports, subset);
|
|
259
|
+
} finally {
|
|
260
|
+
// Clean up all WASM resources while we still own the instance.
|
|
261
|
+
if (subset) exports.hb_face_destroy(subset);
|
|
262
|
+
exports.hb_subset_input_destroy(input);
|
|
263
|
+
exports.hb_face_destroy(face);
|
|
264
|
+
exports.free(fontBuffer);
|
|
205
265
|
}
|
|
206
266
|
|
|
207
|
-
|
|
208
|
-
|
|
267
|
+
// Instance is fully cleaned up — release it so other subsetting
|
|
268
|
+
// calls can proceed while we wait for the serialized WOFF2 step.
|
|
269
|
+
released = true;
|
|
270
|
+
releaseInstance(inst);
|
|
271
|
+
|
|
272
|
+
return convertLimiter(() =>
|
|
273
|
+
fontverter.convert(subsetFont, targetFormat, 'truetype')
|
|
274
|
+
);
|
|
209
275
|
} finally {
|
|
210
|
-
if (
|
|
211
|
-
exports.hb_subset_input_destroy(input);
|
|
212
|
-
exports.hb_face_destroy(face);
|
|
213
|
-
exports.free(fontBuffer);
|
|
276
|
+
if (!released) releaseInstance(inst);
|
|
214
277
|
}
|
|
215
278
|
}
|
|
216
279
|
|
|
217
|
-
|
|
218
|
-
module.exports = (...args) => limiter(() => subsetFontWithGlyphs(...args));
|
|
280
|
+
module.exports = subsetFontWithGlyphs;
|
package/package.json
CHANGED