@chr33s/pdf-codepoints 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parser.ts ADDED
@@ -0,0 +1,428 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { dirname, join, resolve } from "node:path";
3
+ import { fileURLToPath } from "node:url";
4
+
5
+ const moduleDir = dirname(fileURLToPath(import.meta.url));
6
+ const defaultUcdPath = resolve(moduleDir, "..", "data");
7
+
8
+ export type CodePointRange = [number, number];
9
+
10
+ export interface CodePoint {
11
+ code: number;
12
+ name: string;
13
+ unicode1Name: string | null;
14
+ isoComment: string | null;
15
+ category: string;
16
+ block: string | null;
17
+ script: string | null;
18
+ eastAsianWidth: string | null;
19
+ combiningClass: number;
20
+ combiningClassName: string | null;
21
+ bidiClass: string;
22
+ bidiMirrored: boolean;
23
+ numeric: string | null;
24
+ uppercase: number[] | null;
25
+ lowercase: number[] | null;
26
+ titlecase: number[] | null;
27
+ folded: number[] | null;
28
+ caseConditions: string[] | null;
29
+ decomposition: number[];
30
+ compositions: Record<number, number>;
31
+ isCompat: boolean;
32
+ isExcluded: boolean;
33
+ joiningType: string | null;
34
+ joiningGroup: string | null;
35
+ indicSyllabicCategory: string | null;
36
+ indicPositionalCategory: string | null;
37
+ NFD_QC: number;
38
+ NFKD_QC: number;
39
+ NFC_QC: number;
40
+ NFKC_QC: number;
41
+ }
42
+
43
+ export type CodePointTable = Array<CodePoint | undefined>;
44
+
45
+ const COMMENT_PATTERN = /\s*#.*$/;
46
+
47
+ const RANGE_PATTERN = /([a-f0-9]+)\.\.([a-f0-9]+)/i;
48
+
49
+ function parseCodes(code: string | undefined): number[] | null {
50
+ if (!code) {
51
+ return null;
52
+ }
53
+
54
+ return code
55
+ .split(" ")
56
+ .filter((segment) => segment.length > 0)
57
+ .map((segment) => parseInt(segment, 16));
58
+ }
59
+
60
+ function createCodePoint(parts: string[]): CodePoint {
61
+ const [
62
+ codeHex,
63
+ name,
64
+ category,
65
+ combiningClassValue,
66
+ bidiClass,
67
+ decompositionRaw,
68
+ decimal,
69
+ digit,
70
+ numeric,
71
+ bidiMirroredFlag,
72
+ unicode1Name,
73
+ isoComment,
74
+ uppercaseHex,
75
+ lowercaseHex,
76
+ titlecaseHex,
77
+ ] = parts;
78
+
79
+ const decomposition = parseCodes(decompositionRaw) ?? [];
80
+
81
+ const codePoint: CodePoint = {
82
+ code: parseInt(codeHex, 16),
83
+ name,
84
+ unicode1Name: unicode1Name || null,
85
+ isoComment: isoComment || null,
86
+ category,
87
+ block: null,
88
+ script: null,
89
+ eastAsianWidth: null,
90
+ combiningClass: parseInt(combiningClassValue, 10) || 0,
91
+ combiningClassName: null,
92
+ bidiClass,
93
+ bidiMirrored: bidiMirroredFlag === "Y",
94
+ numeric: numeric || null,
95
+ uppercase: uppercaseHex ? [parseInt(uppercaseHex, 16)] : null,
96
+ lowercase: lowercaseHex ? [parseInt(lowercaseHex, 16)] : null,
97
+ titlecase: titlecaseHex ? [parseInt(titlecaseHex, 16)] : null,
98
+ folded: null,
99
+ caseConditions: null,
100
+ decomposition,
101
+ compositions: {},
102
+ isCompat: false,
103
+ isExcluded: false,
104
+ joiningType: null,
105
+ joiningGroup: null,
106
+ indicSyllabicCategory: null,
107
+ indicPositionalCategory: null,
108
+ NFD_QC: 0,
109
+ NFKD_QC: 0,
110
+ NFC_QC: 0,
111
+ NFKC_QC: 0,
112
+ };
113
+
114
+ if (codePoint.decomposition.length && Number.isNaN(codePoint.decomposition[0])) {
115
+ codePoint.isCompat = true;
116
+ codePoint.decomposition.shift();
117
+ }
118
+
119
+ if ((decimal && decimal !== codePoint.numeric) || (digit && digit !== codePoint.numeric)) {
120
+ throw new Error("Decimal or digit does not match numeric value");
121
+ }
122
+
123
+ return codePoint;
124
+ }
125
+
126
+ function cloneCodePoint(source: CodePoint, code: number): CodePoint {
127
+ return { ...source, code };
128
+ }
129
+
130
+ function readRangeFile(
131
+ ucdPath: string,
132
+ filename: string,
133
+ handler: (parts: [CodePointRange, ...string[]]) => void,
134
+ ): void {
135
+ const data = readFileSync(join(ucdPath, filename), "ascii");
136
+
137
+ for (const rawLine of data.split("\n")) {
138
+ let line = rawLine.trim();
139
+ if (!line || line.startsWith("#")) {
140
+ continue;
141
+ }
142
+
143
+ line = line.replace(COMMENT_PATTERN, "");
144
+ if (!line) {
145
+ continue;
146
+ }
147
+
148
+ const parts = line.split(/\s*;\s*/);
149
+
150
+ const match = parts[0].match(RANGE_PATTERN);
151
+ let range: CodePointRange | null = null;
152
+
153
+ if (match) {
154
+ range = [parseInt(match[1], 16), parseInt(match[2], 16)];
155
+ } else {
156
+ const value = parseInt(parts[0], 16);
157
+ if (!Number.isNaN(value)) {
158
+ range = [value, value];
159
+ }
160
+ }
161
+
162
+ if (!range) {
163
+ continue;
164
+ }
165
+
166
+ const rest = parts.slice(1) as string[];
167
+ handler([range, ...rest]);
168
+ }
169
+ }
170
+
171
+ function readRawFile(ucdPath: string, filename: string, handler: (parts: string[]) => void): void {
172
+ const data = readFileSync(join(ucdPath, filename), "ascii");
173
+
174
+ for (const rawLine of data.split("\n")) {
175
+ let line = rawLine.trim();
176
+ if (!line || line.startsWith("#")) {
177
+ continue;
178
+ }
179
+
180
+ line = line.replace(COMMENT_PATTERN, "");
181
+ if (!line) {
182
+ continue;
183
+ }
184
+
185
+ handler(line.split(/\s*;\s*/));
186
+ }
187
+ }
188
+
189
+ export default function loadCodePoints(ucdPath: string = defaultUcdPath): CodePointTable {
190
+ const codePoints: CodePointTable = [];
191
+
192
+ const unicodeData = readFileSync(join(ucdPath, "UnicodeData.txt"), "ascii");
193
+
194
+ let rangeStart = -1;
195
+
196
+ for (const line of unicodeData.split("\n")) {
197
+ if (!line.length) {
198
+ continue;
199
+ }
200
+
201
+ const parts = line.split(";");
202
+ const name = parts[1];
203
+ const codePoint = createCodePoint(parts);
204
+
205
+ if (rangeStart >= 0) {
206
+ if (!/<.+, Last>/.test(name)) {
207
+ throw new Error("No range end found");
208
+ }
209
+
210
+ for (let code = rangeStart; code <= codePoint.code; code += 1) {
211
+ codePoints[code] = cloneCodePoint(codePoint, code);
212
+ }
213
+
214
+ rangeStart = -1;
215
+ continue;
216
+ }
217
+
218
+ if (/<.+, First>/.test(name)) {
219
+ rangeStart = codePoint.code;
220
+ } else {
221
+ codePoints[codePoint.code] = codePoint;
222
+ }
223
+ }
224
+
225
+ readRangeFile(ucdPath, "extracted/DerivedNumericValues.txt", (parts) => {
226
+ const [start, end] = parts[0];
227
+ const value = parts[3];
228
+
229
+ for (let code = start; code <= end; code += 1) {
230
+ const codePoint = codePoints[code];
231
+ if (codePoint && !codePoint.numeric) {
232
+ codePoint.numeric = value;
233
+ }
234
+ }
235
+ });
236
+
237
+ const combiningClasses: Record<number, string> = {};
238
+ const joiningTypes: Record<string, string> = {};
239
+
240
+ readRawFile(ucdPath, "PropertyValueAliases.txt", (parts) => {
241
+ if (parts[0] === "ccc") {
242
+ const num = parseInt(parts[1], 10);
243
+ const name = parts[3];
244
+ combiningClasses[num] = name;
245
+ }
246
+
247
+ if (parts[0] === "jt") {
248
+ joiningTypes[parts[1]] = parts[2];
249
+ }
250
+ });
251
+
252
+ for (const codePoint of codePoints) {
253
+ if (codePoint) {
254
+ codePoint.combiningClassName = combiningClasses[codePoint.combiningClass] ?? null;
255
+ }
256
+ }
257
+
258
+ readRangeFile(ucdPath, "Blocks.txt", (parts) => {
259
+ const [start, end] = parts[0];
260
+
261
+ for (let code = start; code <= end; code += 1) {
262
+ const codePoint = codePoints[code];
263
+ if (codePoint) {
264
+ codePoint.block = parts[1];
265
+ }
266
+ }
267
+ });
268
+
269
+ readRangeFile(ucdPath, "Scripts.txt", (parts) => {
270
+ const [start, end] = parts[0];
271
+
272
+ for (let code = start; code <= end; code += 1) {
273
+ const codePoint = codePoints[code];
274
+ if (codePoint) {
275
+ codePoint.script = parts[1];
276
+ }
277
+ }
278
+ });
279
+
280
+ readRangeFile(ucdPath, "EastAsianWidth.txt", (parts) => {
281
+ const [start, end] = parts[0];
282
+
283
+ for (let code = start; code <= end; code += 1) {
284
+ const codePoint = codePoints[code];
285
+ if (codePoint) {
286
+ codePoint.eastAsianWidth = parts[1];
287
+ }
288
+ }
289
+ });
290
+
291
+ readRangeFile(ucdPath, "SpecialCasing.txt", (parts) => {
292
+ const [start] = parts[0];
293
+ const lower = parseCodes(parts[1]);
294
+ const title = parseCodes(parts[2]);
295
+ const upper = parseCodes(parts[3]);
296
+ const conditions = parts[4] ? parts[4].split(/\s+/) : null;
297
+
298
+ const codePoint = codePoints[start];
299
+ if (!codePoint) {
300
+ return;
301
+ }
302
+
303
+ if (!conditions) {
304
+ codePoint.uppercase = upper;
305
+ codePoint.lowercase = lower;
306
+ codePoint.titlecase = title;
307
+ } else {
308
+ codePoint.caseConditions = conditions;
309
+ }
310
+ });
311
+
312
+ readRangeFile(ucdPath, "CaseFolding.txt", (parts) => {
313
+ const [start] = parts[0];
314
+ const type = parts[1];
315
+ const folded = parseCodes(parts[2]) ?? [];
316
+
317
+ if (["C", "F"].includes(type)) {
318
+ const codePoint = codePoints[start];
319
+ if (!codePoint) {
320
+ return;
321
+ }
322
+
323
+ const lowercase = codePoint.lowercase?.join("|") ?? "";
324
+ const foldedStr = folded.join("|");
325
+ if (lowercase !== foldedStr) {
326
+ codePoint.folded = folded;
327
+ }
328
+ }
329
+ });
330
+
331
+ readRangeFile(ucdPath, "CompositionExclusions.txt", (parts) => {
332
+ const [start] = parts[0];
333
+ const codePoint = codePoints[start];
334
+ if (codePoint) {
335
+ codePoint.isExcluded = true;
336
+ }
337
+ });
338
+
339
+ readRangeFile(ucdPath, "DerivedNormalizationProps.txt", (parts) => {
340
+ const [start, end] = parts[0];
341
+ const prop = parts[1];
342
+ const value = parts[2];
343
+
344
+ if (["NFD_QC", "NFKD_QC", "NFC_QC", "NFKC_QC"].includes(prop)) {
345
+ const quickCheckValue = value === "Y" ? 0 : value === "N" ? 1 : 2;
346
+
347
+ for (let code = start; code <= end; code += 1) {
348
+ const codePoint = codePoints[code];
349
+ if (!codePoint) {
350
+ continue;
351
+ }
352
+
353
+ switch (prop) {
354
+ case "NFD_QC":
355
+ codePoint.NFD_QC = quickCheckValue;
356
+ break;
357
+ case "NFKD_QC":
358
+ codePoint.NFKD_QC = quickCheckValue;
359
+ break;
360
+ case "NFC_QC":
361
+ codePoint.NFC_QC = quickCheckValue;
362
+ break;
363
+ case "NFKC_QC":
364
+ codePoint.NFKC_QC = quickCheckValue;
365
+ break;
366
+ default:
367
+ break;
368
+ }
369
+ }
370
+ }
371
+ });
372
+
373
+ readRangeFile(ucdPath, "ArabicShaping.txt", (parts) => {
374
+ const [start, end] = parts[0];
375
+ const joiningType = parts[2];
376
+ const joiningGroup = parts[3];
377
+
378
+ for (let code = start; code <= end; code += 1) {
379
+ const codePoint = codePoints[code];
380
+ if (!codePoint) {
381
+ continue;
382
+ }
383
+
384
+ codePoint.joiningType = joiningTypes[joiningType] ?? null;
385
+ codePoint.joiningGroup = joiningGroup || null;
386
+ }
387
+ });
388
+
389
+ readRangeFile(ucdPath, "IndicPositionalCategory.txt", (parts) => {
390
+ const [start, end] = parts[0];
391
+ const prop = parts[1];
392
+
393
+ for (let code = start; code <= end; code += 1) {
394
+ const codePoint = codePoints[code];
395
+ if (codePoint) {
396
+ codePoint.indicPositionalCategory = prop;
397
+ }
398
+ }
399
+ });
400
+
401
+ readRangeFile(ucdPath, "IndicSyllabicCategory.txt", (parts) => {
402
+ const [start, end] = parts[0];
403
+ const prop = parts[1];
404
+
405
+ for (let code = start; code <= end; code += 1) {
406
+ const codePoint = codePoints[code];
407
+ if (codePoint) {
408
+ codePoint.indicSyllabicCategory = prop;
409
+ }
410
+ }
411
+ });
412
+
413
+ for (const codePoint of codePoints) {
414
+ if (
415
+ codePoint &&
416
+ codePoint.decomposition.length > 1 &&
417
+ !codePoint.isCompat &&
418
+ !codePoint.isExcluded
419
+ ) {
420
+ const base = codePoints[codePoint.decomposition[1]];
421
+ if (base) {
422
+ base.compositions[codePoint.decomposition[0]] = codePoint.code;
423
+ }
424
+ }
425
+ }
426
+
427
+ return codePoints;
428
+ }
@@ -0,0 +1,77 @@
1
+ import { beforeAll, describe, expect, test } from "vitest";
2
+
3
+ import parser, { type CodePointTable } from "../src/parser.js";
4
+
5
+ let codePoints: CodePointTable;
6
+
7
+ beforeAll(() => {
8
+ codePoints = parser();
9
+ });
10
+
11
+ describe("parser", () => {
12
+ test("loads metadata for basic Latin letters", () => {
13
+ const capitalA = codePoints[0x0041];
14
+ const smallA = codePoints[0x0061];
15
+
16
+ expect(capitalA).toBeDefined();
17
+ expect(capitalA?.name).toBe("LATIN CAPITAL LETTER A");
18
+ expect(capitalA?.category).toBe("Lu");
19
+ expect(capitalA?.block).toBe("Basic Latin");
20
+ expect(capitalA?.script).toBe("Latin");
21
+ expect(capitalA?.eastAsianWidth).toBe("Na");
22
+ expect(capitalA?.lowercase).toEqual([0x0061]);
23
+
24
+ expect(smallA).toBeDefined();
25
+ expect(smallA?.name).toBe("LATIN SMALL LETTER A");
26
+ expect(smallA?.category).toBe("Ll");
27
+ expect(smallA?.block).toBe("Basic Latin");
28
+ expect(smallA?.uppercase).toEqual([0x0041]);
29
+ });
30
+
31
+ test("marks compatibility decompositions and canonical compositions", () => {
32
+ const noBreakSpace = codePoints[0x00a0];
33
+ const combiningDiaeresis = codePoints[0x0308];
34
+
35
+ expect(noBreakSpace).toBeDefined();
36
+ expect(noBreakSpace?.isCompat).toBe(true);
37
+ expect(noBreakSpace?.decomposition).toEqual([0x0020]);
38
+
39
+ expect(combiningDiaeresis).toBeDefined();
40
+ expect(combiningDiaeresis?.isCompat).toBe(false);
41
+ expect(combiningDiaeresis?.compositions[0x0041]).toBe(0x00c4);
42
+ });
43
+
44
+ test("applies derived metadata from supplemental files", () => {
45
+ const combiningAcute = codePoints[0x0301];
46
+ const aegeanSixtyThousand = codePoints[0x10130];
47
+
48
+ expect(combiningAcute).toBeDefined();
49
+ expect(combiningAcute?.combiningClass).toBe(230);
50
+ expect(combiningAcute?.combiningClassName).toBe("Above");
51
+
52
+ expect(aegeanSixtyThousand).toBeDefined();
53
+ expect(aegeanSixtyThousand?.numeric).toBe("60000");
54
+ });
55
+
56
+ test("records normalization quick-check and conditional casing data", () => {
57
+ const dottedCapitalI = codePoints[0x0130];
58
+
59
+ expect(dottedCapitalI).toBeDefined();
60
+ expect(dottedCapitalI?.NFD_QC).toBe(1);
61
+ expect(dottedCapitalI?.NFKD_QC).toBe(1);
62
+ expect(dottedCapitalI?.caseConditions).not.toBeNull();
63
+ expect(dottedCapitalI?.caseConditions).toEqual(expect.arrayContaining(["az"]));
64
+ });
65
+
66
+ test("fills in range entries and Arabic joining metadata", () => {
67
+ const cjkExtensionA = codePoints[0x3401];
68
+ const arabicBeh = codePoints[0x0628];
69
+
70
+ expect(cjkExtensionA).toBeDefined();
71
+ expect(cjkExtensionA?.code).toBe(0x3401);
72
+
73
+ expect(arabicBeh).toBeDefined();
74
+ expect(arabicBeh?.joiningType).toBe("Dual_Joining");
75
+ expect(arabicBeh?.joiningGroup).toBe("BEH");
76
+ });
77
+ });
package/tsconfig.json ADDED
@@ -0,0 +1,10 @@
1
+ {
2
+ "extends": "../../tsconfig.json",
3
+ "compilerOptions": {
4
+ "rootDir": "src",
5
+ "outDir": "dist",
6
+ "declarationDir": "dist"
7
+ },
8
+ "include": ["src/**/*"],
9
+ "exclude": ["dist", "node_modules"]
10
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "extends": "./tsconfig.json",
3
+ "compilerOptions": {
4
+ "noEmit": true,
5
+ "rootDir": "."
6
+ },
7
+ "include": [
8
+ "**/*.ts",
9
+ ],
10
+ "exclude": [
11
+ "dist",
12
+ "node_modules"
13
+ ]
14
+ }
@@ -0,0 +1,8 @@
1
+ import { defineConfig } from "vitest/config";
2
+
3
+ export default defineConfig({
4
+ test: {
5
+ environment: "node",
6
+ include: ["test/**/*.test.ts"],
7
+ },
8
+ });