@chr33s/pdf-codepoints 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +21 -0
- package/README.md +81 -0
- package/data/ArabicShaping.txt +894 -0
- package/data/Blocks.txt +336 -0
- package/data/CaseFolding.txt +1581 -0
- package/data/CompositionExclusions.txt +208 -0
- package/data/DerivedNormalizationProps.txt +9803 -0
- package/data/EastAsianWidth.txt +2473 -0
- package/data/IndicPositionalCategory.txt +755 -0
- package/data/IndicSyllabicCategory.txt +1286 -0
- package/data/PropertyValueAliases.txt +1541 -0
- package/data/Scripts.txt +2837 -0
- package/data/SpecialCasing.txt +281 -0
- package/data/UnicodeData.txt +32840 -0
- package/data/extracted/DerivedNumericValues.txt +2537 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/parser.d.ts +35 -0
- package/dist/parser.js +308 -0
- package/dist/parser.js.map +1 -0
- package/package.json +40 -0
- package/scripts/update-data.ts +64 -0
- package/src/index.ts +7 -0
- package/src/parser.ts +428 -0
- package/test/parser.test.ts +77 -0
- package/tsconfig.json +10 -0
- package/tsconfig.typecheck.json +14 -0
- package/vitest.config.ts +8 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# CompositionExclusions-12.0.0.txt
|
|
2
|
+
# Date: 2018-08-03, 00:00:00 GMT [KW, LI]
|
|
3
|
+
# © 2018 Unicode®, Inc.
|
|
4
|
+
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
|
5
|
+
#
|
|
6
|
+
# Unicode Character Database
|
|
7
|
+
# For documentation, see http://www.unicode.org/reports/tr44/
|
|
8
|
+
#
|
|
9
|
+
# This file lists the characters for the Composition Exclusion Table
|
|
10
|
+
# defined in UAX #15, Unicode Normalization Forms.
|
|
11
|
+
#
|
|
12
|
+
# This file is a normative contributory data file in the
|
|
13
|
+
# Unicode Character Database.
|
|
14
|
+
#
|
|
15
|
+
# For more information, see
|
|
16
|
+
# http://www.unicode.org/unicode/reports/tr15/#Primary_Exclusion_List_Table
|
|
17
|
+
#
|
|
18
|
+
# For a full derivation of composition exclusions, see the derived property
|
|
19
|
+
# Full_Composition_Exclusion in DerivedNormalizationProps.txt
|
|
20
|
+
#
|
|
21
|
+
|
|
22
|
+
# ================================================
|
|
23
|
+
# (1) Script Specifics
|
|
24
|
+
#
|
|
25
|
+
# This list of characters cannot be derived from the UnicodeData.txt file.
|
|
26
|
+
# ================================================
|
|
27
|
+
|
|
28
|
+
0958 # DEVANAGARI LETTER QA
|
|
29
|
+
0959 # DEVANAGARI LETTER KHHA
|
|
30
|
+
095A # DEVANAGARI LETTER GHHA
|
|
31
|
+
095B # DEVANAGARI LETTER ZA
|
|
32
|
+
095C # DEVANAGARI LETTER DDDHA
|
|
33
|
+
095D # DEVANAGARI LETTER RHA
|
|
34
|
+
095E # DEVANAGARI LETTER FA
|
|
35
|
+
095F # DEVANAGARI LETTER YYA
|
|
36
|
+
09DC # BENGALI LETTER RRA
|
|
37
|
+
09DD # BENGALI LETTER RHA
|
|
38
|
+
09DF # BENGALI LETTER YYA
|
|
39
|
+
0A33 # GURMUKHI LETTER LLA
|
|
40
|
+
0A36 # GURMUKHI LETTER SHA
|
|
41
|
+
0A59 # GURMUKHI LETTER KHHA
|
|
42
|
+
0A5A # GURMUKHI LETTER GHHA
|
|
43
|
+
0A5B # GURMUKHI LETTER ZA
|
|
44
|
+
0A5E # GURMUKHI LETTER FA
|
|
45
|
+
0B5C # ORIYA LETTER RRA
|
|
46
|
+
0B5D # ORIYA LETTER RHA
|
|
47
|
+
0F43 # TIBETAN LETTER GHA
|
|
48
|
+
0F4D # TIBETAN LETTER DDHA
|
|
49
|
+
0F52 # TIBETAN LETTER DHA
|
|
50
|
+
0F57 # TIBETAN LETTER BHA
|
|
51
|
+
0F5C # TIBETAN LETTER DZHA
|
|
52
|
+
0F69 # TIBETAN LETTER KSSA
|
|
53
|
+
0F76 # TIBETAN VOWEL SIGN VOCALIC R
|
|
54
|
+
0F78 # TIBETAN VOWEL SIGN VOCALIC L
|
|
55
|
+
0F93 # TIBETAN SUBJOINED LETTER GHA
|
|
56
|
+
0F9D # TIBETAN SUBJOINED LETTER DDHA
|
|
57
|
+
0FA2 # TIBETAN SUBJOINED LETTER DHA
|
|
58
|
+
0FA7 # TIBETAN SUBJOINED LETTER BHA
|
|
59
|
+
0FAC # TIBETAN SUBJOINED LETTER DZHA
|
|
60
|
+
0FB9 # TIBETAN SUBJOINED LETTER KSSA
|
|
61
|
+
FB1D # HEBREW LETTER YOD WITH HIRIQ
|
|
62
|
+
FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
|
|
63
|
+
FB2A # HEBREW LETTER SHIN WITH SHIN DOT
|
|
64
|
+
FB2B # HEBREW LETTER SHIN WITH SIN DOT
|
|
65
|
+
FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
|
|
66
|
+
FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
|
|
67
|
+
FB2E # HEBREW LETTER ALEF WITH PATAH
|
|
68
|
+
FB2F # HEBREW LETTER ALEF WITH QAMATS
|
|
69
|
+
FB30 # HEBREW LETTER ALEF WITH MAPIQ
|
|
70
|
+
FB31 # HEBREW LETTER BET WITH DAGESH
|
|
71
|
+
FB32 # HEBREW LETTER GIMEL WITH DAGESH
|
|
72
|
+
FB33 # HEBREW LETTER DALET WITH DAGESH
|
|
73
|
+
FB34 # HEBREW LETTER HE WITH MAPIQ
|
|
74
|
+
FB35 # HEBREW LETTER VAV WITH DAGESH
|
|
75
|
+
FB36 # HEBREW LETTER ZAYIN WITH DAGESH
|
|
76
|
+
FB38 # HEBREW LETTER TET WITH DAGESH
|
|
77
|
+
FB39 # HEBREW LETTER YOD WITH DAGESH
|
|
78
|
+
FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
|
|
79
|
+
FB3B # HEBREW LETTER KAF WITH DAGESH
|
|
80
|
+
FB3C # HEBREW LETTER LAMED WITH DAGESH
|
|
81
|
+
FB3E # HEBREW LETTER MEM WITH DAGESH
|
|
82
|
+
FB40 # HEBREW LETTER NUN WITH DAGESH
|
|
83
|
+
FB41 # HEBREW LETTER SAMEKH WITH DAGESH
|
|
84
|
+
FB43 # HEBREW LETTER FINAL PE WITH DAGESH
|
|
85
|
+
FB44 # HEBREW LETTER PE WITH DAGESH
|
|
86
|
+
FB46 # HEBREW LETTER TSADI WITH DAGESH
|
|
87
|
+
FB47 # HEBREW LETTER QOF WITH DAGESH
|
|
88
|
+
FB48 # HEBREW LETTER RESH WITH DAGESH
|
|
89
|
+
FB49 # HEBREW LETTER SHIN WITH DAGESH
|
|
90
|
+
FB4A # HEBREW LETTER TAV WITH DAGESH
|
|
91
|
+
FB4B # HEBREW LETTER VAV WITH HOLAM
|
|
92
|
+
FB4C # HEBREW LETTER BET WITH RAFE
|
|
93
|
+
FB4D # HEBREW LETTER KAF WITH RAFE
|
|
94
|
+
FB4E # HEBREW LETTER PE WITH RAFE
|
|
95
|
+
|
|
96
|
+
# Total code points: 67
|
|
97
|
+
|
|
98
|
+
# ================================================
|
|
99
|
+
# (2) Post Composition Version precomposed characters
|
|
100
|
+
#
|
|
101
|
+
# These characters cannot be derived solely from the UnicodeData.txt file
|
|
102
|
+
# in this version of Unicode.
|
|
103
|
+
#
|
|
104
|
+
# Note that characters added to the standard after the
|
|
105
|
+
# Composition Version and which have canonical decomposition mappings
|
|
106
|
+
# are not automatically added to this list of Post Composition
|
|
107
|
+
# Version precomposed characters.
|
|
108
|
+
# ================================================
|
|
109
|
+
|
|
110
|
+
2ADC # FORKING
|
|
111
|
+
1D15E # MUSICAL SYMBOL HALF NOTE
|
|
112
|
+
1D15F # MUSICAL SYMBOL QUARTER NOTE
|
|
113
|
+
1D160 # MUSICAL SYMBOL EIGHTH NOTE
|
|
114
|
+
1D161 # MUSICAL SYMBOL SIXTEENTH NOTE
|
|
115
|
+
1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
|
|
116
|
+
1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
|
|
117
|
+
1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
|
|
118
|
+
1D1BB # MUSICAL SYMBOL MINIMA
|
|
119
|
+
1D1BC # MUSICAL SYMBOL MINIMA BLACK
|
|
120
|
+
1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
|
|
121
|
+
1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
|
|
122
|
+
1D1BF # MUSICAL SYMBOL FUSA WHITE
|
|
123
|
+
1D1C0 # MUSICAL SYMBOL FUSA BLACK
|
|
124
|
+
|
|
125
|
+
# Total code points: 14
|
|
126
|
+
|
|
127
|
+
# ================================================
|
|
128
|
+
# (3) Singleton Decompositions
|
|
129
|
+
#
|
|
130
|
+
# These characters can be derived from the UnicodeData.txt file
|
|
131
|
+
# by including all canonically decomposable characters whose
|
|
132
|
+
# canonical decomposition consists of a single character.
|
|
133
|
+
#
|
|
134
|
+
# These characters are simply quoted here for reference.
|
|
135
|
+
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
|
|
136
|
+
# ================================================
|
|
137
|
+
|
|
138
|
+
# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
|
|
139
|
+
# 0343 COMBINING GREEK KORONIS
|
|
140
|
+
# 0374 GREEK NUMERAL SIGN
|
|
141
|
+
# 037E GREEK QUESTION MARK
|
|
142
|
+
# 0387 GREEK ANO TELEIA
|
|
143
|
+
# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
|
|
144
|
+
# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
|
|
145
|
+
# 1F75 GREEK SMALL LETTER ETA WITH OXIA
|
|
146
|
+
# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
|
|
147
|
+
# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
|
|
148
|
+
# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
|
|
149
|
+
# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
|
|
150
|
+
# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
|
|
151
|
+
# 1FBE GREEK PROSGEGRAMMENI
|
|
152
|
+
# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
|
|
153
|
+
# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
|
|
154
|
+
# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
|
|
155
|
+
# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
|
|
156
|
+
# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
|
|
157
|
+
# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
|
|
158
|
+
# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
|
|
159
|
+
# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
|
|
160
|
+
# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
|
|
161
|
+
# 1FFD GREEK OXIA
|
|
162
|
+
# 2000..2001 [2] EN QUAD..EM QUAD
|
|
163
|
+
# 2126 OHM SIGN
|
|
164
|
+
# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
|
|
165
|
+
# 2329 LEFT-POINTING ANGLE BRACKET
|
|
166
|
+
# 232A RIGHT-POINTING ANGLE BRACKET
|
|
167
|
+
# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
|
|
168
|
+
# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
|
|
169
|
+
# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
|
|
170
|
+
# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
|
|
171
|
+
# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
|
|
172
|
+
# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
|
|
173
|
+
# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
|
|
174
|
+
# FA2A..FA6D [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
|
|
175
|
+
# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
|
|
176
|
+
# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
|
177
|
+
|
|
178
|
+
# Total code points: 1035
|
|
179
|
+
|
|
180
|
+
# ================================================
|
|
181
|
+
# (4) Non-Starter Decompositions
|
|
182
|
+
#
|
|
183
|
+
# These characters can be derived from the UnicodeData.txt file
|
|
184
|
+
# by including each expanding canonical decomposition
|
|
185
|
+
# (i.e., those which canonically decompose to a sequence
|
|
186
|
+
# of characters instead of a single character), such that:
|
|
187
|
+
#
|
|
188
|
+
# A. The character is not a Starter.
|
|
189
|
+
#
|
|
190
|
+
# OR (inclusive)
|
|
191
|
+
#
|
|
192
|
+
# B. The character's canonical decomposition begins
|
|
193
|
+
# with a character that is not a Starter.
|
|
194
|
+
#
|
|
195
|
+
# Note that a "Starter" is any character with a zero combining class.
|
|
196
|
+
#
|
|
197
|
+
# These characters are simply quoted here for reference.
|
|
198
|
+
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
|
|
199
|
+
# ================================================
|
|
200
|
+
|
|
201
|
+
# 0344 COMBINING GREEK DIALYTIKA TONOS
|
|
202
|
+
# 0F73 TIBETAN VOWEL SIGN II
|
|
203
|
+
# 0F75 TIBETAN VOWEL SIGN UU
|
|
204
|
+
# 0F81 TIBETAN VOWEL SIGN REVERSED II
|
|
205
|
+
|
|
206
|
+
# Total code points: 4
|
|
207
|
+
|
|
208
|
+
# EOF
|