chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
@@ -0,0 +1,469 @@
|
|
1
|
+
|
2
|
+
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
3
|
+
// make_unicode_casefold.py >unicode_casefold.cc
|
4
|
+
|
5
|
+
#include "re2/unicode_casefold.h"
|
6
|
+
|
7
|
+
namespace re2 {
|
8
|
+
|
9
|
+
|
10
|
+
// 1029 groups, 2079 pairs, 282 ranges
|
11
|
+
CaseFold unicode_casefold[] = {
|
12
|
+
{ 65, 90, 32 },
|
13
|
+
{ 97, 106, -32 },
|
14
|
+
{ 107, 107, 8383 },
|
15
|
+
{ 108, 114, -32 },
|
16
|
+
{ 115, 115, 268 },
|
17
|
+
{ 116, 122, -32 },
|
18
|
+
{ 181, 181, 743 },
|
19
|
+
{ 192, 214, 32 },
|
20
|
+
{ 216, 222, 32 },
|
21
|
+
{ 223, 223, 7615 },
|
22
|
+
{ 224, 228, -32 },
|
23
|
+
{ 229, 229, 8262 },
|
24
|
+
{ 230, 246, -32 },
|
25
|
+
{ 248, 254, -32 },
|
26
|
+
{ 255, 255, 121 },
|
27
|
+
{ 256, 303, EvenOdd },
|
28
|
+
{ 306, 311, EvenOdd },
|
29
|
+
{ 313, 328, OddEven },
|
30
|
+
{ 330, 375, EvenOdd },
|
31
|
+
{ 376, 376, -121 },
|
32
|
+
{ 377, 382, OddEven },
|
33
|
+
{ 383, 383, -300 },
|
34
|
+
{ 384, 384, 195 },
|
35
|
+
{ 385, 385, 210 },
|
36
|
+
{ 386, 389, EvenOdd },
|
37
|
+
{ 390, 390, 206 },
|
38
|
+
{ 391, 392, OddEven },
|
39
|
+
{ 393, 394, 205 },
|
40
|
+
{ 395, 396, OddEven },
|
41
|
+
{ 398, 398, 79 },
|
42
|
+
{ 399, 399, 202 },
|
43
|
+
{ 400, 400, 203 },
|
44
|
+
{ 401, 402, OddEven },
|
45
|
+
{ 403, 403, 205 },
|
46
|
+
{ 404, 404, 207 },
|
47
|
+
{ 405, 405, 97 },
|
48
|
+
{ 406, 406, 211 },
|
49
|
+
{ 407, 407, 209 },
|
50
|
+
{ 408, 409, EvenOdd },
|
51
|
+
{ 410, 410, 163 },
|
52
|
+
{ 412, 412, 211 },
|
53
|
+
{ 413, 413, 213 },
|
54
|
+
{ 414, 414, 130 },
|
55
|
+
{ 415, 415, 214 },
|
56
|
+
{ 416, 421, EvenOdd },
|
57
|
+
{ 422, 422, 218 },
|
58
|
+
{ 423, 424, OddEven },
|
59
|
+
{ 425, 425, 218 },
|
60
|
+
{ 428, 429, EvenOdd },
|
61
|
+
{ 430, 430, 218 },
|
62
|
+
{ 431, 432, OddEven },
|
63
|
+
{ 433, 434, 217 },
|
64
|
+
{ 435, 438, OddEven },
|
65
|
+
{ 439, 439, 219 },
|
66
|
+
{ 440, 441, EvenOdd },
|
67
|
+
{ 444, 445, EvenOdd },
|
68
|
+
{ 447, 447, 56 },
|
69
|
+
{ 452, 452, EvenOdd },
|
70
|
+
{ 453, 453, OddEven },
|
71
|
+
{ 454, 454, -2 },
|
72
|
+
{ 455, 455, OddEven },
|
73
|
+
{ 456, 456, EvenOdd },
|
74
|
+
{ 457, 457, -2 },
|
75
|
+
{ 458, 458, EvenOdd },
|
76
|
+
{ 459, 459, OddEven },
|
77
|
+
{ 460, 460, -2 },
|
78
|
+
{ 461, 476, OddEven },
|
79
|
+
{ 477, 477, -79 },
|
80
|
+
{ 478, 495, EvenOdd },
|
81
|
+
{ 497, 497, OddEven },
|
82
|
+
{ 498, 498, EvenOdd },
|
83
|
+
{ 499, 499, -2 },
|
84
|
+
{ 500, 501, EvenOdd },
|
85
|
+
{ 502, 502, -97 },
|
86
|
+
{ 503, 503, -56 },
|
87
|
+
{ 504, 543, EvenOdd },
|
88
|
+
{ 544, 544, -130 },
|
89
|
+
{ 546, 563, EvenOdd },
|
90
|
+
{ 570, 570, 10795 },
|
91
|
+
{ 571, 572, OddEven },
|
92
|
+
{ 573, 573, -163 },
|
93
|
+
{ 574, 574, 10792 },
|
94
|
+
{ 575, 576, 10815 },
|
95
|
+
{ 577, 578, OddEven },
|
96
|
+
{ 579, 579, -195 },
|
97
|
+
{ 580, 580, 69 },
|
98
|
+
{ 581, 581, 71 },
|
99
|
+
{ 582, 591, EvenOdd },
|
100
|
+
{ 592, 592, 10783 },
|
101
|
+
{ 593, 593, 10780 },
|
102
|
+
{ 594, 594, 10782 },
|
103
|
+
{ 595, 595, -210 },
|
104
|
+
{ 596, 596, -206 },
|
105
|
+
{ 598, 599, -205 },
|
106
|
+
{ 601, 601, -202 },
|
107
|
+
{ 603, 603, -203 },
|
108
|
+
{ 608, 608, -205 },
|
109
|
+
{ 611, 611, -207 },
|
110
|
+
{ 613, 613, 42280 },
|
111
|
+
{ 616, 616, -209 },
|
112
|
+
{ 617, 617, -211 },
|
113
|
+
{ 619, 619, 10743 },
|
114
|
+
{ 623, 623, -211 },
|
115
|
+
{ 625, 625, 10749 },
|
116
|
+
{ 626, 626, -213 },
|
117
|
+
{ 629, 629, -214 },
|
118
|
+
{ 637, 637, 10727 },
|
119
|
+
{ 640, 640, -218 },
|
120
|
+
{ 643, 643, -218 },
|
121
|
+
{ 648, 648, -218 },
|
122
|
+
{ 649, 649, -69 },
|
123
|
+
{ 650, 651, -217 },
|
124
|
+
{ 652, 652, -71 },
|
125
|
+
{ 658, 658, -219 },
|
126
|
+
{ 837, 837, 84 },
|
127
|
+
{ 880, 883, EvenOdd },
|
128
|
+
{ 886, 887, EvenOdd },
|
129
|
+
{ 891, 893, 130 },
|
130
|
+
{ 902, 902, 38 },
|
131
|
+
{ 904, 906, 37 },
|
132
|
+
{ 908, 908, 64 },
|
133
|
+
{ 910, 911, 63 },
|
134
|
+
{ 913, 929, 32 },
|
135
|
+
{ 931, 931, 31 },
|
136
|
+
{ 932, 939, 32 },
|
137
|
+
{ 940, 940, -38 },
|
138
|
+
{ 941, 943, -37 },
|
139
|
+
{ 945, 945, -32 },
|
140
|
+
{ 946, 946, 30 },
|
141
|
+
{ 947, 948, -32 },
|
142
|
+
{ 949, 949, 64 },
|
143
|
+
{ 950, 951, -32 },
|
144
|
+
{ 952, 952, 25 },
|
145
|
+
{ 953, 953, 7173 },
|
146
|
+
{ 954, 954, 54 },
|
147
|
+
{ 955, 955, -32 },
|
148
|
+
{ 956, 956, -775 },
|
149
|
+
{ 957, 959, -32 },
|
150
|
+
{ 960, 960, 22 },
|
151
|
+
{ 961, 961, 48 },
|
152
|
+
{ 962, 962, EvenOdd },
|
153
|
+
{ 963, 965, -32 },
|
154
|
+
{ 966, 966, 15 },
|
155
|
+
{ 967, 968, -32 },
|
156
|
+
{ 969, 969, 7517 },
|
157
|
+
{ 970, 971, -32 },
|
158
|
+
{ 972, 972, -64 },
|
159
|
+
{ 973, 974, -63 },
|
160
|
+
{ 975, 975, 8 },
|
161
|
+
{ 976, 976, -62 },
|
162
|
+
{ 977, 977, 35 },
|
163
|
+
{ 981, 981, -47 },
|
164
|
+
{ 982, 982, -54 },
|
165
|
+
{ 983, 983, -8 },
|
166
|
+
{ 984, 1007, EvenOdd },
|
167
|
+
{ 1008, 1008, -86 },
|
168
|
+
{ 1009, 1009, -80 },
|
169
|
+
{ 1010, 1010, 7 },
|
170
|
+
{ 1012, 1012, -92 },
|
171
|
+
{ 1013, 1013, -96 },
|
172
|
+
{ 1015, 1016, OddEven },
|
173
|
+
{ 1017, 1017, -7 },
|
174
|
+
{ 1018, 1019, EvenOdd },
|
175
|
+
{ 1021, 1023, -130 },
|
176
|
+
{ 1024, 1039, 80 },
|
177
|
+
{ 1040, 1071, 32 },
|
178
|
+
{ 1072, 1103, -32 },
|
179
|
+
{ 1104, 1119, -80 },
|
180
|
+
{ 1120, 1153, EvenOdd },
|
181
|
+
{ 1162, 1215, EvenOdd },
|
182
|
+
{ 1216, 1216, 15 },
|
183
|
+
{ 1217, 1230, OddEven },
|
184
|
+
{ 1231, 1231, -15 },
|
185
|
+
{ 1232, 1319, EvenOdd },
|
186
|
+
{ 1329, 1366, 48 },
|
187
|
+
{ 1377, 1414, -48 },
|
188
|
+
{ 4256, 4293, 7264 },
|
189
|
+
{ 7545, 7545, 35332 },
|
190
|
+
{ 7549, 7549, 3814 },
|
191
|
+
{ 7680, 7776, EvenOdd },
|
192
|
+
{ 7777, 7777, 58 },
|
193
|
+
{ 7778, 7829, EvenOdd },
|
194
|
+
{ 7835, 7835, -59 },
|
195
|
+
{ 7838, 7838, -7615 },
|
196
|
+
{ 7840, 7935, EvenOdd },
|
197
|
+
{ 7936, 7943, 8 },
|
198
|
+
{ 7944, 7951, -8 },
|
199
|
+
{ 7952, 7957, 8 },
|
200
|
+
{ 7960, 7965, -8 },
|
201
|
+
{ 7968, 7975, 8 },
|
202
|
+
{ 7976, 7983, -8 },
|
203
|
+
{ 7984, 7991, 8 },
|
204
|
+
{ 7992, 7999, -8 },
|
205
|
+
{ 8000, 8005, 8 },
|
206
|
+
{ 8008, 8013, -8 },
|
207
|
+
{ 8017, 8017, 8 },
|
208
|
+
{ 8019, 8019, 8 },
|
209
|
+
{ 8021, 8021, 8 },
|
210
|
+
{ 8023, 8023, 8 },
|
211
|
+
{ 8025, 8025, -8 },
|
212
|
+
{ 8027, 8027, -8 },
|
213
|
+
{ 8029, 8029, -8 },
|
214
|
+
{ 8031, 8031, -8 },
|
215
|
+
{ 8032, 8039, 8 },
|
216
|
+
{ 8040, 8047, -8 },
|
217
|
+
{ 8048, 8049, 74 },
|
218
|
+
{ 8050, 8053, 86 },
|
219
|
+
{ 8054, 8055, 100 },
|
220
|
+
{ 8056, 8057, 128 },
|
221
|
+
{ 8058, 8059, 112 },
|
222
|
+
{ 8060, 8061, 126 },
|
223
|
+
{ 8064, 8071, 8 },
|
224
|
+
{ 8072, 8079, -8 },
|
225
|
+
{ 8080, 8087, 8 },
|
226
|
+
{ 8088, 8095, -8 },
|
227
|
+
{ 8096, 8103, 8 },
|
228
|
+
{ 8104, 8111, -8 },
|
229
|
+
{ 8112, 8113, 8 },
|
230
|
+
{ 8115, 8115, 9 },
|
231
|
+
{ 8120, 8121, -8 },
|
232
|
+
{ 8122, 8123, -74 },
|
233
|
+
{ 8124, 8124, -9 },
|
234
|
+
{ 8126, 8126, -7289 },
|
235
|
+
{ 8131, 8131, 9 },
|
236
|
+
{ 8136, 8139, -86 },
|
237
|
+
{ 8140, 8140, -9 },
|
238
|
+
{ 8144, 8145, 8 },
|
239
|
+
{ 8152, 8153, -8 },
|
240
|
+
{ 8154, 8155, -100 },
|
241
|
+
{ 8160, 8161, 8 },
|
242
|
+
{ 8165, 8165, 7 },
|
243
|
+
{ 8168, 8169, -8 },
|
244
|
+
{ 8170, 8171, -112 },
|
245
|
+
{ 8172, 8172, -7 },
|
246
|
+
{ 8179, 8179, 9 },
|
247
|
+
{ 8184, 8185, -128 },
|
248
|
+
{ 8186, 8187, -126 },
|
249
|
+
{ 8188, 8188, -9 },
|
250
|
+
{ 8486, 8486, -7549 },
|
251
|
+
{ 8490, 8490, -8415 },
|
252
|
+
{ 8491, 8491, -8294 },
|
253
|
+
{ 8498, 8498, 28 },
|
254
|
+
{ 8526, 8526, -28 },
|
255
|
+
{ 8544, 8559, 16 },
|
256
|
+
{ 8560, 8575, -16 },
|
257
|
+
{ 8579, 8580, OddEven },
|
258
|
+
{ 9398, 9423, 26 },
|
259
|
+
{ 9424, 9449, -26 },
|
260
|
+
{ 11264, 11310, 48 },
|
261
|
+
{ 11312, 11358, -48 },
|
262
|
+
{ 11360, 11361, EvenOdd },
|
263
|
+
{ 11362, 11362, -10743 },
|
264
|
+
{ 11363, 11363, -3814 },
|
265
|
+
{ 11364, 11364, -10727 },
|
266
|
+
{ 11365, 11365, -10795 },
|
267
|
+
{ 11366, 11366, -10792 },
|
268
|
+
{ 11367, 11372, OddEven },
|
269
|
+
{ 11373, 11373, -10780 },
|
270
|
+
{ 11374, 11374, -10749 },
|
271
|
+
{ 11375, 11375, -10783 },
|
272
|
+
{ 11376, 11376, -10782 },
|
273
|
+
{ 11378, 11379, EvenOdd },
|
274
|
+
{ 11381, 11382, OddEven },
|
275
|
+
{ 11390, 11391, -10815 },
|
276
|
+
{ 11392, 11491, EvenOdd },
|
277
|
+
{ 11499, 11502, OddEven },
|
278
|
+
{ 11520, 11557, -7264 },
|
279
|
+
{ 42560, 42605, EvenOdd },
|
280
|
+
{ 42624, 42647, EvenOdd },
|
281
|
+
{ 42786, 42799, EvenOdd },
|
282
|
+
{ 42802, 42863, EvenOdd },
|
283
|
+
{ 42873, 42876, OddEven },
|
284
|
+
{ 42877, 42877, -35332 },
|
285
|
+
{ 42878, 42887, EvenOdd },
|
286
|
+
{ 42891, 42892, OddEven },
|
287
|
+
{ 42893, 42893, -42280 },
|
288
|
+
{ 42896, 42897, EvenOdd },
|
289
|
+
{ 42912, 42921, EvenOdd },
|
290
|
+
{ 65313, 65338, 32 },
|
291
|
+
{ 65345, 65370, -32 },
|
292
|
+
{ 66560, 66599, 40 },
|
293
|
+
{ 66600, 66639, -40 },
|
294
|
+
};
|
295
|
+
int num_unicode_casefold = 282;
|
296
|
+
|
297
|
+
// 1029 groups, 1050 pairs, 163 ranges
|
298
|
+
CaseFold unicode_tolower[] = {
|
299
|
+
{ 65, 90, 32 },
|
300
|
+
{ 181, 181, 775 },
|
301
|
+
{ 192, 214, 32 },
|
302
|
+
{ 216, 222, 32 },
|
303
|
+
{ 256, 302, EvenOddSkip },
|
304
|
+
{ 306, 310, EvenOddSkip },
|
305
|
+
{ 313, 327, OddEvenSkip },
|
306
|
+
{ 330, 374, EvenOddSkip },
|
307
|
+
{ 376, 376, -121 },
|
308
|
+
{ 377, 381, OddEvenSkip },
|
309
|
+
{ 383, 383, -268 },
|
310
|
+
{ 385, 385, 210 },
|
311
|
+
{ 386, 388, EvenOddSkip },
|
312
|
+
{ 390, 390, 206 },
|
313
|
+
{ 391, 391, OddEven },
|
314
|
+
{ 393, 394, 205 },
|
315
|
+
{ 395, 395, OddEven },
|
316
|
+
{ 398, 398, 79 },
|
317
|
+
{ 399, 399, 202 },
|
318
|
+
{ 400, 400, 203 },
|
319
|
+
{ 401, 401, OddEven },
|
320
|
+
{ 403, 403, 205 },
|
321
|
+
{ 404, 404, 207 },
|
322
|
+
{ 406, 406, 211 },
|
323
|
+
{ 407, 407, 209 },
|
324
|
+
{ 408, 408, EvenOdd },
|
325
|
+
{ 412, 412, 211 },
|
326
|
+
{ 413, 413, 213 },
|
327
|
+
{ 415, 415, 214 },
|
328
|
+
{ 416, 420, EvenOddSkip },
|
329
|
+
{ 422, 422, 218 },
|
330
|
+
{ 423, 423, OddEven },
|
331
|
+
{ 425, 425, 218 },
|
332
|
+
{ 428, 428, EvenOdd },
|
333
|
+
{ 430, 430, 218 },
|
334
|
+
{ 431, 431, OddEven },
|
335
|
+
{ 433, 434, 217 },
|
336
|
+
{ 435, 437, OddEvenSkip },
|
337
|
+
{ 439, 439, 219 },
|
338
|
+
{ 440, 440, EvenOdd },
|
339
|
+
{ 444, 444, EvenOdd },
|
340
|
+
{ 452, 452, 2 },
|
341
|
+
{ 453, 453, OddEven },
|
342
|
+
{ 455, 455, 2 },
|
343
|
+
{ 456, 456, EvenOdd },
|
344
|
+
{ 458, 458, 2 },
|
345
|
+
{ 459, 475, OddEvenSkip },
|
346
|
+
{ 478, 494, EvenOddSkip },
|
347
|
+
{ 497, 497, 2 },
|
348
|
+
{ 498, 500, EvenOddSkip },
|
349
|
+
{ 502, 502, -97 },
|
350
|
+
{ 503, 503, -56 },
|
351
|
+
{ 504, 542, EvenOddSkip },
|
352
|
+
{ 544, 544, -130 },
|
353
|
+
{ 546, 562, EvenOddSkip },
|
354
|
+
{ 570, 570, 10795 },
|
355
|
+
{ 571, 571, OddEven },
|
356
|
+
{ 573, 573, -163 },
|
357
|
+
{ 574, 574, 10792 },
|
358
|
+
{ 577, 577, OddEven },
|
359
|
+
{ 579, 579, -195 },
|
360
|
+
{ 580, 580, 69 },
|
361
|
+
{ 581, 581, 71 },
|
362
|
+
{ 582, 590, EvenOddSkip },
|
363
|
+
{ 837, 837, 116 },
|
364
|
+
{ 880, 882, EvenOddSkip },
|
365
|
+
{ 886, 886, EvenOdd },
|
366
|
+
{ 902, 902, 38 },
|
367
|
+
{ 904, 906, 37 },
|
368
|
+
{ 908, 908, 64 },
|
369
|
+
{ 910, 911, 63 },
|
370
|
+
{ 913, 929, 32 },
|
371
|
+
{ 931, 939, 32 },
|
372
|
+
{ 962, 962, EvenOdd },
|
373
|
+
{ 975, 975, 8 },
|
374
|
+
{ 976, 976, -30 },
|
375
|
+
{ 977, 977, -25 },
|
376
|
+
{ 981, 981, -15 },
|
377
|
+
{ 982, 982, -22 },
|
378
|
+
{ 984, 1006, EvenOddSkip },
|
379
|
+
{ 1008, 1008, -54 },
|
380
|
+
{ 1009, 1009, -48 },
|
381
|
+
{ 1012, 1012, -60 },
|
382
|
+
{ 1013, 1013, -64 },
|
383
|
+
{ 1015, 1015, OddEven },
|
384
|
+
{ 1017, 1017, -7 },
|
385
|
+
{ 1018, 1018, EvenOdd },
|
386
|
+
{ 1021, 1023, -130 },
|
387
|
+
{ 1024, 1039, 80 },
|
388
|
+
{ 1040, 1071, 32 },
|
389
|
+
{ 1120, 1152, EvenOddSkip },
|
390
|
+
{ 1162, 1214, EvenOddSkip },
|
391
|
+
{ 1216, 1216, 15 },
|
392
|
+
{ 1217, 1229, OddEvenSkip },
|
393
|
+
{ 1232, 1318, EvenOddSkip },
|
394
|
+
{ 1329, 1366, 48 },
|
395
|
+
{ 4256, 4293, 7264 },
|
396
|
+
{ 7680, 7828, EvenOddSkip },
|
397
|
+
{ 7835, 7835, -58 },
|
398
|
+
{ 7838, 7838, -7615 },
|
399
|
+
{ 7840, 7934, EvenOddSkip },
|
400
|
+
{ 7944, 7951, -8 },
|
401
|
+
{ 7960, 7965, -8 },
|
402
|
+
{ 7976, 7983, -8 },
|
403
|
+
{ 7992, 7999, -8 },
|
404
|
+
{ 8008, 8013, -8 },
|
405
|
+
{ 8025, 8025, -8 },
|
406
|
+
{ 8027, 8027, -8 },
|
407
|
+
{ 8029, 8029, -8 },
|
408
|
+
{ 8031, 8031, -8 },
|
409
|
+
{ 8040, 8047, -8 },
|
410
|
+
{ 8072, 8079, -8 },
|
411
|
+
{ 8088, 8095, -8 },
|
412
|
+
{ 8104, 8111, -8 },
|
413
|
+
{ 8120, 8121, -8 },
|
414
|
+
{ 8122, 8123, -74 },
|
415
|
+
{ 8124, 8124, -9 },
|
416
|
+
{ 8126, 8126, -7173 },
|
417
|
+
{ 8136, 8139, -86 },
|
418
|
+
{ 8140, 8140, -9 },
|
419
|
+
{ 8152, 8153, -8 },
|
420
|
+
{ 8154, 8155, -100 },
|
421
|
+
{ 8168, 8169, -8 },
|
422
|
+
{ 8170, 8171, -112 },
|
423
|
+
{ 8172, 8172, -7 },
|
424
|
+
{ 8184, 8185, -128 },
|
425
|
+
{ 8186, 8187, -126 },
|
426
|
+
{ 8188, 8188, -9 },
|
427
|
+
{ 8486, 8486, -7517 },
|
428
|
+
{ 8490, 8490, -8383 },
|
429
|
+
{ 8491, 8491, -8262 },
|
430
|
+
{ 8498, 8498, 28 },
|
431
|
+
{ 8544, 8559, 16 },
|
432
|
+
{ 8579, 8579, OddEven },
|
433
|
+
{ 9398, 9423, 26 },
|
434
|
+
{ 11264, 11310, 48 },
|
435
|
+
{ 11360, 11360, EvenOdd },
|
436
|
+
{ 11362, 11362, -10743 },
|
437
|
+
{ 11363, 11363, -3814 },
|
438
|
+
{ 11364, 11364, -10727 },
|
439
|
+
{ 11367, 11371, OddEvenSkip },
|
440
|
+
{ 11373, 11373, -10780 },
|
441
|
+
{ 11374, 11374, -10749 },
|
442
|
+
{ 11375, 11375, -10783 },
|
443
|
+
{ 11376, 11376, -10782 },
|
444
|
+
{ 11378, 11378, EvenOdd },
|
445
|
+
{ 11381, 11381, OddEven },
|
446
|
+
{ 11390, 11391, -10815 },
|
447
|
+
{ 11392, 11490, EvenOddSkip },
|
448
|
+
{ 11499, 11501, OddEvenSkip },
|
449
|
+
{ 42560, 42604, EvenOddSkip },
|
450
|
+
{ 42624, 42646, EvenOddSkip },
|
451
|
+
{ 42786, 42798, EvenOddSkip },
|
452
|
+
{ 42802, 42862, EvenOddSkip },
|
453
|
+
{ 42873, 42875, OddEvenSkip },
|
454
|
+
{ 42877, 42877, -35332 },
|
455
|
+
{ 42878, 42886, EvenOddSkip },
|
456
|
+
{ 42891, 42891, OddEven },
|
457
|
+
{ 42893, 42893, -42280 },
|
458
|
+
{ 42896, 42896, EvenOdd },
|
459
|
+
{ 42912, 42920, EvenOddSkip },
|
460
|
+
{ 65313, 65338, 32 },
|
461
|
+
{ 66560, 66599, 40 },
|
462
|
+
};
|
463
|
+
int num_unicode_tolower = 163;
|
464
|
+
|
465
|
+
|
466
|
+
|
467
|
+
} // namespace re2
|
468
|
+
|
469
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Unicode case folding tables.
|
6
|
+
|
7
|
+
// The Unicode case folding tables encode the mapping from one Unicode point
|
8
|
+
// to the next largest Unicode point with equivalent folding. The largest
|
9
|
+
// point wraps back to the first. For example, the tables map:
|
10
|
+
//
|
11
|
+
// 'A' -> 'a'
|
12
|
+
// 'a' -> 'A'
|
13
|
+
//
|
14
|
+
// 'K' -> 'k'
|
15
|
+
// 'k' -> 'K' (Kelvin symbol)
|
16
|
+
// 'K' -> 'K'
|
17
|
+
//
|
18
|
+
// Like everything Unicode, these tables are big. If we represent the table
|
19
|
+
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
|
20
|
+
// Most table entries look like the ones around them:
|
21
|
+
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
|
22
|
+
// Instead of listing all the pairs explicitly, we make a list of ranges
|
23
|
+
// and deltas, so that the table entries for 'A' through 'Z' can be represented
|
24
|
+
// as a single entry { 'A', 'Z', +32 }.
|
25
|
+
//
|
26
|
+
// In addition to blocks that map to each other (A-Z mapping to a-z)
|
27
|
+
// there are blocks of pairs that individually map to each other
|
28
|
+
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
|
29
|
+
// For those, the special delta value EvenOdd marks even/odd pairs
|
30
|
+
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
|
31
|
+
//
|
32
|
+
// In this form, the table has 274 entries, about 3kB. If we were to split
|
33
|
+
// the table into one for 16-bit codes and an overflow table for larger ones,
|
34
|
+
// we could get it down to about 1.5kB, but that's not worth the complexity.
|
35
|
+
//
|
36
|
+
// The grouped form also allows for efficient fold range calculations
|
37
|
+
// rather than looping one character at a time.
|
38
|
+
|
39
|
+
#ifndef RE2_UNICODE_CASEFOLD_H__
|
40
|
+
#define RE2_UNICODE_CASEFOLD_H__
|
41
|
+
|
42
|
+
#include "util/util.h"
|
43
|
+
|
44
|
+
namespace re2 {
|
45
|
+
|
46
|
+
enum {
|
47
|
+
EvenOdd = 1,
|
48
|
+
OddEven = -1,
|
49
|
+
EvenOddSkip = 1<<30,
|
50
|
+
OddEvenSkip,
|
51
|
+
};
|
52
|
+
|
53
|
+
struct CaseFold {
|
54
|
+
uint32 lo;
|
55
|
+
uint32 hi;
|
56
|
+
int32 delta;
|
57
|
+
};
|
58
|
+
|
59
|
+
extern CaseFold unicode_casefold[];
|
60
|
+
extern int num_unicode_casefold;
|
61
|
+
|
62
|
+
extern CaseFold unicode_tolower[];
|
63
|
+
extern int num_unicode_tolower;
|
64
|
+
|
65
|
+
// Returns the CaseFold* in the tables that contains rune.
|
66
|
+
// If rune is not in the tables, returns the first CaseFold* after rune.
|
67
|
+
// If rune is larger than any value in the tables, returns NULL.
|
68
|
+
extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
|
69
|
+
|
70
|
+
// Returns the result of applying the fold f to the rune r.
|
71
|
+
extern Rune ApplyFold(CaseFold *f, Rune r);
|
72
|
+
|
73
|
+
} // namespace re2
|
74
|
+
|
75
|
+
#endif // RE2_UNICODE_CASEFOLD_H__
|