chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,469 @@
1
+
2
+ // GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
3
+ // make_unicode_casefold.py >unicode_casefold.cc
4
+
5
+ #include "re2/unicode_casefold.h"
6
+
7
+ namespace re2 {
8
+
9
+
10
+ // 1029 groups, 2079 pairs, 282 ranges
11
+ CaseFold unicode_casefold[] = {
12
+ { 65, 90, 32 },
13
+ { 97, 106, -32 },
14
+ { 107, 107, 8383 },
15
+ { 108, 114, -32 },
16
+ { 115, 115, 268 },
17
+ { 116, 122, -32 },
18
+ { 181, 181, 743 },
19
+ { 192, 214, 32 },
20
+ { 216, 222, 32 },
21
+ { 223, 223, 7615 },
22
+ { 224, 228, -32 },
23
+ { 229, 229, 8262 },
24
+ { 230, 246, -32 },
25
+ { 248, 254, -32 },
26
+ { 255, 255, 121 },
27
+ { 256, 303, EvenOdd },
28
+ { 306, 311, EvenOdd },
29
+ { 313, 328, OddEven },
30
+ { 330, 375, EvenOdd },
31
+ { 376, 376, -121 },
32
+ { 377, 382, OddEven },
33
+ { 383, 383, -300 },
34
+ { 384, 384, 195 },
35
+ { 385, 385, 210 },
36
+ { 386, 389, EvenOdd },
37
+ { 390, 390, 206 },
38
+ { 391, 392, OddEven },
39
+ { 393, 394, 205 },
40
+ { 395, 396, OddEven },
41
+ { 398, 398, 79 },
42
+ { 399, 399, 202 },
43
+ { 400, 400, 203 },
44
+ { 401, 402, OddEven },
45
+ { 403, 403, 205 },
46
+ { 404, 404, 207 },
47
+ { 405, 405, 97 },
48
+ { 406, 406, 211 },
49
+ { 407, 407, 209 },
50
+ { 408, 409, EvenOdd },
51
+ { 410, 410, 163 },
52
+ { 412, 412, 211 },
53
+ { 413, 413, 213 },
54
+ { 414, 414, 130 },
55
+ { 415, 415, 214 },
56
+ { 416, 421, EvenOdd },
57
+ { 422, 422, 218 },
58
+ { 423, 424, OddEven },
59
+ { 425, 425, 218 },
60
+ { 428, 429, EvenOdd },
61
+ { 430, 430, 218 },
62
+ { 431, 432, OddEven },
63
+ { 433, 434, 217 },
64
+ { 435, 438, OddEven },
65
+ { 439, 439, 219 },
66
+ { 440, 441, EvenOdd },
67
+ { 444, 445, EvenOdd },
68
+ { 447, 447, 56 },
69
+ { 452, 452, EvenOdd },
70
+ { 453, 453, OddEven },
71
+ { 454, 454, -2 },
72
+ { 455, 455, OddEven },
73
+ { 456, 456, EvenOdd },
74
+ { 457, 457, -2 },
75
+ { 458, 458, EvenOdd },
76
+ { 459, 459, OddEven },
77
+ { 460, 460, -2 },
78
+ { 461, 476, OddEven },
79
+ { 477, 477, -79 },
80
+ { 478, 495, EvenOdd },
81
+ { 497, 497, OddEven },
82
+ { 498, 498, EvenOdd },
83
+ { 499, 499, -2 },
84
+ { 500, 501, EvenOdd },
85
+ { 502, 502, -97 },
86
+ { 503, 503, -56 },
87
+ { 504, 543, EvenOdd },
88
+ { 544, 544, -130 },
89
+ { 546, 563, EvenOdd },
90
+ { 570, 570, 10795 },
91
+ { 571, 572, OddEven },
92
+ { 573, 573, -163 },
93
+ { 574, 574, 10792 },
94
+ { 575, 576, 10815 },
95
+ { 577, 578, OddEven },
96
+ { 579, 579, -195 },
97
+ { 580, 580, 69 },
98
+ { 581, 581, 71 },
99
+ { 582, 591, EvenOdd },
100
+ { 592, 592, 10783 },
101
+ { 593, 593, 10780 },
102
+ { 594, 594, 10782 },
103
+ { 595, 595, -210 },
104
+ { 596, 596, -206 },
105
+ { 598, 599, -205 },
106
+ { 601, 601, -202 },
107
+ { 603, 603, -203 },
108
+ { 608, 608, -205 },
109
+ { 611, 611, -207 },
110
+ { 613, 613, 42280 },
111
+ { 616, 616, -209 },
112
+ { 617, 617, -211 },
113
+ { 619, 619, 10743 },
114
+ { 623, 623, -211 },
115
+ { 625, 625, 10749 },
116
+ { 626, 626, -213 },
117
+ { 629, 629, -214 },
118
+ { 637, 637, 10727 },
119
+ { 640, 640, -218 },
120
+ { 643, 643, -218 },
121
+ { 648, 648, -218 },
122
+ { 649, 649, -69 },
123
+ { 650, 651, -217 },
124
+ { 652, 652, -71 },
125
+ { 658, 658, -219 },
126
+ { 837, 837, 84 },
127
+ { 880, 883, EvenOdd },
128
+ { 886, 887, EvenOdd },
129
+ { 891, 893, 130 },
130
+ { 902, 902, 38 },
131
+ { 904, 906, 37 },
132
+ { 908, 908, 64 },
133
+ { 910, 911, 63 },
134
+ { 913, 929, 32 },
135
+ { 931, 931, 31 },
136
+ { 932, 939, 32 },
137
+ { 940, 940, -38 },
138
+ { 941, 943, -37 },
139
+ { 945, 945, -32 },
140
+ { 946, 946, 30 },
141
+ { 947, 948, -32 },
142
+ { 949, 949, 64 },
143
+ { 950, 951, -32 },
144
+ { 952, 952, 25 },
145
+ { 953, 953, 7173 },
146
+ { 954, 954, 54 },
147
+ { 955, 955, -32 },
148
+ { 956, 956, -775 },
149
+ { 957, 959, -32 },
150
+ { 960, 960, 22 },
151
+ { 961, 961, 48 },
152
+ { 962, 962, EvenOdd },
153
+ { 963, 965, -32 },
154
+ { 966, 966, 15 },
155
+ { 967, 968, -32 },
156
+ { 969, 969, 7517 },
157
+ { 970, 971, -32 },
158
+ { 972, 972, -64 },
159
+ { 973, 974, -63 },
160
+ { 975, 975, 8 },
161
+ { 976, 976, -62 },
162
+ { 977, 977, 35 },
163
+ { 981, 981, -47 },
164
+ { 982, 982, -54 },
165
+ { 983, 983, -8 },
166
+ { 984, 1007, EvenOdd },
167
+ { 1008, 1008, -86 },
168
+ { 1009, 1009, -80 },
169
+ { 1010, 1010, 7 },
170
+ { 1012, 1012, -92 },
171
+ { 1013, 1013, -96 },
172
+ { 1015, 1016, OddEven },
173
+ { 1017, 1017, -7 },
174
+ { 1018, 1019, EvenOdd },
175
+ { 1021, 1023, -130 },
176
+ { 1024, 1039, 80 },
177
+ { 1040, 1071, 32 },
178
+ { 1072, 1103, -32 },
179
+ { 1104, 1119, -80 },
180
+ { 1120, 1153, EvenOdd },
181
+ { 1162, 1215, EvenOdd },
182
+ { 1216, 1216, 15 },
183
+ { 1217, 1230, OddEven },
184
+ { 1231, 1231, -15 },
185
+ { 1232, 1319, EvenOdd },
186
+ { 1329, 1366, 48 },
187
+ { 1377, 1414, -48 },
188
+ { 4256, 4293, 7264 },
189
+ { 7545, 7545, 35332 },
190
+ { 7549, 7549, 3814 },
191
+ { 7680, 7776, EvenOdd },
192
+ { 7777, 7777, 58 },
193
+ { 7778, 7829, EvenOdd },
194
+ { 7835, 7835, -59 },
195
+ { 7838, 7838, -7615 },
196
+ { 7840, 7935, EvenOdd },
197
+ { 7936, 7943, 8 },
198
+ { 7944, 7951, -8 },
199
+ { 7952, 7957, 8 },
200
+ { 7960, 7965, -8 },
201
+ { 7968, 7975, 8 },
202
+ { 7976, 7983, -8 },
203
+ { 7984, 7991, 8 },
204
+ { 7992, 7999, -8 },
205
+ { 8000, 8005, 8 },
206
+ { 8008, 8013, -8 },
207
+ { 8017, 8017, 8 },
208
+ { 8019, 8019, 8 },
209
+ { 8021, 8021, 8 },
210
+ { 8023, 8023, 8 },
211
+ { 8025, 8025, -8 },
212
+ { 8027, 8027, -8 },
213
+ { 8029, 8029, -8 },
214
+ { 8031, 8031, -8 },
215
+ { 8032, 8039, 8 },
216
+ { 8040, 8047, -8 },
217
+ { 8048, 8049, 74 },
218
+ { 8050, 8053, 86 },
219
+ { 8054, 8055, 100 },
220
+ { 8056, 8057, 128 },
221
+ { 8058, 8059, 112 },
222
+ { 8060, 8061, 126 },
223
+ { 8064, 8071, 8 },
224
+ { 8072, 8079, -8 },
225
+ { 8080, 8087, 8 },
226
+ { 8088, 8095, -8 },
227
+ { 8096, 8103, 8 },
228
+ { 8104, 8111, -8 },
229
+ { 8112, 8113, 8 },
230
+ { 8115, 8115, 9 },
231
+ { 8120, 8121, -8 },
232
+ { 8122, 8123, -74 },
233
+ { 8124, 8124, -9 },
234
+ { 8126, 8126, -7289 },
235
+ { 8131, 8131, 9 },
236
+ { 8136, 8139, -86 },
237
+ { 8140, 8140, -9 },
238
+ { 8144, 8145, 8 },
239
+ { 8152, 8153, -8 },
240
+ { 8154, 8155, -100 },
241
+ { 8160, 8161, 8 },
242
+ { 8165, 8165, 7 },
243
+ { 8168, 8169, -8 },
244
+ { 8170, 8171, -112 },
245
+ { 8172, 8172, -7 },
246
+ { 8179, 8179, 9 },
247
+ { 8184, 8185, -128 },
248
+ { 8186, 8187, -126 },
249
+ { 8188, 8188, -9 },
250
+ { 8486, 8486, -7549 },
251
+ { 8490, 8490, -8415 },
252
+ { 8491, 8491, -8294 },
253
+ { 8498, 8498, 28 },
254
+ { 8526, 8526, -28 },
255
+ { 8544, 8559, 16 },
256
+ { 8560, 8575, -16 },
257
+ { 8579, 8580, OddEven },
258
+ { 9398, 9423, 26 },
259
+ { 9424, 9449, -26 },
260
+ { 11264, 11310, 48 },
261
+ { 11312, 11358, -48 },
262
+ { 11360, 11361, EvenOdd },
263
+ { 11362, 11362, -10743 },
264
+ { 11363, 11363, -3814 },
265
+ { 11364, 11364, -10727 },
266
+ { 11365, 11365, -10795 },
267
+ { 11366, 11366, -10792 },
268
+ { 11367, 11372, OddEven },
269
+ { 11373, 11373, -10780 },
270
+ { 11374, 11374, -10749 },
271
+ { 11375, 11375, -10783 },
272
+ { 11376, 11376, -10782 },
273
+ { 11378, 11379, EvenOdd },
274
+ { 11381, 11382, OddEven },
275
+ { 11390, 11391, -10815 },
276
+ { 11392, 11491, EvenOdd },
277
+ { 11499, 11502, OddEven },
278
+ { 11520, 11557, -7264 },
279
+ { 42560, 42605, EvenOdd },
280
+ { 42624, 42647, EvenOdd },
281
+ { 42786, 42799, EvenOdd },
282
+ { 42802, 42863, EvenOdd },
283
+ { 42873, 42876, OddEven },
284
+ { 42877, 42877, -35332 },
285
+ { 42878, 42887, EvenOdd },
286
+ { 42891, 42892, OddEven },
287
+ { 42893, 42893, -42280 },
288
+ { 42896, 42897, EvenOdd },
289
+ { 42912, 42921, EvenOdd },
290
+ { 65313, 65338, 32 },
291
+ { 65345, 65370, -32 },
292
+ { 66560, 66599, 40 },
293
+ { 66600, 66639, -40 },
294
+ };
295
+ int num_unicode_casefold = 282;
296
+
297
+ // 1029 groups, 1050 pairs, 163 ranges
298
+ CaseFold unicode_tolower[] = {
299
+ { 65, 90, 32 },
300
+ { 181, 181, 775 },
301
+ { 192, 214, 32 },
302
+ { 216, 222, 32 },
303
+ { 256, 302, EvenOddSkip },
304
+ { 306, 310, EvenOddSkip },
305
+ { 313, 327, OddEvenSkip },
306
+ { 330, 374, EvenOddSkip },
307
+ { 376, 376, -121 },
308
+ { 377, 381, OddEvenSkip },
309
+ { 383, 383, -268 },
310
+ { 385, 385, 210 },
311
+ { 386, 388, EvenOddSkip },
312
+ { 390, 390, 206 },
313
+ { 391, 391, OddEven },
314
+ { 393, 394, 205 },
315
+ { 395, 395, OddEven },
316
+ { 398, 398, 79 },
317
+ { 399, 399, 202 },
318
+ { 400, 400, 203 },
319
+ { 401, 401, OddEven },
320
+ { 403, 403, 205 },
321
+ { 404, 404, 207 },
322
+ { 406, 406, 211 },
323
+ { 407, 407, 209 },
324
+ { 408, 408, EvenOdd },
325
+ { 412, 412, 211 },
326
+ { 413, 413, 213 },
327
+ { 415, 415, 214 },
328
+ { 416, 420, EvenOddSkip },
329
+ { 422, 422, 218 },
330
+ { 423, 423, OddEven },
331
+ { 425, 425, 218 },
332
+ { 428, 428, EvenOdd },
333
+ { 430, 430, 218 },
334
+ { 431, 431, OddEven },
335
+ { 433, 434, 217 },
336
+ { 435, 437, OddEvenSkip },
337
+ { 439, 439, 219 },
338
+ { 440, 440, EvenOdd },
339
+ { 444, 444, EvenOdd },
340
+ { 452, 452, 2 },
341
+ { 453, 453, OddEven },
342
+ { 455, 455, 2 },
343
+ { 456, 456, EvenOdd },
344
+ { 458, 458, 2 },
345
+ { 459, 475, OddEvenSkip },
346
+ { 478, 494, EvenOddSkip },
347
+ { 497, 497, 2 },
348
+ { 498, 500, EvenOddSkip },
349
+ { 502, 502, -97 },
350
+ { 503, 503, -56 },
351
+ { 504, 542, EvenOddSkip },
352
+ { 544, 544, -130 },
353
+ { 546, 562, EvenOddSkip },
354
+ { 570, 570, 10795 },
355
+ { 571, 571, OddEven },
356
+ { 573, 573, -163 },
357
+ { 574, 574, 10792 },
358
+ { 577, 577, OddEven },
359
+ { 579, 579, -195 },
360
+ { 580, 580, 69 },
361
+ { 581, 581, 71 },
362
+ { 582, 590, EvenOddSkip },
363
+ { 837, 837, 116 },
364
+ { 880, 882, EvenOddSkip },
365
+ { 886, 886, EvenOdd },
366
+ { 902, 902, 38 },
367
+ { 904, 906, 37 },
368
+ { 908, 908, 64 },
369
+ { 910, 911, 63 },
370
+ { 913, 929, 32 },
371
+ { 931, 939, 32 },
372
+ { 962, 962, EvenOdd },
373
+ { 975, 975, 8 },
374
+ { 976, 976, -30 },
375
+ { 977, 977, -25 },
376
+ { 981, 981, -15 },
377
+ { 982, 982, -22 },
378
+ { 984, 1006, EvenOddSkip },
379
+ { 1008, 1008, -54 },
380
+ { 1009, 1009, -48 },
381
+ { 1012, 1012, -60 },
382
+ { 1013, 1013, -64 },
383
+ { 1015, 1015, OddEven },
384
+ { 1017, 1017, -7 },
385
+ { 1018, 1018, EvenOdd },
386
+ { 1021, 1023, -130 },
387
+ { 1024, 1039, 80 },
388
+ { 1040, 1071, 32 },
389
+ { 1120, 1152, EvenOddSkip },
390
+ { 1162, 1214, EvenOddSkip },
391
+ { 1216, 1216, 15 },
392
+ { 1217, 1229, OddEvenSkip },
393
+ { 1232, 1318, EvenOddSkip },
394
+ { 1329, 1366, 48 },
395
+ { 4256, 4293, 7264 },
396
+ { 7680, 7828, EvenOddSkip },
397
+ { 7835, 7835, -58 },
398
+ { 7838, 7838, -7615 },
399
+ { 7840, 7934, EvenOddSkip },
400
+ { 7944, 7951, -8 },
401
+ { 7960, 7965, -8 },
402
+ { 7976, 7983, -8 },
403
+ { 7992, 7999, -8 },
404
+ { 8008, 8013, -8 },
405
+ { 8025, 8025, -8 },
406
+ { 8027, 8027, -8 },
407
+ { 8029, 8029, -8 },
408
+ { 8031, 8031, -8 },
409
+ { 8040, 8047, -8 },
410
+ { 8072, 8079, -8 },
411
+ { 8088, 8095, -8 },
412
+ { 8104, 8111, -8 },
413
+ { 8120, 8121, -8 },
414
+ { 8122, 8123, -74 },
415
+ { 8124, 8124, -9 },
416
+ { 8126, 8126, -7173 },
417
+ { 8136, 8139, -86 },
418
+ { 8140, 8140, -9 },
419
+ { 8152, 8153, -8 },
420
+ { 8154, 8155, -100 },
421
+ { 8168, 8169, -8 },
422
+ { 8170, 8171, -112 },
423
+ { 8172, 8172, -7 },
424
+ { 8184, 8185, -128 },
425
+ { 8186, 8187, -126 },
426
+ { 8188, 8188, -9 },
427
+ { 8486, 8486, -7517 },
428
+ { 8490, 8490, -8383 },
429
+ { 8491, 8491, -8262 },
430
+ { 8498, 8498, 28 },
431
+ { 8544, 8559, 16 },
432
+ { 8579, 8579, OddEven },
433
+ { 9398, 9423, 26 },
434
+ { 11264, 11310, 48 },
435
+ { 11360, 11360, EvenOdd },
436
+ { 11362, 11362, -10743 },
437
+ { 11363, 11363, -3814 },
438
+ { 11364, 11364, -10727 },
439
+ { 11367, 11371, OddEvenSkip },
440
+ { 11373, 11373, -10780 },
441
+ { 11374, 11374, -10749 },
442
+ { 11375, 11375, -10783 },
443
+ { 11376, 11376, -10782 },
444
+ { 11378, 11378, EvenOdd },
445
+ { 11381, 11381, OddEven },
446
+ { 11390, 11391, -10815 },
447
+ { 11392, 11490, EvenOddSkip },
448
+ { 11499, 11501, OddEvenSkip },
449
+ { 42560, 42604, EvenOddSkip },
450
+ { 42624, 42646, EvenOddSkip },
451
+ { 42786, 42798, EvenOddSkip },
452
+ { 42802, 42862, EvenOddSkip },
453
+ { 42873, 42875, OddEvenSkip },
454
+ { 42877, 42877, -35332 },
455
+ { 42878, 42886, EvenOddSkip },
456
+ { 42891, 42891, OddEven },
457
+ { 42893, 42893, -42280 },
458
+ { 42896, 42896, EvenOdd },
459
+ { 42912, 42920, EvenOddSkip },
460
+ { 65313, 65338, 32 },
461
+ { 66560, 66599, 40 },
462
+ };
463
+ int num_unicode_tolower = 163;
464
+
465
+
466
+
467
+ } // namespace re2
468
+
469
+
@@ -0,0 +1,75 @@
1
+ // Copyright 2008 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Unicode case folding tables.
6
+
7
+ // The Unicode case folding tables encode the mapping from one Unicode point
8
+ // to the next largest Unicode point with equivalent folding. The largest
9
+ // point wraps back to the first. For example, the tables map:
10
+ //
11
+ // 'A' -> 'a'
12
+ // 'a' -> 'A'
13
+ //
14
+ // 'K' -> 'k'
15
+ // 'k' -> 'K' (Kelvin symbol)
16
+ // 'K' -> 'K'
17
+ //
18
+ // Like everything Unicode, these tables are big. If we represent the table
19
+ // as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
20
+ // Most table entries look like the ones around them:
21
+ // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
22
+ // Instead of listing all the pairs explicitly, we make a list of ranges
23
+ // and deltas, so that the table entries for 'A' through 'Z' can be represented
24
+ // as a single entry { 'A', 'Z', +32 }.
25
+ //
26
+ // In addition to blocks that map to each other (A-Z mapping to a-z)
27
+ // there are blocks of pairs that individually map to each other
28
+ // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
29
+ // For those, the special delta value EvenOdd marks even/odd pairs
30
+ // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
31
+ //
32
+ // In this form, the table has 274 entries, about 3kB. If we were to split
33
+ // the table into one for 16-bit codes and an overflow table for larger ones,
34
+ // we could get it down to about 1.5kB, but that's not worth the complexity.
35
+ //
36
+ // The grouped form also allows for efficient fold range calculations
37
+ // rather than looping one character at a time.
38
+
39
+ #ifndef RE2_UNICODE_CASEFOLD_H__
40
+ #define RE2_UNICODE_CASEFOLD_H__
41
+
42
+ #include "util/util.h"
43
+
44
+ namespace re2 {
45
+
46
+ enum {
47
+ EvenOdd = 1,
48
+ OddEven = -1,
49
+ EvenOddSkip = 1<<30,
50
+ OddEvenSkip,
51
+ };
52
+
53
+ struct CaseFold {
54
+ uint32 lo;
55
+ uint32 hi;
56
+ int32 delta;
57
+ };
58
+
59
+ extern CaseFold unicode_casefold[];
60
+ extern int num_unicode_casefold;
61
+
62
+ extern CaseFold unicode_tolower[];
63
+ extern int num_unicode_tolower;
64
+
65
+ // Returns the CaseFold* in the tables that contains rune.
66
+ // If rune is not in the tables, returns the first CaseFold* after rune.
67
+ // If rune is larger than any value in the tables, returns NULL.
68
+ extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
69
+
70
+ // Returns the result of applying the fold f to the rune r.
71
+ extern Rune ApplyFold(CaseFold *f, Rune r);
72
+
73
+ } // namespace re2
74
+
75
+ #endif // RE2_UNICODE_CASEFOLD_H__