chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,469 @@
1
+
2
+ // GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
3
+ // make_unicode_casefold.py >unicode_casefold.cc
4
+
5
+ #include "re2/unicode_casefold.h"
6
+
7
+ namespace re2 {
8
+
9
+
10
+ // 1029 groups, 2079 pairs, 282 ranges
11
+ CaseFold unicode_casefold[] = {
12
+ { 65, 90, 32 },
13
+ { 97, 106, -32 },
14
+ { 107, 107, 8383 },
15
+ { 108, 114, -32 },
16
+ { 115, 115, 268 },
17
+ { 116, 122, -32 },
18
+ { 181, 181, 743 },
19
+ { 192, 214, 32 },
20
+ { 216, 222, 32 },
21
+ { 223, 223, 7615 },
22
+ { 224, 228, -32 },
23
+ { 229, 229, 8262 },
24
+ { 230, 246, -32 },
25
+ { 248, 254, -32 },
26
+ { 255, 255, 121 },
27
+ { 256, 303, EvenOdd },
28
+ { 306, 311, EvenOdd },
29
+ { 313, 328, OddEven },
30
+ { 330, 375, EvenOdd },
31
+ { 376, 376, -121 },
32
+ { 377, 382, OddEven },
33
+ { 383, 383, -300 },
34
+ { 384, 384, 195 },
35
+ { 385, 385, 210 },
36
+ { 386, 389, EvenOdd },
37
+ { 390, 390, 206 },
38
+ { 391, 392, OddEven },
39
+ { 393, 394, 205 },
40
+ { 395, 396, OddEven },
41
+ { 398, 398, 79 },
42
+ { 399, 399, 202 },
43
+ { 400, 400, 203 },
44
+ { 401, 402, OddEven },
45
+ { 403, 403, 205 },
46
+ { 404, 404, 207 },
47
+ { 405, 405, 97 },
48
+ { 406, 406, 211 },
49
+ { 407, 407, 209 },
50
+ { 408, 409, EvenOdd },
51
+ { 410, 410, 163 },
52
+ { 412, 412, 211 },
53
+ { 413, 413, 213 },
54
+ { 414, 414, 130 },
55
+ { 415, 415, 214 },
56
+ { 416, 421, EvenOdd },
57
+ { 422, 422, 218 },
58
+ { 423, 424, OddEven },
59
+ { 425, 425, 218 },
60
+ { 428, 429, EvenOdd },
61
+ { 430, 430, 218 },
62
+ { 431, 432, OddEven },
63
+ { 433, 434, 217 },
64
+ { 435, 438, OddEven },
65
+ { 439, 439, 219 },
66
+ { 440, 441, EvenOdd },
67
+ { 444, 445, EvenOdd },
68
+ { 447, 447, 56 },
69
+ { 452, 452, EvenOdd },
70
+ { 453, 453, OddEven },
71
+ { 454, 454, -2 },
72
+ { 455, 455, OddEven },
73
+ { 456, 456, EvenOdd },
74
+ { 457, 457, -2 },
75
+ { 458, 458, EvenOdd },
76
+ { 459, 459, OddEven },
77
+ { 460, 460, -2 },
78
+ { 461, 476, OddEven },
79
+ { 477, 477, -79 },
80
+ { 478, 495, EvenOdd },
81
+ { 497, 497, OddEven },
82
+ { 498, 498, EvenOdd },
83
+ { 499, 499, -2 },
84
+ { 500, 501, EvenOdd },
85
+ { 502, 502, -97 },
86
+ { 503, 503, -56 },
87
+ { 504, 543, EvenOdd },
88
+ { 544, 544, -130 },
89
+ { 546, 563, EvenOdd },
90
+ { 570, 570, 10795 },
91
+ { 571, 572, OddEven },
92
+ { 573, 573, -163 },
93
+ { 574, 574, 10792 },
94
+ { 575, 576, 10815 },
95
+ { 577, 578, OddEven },
96
+ { 579, 579, -195 },
97
+ { 580, 580, 69 },
98
+ { 581, 581, 71 },
99
+ { 582, 591, EvenOdd },
100
+ { 592, 592, 10783 },
101
+ { 593, 593, 10780 },
102
+ { 594, 594, 10782 },
103
+ { 595, 595, -210 },
104
+ { 596, 596, -206 },
105
+ { 598, 599, -205 },
106
+ { 601, 601, -202 },
107
+ { 603, 603, -203 },
108
+ { 608, 608, -205 },
109
+ { 611, 611, -207 },
110
+ { 613, 613, 42280 },
111
+ { 616, 616, -209 },
112
+ { 617, 617, -211 },
113
+ { 619, 619, 10743 },
114
+ { 623, 623, -211 },
115
+ { 625, 625, 10749 },
116
+ { 626, 626, -213 },
117
+ { 629, 629, -214 },
118
+ { 637, 637, 10727 },
119
+ { 640, 640, -218 },
120
+ { 643, 643, -218 },
121
+ { 648, 648, -218 },
122
+ { 649, 649, -69 },
123
+ { 650, 651, -217 },
124
+ { 652, 652, -71 },
125
+ { 658, 658, -219 },
126
+ { 837, 837, 84 },
127
+ { 880, 883, EvenOdd },
128
+ { 886, 887, EvenOdd },
129
+ { 891, 893, 130 },
130
+ { 902, 902, 38 },
131
+ { 904, 906, 37 },
132
+ { 908, 908, 64 },
133
+ { 910, 911, 63 },
134
+ { 913, 929, 32 },
135
+ { 931, 931, 31 },
136
+ { 932, 939, 32 },
137
+ { 940, 940, -38 },
138
+ { 941, 943, -37 },
139
+ { 945, 945, -32 },
140
+ { 946, 946, 30 },
141
+ { 947, 948, -32 },
142
+ { 949, 949, 64 },
143
+ { 950, 951, -32 },
144
+ { 952, 952, 25 },
145
+ { 953, 953, 7173 },
146
+ { 954, 954, 54 },
147
+ { 955, 955, -32 },
148
+ { 956, 956, -775 },
149
+ { 957, 959, -32 },
150
+ { 960, 960, 22 },
151
+ { 961, 961, 48 },
152
+ { 962, 962, EvenOdd },
153
+ { 963, 965, -32 },
154
+ { 966, 966, 15 },
155
+ { 967, 968, -32 },
156
+ { 969, 969, 7517 },
157
+ { 970, 971, -32 },
158
+ { 972, 972, -64 },
159
+ { 973, 974, -63 },
160
+ { 975, 975, 8 },
161
+ { 976, 976, -62 },
162
+ { 977, 977, 35 },
163
+ { 981, 981, -47 },
164
+ { 982, 982, -54 },
165
+ { 983, 983, -8 },
166
+ { 984, 1007, EvenOdd },
167
+ { 1008, 1008, -86 },
168
+ { 1009, 1009, -80 },
169
+ { 1010, 1010, 7 },
170
+ { 1012, 1012, -92 },
171
+ { 1013, 1013, -96 },
172
+ { 1015, 1016, OddEven },
173
+ { 1017, 1017, -7 },
174
+ { 1018, 1019, EvenOdd },
175
+ { 1021, 1023, -130 },
176
+ { 1024, 1039, 80 },
177
+ { 1040, 1071, 32 },
178
+ { 1072, 1103, -32 },
179
+ { 1104, 1119, -80 },
180
+ { 1120, 1153, EvenOdd },
181
+ { 1162, 1215, EvenOdd },
182
+ { 1216, 1216, 15 },
183
+ { 1217, 1230, OddEven },
184
+ { 1231, 1231, -15 },
185
+ { 1232, 1319, EvenOdd },
186
+ { 1329, 1366, 48 },
187
+ { 1377, 1414, -48 },
188
+ { 4256, 4293, 7264 },
189
+ { 7545, 7545, 35332 },
190
+ { 7549, 7549, 3814 },
191
+ { 7680, 7776, EvenOdd },
192
+ { 7777, 7777, 58 },
193
+ { 7778, 7829, EvenOdd },
194
+ { 7835, 7835, -59 },
195
+ { 7838, 7838, -7615 },
196
+ { 7840, 7935, EvenOdd },
197
+ { 7936, 7943, 8 },
198
+ { 7944, 7951, -8 },
199
+ { 7952, 7957, 8 },
200
+ { 7960, 7965, -8 },
201
+ { 7968, 7975, 8 },
202
+ { 7976, 7983, -8 },
203
+ { 7984, 7991, 8 },
204
+ { 7992, 7999, -8 },
205
+ { 8000, 8005, 8 },
206
+ { 8008, 8013, -8 },
207
+ { 8017, 8017, 8 },
208
+ { 8019, 8019, 8 },
209
+ { 8021, 8021, 8 },
210
+ { 8023, 8023, 8 },
211
+ { 8025, 8025, -8 },
212
+ { 8027, 8027, -8 },
213
+ { 8029, 8029, -8 },
214
+ { 8031, 8031, -8 },
215
+ { 8032, 8039, 8 },
216
+ { 8040, 8047, -8 },
217
+ { 8048, 8049, 74 },
218
+ { 8050, 8053, 86 },
219
+ { 8054, 8055, 100 },
220
+ { 8056, 8057, 128 },
221
+ { 8058, 8059, 112 },
222
+ { 8060, 8061, 126 },
223
+ { 8064, 8071, 8 },
224
+ { 8072, 8079, -8 },
225
+ { 8080, 8087, 8 },
226
+ { 8088, 8095, -8 },
227
+ { 8096, 8103, 8 },
228
+ { 8104, 8111, -8 },
229
+ { 8112, 8113, 8 },
230
+ { 8115, 8115, 9 },
231
+ { 8120, 8121, -8 },
232
+ { 8122, 8123, -74 },
233
+ { 8124, 8124, -9 },
234
+ { 8126, 8126, -7289 },
235
+ { 8131, 8131, 9 },
236
+ { 8136, 8139, -86 },
237
+ { 8140, 8140, -9 },
238
+ { 8144, 8145, 8 },
239
+ { 8152, 8153, -8 },
240
+ { 8154, 8155, -100 },
241
+ { 8160, 8161, 8 },
242
+ { 8165, 8165, 7 },
243
+ { 8168, 8169, -8 },
244
+ { 8170, 8171, -112 },
245
+ { 8172, 8172, -7 },
246
+ { 8179, 8179, 9 },
247
+ { 8184, 8185, -128 },
248
+ { 8186, 8187, -126 },
249
+ { 8188, 8188, -9 },
250
+ { 8486, 8486, -7549 },
251
+ { 8490, 8490, -8415 },
252
+ { 8491, 8491, -8294 },
253
+ { 8498, 8498, 28 },
254
+ { 8526, 8526, -28 },
255
+ { 8544, 8559, 16 },
256
+ { 8560, 8575, -16 },
257
+ { 8579, 8580, OddEven },
258
+ { 9398, 9423, 26 },
259
+ { 9424, 9449, -26 },
260
+ { 11264, 11310, 48 },
261
+ { 11312, 11358, -48 },
262
+ { 11360, 11361, EvenOdd },
263
+ { 11362, 11362, -10743 },
264
+ { 11363, 11363, -3814 },
265
+ { 11364, 11364, -10727 },
266
+ { 11365, 11365, -10795 },
267
+ { 11366, 11366, -10792 },
268
+ { 11367, 11372, OddEven },
269
+ { 11373, 11373, -10780 },
270
+ { 11374, 11374, -10749 },
271
+ { 11375, 11375, -10783 },
272
+ { 11376, 11376, -10782 },
273
+ { 11378, 11379, EvenOdd },
274
+ { 11381, 11382, OddEven },
275
+ { 11390, 11391, -10815 },
276
+ { 11392, 11491, EvenOdd },
277
+ { 11499, 11502, OddEven },
278
+ { 11520, 11557, -7264 },
279
+ { 42560, 42605, EvenOdd },
280
+ { 42624, 42647, EvenOdd },
281
+ { 42786, 42799, EvenOdd },
282
+ { 42802, 42863, EvenOdd },
283
+ { 42873, 42876, OddEven },
284
+ { 42877, 42877, -35332 },
285
+ { 42878, 42887, EvenOdd },
286
+ { 42891, 42892, OddEven },
287
+ { 42893, 42893, -42280 },
288
+ { 42896, 42897, EvenOdd },
289
+ { 42912, 42921, EvenOdd },
290
+ { 65313, 65338, 32 },
291
+ { 65345, 65370, -32 },
292
+ { 66560, 66599, 40 },
293
+ { 66600, 66639, -40 },
294
+ };
295
+ int num_unicode_casefold = 282;
296
+
297
+ // 1029 groups, 1050 pairs, 163 ranges
298
+ CaseFold unicode_tolower[] = {
299
+ { 65, 90, 32 },
300
+ { 181, 181, 775 },
301
+ { 192, 214, 32 },
302
+ { 216, 222, 32 },
303
+ { 256, 302, EvenOddSkip },
304
+ { 306, 310, EvenOddSkip },
305
+ { 313, 327, OddEvenSkip },
306
+ { 330, 374, EvenOddSkip },
307
+ { 376, 376, -121 },
308
+ { 377, 381, OddEvenSkip },
309
+ { 383, 383, -268 },
310
+ { 385, 385, 210 },
311
+ { 386, 388, EvenOddSkip },
312
+ { 390, 390, 206 },
313
+ { 391, 391, OddEven },
314
+ { 393, 394, 205 },
315
+ { 395, 395, OddEven },
316
+ { 398, 398, 79 },
317
+ { 399, 399, 202 },
318
+ { 400, 400, 203 },
319
+ { 401, 401, OddEven },
320
+ { 403, 403, 205 },
321
+ { 404, 404, 207 },
322
+ { 406, 406, 211 },
323
+ { 407, 407, 209 },
324
+ { 408, 408, EvenOdd },
325
+ { 412, 412, 211 },
326
+ { 413, 413, 213 },
327
+ { 415, 415, 214 },
328
+ { 416, 420, EvenOddSkip },
329
+ { 422, 422, 218 },
330
+ { 423, 423, OddEven },
331
+ { 425, 425, 218 },
332
+ { 428, 428, EvenOdd },
333
+ { 430, 430, 218 },
334
+ { 431, 431, OddEven },
335
+ { 433, 434, 217 },
336
+ { 435, 437, OddEvenSkip },
337
+ { 439, 439, 219 },
338
+ { 440, 440, EvenOdd },
339
+ { 444, 444, EvenOdd },
340
+ { 452, 452, 2 },
341
+ { 453, 453, OddEven },
342
+ { 455, 455, 2 },
343
+ { 456, 456, EvenOdd },
344
+ { 458, 458, 2 },
345
+ { 459, 475, OddEvenSkip },
346
+ { 478, 494, EvenOddSkip },
347
+ { 497, 497, 2 },
348
+ { 498, 500, EvenOddSkip },
349
+ { 502, 502, -97 },
350
+ { 503, 503, -56 },
351
+ { 504, 542, EvenOddSkip },
352
+ { 544, 544, -130 },
353
+ { 546, 562, EvenOddSkip },
354
+ { 570, 570, 10795 },
355
+ { 571, 571, OddEven },
356
+ { 573, 573, -163 },
357
+ { 574, 574, 10792 },
358
+ { 577, 577, OddEven },
359
+ { 579, 579, -195 },
360
+ { 580, 580, 69 },
361
+ { 581, 581, 71 },
362
+ { 582, 590, EvenOddSkip },
363
+ { 837, 837, 116 },
364
+ { 880, 882, EvenOddSkip },
365
+ { 886, 886, EvenOdd },
366
+ { 902, 902, 38 },
367
+ { 904, 906, 37 },
368
+ { 908, 908, 64 },
369
+ { 910, 911, 63 },
370
+ { 913, 929, 32 },
371
+ { 931, 939, 32 },
372
+ { 962, 962, EvenOdd },
373
+ { 975, 975, 8 },
374
+ { 976, 976, -30 },
375
+ { 977, 977, -25 },
376
+ { 981, 981, -15 },
377
+ { 982, 982, -22 },
378
+ { 984, 1006, EvenOddSkip },
379
+ { 1008, 1008, -54 },
380
+ { 1009, 1009, -48 },
381
+ { 1012, 1012, -60 },
382
+ { 1013, 1013, -64 },
383
+ { 1015, 1015, OddEven },
384
+ { 1017, 1017, -7 },
385
+ { 1018, 1018, EvenOdd },
386
+ { 1021, 1023, -130 },
387
+ { 1024, 1039, 80 },
388
+ { 1040, 1071, 32 },
389
+ { 1120, 1152, EvenOddSkip },
390
+ { 1162, 1214, EvenOddSkip },
391
+ { 1216, 1216, 15 },
392
+ { 1217, 1229, OddEvenSkip },
393
+ { 1232, 1318, EvenOddSkip },
394
+ { 1329, 1366, 48 },
395
+ { 4256, 4293, 7264 },
396
+ { 7680, 7828, EvenOddSkip },
397
+ { 7835, 7835, -58 },
398
+ { 7838, 7838, -7615 },
399
+ { 7840, 7934, EvenOddSkip },
400
+ { 7944, 7951, -8 },
401
+ { 7960, 7965, -8 },
402
+ { 7976, 7983, -8 },
403
+ { 7992, 7999, -8 },
404
+ { 8008, 8013, -8 },
405
+ { 8025, 8025, -8 },
406
+ { 8027, 8027, -8 },
407
+ { 8029, 8029, -8 },
408
+ { 8031, 8031, -8 },
409
+ { 8040, 8047, -8 },
410
+ { 8072, 8079, -8 },
411
+ { 8088, 8095, -8 },
412
+ { 8104, 8111, -8 },
413
+ { 8120, 8121, -8 },
414
+ { 8122, 8123, -74 },
415
+ { 8124, 8124, -9 },
416
+ { 8126, 8126, -7173 },
417
+ { 8136, 8139, -86 },
418
+ { 8140, 8140, -9 },
419
+ { 8152, 8153, -8 },
420
+ { 8154, 8155, -100 },
421
+ { 8168, 8169, -8 },
422
+ { 8170, 8171, -112 },
423
+ { 8172, 8172, -7 },
424
+ { 8184, 8185, -128 },
425
+ { 8186, 8187, -126 },
426
+ { 8188, 8188, -9 },
427
+ { 8486, 8486, -7517 },
428
+ { 8490, 8490, -8383 },
429
+ { 8491, 8491, -8262 },
430
+ { 8498, 8498, 28 },
431
+ { 8544, 8559, 16 },
432
+ { 8579, 8579, OddEven },
433
+ { 9398, 9423, 26 },
434
+ { 11264, 11310, 48 },
435
+ { 11360, 11360, EvenOdd },
436
+ { 11362, 11362, -10743 },
437
+ { 11363, 11363, -3814 },
438
+ { 11364, 11364, -10727 },
439
+ { 11367, 11371, OddEvenSkip },
440
+ { 11373, 11373, -10780 },
441
+ { 11374, 11374, -10749 },
442
+ { 11375, 11375, -10783 },
443
+ { 11376, 11376, -10782 },
444
+ { 11378, 11378, EvenOdd },
445
+ { 11381, 11381, OddEven },
446
+ { 11390, 11391, -10815 },
447
+ { 11392, 11490, EvenOddSkip },
448
+ { 11499, 11501, OddEvenSkip },
449
+ { 42560, 42604, EvenOddSkip },
450
+ { 42624, 42646, EvenOddSkip },
451
+ { 42786, 42798, EvenOddSkip },
452
+ { 42802, 42862, EvenOddSkip },
453
+ { 42873, 42875, OddEvenSkip },
454
+ { 42877, 42877, -35332 },
455
+ { 42878, 42886, EvenOddSkip },
456
+ { 42891, 42891, OddEven },
457
+ { 42893, 42893, -42280 },
458
+ { 42896, 42896, EvenOdd },
459
+ { 42912, 42920, EvenOddSkip },
460
+ { 65313, 65338, 32 },
461
+ { 66560, 66599, 40 },
462
+ };
463
+ int num_unicode_tolower = 163;
464
+
465
+
466
+
467
+ } // namespace re2
468
+
469
+
@@ -0,0 +1,75 @@
1
+ // Copyright 2008 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Unicode case folding tables.
6
+
7
+ // The Unicode case folding tables encode the mapping from one Unicode point
8
+ // to the next largest Unicode point with equivalent folding. The largest
9
+ // point wraps back to the first. For example, the tables map:
10
+ //
11
+ // 'A' -> 'a'
12
+ // 'a' -> 'A'
13
+ //
14
+ // 'K' -> 'k'
15
+ // 'k' -> 'K' (Kelvin symbol)
16
+ // 'K' -> 'K'
17
+ //
18
+ // Like everything Unicode, these tables are big. If we represent the table
19
+ // as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
20
+ // Most table entries look like the ones around them:
21
+ // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
22
+ // Instead of listing all the pairs explicitly, we make a list of ranges
23
+ // and deltas, so that the table entries for 'A' through 'Z' can be represented
24
+ // as a single entry { 'A', 'Z', +32 }.
25
+ //
26
+ // In addition to blocks that map to each other (A-Z mapping to a-z)
27
+ // there are blocks of pairs that individually map to each other
28
+ // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
29
+ // For those, the special delta value EvenOdd marks even/odd pairs
30
+ // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
31
+ //
32
+ // In this form, the table has 274 entries, about 3kB. If we were to split
33
+ // the table into one for 16-bit codes and an overflow table for larger ones,
34
+ // we could get it down to about 1.5kB, but that's not worth the complexity.
35
+ //
36
+ // The grouped form also allows for efficient fold range calculations
37
+ // rather than looping one character at a time.
38
+
39
+ #ifndef RE2_UNICODE_CASEFOLD_H__
40
+ #define RE2_UNICODE_CASEFOLD_H__
41
+
42
+ #include "util/util.h"
43
+
44
+ namespace re2 {
45
+
46
+ enum {
47
+ EvenOdd = 1,
48
+ OddEven = -1,
49
+ EvenOddSkip = 1<<30,
50
+ OddEvenSkip,
51
+ };
52
+
53
+ struct CaseFold {
54
+ uint32 lo;
55
+ uint32 hi;
56
+ int32 delta;
57
+ };
58
+
59
+ extern CaseFold unicode_casefold[];
60
+ extern int num_unicode_casefold;
61
+
62
+ extern CaseFold unicode_tolower[];
63
+ extern int num_unicode_tolower;
64
+
65
+ // Returns the CaseFold* in the tables that contains rune.
66
+ // If rune is not in the tables, returns the first CaseFold* after rune.
67
+ // If rune is larger than any value in the tables, returns NULL.
68
+ extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
69
+
70
+ // Returns the result of applying the fold f to the rune r.
71
+ extern Rune ApplyFold(CaseFold *f, Rune r);
72
+
73
+ } // namespace re2
74
+
75
+ #endif // RE2_UNICODE_CASEFOLD_H__