chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/re2/prog.cc ADDED
@@ -0,0 +1,341 @@
1
+ // Copyright 2007 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Compiled regular expression representation.
6
+ // Tested by compile_test.cc
7
+
8
+ #include "util/util.h"
9
+ #include "util/sparse_set.h"
10
+ #include "re2/prog.h"
11
+ #include "re2/stringpiece.h"
12
+
13
+ namespace re2 {
14
+
15
+ // Constructors per Inst opcode
16
+
17
+ void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
18
+ DCHECK_EQ(out_opcode_, 0);
19
+ set_out_opcode(out, kInstAlt);
20
+ out1_ = out1;
21
+ }
22
+
23
+ void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
24
+ DCHECK_EQ(out_opcode_, 0);
25
+ set_out_opcode(out, kInstByteRange);
26
+ lo_ = lo & 0xFF;
27
+ hi_ = hi & 0xFF;
28
+ foldcase_ = foldcase;
29
+ }
30
+
31
+ void Prog::Inst::InitCapture(int cap, uint32 out) {
32
+ DCHECK_EQ(out_opcode_, 0);
33
+ set_out_opcode(out, kInstCapture);
34
+ cap_ = cap;
35
+ }
36
+
37
+ void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
38
+ DCHECK_EQ(out_opcode_, 0);
39
+ set_out_opcode(out, kInstEmptyWidth);
40
+ empty_ = empty;
41
+ }
42
+
43
+ void Prog::Inst::InitMatch(int32 id) {
44
+ DCHECK_EQ(out_opcode_, 0);
45
+ set_opcode(kInstMatch);
46
+ match_id_ = id;
47
+ }
48
+
49
+ void Prog::Inst::InitNop(uint32 out) {
50
+ DCHECK_EQ(out_opcode_, 0);
51
+ set_opcode(kInstNop);
52
+ }
53
+
54
+ void Prog::Inst::InitFail() {
55
+ DCHECK_EQ(out_opcode_, 0);
56
+ set_opcode(kInstFail);
57
+ }
58
+
59
+ string Prog::Inst::Dump() {
60
+ switch (opcode()) {
61
+ default:
62
+ return StringPrintf("opcode %d", static_cast<int>(opcode()));
63
+
64
+ case kInstAlt:
65
+ return StringPrintf("alt -> %d | %d", out(), out1_);
66
+
67
+ case kInstAltMatch:
68
+ return StringPrintf("altmatch -> %d | %d", out(), out1_);
69
+
70
+ case kInstByteRange:
71
+ return StringPrintf("byte%s [%02x-%02x] -> %d",
72
+ foldcase_ ? "/i" : "",
73
+ lo_, hi_, out());
74
+
75
+ case kInstCapture:
76
+ return StringPrintf("capture %d -> %d", cap_, out());
77
+
78
+ case kInstEmptyWidth:
79
+ return StringPrintf("emptywidth %#x -> %d",
80
+ static_cast<int>(empty_), out());
81
+
82
+ case kInstMatch:
83
+ return StringPrintf("match! %d", match_id());
84
+
85
+ case kInstNop:
86
+ return StringPrintf("nop -> %d", out());
87
+
88
+ case kInstFail:
89
+ return StringPrintf("fail");
90
+ }
91
+ }
92
+
93
+ Prog::Prog()
94
+ : anchor_start_(false),
95
+ anchor_end_(false),
96
+ reversed_(false),
97
+ did_onepass_(false),
98
+ start_(0),
99
+ start_unanchored_(0),
100
+ size_(0),
101
+ byte_inst_count_(0),
102
+ bytemap_range_(0),
103
+ flags_(0),
104
+ onepass_statesize_(0),
105
+ inst_(NULL),
106
+ dfa_first_(NULL),
107
+ dfa_longest_(NULL),
108
+ dfa_mem_(0),
109
+ delete_dfa_(NULL),
110
+ unbytemap_(NULL),
111
+ onepass_nodes_(NULL),
112
+ onepass_start_(NULL) {
113
+ }
114
+
115
+ Prog::~Prog() {
116
+ if (delete_dfa_) {
117
+ if (dfa_first_)
118
+ delete_dfa_(dfa_first_);
119
+ if (dfa_longest_)
120
+ delete_dfa_(dfa_longest_);
121
+ }
122
+ delete[] onepass_nodes_;
123
+ delete[] inst_;
124
+ delete[] unbytemap_;
125
+ }
126
+
127
+ typedef SparseSet Workq;
128
+
129
+ static inline void AddToQueue(Workq* q, int id) {
130
+ if (id != 0)
131
+ q->insert(id);
132
+ }
133
+
134
+ static string ProgToString(Prog* prog, Workq* q) {
135
+ string s;
136
+
137
+ for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
138
+ int id = *i;
139
+ Prog::Inst* ip = prog->inst(id);
140
+ StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
141
+ AddToQueue(q, ip->out());
142
+ if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
143
+ AddToQueue(q, ip->out1());
144
+ }
145
+ return s;
146
+ }
147
+
148
+ string Prog::Dump() {
149
+ string map;
150
+ if (false) { // Debugging
151
+ int lo = 0;
152
+ StringAppendF(&map, "byte map:\n");
153
+ for (int i = 0; i < bytemap_range_; i++) {
154
+ StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
155
+ lo = unbytemap_[i] + 1;
156
+ }
157
+ StringAppendF(&map, "\n");
158
+ }
159
+
160
+ Workq q(size_);
161
+ AddToQueue(&q, start_);
162
+ return map + ProgToString(this, &q);
163
+ }
164
+
165
+ string Prog::DumpUnanchored() {
166
+ Workq q(size_);
167
+ AddToQueue(&q, start_unanchored_);
168
+ return ProgToString(this, &q);
169
+ }
170
+
171
+ static bool IsMatch(Prog*, Prog::Inst*);
172
+
173
+ // Peep-hole optimizer.
174
+ void Prog::Optimize() {
175
+ Workq q(size_);
176
+
177
+ // Eliminate nops. Most are taken out during compilation
178
+ // but a few are hard to avoid.
179
+ q.clear();
180
+ AddToQueue(&q, start_);
181
+ for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
182
+ int id = *i;
183
+
184
+ Inst* ip = inst(id);
185
+ int j = ip->out();
186
+ Inst* jp;
187
+ while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
188
+ j = jp->out();
189
+ }
190
+ ip->set_out(j);
191
+ AddToQueue(&q, ip->out());
192
+
193
+ if (ip->opcode() == kInstAlt) {
194
+ j = ip->out1();
195
+ while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
196
+ j = jp->out();
197
+ }
198
+ ip->out1_ = j;
199
+ AddToQueue(&q, ip->out1());
200
+ }
201
+ }
202
+
203
+ // Insert kInstAltMatch instructions
204
+ // Look for
205
+ // ip: Alt -> j | k
206
+ // j: ByteRange [00-FF] -> ip
207
+ // k: Match
208
+ // or the reverse (the above is the greedy one).
209
+ // Rewrite Alt to AltMatch.
210
+ q.clear();
211
+ AddToQueue(&q, start_);
212
+ for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
213
+ int id = *i;
214
+ Inst* ip = inst(id);
215
+ AddToQueue(&q, ip->out());
216
+ if (ip->opcode() == kInstAlt)
217
+ AddToQueue(&q, ip->out1());
218
+
219
+ if (ip->opcode() == kInstAlt) {
220
+ Inst* j = inst(ip->out());
221
+ Inst* k = inst(ip->out1());
222
+ if (j->opcode() == kInstByteRange && j->out() == id &&
223
+ j->lo() == 0x00 && j->hi() == 0xFF &&
224
+ IsMatch(this, k)) {
225
+ ip->set_opcode(kInstAltMatch);
226
+ continue;
227
+ }
228
+ if (IsMatch(this, j) &&
229
+ k->opcode() == kInstByteRange && k->out() == id &&
230
+ k->lo() == 0x00 && k->hi() == 0xFF) {
231
+ ip->set_opcode(kInstAltMatch);
232
+ }
233
+ }
234
+ }
235
+ }
236
+
237
+ // Is ip a guaranteed match at end of text, perhaps after some capturing?
238
+ static bool IsMatch(Prog* prog, Prog::Inst* ip) {
239
+ for (;;) {
240
+ switch (ip->opcode()) {
241
+ default:
242
+ LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
243
+ return false;
244
+
245
+ case kInstAlt:
246
+ case kInstAltMatch:
247
+ case kInstByteRange:
248
+ case kInstFail:
249
+ case kInstEmptyWidth:
250
+ return false;
251
+
252
+ case kInstCapture:
253
+ case kInstNop:
254
+ ip = prog->inst(ip->out());
255
+ break;
256
+
257
+ case kInstMatch:
258
+ return true;
259
+ }
260
+ }
261
+ }
262
+
263
+ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
264
+ int flags = 0;
265
+
266
+ // ^ and \A
267
+ if (p == text.begin())
268
+ flags |= kEmptyBeginText | kEmptyBeginLine;
269
+ else if (p[-1] == '\n')
270
+ flags |= kEmptyBeginLine;
271
+
272
+ // $ and \z
273
+ if (p == text.end())
274
+ flags |= kEmptyEndText | kEmptyEndLine;
275
+ else if (p < text.end() && p[0] == '\n')
276
+ flags |= kEmptyEndLine;
277
+
278
+ // \b and \B
279
+ if (p == text.begin() && p == text.end()) {
280
+ // no word boundary here
281
+ } else if (p == text.begin()) {
282
+ if (IsWordChar(p[0]))
283
+ flags |= kEmptyWordBoundary;
284
+ } else if (p == text.end()) {
285
+ if (IsWordChar(p[-1]))
286
+ flags |= kEmptyWordBoundary;
287
+ } else {
288
+ if (IsWordChar(p[-1]) != IsWordChar(p[0]))
289
+ flags |= kEmptyWordBoundary;
290
+ }
291
+ if (!(flags & kEmptyWordBoundary))
292
+ flags |= kEmptyNonWordBoundary;
293
+
294
+ return flags;
295
+ }
296
+
297
+ void Prog::MarkByteRange(int lo, int hi) {
298
+ CHECK_GE(lo, 0);
299
+ CHECK_GE(hi, 0);
300
+ CHECK_LE(lo, 255);
301
+ CHECK_LE(hi, 255);
302
+ if (lo > 0)
303
+ byterange_.Set(lo - 1);
304
+ byterange_.Set(hi);
305
+ }
306
+
307
+ void Prog::ComputeByteMap() {
308
+ // Fill in bytemap with byte classes for prog_.
309
+ // Ranges of bytes that are treated as indistinguishable
310
+ // by the regexp program are mapped to a single byte class.
311
+ // The vector prog_->byterange() marks the end of each
312
+ // such range.
313
+ const Bitmap<256>& v = byterange();
314
+
315
+ COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
316
+ uint8 n = 0;
317
+ uint32 bits = 0;
318
+ for (int i = 0; i < 256; i++) {
319
+ if ((i&31) == 0)
320
+ bits = v.Word(i >> 5);
321
+ bytemap_[i] = n;
322
+ n += bits & 1;
323
+ bits >>= 1;
324
+ }
325
+ bytemap_range_ = bytemap_[255] + 1;
326
+ unbytemap_ = new uint8[bytemap_range_];
327
+ for (int i = 0; i < 256; i++)
328
+ unbytemap_[bytemap_[i]] = i;
329
+
330
+ if (0) { // For debugging: use trivial byte map.
331
+ for (int i = 0; i < 256; i++) {
332
+ bytemap_[i] = i;
333
+ unbytemap_[i] = i;
334
+ }
335
+ bytemap_range_ = 256;
336
+ LOG(INFO) << "Using trivial bytemap.";
337
+ }
338
+ }
339
+
340
+ } // namespace re2
341
+