chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,341 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Format a regular expression structure as a string.
6
+ // Tested by parse_test.cc
7
+
8
+ #include "util/util.h"
9
+ #include "re2/regexp.h"
10
+ #include "re2/walker-inl.h"
11
+
12
+ namespace re2 {
13
+
14
+ enum {
15
+ PrecAtom,
16
+ PrecUnary,
17
+ PrecConcat,
18
+ PrecAlternate,
19
+ PrecEmpty,
20
+ PrecParen,
21
+ PrecToplevel,
22
+ };
23
+
24
+ // Helper function. See description below.
25
+ static void AppendCCRange(string* t, Rune lo, Rune hi);
26
+
27
+ // Walker to generate string in s_.
28
+ // The arg pointers are actually integers giving the
29
+ // context precedence.
30
+ // The child_args are always NULL.
31
+ class ToStringWalker : public Regexp::Walker<int> {
32
+ public:
33
+ explicit ToStringWalker(string* t) : t_(t) {}
34
+
35
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
36
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
37
+ int* child_args, int nchild_args);
38
+ virtual int ShortVisit(Regexp* re, int parent_arg) {
39
+ return 0;
40
+ }
41
+
42
+ private:
43
+ string* t_; // The string the walker appends to.
44
+
45
+ DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
46
+ };
47
+
48
+ string Regexp::ToString() {
49
+ string t;
50
+ ToStringWalker w(&t);
51
+ w.WalkExponential(this, PrecToplevel, 100000);
52
+ if (w.stopped_early())
53
+ t += " [truncated]";
54
+ return t;
55
+ }
56
+
57
+ #define ToString DontCallToString // Avoid accidental recursion.
58
+
59
+ // Visits re before children are processed.
60
+ // Appends ( if needed and passes new precedence to children.
61
+ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
62
+ int prec = parent_arg;
63
+ int nprec = PrecAtom;
64
+
65
+ switch (re->op()) {
66
+ case kRegexpNoMatch:
67
+ case kRegexpEmptyMatch:
68
+ case kRegexpLiteral:
69
+ case kRegexpAnyChar:
70
+ case kRegexpAnyByte:
71
+ case kRegexpBeginLine:
72
+ case kRegexpEndLine:
73
+ case kRegexpBeginText:
74
+ case kRegexpEndText:
75
+ case kRegexpWordBoundary:
76
+ case kRegexpNoWordBoundary:
77
+ case kRegexpCharClass:
78
+ case kRegexpHaveMatch:
79
+ nprec = PrecAtom;
80
+ break;
81
+
82
+ case kRegexpConcat:
83
+ case kRegexpLiteralString:
84
+ if (prec < PrecConcat)
85
+ t_->append("(?:");
86
+ nprec = PrecConcat;
87
+ break;
88
+
89
+ case kRegexpAlternate:
90
+ if (prec < PrecAlternate)
91
+ t_->append("(?:");
92
+ nprec = PrecAlternate;
93
+ break;
94
+
95
+ case kRegexpCapture:
96
+ t_->append("(");
97
+ if (re->name()) {
98
+ t_->append("?P<");
99
+ t_->append(*re->name());
100
+ t_->append(">");
101
+ }
102
+ nprec = PrecParen;
103
+ break;
104
+
105
+ case kRegexpStar:
106
+ case kRegexpPlus:
107
+ case kRegexpQuest:
108
+ case kRegexpRepeat:
109
+ if (prec < PrecUnary)
110
+ t_->append("(?:");
111
+ // The subprecedence here is PrecAtom instead of PrecUnary
112
+ // because PCRE treats two unary ops in a row as a parse error.
113
+ nprec = PrecAtom;
114
+ break;
115
+ }
116
+
117
+ return nprec;
118
+ }
119
+
120
+ static void AppendLiteral(string *t, Rune r, bool foldcase) {
121
+ if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
122
+ t->append(1, '\\');
123
+ t->append(1, r);
124
+ } else if (foldcase && 'a' <= r && r <= 'z') {
125
+ if ('a' <= r && r <= 'z')
126
+ r += 'A' - 'a';
127
+ t->append(1, '[');
128
+ t->append(1, r);
129
+ t->append(1, r + 'a' - 'A');
130
+ t->append(1, ']');
131
+ } else {
132
+ AppendCCRange(t, r, r);
133
+ }
134
+ }
135
+
136
+ // Visits re after children are processed.
137
+ // For childless regexps, all the work is done here.
138
+ // For regexps with children, append any unary suffixes or ).
139
+ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
140
+ int* child_args, int nchild_args) {
141
+ int prec = parent_arg;
142
+ switch (re->op()) {
143
+ case kRegexpNoMatch:
144
+ // There's no simple symbol for "no match", but
145
+ // [^0-Runemax] excludes everything.
146
+ t_->append("[^\\x00-\\x{10ffff}]");
147
+ break;
148
+
149
+ case kRegexpEmptyMatch:
150
+ // Append (?:) to make empty string visible,
151
+ // unless this is already being parenthesized.
152
+ if (prec < PrecEmpty)
153
+ t_->append("(?:)");
154
+ break;
155
+
156
+ case kRegexpLiteral:
157
+ AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
158
+ break;
159
+
160
+ case kRegexpLiteralString:
161
+ for (int i = 0; i < re->nrunes(); i++)
162
+ AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
163
+ if (prec < PrecConcat)
164
+ t_->append(")");
165
+ break;
166
+
167
+ case kRegexpConcat:
168
+ if (prec < PrecConcat)
169
+ t_->append(")");
170
+ break;
171
+
172
+ case kRegexpAlternate:
173
+ // Clumsy but workable: the children all appended |
174
+ // at the end of their strings, so just remove the last one.
175
+ if ((*t_)[t_->size()-1] == '|')
176
+ t_->erase(t_->size()-1);
177
+ else
178
+ LOG(DFATAL) << "Bad final char: " << t_;
179
+ if (prec < PrecAlternate)
180
+ t_->append(")");
181
+ break;
182
+
183
+ case kRegexpStar:
184
+ t_->append("*");
185
+ if (re->parse_flags() & Regexp::NonGreedy)
186
+ t_->append("?");
187
+ if (prec < PrecUnary)
188
+ t_->append(")");
189
+ break;
190
+
191
+ case kRegexpPlus:
192
+ t_->append("+");
193
+ if (re->parse_flags() & Regexp::NonGreedy)
194
+ t_->append("?");
195
+ if (prec < PrecUnary)
196
+ t_->append(")");
197
+ break;
198
+
199
+ case kRegexpQuest:
200
+ t_->append("?");
201
+ if (re->parse_flags() & Regexp::NonGreedy)
202
+ t_->append("?");
203
+ if (prec < PrecUnary)
204
+ t_->append(")");
205
+ break;
206
+
207
+ case kRegexpRepeat:
208
+ if (re->max() == -1)
209
+ t_->append(StringPrintf("{%d,}", re->min()));
210
+ else if (re->min() == re->max())
211
+ t_->append(StringPrintf("{%d}", re->min()));
212
+ else
213
+ t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
214
+ if (re->parse_flags() & Regexp::NonGreedy)
215
+ t_->append("?");
216
+ if (prec < PrecUnary)
217
+ t_->append(")");
218
+ break;
219
+
220
+ case kRegexpAnyChar:
221
+ t_->append(".");
222
+ break;
223
+
224
+ case kRegexpAnyByte:
225
+ t_->append("\\C");
226
+ break;
227
+
228
+ case kRegexpBeginLine:
229
+ t_->append("^");
230
+ break;
231
+
232
+ case kRegexpEndLine:
233
+ t_->append("$");
234
+ break;
235
+
236
+ case kRegexpBeginText:
237
+ t_->append("(?-m:^)");
238
+ break;
239
+
240
+ case kRegexpEndText:
241
+ if (re->parse_flags() & Regexp::WasDollar)
242
+ t_->append("(?-m:$)");
243
+ else
244
+ t_->append("\\z");
245
+ break;
246
+
247
+ case kRegexpWordBoundary:
248
+ t_->append("\\b");
249
+ break;
250
+
251
+ case kRegexpNoWordBoundary:
252
+ t_->append("\\B");
253
+ break;
254
+
255
+ case kRegexpCharClass: {
256
+ if (re->cc()->size() == 0) {
257
+ t_->append("[^\\x00-\\x{10ffff}]");
258
+ break;
259
+ }
260
+ t_->append("[");
261
+ // Heuristic: show class as negated if it contains the
262
+ // non-character 0xFFFE.
263
+ CharClass* cc = re->cc();
264
+ if (cc->Contains(0xFFFE)) {
265
+ cc = cc->Negate();
266
+ t_->append("^");
267
+ }
268
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
269
+ AppendCCRange(t_, i->lo, i->hi);
270
+ if (cc != re->cc())
271
+ cc->Delete();
272
+ t_->append("]");
273
+ break;
274
+ }
275
+
276
+ case kRegexpCapture:
277
+ t_->append(")");
278
+ break;
279
+
280
+ case kRegexpHaveMatch:
281
+ // There's no syntax accepted by the parser to generate
282
+ // this node (it is generated by RE2::Set) so make something
283
+ // up that is readable but won't compile.
284
+ t_->append("(?HaveMatch:%d)", re->match_id());
285
+ break;
286
+ }
287
+
288
+ // If the parent is an alternation, append the | for it.
289
+ if (prec == PrecAlternate)
290
+ t_->append("|");
291
+
292
+ return 0;
293
+ }
294
+
295
+ // Appends a rune for use in a character class to the string t.
296
+ static void AppendCCChar(string* t, Rune r) {
297
+ if (0x20 <= r && r <= 0x7E) {
298
+ if (strchr("[]^-\\", r))
299
+ t->append("\\");
300
+ t->append(1, r);
301
+ return;
302
+ }
303
+ switch (r) {
304
+ default:
305
+ break;
306
+
307
+ case '\r':
308
+ t->append("\\r");
309
+ return;
310
+
311
+ case '\t':
312
+ t->append("\\t");
313
+ return;
314
+
315
+ case '\n':
316
+ t->append("\\n");
317
+ return;
318
+
319
+ case '\f':
320
+ t->append("\\f");
321
+ return;
322
+ }
323
+
324
+ if (r < 0x100) {
325
+ StringAppendF(t, "\\x%02x", static_cast<int>(r));
326
+ return;
327
+ }
328
+ StringAppendF(t, "\\x{%x}", static_cast<int>(r));
329
+ }
330
+
331
+ static void AppendCCRange(string* t, Rune lo, Rune hi) {
332
+ if (lo > hi)
333
+ return;
334
+ AppendCCChar(t, lo);
335
+ if (lo < hi) {
336
+ t->append("-");
337
+ AppendCCChar(t, hi);
338
+ }
339
+ }
340
+
341
+ } // namespace re2