chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,626 @@
1
+ #include <stdlib.h>
2
+ #include <iostream>
3
+ #include <vector>
4
+ #include "re2/re2.h"
5
+ #include "re2/stringpiece.h"
6
+ #include "libstemmer.h"
7
+ #include "version.h"
8
+
9
+ #if __GNUC__
10
+ #define STRSTR strcasestr
11
+ #else
12
+ #define STRSTR strstr
13
+ #endif
14
+
15
+ #include <ruby/ruby.h>
16
+ #include <ruby/io.h>
17
+
18
+ #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
19
+ #define CSTRING(v) RSTRING_PTR(TO_S(v))
20
+ #define MIN_TAG_SIZE 3
21
+ #define MIN_WORD_SIZE 3
22
+
23
+ using namespace std;
24
+ using namespace re2;
25
+
26
+ RE2 *UserRE;
27
+ RE2 *HashTagRE;
28
+ RE2 *UserStopRE;
29
+ RE2 *HashTagStopRE;
30
+ RE2 *SkipTokenRE;
31
+ RE2 *SkipTokenPatternRE;
32
+
33
+ RE2::Options DefaultMatchOptions;
34
+ VALUE id_users, id_hashtags, id_urls, id_tokens;
35
+
36
+ string build_alternating_expr(VALUE list) {
37
+ VALUE v;
38
+ string expr = "(?:";
39
+ for (int i = 0; i < RARRAY_LEN(list) - 1; i++) {
40
+ v = rb_ary_entry(list, i);
41
+ expr += string(RSTRING_PTR(v), RSTRING_LEN(v)) + "|";
42
+ }
43
+ v = rb_ary_entry(list, RARRAY_LEN(list)-1);
44
+ expr += string(RSTRING_PTR(v), RSTRING_LEN(v)) + ")";
45
+ return expr;
46
+ }
47
+
48
+ void replace(char *string, const char *pattern, int c) {
49
+ int width = strlen(pattern);
50
+ char *ptr1, *ptr2 = string;
51
+
52
+ while ((ptr1 = strstr(ptr2, pattern))) {
53
+ memset(ptr1, c, width);
54
+ ptr2 = ptr1 + width;
55
+ }
56
+ }
57
+
58
+ void remove(char *string, const char *pattern) {
59
+ int size = strlen(string), width = strlen(pattern);
60
+ char *ptr1, *ptr2 = string;
61
+
62
+ while ((ptr1 = strstr(ptr2, pattern))) {
63
+ memcpy(ptr1, ptr1 + width, size - (ptr1 - string) - width);
64
+ size -= width;
65
+ string[size] = 0;
66
+ }
67
+ }
68
+
69
+ typedef struct List {
70
+ char *text;
71
+ struct List *next;
72
+ } List;
73
+
74
+ typedef struct DList {
75
+ List *list;
76
+ struct DList *next;
77
+ } DList;
78
+
79
+ void list_free(List *list) {
80
+ List *curr = list;
81
+ while (list) {
82
+ list = curr->next;
83
+ if (curr->text)
84
+ free(curr->text);
85
+ free(curr);
86
+ curr = list;
87
+ }
88
+ }
89
+
90
+ List* list_push(List *root, List *curr, const char *text, int size) {
91
+ List *node = (List *)malloc(sizeof(List));
92
+ if (!node) {
93
+ list_free(root);
94
+ return 0;
95
+ }
96
+
97
+ node->text = (char *)malloc(size + 1);
98
+ if (!node->text) {
99
+ free(node);
100
+ list_free(root);
101
+ return 0;
102
+ }
103
+
104
+ memcpy(node->text, text, size);
105
+
106
+ node->next = 0;
107
+ node->text[size] = 0;
108
+
109
+ if (curr)
110
+ curr->next = node;
111
+
112
+ return node;
113
+ }
114
+
115
+ VALUE list_to_array(List *node, rb_encoding *encoding) {
116
+ List *next;
117
+ VALUE array = rb_ary_new();
118
+
119
+ while (node) {
120
+ rb_ary_push(array, rb_enc_str_new(node->text, strlen(node->text), encoding));
121
+ next = node->next;
122
+ free(node->text);
123
+ free(node);
124
+ node = next;
125
+ }
126
+
127
+ return array;
128
+ }
129
+
130
+ void dlist_free(DList *dlist) {
131
+ DList *curr = dlist;
132
+ while (dlist) {
133
+ dlist = curr->next;
134
+ if (curr->list)
135
+ list_free(curr->list);
136
+ free(curr);
137
+ curr = dlist;
138
+ }
139
+ }
140
+
141
+ DList* dlist_push(DList *root, DList *curr, List *list) {
142
+ DList *node = (DList *)malloc(sizeof(DList));
143
+ if (!node) {
144
+ dlist_free(root);
145
+ list_free(list);
146
+ return 0;
147
+ }
148
+
149
+ node->list = list;
150
+ node->next = 0;
151
+
152
+ if (curr)
153
+ curr->next = node;
154
+
155
+ return node;
156
+ }
157
+
158
+ VALUE dlist_to_array(DList *node, rb_encoding *encoding) {
159
+ DList *next;
160
+ VALUE array = rb_ary_new();
161
+
162
+ while (node) {
163
+ rb_ary_push(array, list_to_array(node->list, encoding));
164
+ next = node->next;
165
+ free(node);
166
+ node = next;
167
+ }
168
+
169
+ return array;
170
+ }
171
+
172
+ List* tbr_users(VALUE text) {
173
+ List *lroot = 0, *lcurr = 0, *lnode;
174
+
175
+ string match;
176
+ StringPiece input;
177
+ input.set(RSTRING_PTR(text), RSTRING_LEN(text));
178
+ while (RE2::FindAndConsume(&input, *UserRE, &match)) {
179
+ if (UserStopRE && RE2::FullMatch(match, *UserStopRE)) continue;
180
+
181
+ if (!(lnode = list_push(lroot, lcurr, match.data(), match.size())))
182
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
183
+
184
+ if (lcurr)
185
+ lcurr = lnode;
186
+ else
187
+ lroot = lcurr = lnode;
188
+ }
189
+
190
+ return lroot;
191
+ }
192
+
193
+
194
+ List* tbr_hashtags(VALUE text) {
195
+ List *lroot = 0, *lcurr = 0, *lnode;
196
+
197
+ string match;
198
+ StringPiece input;
199
+ input.set(RSTRING_PTR(text), RSTRING_LEN(text));
200
+ while (RE2::FindAndConsume(&input, *HashTagRE, &match)) {
201
+ if (match.size() < MIN_TAG_SIZE) continue;
202
+ if (HashTagStopRE && RE2::FullMatch(match, *HashTagStopRE)) continue;
203
+
204
+ if (!(lnode = list_push(lroot, lcurr, match.data(), match.size())))
205
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
206
+
207
+ if (lcurr)
208
+ lcurr = lnode;
209
+ else
210
+ lroot = lcurr = lnode;
211
+ }
212
+
213
+ return lroot;
214
+ }
215
+
216
+ int tco_slug_size(char *ptr, int max) {
217
+ int size = 0;
218
+ while (*ptr) {
219
+ if (!isalnum(*ptr) || size >= max) break;
220
+ size++;
221
+ ptr++;
222
+ }
223
+ return size;
224
+ }
225
+
226
+ List* tbr_urls(VALUE text) {
227
+ int size;
228
+ List *lroot = 0, *lcurr = 0, *lnode;
229
+
230
+ char *token, *ptr, *buffer = (char*)calloc(RSTRING_LEN(text) + 1, 1);
231
+ if (!buffer)
232
+ rb_raise(rb_eNoMemError, "ran out of memory copying tweet text");
233
+
234
+ ptr = buffer;
235
+ bzero(ptr, RSTRING_LEN(text) + 1);
236
+ memcpy(ptr, RSTRING_PTR(text), RSTRING_LEN(text));
237
+
238
+ // TODO: remove duplication
239
+ while ((token = strstr(ptr, "http://t.co/"))) {
240
+ size = 12 + tco_slug_size(token + 12, 10);
241
+
242
+ if (!(lnode = list_push(lroot, lcurr, token, size))) {
243
+ free(buffer);
244
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
245
+ }
246
+
247
+ if (lcurr)
248
+ lcurr = lnode;
249
+ else
250
+ lroot = lcurr = lnode;
251
+
252
+ ptr = token + size;
253
+ }
254
+
255
+ ptr = buffer;
256
+ while ((token = strstr(ptr, "https://t.co/"))) {
257
+ size = 13 + tco_slug_size(token + 13, 10);
258
+
259
+ if (!(lnode = list_push(lroot, lcurr, token, size))) {
260
+ free(buffer);
261
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
262
+ }
263
+
264
+ if (lcurr)
265
+ lcurr = lnode;
266
+ else
267
+ lroot = lcurr = lnode;
268
+
269
+ ptr = token + size;
270
+ }
271
+
272
+ free(buffer);
273
+ return lroot;
274
+ }
275
+
276
+ void inline dlist_add_segment(DList **dlroot, DList **dlcurr, List **lroot, List **lcurr, sb_stemmer *stemmer) {
277
+ DList *dlnode = dlist_push(*dlroot, *dlcurr, *lroot);
278
+
279
+ if (!dlnode) {
280
+ sb_stemmer_delete(stemmer);
281
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
282
+ }
283
+
284
+ if (*dlcurr)
285
+ *dlcurr = dlnode;
286
+ else
287
+ *dlroot = *dlcurr = dlnode;
288
+
289
+ *lroot = *lcurr = 0;
290
+ }
291
+
292
+ DList* tbr_tokens(VALUE text) {
293
+ static const char *phrase_delim = "\r\n:,;'\"{}()[]./\\%*|&!~`$+=<>?^";
294
+ static const char *word_delim = "\t- ";
295
+ static const char *token_delim = "_\t- ";
296
+
297
+ DList *dlroot = 0, *dlcurr = 0;
298
+ List *lroot = 0, *lcurr = 0, *lnode;
299
+
300
+ char *token, *ptr, *buffer = (char*)calloc(RSTRING_LEN(text) + 1, 1), *phrase_ptr, *word_ptr, *token_ptr;
301
+
302
+ if (!buffer)
303
+ rb_raise(rb_eNoMemError, "ran out of memory copying tweet text");
304
+
305
+ ptr = buffer;
306
+ bzero(ptr, RSTRING_LEN(text) + 1);
307
+ memcpy(ptr, RSTRING_PTR(text), RSTRING_LEN(text));
308
+
309
+ // downcase input
310
+ while (*ptr) *ptr++ = tolower(*ptr);
311
+ ptr = buffer;
312
+
313
+ // blank out urls
314
+ char *ptr1, *ptr2 = ptr;
315
+ while ((ptr1 = STRSTR(ptr2, "http://"))) {
316
+ ptr2 = strtok_r(ptr1, "\r\n\t ", &phrase_ptr);
317
+ ptr2 = phrase_ptr ? phrase_ptr : buffer + RSTRING_LEN(text);
318
+ memset(ptr1, '\n', ptr2 - ptr1);
319
+ }
320
+
321
+ ptr2 = ptr;
322
+ while ((ptr1 = STRSTR(ptr2, "https://"))) {
323
+ ptr2 = strtok_r(ptr1, "\r\n\t ", &phrase_ptr);
324
+ ptr2 = phrase_ptr ? phrase_ptr : buffer + RSTRING_LEN(text);
325
+ memset(ptr1, '\n', ptr2 - ptr1);
326
+ }
327
+
328
+ // remove blank out single quotes, prime
329
+ remove(ptr, "'");
330
+ remove(ptr, "\u2019");
331
+ remove(ptr, "\u2032");
332
+
333
+ // segment at unicode quotes
334
+ replace(ptr, "\u2018", '\t');
335
+ replace(ptr, "\u201c", '\t');
336
+ replace(ptr, "\u201d", '\t');
337
+ replace(ptr, "\u201e", '\t');
338
+ replace(ptr, "\u201f", '\t');
339
+ replace(ptr, "\u2033", '\t');
340
+ replace(ptr, "\u2034", '\t');
341
+ replace(ptr, "\u2035", '\t');
342
+ replace(ptr, "\u2036", '\t');
343
+ replace(ptr, "\u2037", '\t');
344
+
345
+ // angle quote
346
+ replace(ptr, "\u2039", '<');
347
+ replace(ptr, "\u203A", '>');
348
+
349
+ // slash
350
+ replace(ptr, "\u2044", '/');
351
+
352
+ // fullwidth AT => @
353
+ replace(ptr, "\uff20", '@');
354
+
355
+ // unicode spaces
356
+ replace(ptr, "\u2000", ' ');
357
+ replace(ptr, "\u2001", ' ');
358
+ replace(ptr, "\u2002", ' ');
359
+ replace(ptr, "\u2003", ' ');
360
+ replace(ptr, "\u2004", ' ');
361
+ replace(ptr, "\u2005", ' ');
362
+ replace(ptr, "\u2006", ' ');
363
+ replace(ptr, "\u2007", ' ');
364
+ replace(ptr, "\u2008", ' ');
365
+ replace(ptr, "\u2009", ' ');
366
+ replace(ptr, "\u200A", ' ');
367
+ replace(ptr, "\u200B", ' ');
368
+ replace(ptr, "\u202F", ' ');
369
+ replace(ptr, "\u3000", ' ');
370
+
371
+ // unicode dashes
372
+ replace(ptr, "\u058A", '-');
373
+ replace(ptr, "\u1806", '-');
374
+ replace(ptr, "\u2010", '-');
375
+ replace(ptr, "\u2011", '-');
376
+ replace(ptr, "\u2012", '-');
377
+ replace(ptr, "\u2013", '-');
378
+ replace(ptr, "\u2014", '-');
379
+ replace(ptr, "\u2015", '-');
380
+ replace(ptr, "\u207B", '-');
381
+ replace(ptr, "\u208B", '-');
382
+ replace(ptr, "\u2212", '-');
383
+ replace(ptr, "\u301C", '-');
384
+ replace(ptr, "\u3030", '-');
385
+
386
+ // corner brackets
387
+ replace(ptr, "\u300C", '<');
388
+ replace(ptr, "\u300E", '<');
389
+ replace(ptr, "\u301D", '<');
390
+ replace(ptr, "\u300D", '>');
391
+ replace(ptr, "\u300F", '>');
392
+ replace(ptr, "\u301F", '>');
393
+
394
+ struct sb_stemmer *en_stemmer = sb_stemmer_new("english", "UTF_8");
395
+ while ((token = strtok_r(ptr, phrase_delim, &phrase_ptr))) {
396
+ ptr = token;
397
+
398
+ while ((token = strtok_r(ptr, word_delim, &word_ptr))) {
399
+ ptr = NULL;
400
+
401
+ if (strlen(token) < MIN_WORD_SIZE || *token == '@' || *token == '#') {
402
+ if (lroot)
403
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
404
+ continue;
405
+ }
406
+
407
+ ptr = token;
408
+ while ((token = strtok_r(ptr, token_delim, &token_ptr))) {
409
+ ptr = NULL;
410
+
411
+ const sb_symbol *sbstem = sb_stemmer_stem(en_stemmer, (sb_symbol *)token, strlen(token));
412
+ uint32_t sbstem_len = sb_stemmer_length(en_stemmer);
413
+
414
+ if (sbstem_len < MIN_WORD_SIZE) {
415
+ if (lroot)
416
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
417
+ continue;
418
+ }
419
+
420
+ if (SkipTokenRE) {
421
+ if (RE2::FullMatch(token, *SkipTokenRE)) {
422
+ if (lroot)
423
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
424
+ continue;
425
+ }
426
+
427
+ string stem((char*)sbstem, sbstem_len);
428
+ if (RE2::FullMatch(stem, *SkipTokenRE)) {
429
+ if (lroot)
430
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
431
+ continue;
432
+ }
433
+ }
434
+
435
+ if (SkipTokenPatternRE && RE2::FullMatch(token, *SkipTokenPatternRE)) {
436
+ if (lroot)
437
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
438
+ continue;
439
+ }
440
+
441
+ if (!(lnode = list_push(lroot, lcurr, token, strlen(token)))) {
442
+ dlist_free(dlroot);
443
+ sb_stemmer_delete(en_stemmer);
444
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
445
+ }
446
+
447
+ if (lcurr)
448
+ lcurr = lnode;
449
+ else
450
+ lroot = lcurr = lnode;
451
+ }
452
+
453
+ ptr = NULL;
454
+ }
455
+
456
+ ptr = NULL;
457
+ if (lroot)
458
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
459
+ }
460
+
461
+ sb_stemmer_delete(en_stemmer);
462
+ free(buffer);
463
+ return dlroot;
464
+ }
465
+
466
+ #define TBR_FUNC(a) (VALUE (*)(void*))(a)
467
+ #define TBR_CALL(a, text) rb_thread_blocking_region(TBR_FUNC(a), (void *)text, RUBY_UBF_PROCESS, 0)
468
+
469
+ // API
470
+
471
+ VALUE users(VALUE self, VALUE text, bool validated = false) {
472
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
473
+ rb_raise(rb_eArgError, "requires tweet text");
474
+ return list_to_array((List*)TBR_CALL(tbr_users, text), rb_enc_get(text));
475
+ }
476
+
477
+ VALUE hashtags(VALUE self, VALUE text, bool validated = false) {
478
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
479
+ rb_raise(rb_eArgError, "requires tweet text");
480
+ return list_to_array((List*)TBR_CALL(tbr_hashtags, text), rb_enc_get(text));
481
+ }
482
+
483
+ VALUE urls(VALUE self, VALUE text, bool validated = false) {
484
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
485
+ rb_raise(rb_eArgError, "requires tweet text");
486
+ return list_to_array((List*)TBR_CALL(tbr_urls, text), rb_enc_get(text));
487
+ }
488
+
489
+ VALUE tokens(VALUE self, VALUE text, bool validated = false) {
490
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
491
+ rb_raise(rb_eArgError, "requires tweet text");
492
+ return dlist_to_array((DList*)TBR_CALL(tbr_tokens, text), rb_enc_get(text));
493
+ }
494
+
495
+ VALUE entities(VALUE self, VALUE text) {
496
+ if (NIL_P(text) || TYPE(text) != T_STRING)
497
+ rb_raise(rb_eArgError, "requires tweet text");
498
+
499
+ VALUE result = rb_hash_new();
500
+ rb_hash_aset(result, id_users, users(self, text, true));
501
+ rb_hash_aset(result, id_hashtags, hashtags(self, text, true));
502
+ rb_hash_aset(result, id_urls, urls(self, text, true));
503
+ rb_hash_aset(result, id_tokens, tokens(self, text, true));
504
+ return result;
505
+ }
506
+
507
+ VALUE skip_users(VALUE self, VALUE list) {
508
+ if (UserStopRE)
509
+ delete UserStopRE;
510
+ UserStopRE = NULL;
511
+
512
+ if (NIL_P(list)) return Qtrue;
513
+
514
+ if (TYPE(list) != T_ARRAY)
515
+ rb_raise(rb_eArgError, "requires a list of screen names minus @");
516
+
517
+ UserStopRE = new RE2("@" + build_alternating_expr(list), DefaultMatchOptions);
518
+ if (!UserStopRE->ok())
519
+ rb_raise(rb_eArgError, "%s", UserStopRE->error().c_str());
520
+
521
+ return Qtrue;
522
+ }
523
+
524
+ VALUE skip_hashtags(VALUE self, VALUE list) {
525
+ if (HashTagStopRE)
526
+ delete HashTagStopRE;
527
+ HashTagStopRE = NULL;
528
+
529
+ if (NIL_P(list)) return Qtrue;
530
+
531
+ if (TYPE(list) != T_ARRAY)
532
+ rb_raise(rb_eArgError, "requires a list of hashtags minus #");
533
+
534
+ HashTagStopRE = new RE2("#" + build_alternating_expr(list), DefaultMatchOptions);
535
+ if (!HashTagStopRE->ok())
536
+ rb_raise(rb_eArgError, "%s", HashTagStopRE->error().c_str());
537
+
538
+ return Qtrue;
539
+ }
540
+
541
+ VALUE skip_tokens(VALUE self, VALUE list) {
542
+ if (SkipTokenRE)
543
+ delete SkipTokenRE;
544
+ SkipTokenRE = NULL;
545
+
546
+ if (NIL_P(list)) return Qtrue;
547
+
548
+ if (TYPE(list) != T_ARRAY)
549
+ rb_raise(rb_eArgError, "requires a list of words");
550
+
551
+ struct sb_stemmer *en_stemmer = sb_stemmer_new("english", "UTF_8");
552
+
553
+ // add stems as well
554
+ int i, max = RARRAY_LEN(list);
555
+ for (int i = 0; i < max; i++) {
556
+ VALUE word = rb_ary_entry(list, i);
557
+ rb_encoding *encoding = rb_enc_get(word);
558
+ const sb_symbol *sbstem = sb_stemmer_stem(en_stemmer, (sb_symbol *)RSTRING_PTR(word), RSTRING_LEN(word));
559
+ uint32_t sbstem_len = sb_stemmer_length(en_stemmer);
560
+ rb_ary_push(list, rb_enc_str_new((char*)sbstem, sbstem_len, encoding));
561
+ }
562
+
563
+ sb_stemmer_delete(en_stemmer);
564
+
565
+ // too bad, no uniq c api
566
+ rb_funcall(list, rb_intern("uniq!"), 0);
567
+ SkipTokenRE = new RE2("^" + build_alternating_expr(list) + "$", DefaultMatchOptions);
568
+ if (!SkipTokenRE->ok())
569
+ rb_raise(rb_eArgError, "%s", SkipTokenRE->error().c_str());
570
+
571
+ return Qtrue;
572
+ }
573
+
574
+ VALUE skip_token_pattern(VALUE self, VALUE re) {
575
+ if (SkipTokenPatternRE)
576
+ delete SkipTokenPatternRE;
577
+
578
+ SkipTokenPatternRE = NULL;
579
+
580
+ if (NIL_P(re)) return Qtrue;
581
+
582
+ SkipTokenPatternRE = new RE2(CSTRING(re), DefaultMatchOptions);
583
+ if (!SkipTokenPatternRE->ok())
584
+ rb_raise(rb_eArgError, "%s", SkipTokenPatternRE->error().c_str());
585
+
586
+ return Qtrue;
587
+ }
588
+
589
+
590
+ extern "C" {
591
+ void Init_chipper(void) {
592
+ UserRE = new RE2("(?:^|[^[:alnum:]])+([@@][[:alnum:]_\\-]+)");
593
+ HashTagRE = new RE2("(?:^|[^[:alnum:]])+(#[[:alnum:]}_]+)");
594
+
595
+ UserStopRE = NULL;
596
+ HashTagStopRE = NULL;
597
+ SkipTokenRE = NULL;
598
+ SkipTokenPatternRE = NULL;
599
+
600
+ DefaultMatchOptions.set_case_sensitive(false);
601
+ DefaultMatchOptions.set_log_errors(false);
602
+
603
+ id_users = ID2SYM(rb_intern("users"));
604
+ id_hashtags = ID2SYM(rb_intern("hashtags"));
605
+ id_urls = ID2SYM(rb_intern("urls"));
606
+ id_tokens = ID2SYM(rb_intern("tokens"));
607
+
608
+ rb_global_variable(&id_users);
609
+ rb_global_variable(&id_hashtags);
610
+ rb_global_variable(&id_urls);
611
+ rb_global_variable(&id_tokens);
612
+
613
+ VALUE mChipper = rb_define_module("Chipper");
614
+ rb_define_module_function(mChipper, "users", RUBY_METHOD_FUNC(users), 1);
615
+ rb_define_module_function(mChipper, "hashtags", RUBY_METHOD_FUNC(hashtags), 1);
616
+ rb_define_module_function(mChipper, "urls", RUBY_METHOD_FUNC(urls), 1);
617
+ rb_define_module_function(mChipper, "tokens", RUBY_METHOD_FUNC(tokens), 1);
618
+ rb_define_module_function(mChipper, "entities", RUBY_METHOD_FUNC(entities), 1);
619
+ rb_define_module_function(mChipper, "skip_users", RUBY_METHOD_FUNC(skip_users), 1);
620
+ rb_define_module_function(mChipper, "skip_hashtags", RUBY_METHOD_FUNC(skip_hashtags), 1);
621
+ rb_define_module_function(mChipper, "skip_tokens", RUBY_METHOD_FUNC(skip_tokens), 1);
622
+ rb_define_module_function(mChipper, "skip_token_pattern", RUBY_METHOD_FUNC(skip_token_pattern), 1);
623
+
624
+ rb_define_const(mChipper, "VERSION", rb_str_new2(CHIPPER_VERSION));
625
+ }
626
+ }
data/ext/src/version.h ADDED
@@ -0,0 +1 @@
1
+ #define CHIPPER_VERSION "0.4.2"
data/ext/stemmer.rb ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'fileutils'
4
+
5
+ ################################################################################
6
+ ################################################################################
7
+ ## Derived from ruby-stemmer https://github.com/aurelian/ruby-stemmer
8
+
9
+ # FreeBSD make is gmake
10
+ make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
11
+
12
+ LIBSTEMMER = File.expand_path(File.join(File.dirname(__FILE__), 'libstemmer_c'))
13
+
14
+ # MacOS architecture mess up
15
+ if RUBY_PLATFORM =~ /darwin/
16
+ # see: #issue/3, #issue/5
17
+ begin
18
+ ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
19
+ rescue
20
+ $stderr << "Failed to get your ruby executable architecture.\n"
21
+ $stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
22
+ exit
23
+ end
24
+ # see: #issue/9, #issue/6
25
+ # see: man compat
26
+ if ENV['COMMAND_MODE'] == 'legacy'
27
+ $stdout << "Setting compat mode to unix2003\n."
28
+ ENV['COMMAND_MODE']= 'unix2003'
29
+ end
30
+ end
31
+
32
+ # make libstemmer_c. unless we're cross-compiling.
33
+ unless RUBY_PLATFORM =~ /i386-mingw32/
34
+ Dir.chdir(LIBSTEMMER) {
35
+ system(make) || exit(false)
36
+ }
37
+ end
38
+
39
+ ################################################################################
40
+ ################################################################################