chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
@@ -0,0 +1,626 @@
1
+ #include <stdlib.h>
2
+ #include <iostream>
3
+ #include <vector>
4
+ #include "re2/re2.h"
5
+ #include "re2/stringpiece.h"
6
+ #include "libstemmer.h"
7
+ #include "version.h"
8
+
9
+ #if __GNUC__
10
+ #define STRSTR strcasestr
11
+ #else
12
+ #define STRSTR strstr
13
+ #endif
14
+
15
+ #include <ruby/ruby.h>
16
+ #include <ruby/io.h>
17
+
18
+ #define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
19
+ #define CSTRING(v) RSTRING_PTR(TO_S(v))
20
+ #define MIN_TAG_SIZE 3
21
+ #define MIN_WORD_SIZE 3
22
+
23
+ using namespace std;
24
+ using namespace re2;
25
+
26
+ RE2 *UserRE;
27
+ RE2 *HashTagRE;
28
+ RE2 *UserStopRE;
29
+ RE2 *HashTagStopRE;
30
+ RE2 *SkipTokenRE;
31
+ RE2 *SkipTokenPatternRE;
32
+
33
+ RE2::Options DefaultMatchOptions;
34
+ VALUE id_users, id_hashtags, id_urls, id_tokens;
35
+
36
+ string build_alternating_expr(VALUE list) {
37
+ VALUE v;
38
+ string expr = "(?:";
39
+ for (int i = 0; i < RARRAY_LEN(list) - 1; i++) {
40
+ v = rb_ary_entry(list, i);
41
+ expr += string(RSTRING_PTR(v), RSTRING_LEN(v)) + "|";
42
+ }
43
+ v = rb_ary_entry(list, RARRAY_LEN(list)-1);
44
+ expr += string(RSTRING_PTR(v), RSTRING_LEN(v)) + ")";
45
+ return expr;
46
+ }
47
+
48
+ void replace(char *string, const char *pattern, int c) {
49
+ int width = strlen(pattern);
50
+ char *ptr1, *ptr2 = string;
51
+
52
+ while ((ptr1 = strstr(ptr2, pattern))) {
53
+ memset(ptr1, c, width);
54
+ ptr2 = ptr1 + width;
55
+ }
56
+ }
57
+
58
+ void remove(char *string, const char *pattern) {
59
+ int size = strlen(string), width = strlen(pattern);
60
+ char *ptr1, *ptr2 = string;
61
+
62
+ while ((ptr1 = strstr(ptr2, pattern))) {
63
+ memcpy(ptr1, ptr1 + width, size - (ptr1 - string) - width);
64
+ size -= width;
65
+ string[size] = 0;
66
+ }
67
+ }
68
+
69
+ typedef struct List {
70
+ char *text;
71
+ struct List *next;
72
+ } List;
73
+
74
+ typedef struct DList {
75
+ List *list;
76
+ struct DList *next;
77
+ } DList;
78
+
79
+ void list_free(List *list) {
80
+ List *curr = list;
81
+ while (list) {
82
+ list = curr->next;
83
+ if (curr->text)
84
+ free(curr->text);
85
+ free(curr);
86
+ curr = list;
87
+ }
88
+ }
89
+
90
+ List* list_push(List *root, List *curr, const char *text, int size) {
91
+ List *node = (List *)malloc(sizeof(List));
92
+ if (!node) {
93
+ list_free(root);
94
+ return 0;
95
+ }
96
+
97
+ node->text = (char *)malloc(size + 1);
98
+ if (!node->text) {
99
+ free(node);
100
+ list_free(root);
101
+ return 0;
102
+ }
103
+
104
+ memcpy(node->text, text, size);
105
+
106
+ node->next = 0;
107
+ node->text[size] = 0;
108
+
109
+ if (curr)
110
+ curr->next = node;
111
+
112
+ return node;
113
+ }
114
+
115
+ VALUE list_to_array(List *node, rb_encoding *encoding) {
116
+ List *next;
117
+ VALUE array = rb_ary_new();
118
+
119
+ while (node) {
120
+ rb_ary_push(array, rb_enc_str_new(node->text, strlen(node->text), encoding));
121
+ next = node->next;
122
+ free(node->text);
123
+ free(node);
124
+ node = next;
125
+ }
126
+
127
+ return array;
128
+ }
129
+
130
+ void dlist_free(DList *dlist) {
131
+ DList *curr = dlist;
132
+ while (dlist) {
133
+ dlist = curr->next;
134
+ if (curr->list)
135
+ list_free(curr->list);
136
+ free(curr);
137
+ curr = dlist;
138
+ }
139
+ }
140
+
141
+ DList* dlist_push(DList *root, DList *curr, List *list) {
142
+ DList *node = (DList *)malloc(sizeof(DList));
143
+ if (!node) {
144
+ dlist_free(root);
145
+ list_free(list);
146
+ return 0;
147
+ }
148
+
149
+ node->list = list;
150
+ node->next = 0;
151
+
152
+ if (curr)
153
+ curr->next = node;
154
+
155
+ return node;
156
+ }
157
+
158
+ VALUE dlist_to_array(DList *node, rb_encoding *encoding) {
159
+ DList *next;
160
+ VALUE array = rb_ary_new();
161
+
162
+ while (node) {
163
+ rb_ary_push(array, list_to_array(node->list, encoding));
164
+ next = node->next;
165
+ free(node);
166
+ node = next;
167
+ }
168
+
169
+ return array;
170
+ }
171
+
172
+ List* tbr_users(VALUE text) {
173
+ List *lroot = 0, *lcurr = 0, *lnode;
174
+
175
+ string match;
176
+ StringPiece input;
177
+ input.set(RSTRING_PTR(text), RSTRING_LEN(text));
178
+ while (RE2::FindAndConsume(&input, *UserRE, &match)) {
179
+ if (UserStopRE && RE2::FullMatch(match, *UserStopRE)) continue;
180
+
181
+ if (!(lnode = list_push(lroot, lcurr, match.data(), match.size())))
182
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
183
+
184
+ if (lcurr)
185
+ lcurr = lnode;
186
+ else
187
+ lroot = lcurr = lnode;
188
+ }
189
+
190
+ return lroot;
191
+ }
192
+
193
+
194
+ List* tbr_hashtags(VALUE text) {
195
+ List *lroot = 0, *lcurr = 0, *lnode;
196
+
197
+ string match;
198
+ StringPiece input;
199
+ input.set(RSTRING_PTR(text), RSTRING_LEN(text));
200
+ while (RE2::FindAndConsume(&input, *HashTagRE, &match)) {
201
+ if (match.size() < MIN_TAG_SIZE) continue;
202
+ if (HashTagStopRE && RE2::FullMatch(match, *HashTagStopRE)) continue;
203
+
204
+ if (!(lnode = list_push(lroot, lcurr, match.data(), match.size())))
205
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
206
+
207
+ if (lcurr)
208
+ lcurr = lnode;
209
+ else
210
+ lroot = lcurr = lnode;
211
+ }
212
+
213
+ return lroot;
214
+ }
215
+
216
+ int tco_slug_size(char *ptr, int max) {
217
+ int size = 0;
218
+ while (*ptr) {
219
+ if (!isalnum(*ptr) || size >= max) break;
220
+ size++;
221
+ ptr++;
222
+ }
223
+ return size;
224
+ }
225
+
226
+ List* tbr_urls(VALUE text) {
227
+ int size;
228
+ List *lroot = 0, *lcurr = 0, *lnode;
229
+
230
+ char *token, *ptr, *buffer = (char*)calloc(RSTRING_LEN(text) + 1, 1);
231
+ if (!buffer)
232
+ rb_raise(rb_eNoMemError, "ran out of memory copying tweet text");
233
+
234
+ ptr = buffer;
235
+ bzero(ptr, RSTRING_LEN(text) + 1);
236
+ memcpy(ptr, RSTRING_PTR(text), RSTRING_LEN(text));
237
+
238
+ // TODO: remove duplication
239
+ while ((token = strstr(ptr, "http://t.co/"))) {
240
+ size = 12 + tco_slug_size(token + 12, 10);
241
+
242
+ if (!(lnode = list_push(lroot, lcurr, token, size))) {
243
+ free(buffer);
244
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
245
+ }
246
+
247
+ if (lcurr)
248
+ lcurr = lnode;
249
+ else
250
+ lroot = lcurr = lnode;
251
+
252
+ ptr = token + size;
253
+ }
254
+
255
+ ptr = buffer;
256
+ while ((token = strstr(ptr, "https://t.co/"))) {
257
+ size = 13 + tco_slug_size(token + 13, 10);
258
+
259
+ if (!(lnode = list_push(lroot, lcurr, token, size))) {
260
+ free(buffer);
261
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
262
+ }
263
+
264
+ if (lcurr)
265
+ lcurr = lnode;
266
+ else
267
+ lroot = lcurr = lnode;
268
+
269
+ ptr = token + size;
270
+ }
271
+
272
+ free(buffer);
273
+ return lroot;
274
+ }
275
+
276
+ void inline dlist_add_segment(DList **dlroot, DList **dlcurr, List **lroot, List **lcurr, sb_stemmer *stemmer) {
277
+ DList *dlnode = dlist_push(*dlroot, *dlcurr, *lroot);
278
+
279
+ if (!dlnode) {
280
+ sb_stemmer_delete(stemmer);
281
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
282
+ }
283
+
284
+ if (*dlcurr)
285
+ *dlcurr = dlnode;
286
+ else
287
+ *dlroot = *dlcurr = dlnode;
288
+
289
+ *lroot = *lcurr = 0;
290
+ }
291
+
292
+ DList* tbr_tokens(VALUE text) {
293
+ static const char *phrase_delim = "\r\n:,;'\"{}()[]./\\%*|&!~`$+=<>?^";
294
+ static const char *word_delim = "\t- ";
295
+ static const char *token_delim = "_\t- ";
296
+
297
+ DList *dlroot = 0, *dlcurr = 0;
298
+ List *lroot = 0, *lcurr = 0, *lnode;
299
+
300
+ char *token, *ptr, *buffer = (char*)calloc(RSTRING_LEN(text) + 1, 1), *phrase_ptr, *word_ptr, *token_ptr;
301
+
302
+ if (!buffer)
303
+ rb_raise(rb_eNoMemError, "ran out of memory copying tweet text");
304
+
305
+ ptr = buffer;
306
+ bzero(ptr, RSTRING_LEN(text) + 1);
307
+ memcpy(ptr, RSTRING_PTR(text), RSTRING_LEN(text));
308
+
309
+ // downcase input
310
+ while (*ptr) *ptr++ = tolower(*ptr);
311
+ ptr = buffer;
312
+
313
+ // blank out urls
314
+ char *ptr1, *ptr2 = ptr;
315
+ while ((ptr1 = STRSTR(ptr2, "http://"))) {
316
+ ptr2 = strtok_r(ptr1, "\r\n\t ", &phrase_ptr);
317
+ ptr2 = phrase_ptr ? phrase_ptr : buffer + RSTRING_LEN(text);
318
+ memset(ptr1, '\n', ptr2 - ptr1);
319
+ }
320
+
321
+ ptr2 = ptr;
322
+ while ((ptr1 = STRSTR(ptr2, "https://"))) {
323
+ ptr2 = strtok_r(ptr1, "\r\n\t ", &phrase_ptr);
324
+ ptr2 = phrase_ptr ? phrase_ptr : buffer + RSTRING_LEN(text);
325
+ memset(ptr1, '\n', ptr2 - ptr1);
326
+ }
327
+
328
+ // remove blank out single quotes, prime
329
+ remove(ptr, "'");
330
+ remove(ptr, "\u2019");
331
+ remove(ptr, "\u2032");
332
+
333
+ // segment at unicode quotes
334
+ replace(ptr, "\u2018", '\t');
335
+ replace(ptr, "\u201c", '\t');
336
+ replace(ptr, "\u201d", '\t');
337
+ replace(ptr, "\u201e", '\t');
338
+ replace(ptr, "\u201f", '\t');
339
+ replace(ptr, "\u2033", '\t');
340
+ replace(ptr, "\u2034", '\t');
341
+ replace(ptr, "\u2035", '\t');
342
+ replace(ptr, "\u2036", '\t');
343
+ replace(ptr, "\u2037", '\t');
344
+
345
+ // angle quote
346
+ replace(ptr, "\u2039", '<');
347
+ replace(ptr, "\u203A", '>');
348
+
349
+ // slash
350
+ replace(ptr, "\u2044", '/');
351
+
352
+ // fullwidth AT => @
353
+ replace(ptr, "\uff20", '@');
354
+
355
+ // unicode spaces
356
+ replace(ptr, "\u2000", ' ');
357
+ replace(ptr, "\u2001", ' ');
358
+ replace(ptr, "\u2002", ' ');
359
+ replace(ptr, "\u2003", ' ');
360
+ replace(ptr, "\u2004", ' ');
361
+ replace(ptr, "\u2005", ' ');
362
+ replace(ptr, "\u2006", ' ');
363
+ replace(ptr, "\u2007", ' ');
364
+ replace(ptr, "\u2008", ' ');
365
+ replace(ptr, "\u2009", ' ');
366
+ replace(ptr, "\u200A", ' ');
367
+ replace(ptr, "\u200B", ' ');
368
+ replace(ptr, "\u202F", ' ');
369
+ replace(ptr, "\u3000", ' ');
370
+
371
+ // unicode dashes
372
+ replace(ptr, "\u058A", '-');
373
+ replace(ptr, "\u1806", '-');
374
+ replace(ptr, "\u2010", '-');
375
+ replace(ptr, "\u2011", '-');
376
+ replace(ptr, "\u2012", '-');
377
+ replace(ptr, "\u2013", '-');
378
+ replace(ptr, "\u2014", '-');
379
+ replace(ptr, "\u2015", '-');
380
+ replace(ptr, "\u207B", '-');
381
+ replace(ptr, "\u208B", '-');
382
+ replace(ptr, "\u2212", '-');
383
+ replace(ptr, "\u301C", '-');
384
+ replace(ptr, "\u3030", '-');
385
+
386
+ // corner brackets
387
+ replace(ptr, "\u300C", '<');
388
+ replace(ptr, "\u300E", '<');
389
+ replace(ptr, "\u301D", '<');
390
+ replace(ptr, "\u300D", '>');
391
+ replace(ptr, "\u300F", '>');
392
+ replace(ptr, "\u301F", '>');
393
+
394
+ struct sb_stemmer *en_stemmer = sb_stemmer_new("english", "UTF_8");
395
+ while ((token = strtok_r(ptr, phrase_delim, &phrase_ptr))) {
396
+ ptr = token;
397
+
398
+ while ((token = strtok_r(ptr, word_delim, &word_ptr))) {
399
+ ptr = NULL;
400
+
401
+ if (strlen(token) < MIN_WORD_SIZE || *token == '@' || *token == '#') {
402
+ if (lroot)
403
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
404
+ continue;
405
+ }
406
+
407
+ ptr = token;
408
+ while ((token = strtok_r(ptr, token_delim, &token_ptr))) {
409
+ ptr = NULL;
410
+
411
+ const sb_symbol *sbstem = sb_stemmer_stem(en_stemmer, (sb_symbol *)token, strlen(token));
412
+ uint32_t sbstem_len = sb_stemmer_length(en_stemmer);
413
+
414
+ if (sbstem_len < MIN_WORD_SIZE) {
415
+ if (lroot)
416
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
417
+ continue;
418
+ }
419
+
420
+ if (SkipTokenRE) {
421
+ if (RE2::FullMatch(token, *SkipTokenRE)) {
422
+ if (lroot)
423
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
424
+ continue;
425
+ }
426
+
427
+ string stem((char*)sbstem, sbstem_len);
428
+ if (RE2::FullMatch(stem, *SkipTokenRE)) {
429
+ if (lroot)
430
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
431
+ continue;
432
+ }
433
+ }
434
+
435
+ if (SkipTokenPatternRE && RE2::FullMatch(token, *SkipTokenPatternRE)) {
436
+ if (lroot)
437
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
438
+ continue;
439
+ }
440
+
441
+ if (!(lnode = list_push(lroot, lcurr, token, strlen(token)))) {
442
+ dlist_free(dlroot);
443
+ sb_stemmer_delete(en_stemmer);
444
+ rb_raise(rb_eNoMemError, "ran out of memory while storing result");
445
+ }
446
+
447
+ if (lcurr)
448
+ lcurr = lnode;
449
+ else
450
+ lroot = lcurr = lnode;
451
+ }
452
+
453
+ ptr = NULL;
454
+ }
455
+
456
+ ptr = NULL;
457
+ if (lroot)
458
+ dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
459
+ }
460
+
461
+ sb_stemmer_delete(en_stemmer);
462
+ free(buffer);
463
+ return dlroot;
464
+ }
465
+
466
+ #define TBR_FUNC(a) (VALUE (*)(void*))(a)
467
+ #define TBR_CALL(a, text) rb_thread_blocking_region(TBR_FUNC(a), (void *)text, RUBY_UBF_PROCESS, 0)
468
+
469
+ // API
470
+
471
+ VALUE users(VALUE self, VALUE text, bool validated = false) {
472
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
473
+ rb_raise(rb_eArgError, "requires tweet text");
474
+ return list_to_array((List*)TBR_CALL(tbr_users, text), rb_enc_get(text));
475
+ }
476
+
477
+ VALUE hashtags(VALUE self, VALUE text, bool validated = false) {
478
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
479
+ rb_raise(rb_eArgError, "requires tweet text");
480
+ return list_to_array((List*)TBR_CALL(tbr_hashtags, text), rb_enc_get(text));
481
+ }
482
+
483
+ VALUE urls(VALUE self, VALUE text, bool validated = false) {
484
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
485
+ rb_raise(rb_eArgError, "requires tweet text");
486
+ return list_to_array((List*)TBR_CALL(tbr_urls, text), rb_enc_get(text));
487
+ }
488
+
489
+ VALUE tokens(VALUE self, VALUE text, bool validated = false) {
490
+ if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
491
+ rb_raise(rb_eArgError, "requires tweet text");
492
+ return dlist_to_array((DList*)TBR_CALL(tbr_tokens, text), rb_enc_get(text));
493
+ }
494
+
495
+ VALUE entities(VALUE self, VALUE text) {
496
+ if (NIL_P(text) || TYPE(text) != T_STRING)
497
+ rb_raise(rb_eArgError, "requires tweet text");
498
+
499
+ VALUE result = rb_hash_new();
500
+ rb_hash_aset(result, id_users, users(self, text, true));
501
+ rb_hash_aset(result, id_hashtags, hashtags(self, text, true));
502
+ rb_hash_aset(result, id_urls, urls(self, text, true));
503
+ rb_hash_aset(result, id_tokens, tokens(self, text, true));
504
+ return result;
505
+ }
506
+
507
+ VALUE skip_users(VALUE self, VALUE list) {
508
+ if (UserStopRE)
509
+ delete UserStopRE;
510
+ UserStopRE = NULL;
511
+
512
+ if (NIL_P(list)) return Qtrue;
513
+
514
+ if (TYPE(list) != T_ARRAY)
515
+ rb_raise(rb_eArgError, "requires a list of screen names minus @");
516
+
517
+ UserStopRE = new RE2("@" + build_alternating_expr(list), DefaultMatchOptions);
518
+ if (!UserStopRE->ok())
519
+ rb_raise(rb_eArgError, "%s", UserStopRE->error().c_str());
520
+
521
+ return Qtrue;
522
+ }
523
+
524
+ VALUE skip_hashtags(VALUE self, VALUE list) {
525
+ if (HashTagStopRE)
526
+ delete HashTagStopRE;
527
+ HashTagStopRE = NULL;
528
+
529
+ if (NIL_P(list)) return Qtrue;
530
+
531
+ if (TYPE(list) != T_ARRAY)
532
+ rb_raise(rb_eArgError, "requires a list of hashtags minus #");
533
+
534
+ HashTagStopRE = new RE2("#" + build_alternating_expr(list), DefaultMatchOptions);
535
+ if (!HashTagStopRE->ok())
536
+ rb_raise(rb_eArgError, "%s", HashTagStopRE->error().c_str());
537
+
538
+ return Qtrue;
539
+ }
540
+
541
+ VALUE skip_tokens(VALUE self, VALUE list) {
542
+ if (SkipTokenRE)
543
+ delete SkipTokenRE;
544
+ SkipTokenRE = NULL;
545
+
546
+ if (NIL_P(list)) return Qtrue;
547
+
548
+ if (TYPE(list) != T_ARRAY)
549
+ rb_raise(rb_eArgError, "requires a list of words");
550
+
551
+ struct sb_stemmer *en_stemmer = sb_stemmer_new("english", "UTF_8");
552
+
553
+ // add stems as well
554
+ int i, max = RARRAY_LEN(list);
555
+ for (int i = 0; i < max; i++) {
556
+ VALUE word = rb_ary_entry(list, i);
557
+ rb_encoding *encoding = rb_enc_get(word);
558
+ const sb_symbol *sbstem = sb_stemmer_stem(en_stemmer, (sb_symbol *)RSTRING_PTR(word), RSTRING_LEN(word));
559
+ uint32_t sbstem_len = sb_stemmer_length(en_stemmer);
560
+ rb_ary_push(list, rb_enc_str_new((char*)sbstem, sbstem_len, encoding));
561
+ }
562
+
563
+ sb_stemmer_delete(en_stemmer);
564
+
565
+ // too bad, no uniq c api
566
+ rb_funcall(list, rb_intern("uniq!"), 0);
567
+ SkipTokenRE = new RE2("^" + build_alternating_expr(list) + "$", DefaultMatchOptions);
568
+ if (!SkipTokenRE->ok())
569
+ rb_raise(rb_eArgError, "%s", SkipTokenRE->error().c_str());
570
+
571
+ return Qtrue;
572
+ }
573
+
574
+ VALUE skip_token_pattern(VALUE self, VALUE re) {
575
+ if (SkipTokenPatternRE)
576
+ delete SkipTokenPatternRE;
577
+
578
+ SkipTokenPatternRE = NULL;
579
+
580
+ if (NIL_P(re)) return Qtrue;
581
+
582
+ SkipTokenPatternRE = new RE2(CSTRING(re), DefaultMatchOptions);
583
+ if (!SkipTokenPatternRE->ok())
584
+ rb_raise(rb_eArgError, "%s", SkipTokenPatternRE->error().c_str());
585
+
586
+ return Qtrue;
587
+ }
588
+
589
+
590
+ extern "C" {
591
+ void Init_chipper(void) {
592
+ UserRE = new RE2("(?:^|[^[:alnum:]])+([@@][[:alnum:]_\\-]+)");
593
+ HashTagRE = new RE2("(?:^|[^[:alnum:]])+(#[[:alnum:]}_]+)");
594
+
595
+ UserStopRE = NULL;
596
+ HashTagStopRE = NULL;
597
+ SkipTokenRE = NULL;
598
+ SkipTokenPatternRE = NULL;
599
+
600
+ DefaultMatchOptions.set_case_sensitive(false);
601
+ DefaultMatchOptions.set_log_errors(false);
602
+
603
+ id_users = ID2SYM(rb_intern("users"));
604
+ id_hashtags = ID2SYM(rb_intern("hashtags"));
605
+ id_urls = ID2SYM(rb_intern("urls"));
606
+ id_tokens = ID2SYM(rb_intern("tokens"));
607
+
608
+ rb_global_variable(&id_users);
609
+ rb_global_variable(&id_hashtags);
610
+ rb_global_variable(&id_urls);
611
+ rb_global_variable(&id_tokens);
612
+
613
+ VALUE mChipper = rb_define_module("Chipper");
614
+ rb_define_module_function(mChipper, "users", RUBY_METHOD_FUNC(users), 1);
615
+ rb_define_module_function(mChipper, "hashtags", RUBY_METHOD_FUNC(hashtags), 1);
616
+ rb_define_module_function(mChipper, "urls", RUBY_METHOD_FUNC(urls), 1);
617
+ rb_define_module_function(mChipper, "tokens", RUBY_METHOD_FUNC(tokens), 1);
618
+ rb_define_module_function(mChipper, "entities", RUBY_METHOD_FUNC(entities), 1);
619
+ rb_define_module_function(mChipper, "skip_users", RUBY_METHOD_FUNC(skip_users), 1);
620
+ rb_define_module_function(mChipper, "skip_hashtags", RUBY_METHOD_FUNC(skip_hashtags), 1);
621
+ rb_define_module_function(mChipper, "skip_tokens", RUBY_METHOD_FUNC(skip_tokens), 1);
622
+ rb_define_module_function(mChipper, "skip_token_pattern", RUBY_METHOD_FUNC(skip_token_pattern), 1);
623
+
624
+ rb_define_const(mChipper, "VERSION", rb_str_new2(CHIPPER_VERSION));
625
+ }
626
+ }
data/ext/src/version.h ADDED
@@ -0,0 +1 @@
1
+ #define CHIPPER_VERSION "0.4.2"
data/ext/stemmer.rb ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'fileutils'
4
+
5
+ ################################################################################
6
+ ################################################################################
7
+ ## Derived from ruby-stemmer https://github.com/aurelian/ruby-stemmer
8
+
9
+ # FreeBSD make is gmake
10
+ make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
11
+
12
+ LIBSTEMMER = File.expand_path(File.join(File.dirname(__FILE__), 'libstemmer_c'))
13
+
14
+ # MacOS architecture mess up
15
+ if RUBY_PLATFORM =~ /darwin/
16
+ # see: #issue/3, #issue/5
17
+ begin
18
+ ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
19
+ rescue
20
+ $stderr << "Failed to get your ruby executable architecture.\n"
21
+ $stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
22
+ exit
23
+ end
24
+ # see: #issue/9, #issue/6
25
+ # see: man compat
26
+ if ENV['COMMAND_MODE'] == 'legacy'
27
+ $stdout << "Setting compat mode to unix2003\n."
28
+ ENV['COMMAND_MODE']= 'unix2003'
29
+ end
30
+ end
31
+
32
+ # make libstemmer_c. unless we're cross-compiling.
33
+ unless RUBY_PLATFORM =~ /i386-mingw32/
34
+ Dir.chdir(LIBSTEMMER) {
35
+ system(make) || exit(false)
36
+ }
37
+ end
38
+
39
+ ################################################################################
40
+ ################################################################################