chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/src/chipper.cc
ADDED
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
#include <stdlib.h>
|
|
2
|
+
#include <iostream>
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include "re2/re2.h"
|
|
5
|
+
#include "re2/stringpiece.h"
|
|
6
|
+
#include "libstemmer.h"
|
|
7
|
+
#include "version.h"
|
|
8
|
+
|
|
9
|
+
#if __GNUC__
|
|
10
|
+
#define STRSTR strcasestr
|
|
11
|
+
#else
|
|
12
|
+
#define STRSTR strstr
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
#include <ruby/ruby.h>
|
|
16
|
+
#include <ruby/io.h>
|
|
17
|
+
|
|
18
|
+
#define TO_S(v) rb_funcall(v, rb_intern("to_s"), 0)
|
|
19
|
+
#define CSTRING(v) RSTRING_PTR(TO_S(v))
|
|
20
|
+
#define MIN_TAG_SIZE 3
|
|
21
|
+
#define MIN_WORD_SIZE 3
|
|
22
|
+
|
|
23
|
+
using namespace std;
|
|
24
|
+
using namespace re2;
|
|
25
|
+
|
|
26
|
+
RE2 *UserRE;
|
|
27
|
+
RE2 *HashTagRE;
|
|
28
|
+
RE2 *UserStopRE;
|
|
29
|
+
RE2 *HashTagStopRE;
|
|
30
|
+
RE2 *SkipTokenRE;
|
|
31
|
+
RE2 *SkipTokenPatternRE;
|
|
32
|
+
|
|
33
|
+
RE2::Options DefaultMatchOptions;
|
|
34
|
+
VALUE id_users, id_hashtags, id_urls, id_tokens;
|
|
35
|
+
|
|
36
|
+
string build_alternating_expr(VALUE list) {
|
|
37
|
+
VALUE v;
|
|
38
|
+
string expr = "(?:";
|
|
39
|
+
for (int i = 0; i < RARRAY_LEN(list) - 1; i++) {
|
|
40
|
+
v = rb_ary_entry(list, i);
|
|
41
|
+
expr += string(RSTRING_PTR(v), RSTRING_LEN(v)) + "|";
|
|
42
|
+
}
|
|
43
|
+
v = rb_ary_entry(list, RARRAY_LEN(list)-1);
|
|
44
|
+
expr += string(RSTRING_PTR(v), RSTRING_LEN(v)) + ")";
|
|
45
|
+
return expr;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
void replace(char *string, const char *pattern, int c) {
|
|
49
|
+
int width = strlen(pattern);
|
|
50
|
+
char *ptr1, *ptr2 = string;
|
|
51
|
+
|
|
52
|
+
while ((ptr1 = strstr(ptr2, pattern))) {
|
|
53
|
+
memset(ptr1, c, width);
|
|
54
|
+
ptr2 = ptr1 + width;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
void remove(char *string, const char *pattern) {
|
|
59
|
+
int size = strlen(string), width = strlen(pattern);
|
|
60
|
+
char *ptr1, *ptr2 = string;
|
|
61
|
+
|
|
62
|
+
while ((ptr1 = strstr(ptr2, pattern))) {
|
|
63
|
+
memcpy(ptr1, ptr1 + width, size - (ptr1 - string) - width);
|
|
64
|
+
size -= width;
|
|
65
|
+
string[size] = 0;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
typedef struct List {
|
|
70
|
+
char *text;
|
|
71
|
+
struct List *next;
|
|
72
|
+
} List;
|
|
73
|
+
|
|
74
|
+
typedef struct DList {
|
|
75
|
+
List *list;
|
|
76
|
+
struct DList *next;
|
|
77
|
+
} DList;
|
|
78
|
+
|
|
79
|
+
void list_free(List *list) {
|
|
80
|
+
List *curr = list;
|
|
81
|
+
while (list) {
|
|
82
|
+
list = curr->next;
|
|
83
|
+
if (curr->text)
|
|
84
|
+
free(curr->text);
|
|
85
|
+
free(curr);
|
|
86
|
+
curr = list;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
List* list_push(List *root, List *curr, const char *text, int size) {
|
|
91
|
+
List *node = (List *)malloc(sizeof(List));
|
|
92
|
+
if (!node) {
|
|
93
|
+
list_free(root);
|
|
94
|
+
return 0;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
node->text = (char *)malloc(size + 1);
|
|
98
|
+
if (!node->text) {
|
|
99
|
+
free(node);
|
|
100
|
+
list_free(root);
|
|
101
|
+
return 0;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
memcpy(node->text, text, size);
|
|
105
|
+
|
|
106
|
+
node->next = 0;
|
|
107
|
+
node->text[size] = 0;
|
|
108
|
+
|
|
109
|
+
if (curr)
|
|
110
|
+
curr->next = node;
|
|
111
|
+
|
|
112
|
+
return node;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
VALUE list_to_array(List *node, rb_encoding *encoding) {
|
|
116
|
+
List *next;
|
|
117
|
+
VALUE array = rb_ary_new();
|
|
118
|
+
|
|
119
|
+
while (node) {
|
|
120
|
+
rb_ary_push(array, rb_enc_str_new(node->text, strlen(node->text), encoding));
|
|
121
|
+
next = node->next;
|
|
122
|
+
free(node->text);
|
|
123
|
+
free(node);
|
|
124
|
+
node = next;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return array;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
void dlist_free(DList *dlist) {
|
|
131
|
+
DList *curr = dlist;
|
|
132
|
+
while (dlist) {
|
|
133
|
+
dlist = curr->next;
|
|
134
|
+
if (curr->list)
|
|
135
|
+
list_free(curr->list);
|
|
136
|
+
free(curr);
|
|
137
|
+
curr = dlist;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
DList* dlist_push(DList *root, DList *curr, List *list) {
|
|
142
|
+
DList *node = (DList *)malloc(sizeof(DList));
|
|
143
|
+
if (!node) {
|
|
144
|
+
dlist_free(root);
|
|
145
|
+
list_free(list);
|
|
146
|
+
return 0;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
node->list = list;
|
|
150
|
+
node->next = 0;
|
|
151
|
+
|
|
152
|
+
if (curr)
|
|
153
|
+
curr->next = node;
|
|
154
|
+
|
|
155
|
+
return node;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
VALUE dlist_to_array(DList *node, rb_encoding *encoding) {
|
|
159
|
+
DList *next;
|
|
160
|
+
VALUE array = rb_ary_new();
|
|
161
|
+
|
|
162
|
+
while (node) {
|
|
163
|
+
rb_ary_push(array, list_to_array(node->list, encoding));
|
|
164
|
+
next = node->next;
|
|
165
|
+
free(node);
|
|
166
|
+
node = next;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return array;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
List* tbr_users(VALUE text) {
|
|
173
|
+
List *lroot = 0, *lcurr = 0, *lnode;
|
|
174
|
+
|
|
175
|
+
string match;
|
|
176
|
+
StringPiece input;
|
|
177
|
+
input.set(RSTRING_PTR(text), RSTRING_LEN(text));
|
|
178
|
+
while (RE2::FindAndConsume(&input, *UserRE, &match)) {
|
|
179
|
+
if (UserStopRE && RE2::FullMatch(match, *UserStopRE)) continue;
|
|
180
|
+
|
|
181
|
+
if (!(lnode = list_push(lroot, lcurr, match.data(), match.size())))
|
|
182
|
+
rb_raise(rb_eNoMemError, "ran out of memory while storing result");
|
|
183
|
+
|
|
184
|
+
if (lcurr)
|
|
185
|
+
lcurr = lnode;
|
|
186
|
+
else
|
|
187
|
+
lroot = lcurr = lnode;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return lroot;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
List* tbr_hashtags(VALUE text) {
|
|
195
|
+
List *lroot = 0, *lcurr = 0, *lnode;
|
|
196
|
+
|
|
197
|
+
string match;
|
|
198
|
+
StringPiece input;
|
|
199
|
+
input.set(RSTRING_PTR(text), RSTRING_LEN(text));
|
|
200
|
+
while (RE2::FindAndConsume(&input, *HashTagRE, &match)) {
|
|
201
|
+
if (match.size() < MIN_TAG_SIZE) continue;
|
|
202
|
+
if (HashTagStopRE && RE2::FullMatch(match, *HashTagStopRE)) continue;
|
|
203
|
+
|
|
204
|
+
if (!(lnode = list_push(lroot, lcurr, match.data(), match.size())))
|
|
205
|
+
rb_raise(rb_eNoMemError, "ran out of memory while storing result");
|
|
206
|
+
|
|
207
|
+
if (lcurr)
|
|
208
|
+
lcurr = lnode;
|
|
209
|
+
else
|
|
210
|
+
lroot = lcurr = lnode;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return lroot;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
int tco_slug_size(char *ptr, int max) {
|
|
217
|
+
int size = 0;
|
|
218
|
+
while (*ptr) {
|
|
219
|
+
if (!isalnum(*ptr) || size >= max) break;
|
|
220
|
+
size++;
|
|
221
|
+
ptr++;
|
|
222
|
+
}
|
|
223
|
+
return size;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
List* tbr_urls(VALUE text) {
|
|
227
|
+
int size;
|
|
228
|
+
List *lroot = 0, *lcurr = 0, *lnode;
|
|
229
|
+
|
|
230
|
+
char *token, *ptr, *buffer = (char*)calloc(RSTRING_LEN(text) + 1, 1);
|
|
231
|
+
if (!buffer)
|
|
232
|
+
rb_raise(rb_eNoMemError, "ran out of memory copying tweet text");
|
|
233
|
+
|
|
234
|
+
ptr = buffer;
|
|
235
|
+
bzero(ptr, RSTRING_LEN(text) + 1);
|
|
236
|
+
memcpy(ptr, RSTRING_PTR(text), RSTRING_LEN(text));
|
|
237
|
+
|
|
238
|
+
// TODO: remove duplication
|
|
239
|
+
while ((token = strstr(ptr, "http://t.co/"))) {
|
|
240
|
+
size = 12 + tco_slug_size(token + 12, 10);
|
|
241
|
+
|
|
242
|
+
if (!(lnode = list_push(lroot, lcurr, token, size))) {
|
|
243
|
+
free(buffer);
|
|
244
|
+
rb_raise(rb_eNoMemError, "ran out of memory while storing result");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (lcurr)
|
|
248
|
+
lcurr = lnode;
|
|
249
|
+
else
|
|
250
|
+
lroot = lcurr = lnode;
|
|
251
|
+
|
|
252
|
+
ptr = token + size;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
ptr = buffer;
|
|
256
|
+
while ((token = strstr(ptr, "https://t.co/"))) {
|
|
257
|
+
size = 13 + tco_slug_size(token + 13, 10);
|
|
258
|
+
|
|
259
|
+
if (!(lnode = list_push(lroot, lcurr, token, size))) {
|
|
260
|
+
free(buffer);
|
|
261
|
+
rb_raise(rb_eNoMemError, "ran out of memory while storing result");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if (lcurr)
|
|
265
|
+
lcurr = lnode;
|
|
266
|
+
else
|
|
267
|
+
lroot = lcurr = lnode;
|
|
268
|
+
|
|
269
|
+
ptr = token + size;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
free(buffer);
|
|
273
|
+
return lroot;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
void inline dlist_add_segment(DList **dlroot, DList **dlcurr, List **lroot, List **lcurr, sb_stemmer *stemmer) {
|
|
277
|
+
DList *dlnode = dlist_push(*dlroot, *dlcurr, *lroot);
|
|
278
|
+
|
|
279
|
+
if (!dlnode) {
|
|
280
|
+
sb_stemmer_delete(stemmer);
|
|
281
|
+
rb_raise(rb_eNoMemError, "ran out of memory while storing result");
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (*dlcurr)
|
|
285
|
+
*dlcurr = dlnode;
|
|
286
|
+
else
|
|
287
|
+
*dlroot = *dlcurr = dlnode;
|
|
288
|
+
|
|
289
|
+
*lroot = *lcurr = 0;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
DList* tbr_tokens(VALUE text) {
|
|
293
|
+
static const char *phrase_delim = "\r\n:,;'\"{}()[]./\\%*|&!~`$+=<>?^";
|
|
294
|
+
static const char *word_delim = "\t- ";
|
|
295
|
+
static const char *token_delim = "_\t- ";
|
|
296
|
+
|
|
297
|
+
DList *dlroot = 0, *dlcurr = 0;
|
|
298
|
+
List *lroot = 0, *lcurr = 0, *lnode;
|
|
299
|
+
|
|
300
|
+
char *token, *ptr, *buffer = (char*)calloc(RSTRING_LEN(text) + 1, 1), *phrase_ptr, *word_ptr, *token_ptr;
|
|
301
|
+
|
|
302
|
+
if (!buffer)
|
|
303
|
+
rb_raise(rb_eNoMemError, "ran out of memory copying tweet text");
|
|
304
|
+
|
|
305
|
+
ptr = buffer;
|
|
306
|
+
bzero(ptr, RSTRING_LEN(text) + 1);
|
|
307
|
+
memcpy(ptr, RSTRING_PTR(text), RSTRING_LEN(text));
|
|
308
|
+
|
|
309
|
+
// downcase input
|
|
310
|
+
while (*ptr) *ptr++ = tolower(*ptr);
|
|
311
|
+
ptr = buffer;
|
|
312
|
+
|
|
313
|
+
// blank out urls
|
|
314
|
+
char *ptr1, *ptr2 = ptr;
|
|
315
|
+
while ((ptr1 = STRSTR(ptr2, "http://"))) {
|
|
316
|
+
ptr2 = strtok_r(ptr1, "\r\n\t ", &phrase_ptr);
|
|
317
|
+
ptr2 = phrase_ptr ? phrase_ptr : buffer + RSTRING_LEN(text);
|
|
318
|
+
memset(ptr1, '\n', ptr2 - ptr1);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
ptr2 = ptr;
|
|
322
|
+
while ((ptr1 = STRSTR(ptr2, "https://"))) {
|
|
323
|
+
ptr2 = strtok_r(ptr1, "\r\n\t ", &phrase_ptr);
|
|
324
|
+
ptr2 = phrase_ptr ? phrase_ptr : buffer + RSTRING_LEN(text);
|
|
325
|
+
memset(ptr1, '\n', ptr2 - ptr1);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// remove blank out single quotes, prime
|
|
329
|
+
remove(ptr, "'");
|
|
330
|
+
remove(ptr, "\u2019");
|
|
331
|
+
remove(ptr, "\u2032");
|
|
332
|
+
|
|
333
|
+
// segment at unicode quotes
|
|
334
|
+
replace(ptr, "\u2018", '\t');
|
|
335
|
+
replace(ptr, "\u201c", '\t');
|
|
336
|
+
replace(ptr, "\u201d", '\t');
|
|
337
|
+
replace(ptr, "\u201e", '\t');
|
|
338
|
+
replace(ptr, "\u201f", '\t');
|
|
339
|
+
replace(ptr, "\u2033", '\t');
|
|
340
|
+
replace(ptr, "\u2034", '\t');
|
|
341
|
+
replace(ptr, "\u2035", '\t');
|
|
342
|
+
replace(ptr, "\u2036", '\t');
|
|
343
|
+
replace(ptr, "\u2037", '\t');
|
|
344
|
+
|
|
345
|
+
// angle quote
|
|
346
|
+
replace(ptr, "\u2039", '<');
|
|
347
|
+
replace(ptr, "\u203A", '>');
|
|
348
|
+
|
|
349
|
+
// slash
|
|
350
|
+
replace(ptr, "\u2044", '/');
|
|
351
|
+
|
|
352
|
+
// fullwidth AT => @
|
|
353
|
+
replace(ptr, "\uff20", '@');
|
|
354
|
+
|
|
355
|
+
// unicode spaces
|
|
356
|
+
replace(ptr, "\u2000", ' ');
|
|
357
|
+
replace(ptr, "\u2001", ' ');
|
|
358
|
+
replace(ptr, "\u2002", ' ');
|
|
359
|
+
replace(ptr, "\u2003", ' ');
|
|
360
|
+
replace(ptr, "\u2004", ' ');
|
|
361
|
+
replace(ptr, "\u2005", ' ');
|
|
362
|
+
replace(ptr, "\u2006", ' ');
|
|
363
|
+
replace(ptr, "\u2007", ' ');
|
|
364
|
+
replace(ptr, "\u2008", ' ');
|
|
365
|
+
replace(ptr, "\u2009", ' ');
|
|
366
|
+
replace(ptr, "\u200A", ' ');
|
|
367
|
+
replace(ptr, "\u200B", ' ');
|
|
368
|
+
replace(ptr, "\u202F", ' ');
|
|
369
|
+
replace(ptr, "\u3000", ' ');
|
|
370
|
+
|
|
371
|
+
// unicode dashes
|
|
372
|
+
replace(ptr, "\u058A", '-');
|
|
373
|
+
replace(ptr, "\u1806", '-');
|
|
374
|
+
replace(ptr, "\u2010", '-');
|
|
375
|
+
replace(ptr, "\u2011", '-');
|
|
376
|
+
replace(ptr, "\u2012", '-');
|
|
377
|
+
replace(ptr, "\u2013", '-');
|
|
378
|
+
replace(ptr, "\u2014", '-');
|
|
379
|
+
replace(ptr, "\u2015", '-');
|
|
380
|
+
replace(ptr, "\u207B", '-');
|
|
381
|
+
replace(ptr, "\u208B", '-');
|
|
382
|
+
replace(ptr, "\u2212", '-');
|
|
383
|
+
replace(ptr, "\u301C", '-');
|
|
384
|
+
replace(ptr, "\u3030", '-');
|
|
385
|
+
|
|
386
|
+
// corner brackets
|
|
387
|
+
replace(ptr, "\u300C", '<');
|
|
388
|
+
replace(ptr, "\u300E", '<');
|
|
389
|
+
replace(ptr, "\u301D", '<');
|
|
390
|
+
replace(ptr, "\u300D", '>');
|
|
391
|
+
replace(ptr, "\u300F", '>');
|
|
392
|
+
replace(ptr, "\u301F", '>');
|
|
393
|
+
|
|
394
|
+
struct sb_stemmer *en_stemmer = sb_stemmer_new("english", "UTF_8");
|
|
395
|
+
while ((token = strtok_r(ptr, phrase_delim, &phrase_ptr))) {
|
|
396
|
+
ptr = token;
|
|
397
|
+
|
|
398
|
+
while ((token = strtok_r(ptr, word_delim, &word_ptr))) {
|
|
399
|
+
ptr = NULL;
|
|
400
|
+
|
|
401
|
+
if (strlen(token) < MIN_WORD_SIZE || *token == '@' || *token == '#') {
|
|
402
|
+
if (lroot)
|
|
403
|
+
dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
|
|
404
|
+
continue;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
ptr = token;
|
|
408
|
+
while ((token = strtok_r(ptr, token_delim, &token_ptr))) {
|
|
409
|
+
ptr = NULL;
|
|
410
|
+
|
|
411
|
+
const sb_symbol *sbstem = sb_stemmer_stem(en_stemmer, (sb_symbol *)token, strlen(token));
|
|
412
|
+
uint32_t sbstem_len = sb_stemmer_length(en_stemmer);
|
|
413
|
+
|
|
414
|
+
if (sbstem_len < MIN_WORD_SIZE) {
|
|
415
|
+
if (lroot)
|
|
416
|
+
dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
|
|
417
|
+
continue;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
if (SkipTokenRE) {
|
|
421
|
+
if (RE2::FullMatch(token, *SkipTokenRE)) {
|
|
422
|
+
if (lroot)
|
|
423
|
+
dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
|
|
424
|
+
continue;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
string stem((char*)sbstem, sbstem_len);
|
|
428
|
+
if (RE2::FullMatch(stem, *SkipTokenRE)) {
|
|
429
|
+
if (lroot)
|
|
430
|
+
dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
|
|
431
|
+
continue;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
if (SkipTokenPatternRE && RE2::FullMatch(token, *SkipTokenPatternRE)) {
|
|
436
|
+
if (lroot)
|
|
437
|
+
dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
if (!(lnode = list_push(lroot, lcurr, token, strlen(token)))) {
|
|
442
|
+
dlist_free(dlroot);
|
|
443
|
+
sb_stemmer_delete(en_stemmer);
|
|
444
|
+
rb_raise(rb_eNoMemError, "ran out of memory while storing result");
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
if (lcurr)
|
|
448
|
+
lcurr = lnode;
|
|
449
|
+
else
|
|
450
|
+
lroot = lcurr = lnode;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
ptr = NULL;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
ptr = NULL;
|
|
457
|
+
if (lroot)
|
|
458
|
+
dlist_add_segment(&dlroot, &dlcurr, &lroot, &lcurr, en_stemmer);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
sb_stemmer_delete(en_stemmer);
|
|
462
|
+
free(buffer);
|
|
463
|
+
return dlroot;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
#define TBR_FUNC(a) (VALUE (*)(void*))(a)
|
|
467
|
+
#define TBR_CALL(a, text) rb_thread_blocking_region(TBR_FUNC(a), (void *)text, RUBY_UBF_PROCESS, 0)
|
|
468
|
+
|
|
469
|
+
// API
|
|
470
|
+
|
|
471
|
+
VALUE users(VALUE self, VALUE text, bool validated = false) {
|
|
472
|
+
if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
|
|
473
|
+
rb_raise(rb_eArgError, "requires tweet text");
|
|
474
|
+
return list_to_array((List*)TBR_CALL(tbr_users, text), rb_enc_get(text));
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
VALUE hashtags(VALUE self, VALUE text, bool validated = false) {
|
|
478
|
+
if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
|
|
479
|
+
rb_raise(rb_eArgError, "requires tweet text");
|
|
480
|
+
return list_to_array((List*)TBR_CALL(tbr_hashtags, text), rb_enc_get(text));
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
VALUE urls(VALUE self, VALUE text, bool validated = false) {
|
|
484
|
+
if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
|
|
485
|
+
rb_raise(rb_eArgError, "requires tweet text");
|
|
486
|
+
return list_to_array((List*)TBR_CALL(tbr_urls, text), rb_enc_get(text));
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
VALUE tokens(VALUE self, VALUE text, bool validated = false) {
|
|
490
|
+
if (!validated && (NIL_P(text) || TYPE(text) != T_STRING))
|
|
491
|
+
rb_raise(rb_eArgError, "requires tweet text");
|
|
492
|
+
return dlist_to_array((DList*)TBR_CALL(tbr_tokens, text), rb_enc_get(text));
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
VALUE entities(VALUE self, VALUE text) {
|
|
496
|
+
if (NIL_P(text) || TYPE(text) != T_STRING)
|
|
497
|
+
rb_raise(rb_eArgError, "requires tweet text");
|
|
498
|
+
|
|
499
|
+
VALUE result = rb_hash_new();
|
|
500
|
+
rb_hash_aset(result, id_users, users(self, text, true));
|
|
501
|
+
rb_hash_aset(result, id_hashtags, hashtags(self, text, true));
|
|
502
|
+
rb_hash_aset(result, id_urls, urls(self, text, true));
|
|
503
|
+
rb_hash_aset(result, id_tokens, tokens(self, text, true));
|
|
504
|
+
return result;
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
VALUE skip_users(VALUE self, VALUE list) {
|
|
508
|
+
if (UserStopRE)
|
|
509
|
+
delete UserStopRE;
|
|
510
|
+
UserStopRE = NULL;
|
|
511
|
+
|
|
512
|
+
if (NIL_P(list)) return Qtrue;
|
|
513
|
+
|
|
514
|
+
if (TYPE(list) != T_ARRAY)
|
|
515
|
+
rb_raise(rb_eArgError, "requires a list of screen names minus @");
|
|
516
|
+
|
|
517
|
+
UserStopRE = new RE2("@" + build_alternating_expr(list), DefaultMatchOptions);
|
|
518
|
+
if (!UserStopRE->ok())
|
|
519
|
+
rb_raise(rb_eArgError, "%s", UserStopRE->error().c_str());
|
|
520
|
+
|
|
521
|
+
return Qtrue;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
VALUE skip_hashtags(VALUE self, VALUE list) {
|
|
525
|
+
if (HashTagStopRE)
|
|
526
|
+
delete HashTagStopRE;
|
|
527
|
+
HashTagStopRE = NULL;
|
|
528
|
+
|
|
529
|
+
if (NIL_P(list)) return Qtrue;
|
|
530
|
+
|
|
531
|
+
if (TYPE(list) != T_ARRAY)
|
|
532
|
+
rb_raise(rb_eArgError, "requires a list of hashtags minus #");
|
|
533
|
+
|
|
534
|
+
HashTagStopRE = new RE2("#" + build_alternating_expr(list), DefaultMatchOptions);
|
|
535
|
+
if (!HashTagStopRE->ok())
|
|
536
|
+
rb_raise(rb_eArgError, "%s", HashTagStopRE->error().c_str());
|
|
537
|
+
|
|
538
|
+
return Qtrue;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
VALUE skip_tokens(VALUE self, VALUE list) {
|
|
542
|
+
if (SkipTokenRE)
|
|
543
|
+
delete SkipTokenRE;
|
|
544
|
+
SkipTokenRE = NULL;
|
|
545
|
+
|
|
546
|
+
if (NIL_P(list)) return Qtrue;
|
|
547
|
+
|
|
548
|
+
if (TYPE(list) != T_ARRAY)
|
|
549
|
+
rb_raise(rb_eArgError, "requires a list of words");
|
|
550
|
+
|
|
551
|
+
struct sb_stemmer *en_stemmer = sb_stemmer_new("english", "UTF_8");
|
|
552
|
+
|
|
553
|
+
// add stems as well
|
|
554
|
+
int i, max = RARRAY_LEN(list);
|
|
555
|
+
for (int i = 0; i < max; i++) {
|
|
556
|
+
VALUE word = rb_ary_entry(list, i);
|
|
557
|
+
rb_encoding *encoding = rb_enc_get(word);
|
|
558
|
+
const sb_symbol *sbstem = sb_stemmer_stem(en_stemmer, (sb_symbol *)RSTRING_PTR(word), RSTRING_LEN(word));
|
|
559
|
+
uint32_t sbstem_len = sb_stemmer_length(en_stemmer);
|
|
560
|
+
rb_ary_push(list, rb_enc_str_new((char*)sbstem, sbstem_len, encoding));
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
sb_stemmer_delete(en_stemmer);
|
|
564
|
+
|
|
565
|
+
// too bad, no uniq c api
|
|
566
|
+
rb_funcall(list, rb_intern("uniq!"), 0);
|
|
567
|
+
SkipTokenRE = new RE2("^" + build_alternating_expr(list) + "$", DefaultMatchOptions);
|
|
568
|
+
if (!SkipTokenRE->ok())
|
|
569
|
+
rb_raise(rb_eArgError, "%s", SkipTokenRE->error().c_str());
|
|
570
|
+
|
|
571
|
+
return Qtrue;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
VALUE skip_token_pattern(VALUE self, VALUE re) {
|
|
575
|
+
if (SkipTokenPatternRE)
|
|
576
|
+
delete SkipTokenPatternRE;
|
|
577
|
+
|
|
578
|
+
SkipTokenPatternRE = NULL;
|
|
579
|
+
|
|
580
|
+
if (NIL_P(re)) return Qtrue;
|
|
581
|
+
|
|
582
|
+
SkipTokenPatternRE = new RE2(CSTRING(re), DefaultMatchOptions);
|
|
583
|
+
if (!SkipTokenPatternRE->ok())
|
|
584
|
+
rb_raise(rb_eArgError, "%s", SkipTokenPatternRE->error().c_str());
|
|
585
|
+
|
|
586
|
+
return Qtrue;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
extern "C" {
|
|
591
|
+
void Init_chipper(void) {
|
|
592
|
+
UserRE = new RE2("(?:^|[^[:alnum:]])+([@@][[:alnum:]_\\-]+)");
|
|
593
|
+
HashTagRE = new RE2("(?:^|[^[:alnum:]])+(#[[:alnum:]}_]+)");
|
|
594
|
+
|
|
595
|
+
UserStopRE = NULL;
|
|
596
|
+
HashTagStopRE = NULL;
|
|
597
|
+
SkipTokenRE = NULL;
|
|
598
|
+
SkipTokenPatternRE = NULL;
|
|
599
|
+
|
|
600
|
+
DefaultMatchOptions.set_case_sensitive(false);
|
|
601
|
+
DefaultMatchOptions.set_log_errors(false);
|
|
602
|
+
|
|
603
|
+
id_users = ID2SYM(rb_intern("users"));
|
|
604
|
+
id_hashtags = ID2SYM(rb_intern("hashtags"));
|
|
605
|
+
id_urls = ID2SYM(rb_intern("urls"));
|
|
606
|
+
id_tokens = ID2SYM(rb_intern("tokens"));
|
|
607
|
+
|
|
608
|
+
rb_global_variable(&id_users);
|
|
609
|
+
rb_global_variable(&id_hashtags);
|
|
610
|
+
rb_global_variable(&id_urls);
|
|
611
|
+
rb_global_variable(&id_tokens);
|
|
612
|
+
|
|
613
|
+
VALUE mChipper = rb_define_module("Chipper");
|
|
614
|
+
rb_define_module_function(mChipper, "users", RUBY_METHOD_FUNC(users), 1);
|
|
615
|
+
rb_define_module_function(mChipper, "hashtags", RUBY_METHOD_FUNC(hashtags), 1);
|
|
616
|
+
rb_define_module_function(mChipper, "urls", RUBY_METHOD_FUNC(urls), 1);
|
|
617
|
+
rb_define_module_function(mChipper, "tokens", RUBY_METHOD_FUNC(tokens), 1);
|
|
618
|
+
rb_define_module_function(mChipper, "entities", RUBY_METHOD_FUNC(entities), 1);
|
|
619
|
+
rb_define_module_function(mChipper, "skip_users", RUBY_METHOD_FUNC(skip_users), 1);
|
|
620
|
+
rb_define_module_function(mChipper, "skip_hashtags", RUBY_METHOD_FUNC(skip_hashtags), 1);
|
|
621
|
+
rb_define_module_function(mChipper, "skip_tokens", RUBY_METHOD_FUNC(skip_tokens), 1);
|
|
622
|
+
rb_define_module_function(mChipper, "skip_token_pattern", RUBY_METHOD_FUNC(skip_token_pattern), 1);
|
|
623
|
+
|
|
624
|
+
rb_define_const(mChipper, "VERSION", rb_str_new2(CHIPPER_VERSION));
|
|
625
|
+
}
|
|
626
|
+
}
|
data/ext/src/version.h
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#define CHIPPER_VERSION "0.4.2"
|
data/ext/stemmer.rb
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
|
|
5
|
+
################################################################################
|
|
6
|
+
################################################################################
|
|
7
|
+
## Derived from ruby-stemmer https://github.com/aurelian/ruby-stemmer
|
|
8
|
+
|
|
9
|
+
# FreeBSD make is gmake
|
|
10
|
+
make= (RUBY_PLATFORM =~ /freebsd/)? 'gmake' : 'make'
|
|
11
|
+
|
|
12
|
+
LIBSTEMMER = File.expand_path(File.join(File.dirname(__FILE__), 'libstemmer_c'))
|
|
13
|
+
|
|
14
|
+
# MacOS architecture mess up
|
|
15
|
+
if RUBY_PLATFORM =~ /darwin/
|
|
16
|
+
# see: #issue/3, #issue/5
|
|
17
|
+
begin
|
|
18
|
+
ENV['ARCHFLAGS']= "-arch " + %x[file #{File.expand_path(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME']))}].strip!.match(/executable (.+)$/)[1] unless ENV['ARCHFLAGS'].nil?
|
|
19
|
+
rescue
|
|
20
|
+
$stderr << "Failed to get your ruby executable architecture.\n"
|
|
21
|
+
$stderr << "Please specify one using $ARCHFLAGS environment variable.\n"
|
|
22
|
+
exit
|
|
23
|
+
end
|
|
24
|
+
# see: #issue/9, #issue/6
|
|
25
|
+
# see: man compat
|
|
26
|
+
if ENV['COMMAND_MODE'] == 'legacy'
|
|
27
|
+
$stdout << "Setting compat mode to unix2003\n."
|
|
28
|
+
ENV['COMMAND_MODE']= 'unix2003'
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# make libstemmer_c. unless we're cross-compiling.
|
|
33
|
+
unless RUBY_PLATFORM =~ /i386-mingw32/
|
|
34
|
+
Dir.chdir(LIBSTEMMER) {
|
|
35
|
+
system(make) || exit(false)
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
################################################################################
|
|
40
|
+
################################################################################
|