categorize 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/ext/categorize/categorize.c +370 -0
- data/ext/categorize/extconf.rb +13 -0
- data/lib/categorize/models/cluster.rb +3 -5
- metadata +13 -9
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
|
10
|
+
OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
|
11
|
+
Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
|
14
|
+
OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
|
15
|
+
NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
|
@@ -0,0 +1,370 @@
|
|
1
|
+
#include <inttypes.h> /* intptr_t, PRIxPTR */
|
2
|
+
#include <search.h> /* hcreate(), hsearch() */
|
3
|
+
#include <stdio.h> /* perror(), printf() */
|
4
|
+
#include <stdlib.h> /* exit() */
|
5
|
+
#include "ruby.h"
|
6
|
+
|
7
|
+
// START header
|
8
|
+
// For information and references about the module to be stored internally.
|
9
|
+
VALUE Bow = Qnil;
|
10
|
+
|
11
|
+
static VALUE method_model_bow(VALUE, VALUE);
|
12
|
+
static int add_or_update_gram_from_index(int, char *);
|
13
|
+
|
14
|
+
// Store all grams, used in compare_top_grams.
|
15
|
+
static char **all_grams_pp;
|
16
|
+
// END header
|
17
|
+
|
18
|
+
// Initialization method for this module.
|
19
|
+
void Init_bow()
|
20
|
+
{
|
21
|
+
Bow = rb_define_module("Bow");
|
22
|
+
rb_define_method(Bow, "model_bow", method_model_bow, 1);
|
23
|
+
}
|
24
|
+
|
25
|
+
const bool DEBUG = false;
|
26
|
+
const int MAX_BUCKETS = 10;
|
27
|
+
const float MIN_SUPPORT = 0.1;
|
28
|
+
const int NUM_TOP_GRAMS = 250;
|
29
|
+
|
30
|
+
void fail(const char *message)
|
31
|
+
{
|
32
|
+
perror(message);
|
33
|
+
rb_fatal(message);
|
34
|
+
exit(1);
|
35
|
+
}
|
36
|
+
|
37
|
+
/*
|
38
|
+
* Must hcreate() the hash table before calling fetch() or store().
|
39
|
+
*
|
40
|
+
* Because p->data is a pointer, fetch() and store() cast between
|
41
|
+
* void * and intptr_t.
|
42
|
+
*/
|
43
|
+
|
44
|
+
/* Fetch value from the hash table. */
|
45
|
+
int fetch(const char *key, intptr_t *value)
|
46
|
+
{
|
47
|
+
ENTRY e = {key: (char *)key}, *p;
|
48
|
+
p = hsearch(e, FIND);
|
49
|
+
|
50
|
+
if (p) {
|
51
|
+
*value = (intptr_t)p->data;
|
52
|
+
return 1;
|
53
|
+
} else
|
54
|
+
return 0;
|
55
|
+
}
|
56
|
+
|
57
|
+
/* Store key-value pair into the hash table. */
|
58
|
+
void store(const char *key, intptr_t value)
|
59
|
+
{
|
60
|
+
/*
|
61
|
+
* hsearch() may insert a new entry or find an existing entry
|
62
|
+
* with the same key. hsearch() ignores e.data if it finds an
|
63
|
+
* existing entry. We must call hsearch(), then set p->data.
|
64
|
+
*/
|
65
|
+
ENTRY e = {key: (char *)key}, *p;
|
66
|
+
p = hsearch(e, ENTER);
|
67
|
+
|
68
|
+
if (p == NULL) fail("hsearch");
|
69
|
+
|
70
|
+
p->data = (void *)value;
|
71
|
+
}
|
72
|
+
|
73
|
+
char *make_key(int i, char *str)
|
74
|
+
{
|
75
|
+
// Only provide support for < 100 groups.
|
76
|
+
int nbuf = (i < 10) ? 3 : 4;
|
77
|
+
char *buf = malloc(sizeof(char) * (nbuf + strlen(str)));
|
78
|
+
|
79
|
+
if (buf == NULL) rb_fatal("No memory for key %i", i);
|
80
|
+
|
81
|
+
snprintf(buf, nbuf + strlen(str), "%i_%s", i, str);
|
82
|
+
|
83
|
+
return buf;
|
84
|
+
}
|
85
|
+
|
86
|
+
typedef struct {
|
87
|
+
int freq;
|
88
|
+
float fitness;
|
89
|
+
} gram;
|
90
|
+
|
91
|
+
int compare_grams(const void *gram1, const void *gram2)
|
92
|
+
{
|
93
|
+
intptr_t g1, g2;
|
94
|
+
|
95
|
+
if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
|
96
|
+
return (*(gram *) g2).freq - (*(gram *) g1).freq;
|
97
|
+
} else
|
98
|
+
fail("compare_grams");
|
99
|
+
|
100
|
+
return 0;
|
101
|
+
}
|
102
|
+
|
103
|
+
int compare_top_grams(const void *idx1, const void *idx2)
|
104
|
+
{
|
105
|
+
char *gram1 = all_grams_pp[*(int *) idx1];
|
106
|
+
char *gram2 = all_grams_pp[*(int *) idx2];
|
107
|
+
intptr_t g1, g2;
|
108
|
+
|
109
|
+
if (fetch(gram1, &g1) && fetch(gram2, &g2))
|
110
|
+
return (*(gram *) g2).fitness - (*(gram *) g1).fitness;
|
111
|
+
else
|
112
|
+
fail("compare_grams");
|
113
|
+
|
114
|
+
return 0;
|
115
|
+
}
|
116
|
+
|
117
|
+
/*
|
118
|
+
* model_bow(array_of_tokens);
|
119
|
+
* ==== Return
|
120
|
+
* Top terms
|
121
|
+
* ==== Parameters
|
122
|
+
* array_of_tokens: Tokens to turn into grams and extract phrases from.
|
123
|
+
*/
|
124
|
+
static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
125
|
+
{
|
126
|
+
int i, j;
|
127
|
+
long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
|
128
|
+
int num_grams = 0;
|
129
|
+
|
130
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
131
|
+
// n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
|
132
|
+
// TODO Correct parentheses enclose as (n - 1).
|
133
|
+
num_grams += 3 * RARRAY_LEN(rb_ary_entry(array_of_tokens, i)) - 1;
|
134
|
+
}
|
135
|
+
|
136
|
+
// Create an empty table that can hold 50 entries.
|
137
|
+
if (DEBUG) printf("num grams: %i\n", num_grams);
|
138
|
+
if (hcreate(2 * num_grams) == 0)
|
139
|
+
fail("hcreate");
|
140
|
+
|
141
|
+
// list of all grams
|
142
|
+
all_grams_pp = malloc(sizeof(char *) * num_grams);
|
143
|
+
if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
|
144
|
+
|
145
|
+
int gram_counter = 0;
|
146
|
+
char *tmp;
|
147
|
+
char *str;
|
148
|
+
char *bigram;
|
149
|
+
char *trigram;
|
150
|
+
char *last_word;
|
151
|
+
char *last_2nd_word;
|
152
|
+
int non_empty_tokens = 0;
|
153
|
+
int tmp_int;
|
154
|
+
|
155
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
156
|
+
// n grams
|
157
|
+
last_word = 0;
|
158
|
+
last_2nd_word = 0;
|
159
|
+
if (DEBUG) printf("start i: %i\n", i);
|
160
|
+
|
161
|
+
for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
|
162
|
+
VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
|
163
|
+
// store str via malloc so we can free it along with others
|
164
|
+
tmp = StringValueCStr(rb_str);
|
165
|
+
tmp_int = 1 + strlen(tmp);
|
166
|
+
str = malloc(sizeof(char) * tmp_int);
|
167
|
+
snprintf(str, tmp_int, "%s", tmp);
|
168
|
+
|
169
|
+
// add gram
|
170
|
+
if (add_or_update_gram_from_index(i, str))
|
171
|
+
all_grams_pp[gram_counter++] = str;
|
172
|
+
|
173
|
+
if (DEBUG) printf("j: %i, gram: %s", j, str);
|
174
|
+
|
175
|
+
// add bigram
|
176
|
+
if (last_word && strcmp(str, last_word) != 0) {
|
177
|
+
tmp_int = 2 + strlen(str) + strlen(last_word);
|
178
|
+
bigram = malloc(sizeof(char) * tmp_int);
|
179
|
+
|
180
|
+
if (bigram == NULL) rb_fatal("No memory for bigram");
|
181
|
+
snprintf(bigram, tmp_int, "%s %s", last_word, str);
|
182
|
+
|
183
|
+
if (add_or_update_gram_from_index(i, bigram))
|
184
|
+
all_grams_pp[gram_counter++] = bigram;
|
185
|
+
|
186
|
+
if (DEBUG) printf(", bigram: %s", bigram);
|
187
|
+
|
188
|
+
// add trigram
|
189
|
+
if (last_2nd_word &&
|
190
|
+
strcmp(str, last_word) != 0 &&
|
191
|
+
strcmp(str, last_2nd_word) != 0 &&
|
192
|
+
strcmp(last_word, last_2nd_word) != 0) {
|
193
|
+
tmp_int = 2 + strlen(bigram) + strlen(last_2nd_word);
|
194
|
+
trigram = malloc(sizeof(char) * tmp_int);
|
195
|
+
|
196
|
+
if (trigram == NULL) rb_fatal("No memory for trigram");
|
197
|
+
snprintf(trigram, tmp_int, "%s %s", last_2nd_word, bigram);
|
198
|
+
|
199
|
+
if (add_or_update_gram_from_index(i, trigram))
|
200
|
+
all_grams_pp[gram_counter++] = trigram;
|
201
|
+
|
202
|
+
if (DEBUG) printf(", trigram: %s", trigram);
|
203
|
+
}
|
204
|
+
}
|
205
|
+
if (DEBUG) printf("\n");
|
206
|
+
last_2nd_word = last_word;
|
207
|
+
last_word = str;
|
208
|
+
}
|
209
|
+
if (j > 0) non_empty_tokens++;
|
210
|
+
if (DEBUG) printf("end i: %i\n", i);
|
211
|
+
}
|
212
|
+
int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
|
213
|
+
|
214
|
+
if (DEBUG) printf("added %i grams\n", gram_counter);
|
215
|
+
|
216
|
+
// sort all_grams
|
217
|
+
qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
|
218
|
+
|
219
|
+
// only consider prominent top NUM_TOP_GRAMS grams
|
220
|
+
int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
|
221
|
+
|
222
|
+
if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
|
223
|
+
gram_counter, num_top_grams, array_of_tokens_len);
|
224
|
+
|
225
|
+
int top_grams_p[num_top_grams];
|
226
|
+
|
227
|
+
if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
|
228
|
+
|
229
|
+
int top_gram_counter = 0;
|
230
|
+
intptr_t g, all_g;
|
231
|
+
int count;
|
232
|
+
char *key;
|
233
|
+
|
234
|
+
for (i = 0; i < num_top_grams; i++) {
|
235
|
+
count = 0;
|
236
|
+
for (j = 0; j < array_of_tokens_len; j++) {
|
237
|
+
key = make_key(j, all_grams_pp[i]);
|
238
|
+
|
239
|
+
if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
|
240
|
+
top_grams_p[top_gram_counter++] = i;
|
241
|
+
if (DEBUG) printf("%i: covering gram: %s\n",
|
242
|
+
top_gram_counter - 1, all_grams_pp[i]);
|
243
|
+
break;
|
244
|
+
}
|
245
|
+
}
|
246
|
+
}
|
247
|
+
|
248
|
+
if (DEBUG) {
|
249
|
+
printf("after top grams\n");
|
250
|
+
printf("tgc %i\n", top_gram_counter);
|
251
|
+
}
|
252
|
+
|
253
|
+
float max_fitness;
|
254
|
+
char *max_fit;
|
255
|
+
|
256
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
257
|
+
if (DEBUG) printf("start i: %i\n", i);
|
258
|
+
|
259
|
+
// set fitness for top grams relative to collections
|
260
|
+
for (j = 0; j < top_gram_counter; j++) {
|
261
|
+
key = make_key(i, all_grams_pp[top_grams_p[j]]);
|
262
|
+
|
263
|
+
if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
|
264
|
+
(*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
|
265
|
+
if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
|
266
|
+
}
|
267
|
+
|
268
|
+
free(key);
|
269
|
+
}
|
270
|
+
|
271
|
+
max_fitness = 0.0;
|
272
|
+
max_fit = 0;
|
273
|
+
|
274
|
+
// set fitness for top grams overall
|
275
|
+
for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
|
276
|
+
VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
|
277
|
+
str = StringValueCStr(rb_str);
|
278
|
+
key = make_key(i, str);
|
279
|
+
|
280
|
+
if (fetch(key, &g) && (*(gram *) g).fitness > max_fitness) {
|
281
|
+
max_fitness = (*(gram *) g).fitness;
|
282
|
+
max_fit = str;
|
283
|
+
}
|
284
|
+
|
285
|
+
free(key);
|
286
|
+
// store fitness of gram
|
287
|
+
if (max_fit && fetch(max_fit, &g))
|
288
|
+
(*(gram *) g).fitness += 1.0;
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
292
|
+
if (DEBUG) printf("after set fitness\n");
|
293
|
+
|
294
|
+
// sort top_grams and take MAX_BUCKETS
|
295
|
+
qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
|
296
|
+
if (DEBUG) printf("after qsort top grams\n");
|
297
|
+
|
298
|
+
int max_fit_idx;
|
299
|
+
VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
|
300
|
+
|
301
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
302
|
+
max_fitness = 0;
|
303
|
+
max_fit_idx = 0;
|
304
|
+
|
305
|
+
for (j = 0; j < MAX_BUCKETS && j < top_gram_counter; j++) {
|
306
|
+
char *key = make_key(i, all_grams_pp[top_grams_p[j]]);
|
307
|
+
|
308
|
+
if (fetch(key, &g) && (*(gram *) g).fitness >= max_fitness) {
|
309
|
+
max_fitness = (*(gram *) g).fitness;
|
310
|
+
max_fit_idx = j;
|
311
|
+
}
|
312
|
+
|
313
|
+
free(key);
|
314
|
+
}
|
315
|
+
|
316
|
+
VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
|
317
|
+
rb_ary_push(term_for_record, term);
|
318
|
+
}
|
319
|
+
if (DEBUG) printf("after qsort top grams\n");
|
320
|
+
if (DEBUG) printf("freeing\n");
|
321
|
+
|
322
|
+
for (i = 0; i < gram_counter; i++) {
|
323
|
+
for (j = 0; j < array_of_tokens_len; j++) {
|
324
|
+
char *key = make_key(j, all_grams_pp[i]);
|
325
|
+
|
326
|
+
if (fetch(key, &g)) free((void *) g);
|
327
|
+
free(key);
|
328
|
+
}
|
329
|
+
|
330
|
+
fetch(all_grams_pp[i], &g);
|
331
|
+
free((void *) g);
|
332
|
+
free(all_grams_pp[i]);
|
333
|
+
}
|
334
|
+
|
335
|
+
free(all_grams_pp);
|
336
|
+
if (DEBUG) printf("freed all grams\n");
|
337
|
+
hdestroy();
|
338
|
+
if (DEBUG) printf("returning\n");
|
339
|
+
|
340
|
+
return term_for_record;
|
341
|
+
}
|
342
|
+
|
343
|
+
// Return whether gram exists or not
|
344
|
+
int add_or_update_gram(char *key)
|
345
|
+
{
|
346
|
+
intptr_t g;
|
347
|
+
if (fetch(key, &g)) {
|
348
|
+
(*(gram *) g).freq += 1;
|
349
|
+
if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
|
350
|
+
|
351
|
+
return 0;
|
352
|
+
} else {
|
353
|
+
gram *g = malloc(sizeof(gram));
|
354
|
+
if (g == NULL) rb_fatal("No memory for gram");
|
355
|
+
(*g).freq = 1;
|
356
|
+
(*g).fitness = 0.0;
|
357
|
+
store(key, (intptr_t) g);
|
358
|
+
|
359
|
+
return 1;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
int add_or_update_gram_from_index(int i, char *str)
|
364
|
+
{
|
365
|
+
char *key = make_key(i, str);
|
366
|
+
add_or_update_gram(key);
|
367
|
+
|
368
|
+
return add_or_update_gram(str);
|
369
|
+
}
|
370
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Loads mkmf which is used to make makefiles for Ruby extensions
|
4
|
+
require 'mkmf'
|
5
|
+
|
6
|
+
# Give it a name
|
7
|
+
extension_name = 'bow'
|
8
|
+
|
9
|
+
# The destination
|
10
|
+
dir_config(extension_name)
|
11
|
+
|
12
|
+
# Do the work
|
13
|
+
create_makefile(extension_name)
|
@@ -18,11 +18,9 @@ module Categorize
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def build_categories(clusters)
|
21
|
-
|
22
|
-
cluster.data_items.map { |v| @vectors.index(v) }
|
23
|
-
end
|
24
|
-
|
25
|
-
clusters_to_records = Hash[(0...@num_clusters).zip(cluster_indices)]
|
21
|
+
clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
|
22
|
+
[i, cluster.data_items.map { |v| @vectors.index(v) }]
|
23
|
+
end]
|
26
24
|
|
27
25
|
@query_terms ||= @query.split.map(&:downcase)
|
28
26
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Lubell-Doughtie
|
@@ -15,21 +15,25 @@ description: ! "A text categorization library that favors performance.\n
|
|
15
15
|
for use in online systems."
|
16
16
|
email: peter@helioid.com
|
17
17
|
executables: []
|
18
|
-
extensions:
|
18
|
+
extensions:
|
19
|
+
- ext/categorize/extconf.rb
|
19
20
|
extra_rdoc_files: []
|
20
21
|
files:
|
21
|
-
- lib/categorize.rb
|
22
|
-
- lib/categorize/model.rb
|
23
22
|
- lib/categorize/constants.rb
|
23
|
+
- lib/categorize/utils/gram_collection.rb
|
24
|
+
- lib/categorize/utils/grams.rb
|
25
|
+
- lib/categorize/utils/gram_node.rb
|
26
|
+
- lib/categorize/model.rb
|
24
27
|
- lib/categorize/models/abstract_model.rb
|
28
|
+
- lib/categorize/models/hierarchical_cluster.rb
|
25
29
|
- lib/categorize/models/bag_of_words.rb
|
26
30
|
- lib/categorize/models/cluster.rb
|
27
|
-
- lib/categorize
|
28
|
-
-
|
29
|
-
-
|
30
|
-
- lib/categorize/utils/grams.rb
|
31
|
+
- lib/categorize.rb
|
32
|
+
- ext/categorize/categorize.c
|
33
|
+
- ext/categorize/extconf.rb
|
31
34
|
homepage: http://www.helioid.com/
|
32
|
-
licenses:
|
35
|
+
licenses:
|
36
|
+
- BSD3
|
33
37
|
metadata: {}
|
34
38
|
post_install_message:
|
35
39
|
rdoc_options: []
|