categorize 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/ext/categorize/categorize.c +370 -0
- data/ext/categorize/extconf.rb +13 -0
- data/lib/categorize/models/cluster.rb +3 -5
- metadata +13 -9
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
|
10
|
+
OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
|
11
|
+
Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
|
14
|
+
OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
|
15
|
+
NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
|
@@ -0,0 +1,370 @@
|
|
1
|
+
#include <inttypes.h> /* intptr_t, PRIxPTR */
|
2
|
+
#include <search.h> /* hcreate(), hsearch() */
|
3
|
+
#include <stdio.h> /* perror(), printf() */
|
4
|
+
#include <stdlib.h> /* exit() */
|
5
|
+
#include "ruby.h"
|
6
|
+
|
7
|
+
// START header
|
8
|
+
// For information and references about the module to be stored internally.
|
9
|
+
VALUE Bow = Qnil;
|
10
|
+
|
11
|
+
static VALUE method_model_bow(VALUE, VALUE);
|
12
|
+
static int add_or_update_gram_from_index(int, char *);
|
13
|
+
|
14
|
+
// Store all grams, used in compare_top_grams.
|
15
|
+
static char **all_grams_pp;
|
16
|
+
// END header
|
17
|
+
|
18
|
+
// Initialization method for this module.
|
19
|
+
void Init_bow()
|
20
|
+
{
|
21
|
+
Bow = rb_define_module("Bow");
|
22
|
+
rb_define_method(Bow, "model_bow", method_model_bow, 1);
|
23
|
+
}
|
24
|
+
|
25
|
+
const bool DEBUG = false;
|
26
|
+
const int MAX_BUCKETS = 10;
|
27
|
+
const float MIN_SUPPORT = 0.1;
|
28
|
+
const int NUM_TOP_GRAMS = 250;
|
29
|
+
|
30
|
+
void fail(const char *message)
|
31
|
+
{
|
32
|
+
perror(message);
|
33
|
+
rb_fatal(message);
|
34
|
+
exit(1);
|
35
|
+
}
|
36
|
+
|
37
|
+
/*
|
38
|
+
* Must hcreate() the hash table before calling fetch() or store().
|
39
|
+
*
|
40
|
+
* Because p->data is a pointer, fetch() and store() cast between
|
41
|
+
* void * and intptr_t.
|
42
|
+
*/
|
43
|
+
|
44
|
+
/* Fetch value from the hash table. */
|
45
|
+
int fetch(const char *key, intptr_t *value)
|
46
|
+
{
|
47
|
+
ENTRY e = {key: (char *)key}, *p;
|
48
|
+
p = hsearch(e, FIND);
|
49
|
+
|
50
|
+
if (p) {
|
51
|
+
*value = (intptr_t)p->data;
|
52
|
+
return 1;
|
53
|
+
} else
|
54
|
+
return 0;
|
55
|
+
}
|
56
|
+
|
57
|
+
/* Store key-value pair into the hash table. */
|
58
|
+
void store(const char *key, intptr_t value)
|
59
|
+
{
|
60
|
+
/*
|
61
|
+
* hsearch() may insert a new entry or find an existing entry
|
62
|
+
* with the same key. hsearch() ignores e.data if it finds an
|
63
|
+
* existing entry. We must call hsearch(), then set p->data.
|
64
|
+
*/
|
65
|
+
ENTRY e = {key: (char *)key}, *p;
|
66
|
+
p = hsearch(e, ENTER);
|
67
|
+
|
68
|
+
if (p == NULL) fail("hsearch");
|
69
|
+
|
70
|
+
p->data = (void *)value;
|
71
|
+
}
|
72
|
+
|
73
|
+
char *make_key(int i, char *str)
|
74
|
+
{
|
75
|
+
// Only provide support for < 100 groups.
|
76
|
+
int nbuf = (i < 10) ? 3 : 4;
|
77
|
+
char *buf = malloc(sizeof(char) * (nbuf + strlen(str)));
|
78
|
+
|
79
|
+
if (buf == NULL) rb_fatal("No memory for key %i", i);
|
80
|
+
|
81
|
+
snprintf(buf, nbuf + strlen(str), "%i_%s", i, str);
|
82
|
+
|
83
|
+
return buf;
|
84
|
+
}
|
85
|
+
|
86
|
+
typedef struct {
|
87
|
+
int freq;
|
88
|
+
float fitness;
|
89
|
+
} gram;
|
90
|
+
|
91
|
+
int compare_grams(const void *gram1, const void *gram2)
|
92
|
+
{
|
93
|
+
intptr_t g1, g2;
|
94
|
+
|
95
|
+
if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
|
96
|
+
return (*(gram *) g2).freq - (*(gram *) g1).freq;
|
97
|
+
} else
|
98
|
+
fail("compare_grams");
|
99
|
+
|
100
|
+
return 0;
|
101
|
+
}
|
102
|
+
|
103
|
+
int compare_top_grams(const void *idx1, const void *idx2)
|
104
|
+
{
|
105
|
+
char *gram1 = all_grams_pp[*(int *) idx1];
|
106
|
+
char *gram2 = all_grams_pp[*(int *) idx2];
|
107
|
+
intptr_t g1, g2;
|
108
|
+
|
109
|
+
if (fetch(gram1, &g1) && fetch(gram2, &g2))
|
110
|
+
return (*(gram *) g2).fitness - (*(gram *) g1).fitness;
|
111
|
+
else
|
112
|
+
fail("compare_grams");
|
113
|
+
|
114
|
+
return 0;
|
115
|
+
}
|
116
|
+
|
117
|
+
/*
|
118
|
+
* model_bow(array_of_tokens);
|
119
|
+
* ==== Return
|
120
|
+
* Top terms
|
121
|
+
* ==== Parameters
|
122
|
+
* array_of_tokens: Tokens to turn into grams and extract phrases from.
|
123
|
+
*/
|
124
|
+
static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
125
|
+
{
|
126
|
+
int i, j;
|
127
|
+
long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
|
128
|
+
int num_grams = 0;
|
129
|
+
|
130
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
131
|
+
// n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
|
132
|
+
// TODO Correct parentheses enclose as (n - 1).
|
133
|
+
num_grams += 3 * RARRAY_LEN(rb_ary_entry(array_of_tokens, i)) - 1;
|
134
|
+
}
|
135
|
+
|
136
|
+
// Create an empty table that can hold 50 entries.
|
137
|
+
if (DEBUG) printf("num grams: %i\n", num_grams);
|
138
|
+
if (hcreate(2 * num_grams) == 0)
|
139
|
+
fail("hcreate");
|
140
|
+
|
141
|
+
// list of all grams
|
142
|
+
all_grams_pp = malloc(sizeof(char *) * num_grams);
|
143
|
+
if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
|
144
|
+
|
145
|
+
int gram_counter = 0;
|
146
|
+
char *tmp;
|
147
|
+
char *str;
|
148
|
+
char *bigram;
|
149
|
+
char *trigram;
|
150
|
+
char *last_word;
|
151
|
+
char *last_2nd_word;
|
152
|
+
int non_empty_tokens = 0;
|
153
|
+
int tmp_int;
|
154
|
+
|
155
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
156
|
+
// n grams
|
157
|
+
last_word = 0;
|
158
|
+
last_2nd_word = 0;
|
159
|
+
if (DEBUG) printf("start i: %i\n", i);
|
160
|
+
|
161
|
+
for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
|
162
|
+
VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
|
163
|
+
// store str via malloc so we can free it along with others
|
164
|
+
tmp = StringValueCStr(rb_str);
|
165
|
+
tmp_int = 1 + strlen(tmp);
|
166
|
+
str = malloc(sizeof(char) * tmp_int);
|
167
|
+
snprintf(str, tmp_int, "%s", tmp);
|
168
|
+
|
169
|
+
// add gram
|
170
|
+
if (add_or_update_gram_from_index(i, str))
|
171
|
+
all_grams_pp[gram_counter++] = str;
|
172
|
+
|
173
|
+
if (DEBUG) printf("j: %i, gram: %s", j, str);
|
174
|
+
|
175
|
+
// add bigram
|
176
|
+
if (last_word && strcmp(str, last_word) != 0) {
|
177
|
+
tmp_int = 2 + strlen(str) + strlen(last_word);
|
178
|
+
bigram = malloc(sizeof(char) * tmp_int);
|
179
|
+
|
180
|
+
if (bigram == NULL) rb_fatal("No memory for bigram");
|
181
|
+
snprintf(bigram, tmp_int, "%s %s", last_word, str);
|
182
|
+
|
183
|
+
if (add_or_update_gram_from_index(i, bigram))
|
184
|
+
all_grams_pp[gram_counter++] = bigram;
|
185
|
+
|
186
|
+
if (DEBUG) printf(", bigram: %s", bigram);
|
187
|
+
|
188
|
+
// add trigram
|
189
|
+
if (last_2nd_word &&
|
190
|
+
strcmp(str, last_word) != 0 &&
|
191
|
+
strcmp(str, last_2nd_word) != 0 &&
|
192
|
+
strcmp(last_word, last_2nd_word) != 0) {
|
193
|
+
tmp_int = 2 + strlen(bigram) + strlen(last_2nd_word);
|
194
|
+
trigram = malloc(sizeof(char) * tmp_int);
|
195
|
+
|
196
|
+
if (trigram == NULL) rb_fatal("No memory for trigram");
|
197
|
+
snprintf(trigram, tmp_int, "%s %s", last_2nd_word, bigram);
|
198
|
+
|
199
|
+
if (add_or_update_gram_from_index(i, trigram))
|
200
|
+
all_grams_pp[gram_counter++] = trigram;
|
201
|
+
|
202
|
+
if (DEBUG) printf(", trigram: %s", trigram);
|
203
|
+
}
|
204
|
+
}
|
205
|
+
if (DEBUG) printf("\n");
|
206
|
+
last_2nd_word = last_word;
|
207
|
+
last_word = str;
|
208
|
+
}
|
209
|
+
if (j > 0) non_empty_tokens++;
|
210
|
+
if (DEBUG) printf("end i: %i\n", i);
|
211
|
+
}
|
212
|
+
int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
|
213
|
+
|
214
|
+
if (DEBUG) printf("added %i grams\n", gram_counter);
|
215
|
+
|
216
|
+
// sort all_grams
|
217
|
+
qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
|
218
|
+
|
219
|
+
// only consider prominent top NUM_TOP_GRAMS grams
|
220
|
+
int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
|
221
|
+
|
222
|
+
if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
|
223
|
+
gram_counter, num_top_grams, array_of_tokens_len);
|
224
|
+
|
225
|
+
int top_grams_p[num_top_grams];
|
226
|
+
|
227
|
+
if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
|
228
|
+
|
229
|
+
int top_gram_counter = 0;
|
230
|
+
intptr_t g, all_g;
|
231
|
+
int count;
|
232
|
+
char *key;
|
233
|
+
|
234
|
+
for (i = 0; i < num_top_grams; i++) {
|
235
|
+
count = 0;
|
236
|
+
for (j = 0; j < array_of_tokens_len; j++) {
|
237
|
+
key = make_key(j, all_grams_pp[i]);
|
238
|
+
|
239
|
+
if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
|
240
|
+
top_grams_p[top_gram_counter++] = i;
|
241
|
+
if (DEBUG) printf("%i: covering gram: %s\n",
|
242
|
+
top_gram_counter - 1, all_grams_pp[i]);
|
243
|
+
break;
|
244
|
+
}
|
245
|
+
}
|
246
|
+
}
|
247
|
+
|
248
|
+
if (DEBUG) {
|
249
|
+
printf("after top grams\n");
|
250
|
+
printf("tgc %i\n", top_gram_counter);
|
251
|
+
}
|
252
|
+
|
253
|
+
float max_fitness;
|
254
|
+
char *max_fit;
|
255
|
+
|
256
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
257
|
+
if (DEBUG) printf("start i: %i\n", i);
|
258
|
+
|
259
|
+
// set fitness for top grams relative to collections
|
260
|
+
for (j = 0; j < top_gram_counter; j++) {
|
261
|
+
key = make_key(i, all_grams_pp[top_grams_p[j]]);
|
262
|
+
|
263
|
+
if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
|
264
|
+
(*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
|
265
|
+
if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
|
266
|
+
}
|
267
|
+
|
268
|
+
free(key);
|
269
|
+
}
|
270
|
+
|
271
|
+
max_fitness = 0.0;
|
272
|
+
max_fit = 0;
|
273
|
+
|
274
|
+
// set fitness for top grams overall
|
275
|
+
for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
|
276
|
+
VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
|
277
|
+
str = StringValueCStr(rb_str);
|
278
|
+
key = make_key(i, str);
|
279
|
+
|
280
|
+
if (fetch(key, &g) && (*(gram *) g).fitness > max_fitness) {
|
281
|
+
max_fitness = (*(gram *) g).fitness;
|
282
|
+
max_fit = str;
|
283
|
+
}
|
284
|
+
|
285
|
+
free(key);
|
286
|
+
// store fitness of gram
|
287
|
+
if (max_fit && fetch(max_fit, &g))
|
288
|
+
(*(gram *) g).fitness += 1.0;
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
292
|
+
if (DEBUG) printf("after set fitness\n");
|
293
|
+
|
294
|
+
// sort top_grams and take MAX_BUCKETS
|
295
|
+
qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
|
296
|
+
if (DEBUG) printf("after qsort top grams\n");
|
297
|
+
|
298
|
+
int max_fit_idx;
|
299
|
+
VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
|
300
|
+
|
301
|
+
for (i = 0; i < array_of_tokens_len; i++) {
|
302
|
+
max_fitness = 0;
|
303
|
+
max_fit_idx = 0;
|
304
|
+
|
305
|
+
for (j = 0; j < MAX_BUCKETS && j < top_gram_counter; j++) {
|
306
|
+
char *key = make_key(i, all_grams_pp[top_grams_p[j]]);
|
307
|
+
|
308
|
+
if (fetch(key, &g) && (*(gram *) g).fitness >= max_fitness) {
|
309
|
+
max_fitness = (*(gram *) g).fitness;
|
310
|
+
max_fit_idx = j;
|
311
|
+
}
|
312
|
+
|
313
|
+
free(key);
|
314
|
+
}
|
315
|
+
|
316
|
+
VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
|
317
|
+
rb_ary_push(term_for_record, term);
|
318
|
+
}
|
319
|
+
if (DEBUG) printf("after qsort top grams\n");
|
320
|
+
if (DEBUG) printf("freeing\n");
|
321
|
+
|
322
|
+
for (i = 0; i < gram_counter; i++) {
|
323
|
+
for (j = 0; j < array_of_tokens_len; j++) {
|
324
|
+
char *key = make_key(j, all_grams_pp[i]);
|
325
|
+
|
326
|
+
if (fetch(key, &g)) free((void *) g);
|
327
|
+
free(key);
|
328
|
+
}
|
329
|
+
|
330
|
+
fetch(all_grams_pp[i], &g);
|
331
|
+
free((void *) g);
|
332
|
+
free(all_grams_pp[i]);
|
333
|
+
}
|
334
|
+
|
335
|
+
free(all_grams_pp);
|
336
|
+
if (DEBUG) printf("freed all grams\n");
|
337
|
+
hdestroy();
|
338
|
+
if (DEBUG) printf("returning\n");
|
339
|
+
|
340
|
+
return term_for_record;
|
341
|
+
}
|
342
|
+
|
343
|
+
// Return whether gram exists or not
|
344
|
+
int add_or_update_gram(char *key)
|
345
|
+
{
|
346
|
+
intptr_t g;
|
347
|
+
if (fetch(key, &g)) {
|
348
|
+
(*(gram *) g).freq += 1;
|
349
|
+
if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
|
350
|
+
|
351
|
+
return 0;
|
352
|
+
} else {
|
353
|
+
gram *g = malloc(sizeof(gram));
|
354
|
+
if (g == NULL) rb_fatal("No memory for gram");
|
355
|
+
(*g).freq = 1;
|
356
|
+
(*g).fitness = 0.0;
|
357
|
+
store(key, (intptr_t) g);
|
358
|
+
|
359
|
+
return 1;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
int add_or_update_gram_from_index(int i, char *str)
|
364
|
+
{
|
365
|
+
char *key = make_key(i, str);
|
366
|
+
add_or_update_gram(key);
|
367
|
+
|
368
|
+
return add_or_update_gram(str);
|
369
|
+
}
|
370
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Loads mkmf which is used to make makefiles for Ruby extensions
|
4
|
+
require 'mkmf'
|
5
|
+
|
6
|
+
# Give it a name
|
7
|
+
extension_name = 'bow'
|
8
|
+
|
9
|
+
# The destination
|
10
|
+
dir_config(extension_name)
|
11
|
+
|
12
|
+
# Do the work
|
13
|
+
create_makefile(extension_name)
|
@@ -18,11 +18,9 @@ module Categorize
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def build_categories(clusters)
|
21
|
-
|
22
|
-
cluster.data_items.map { |v| @vectors.index(v) }
|
23
|
-
end
|
24
|
-
|
25
|
-
clusters_to_records = Hash[(0...@num_clusters).zip(cluster_indices)]
|
21
|
+
clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
|
22
|
+
[i, cluster.data_items.map { |v| @vectors.index(v) }]
|
23
|
+
end]
|
26
24
|
|
27
25
|
@query_terms ||= @query.split.map(&:downcase)
|
28
26
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Lubell-Doughtie
|
@@ -15,21 +15,25 @@ description: ! "A text categorization library that favors performance.\n
|
|
15
15
|
for use in online systems."
|
16
16
|
email: peter@helioid.com
|
17
17
|
executables: []
|
18
|
-
extensions:
|
18
|
+
extensions:
|
19
|
+
- ext/categorize/extconf.rb
|
19
20
|
extra_rdoc_files: []
|
20
21
|
files:
|
21
|
-
- lib/categorize.rb
|
22
|
-
- lib/categorize/model.rb
|
23
22
|
- lib/categorize/constants.rb
|
23
|
+
- lib/categorize/utils/gram_collection.rb
|
24
|
+
- lib/categorize/utils/grams.rb
|
25
|
+
- lib/categorize/utils/gram_node.rb
|
26
|
+
- lib/categorize/model.rb
|
24
27
|
- lib/categorize/models/abstract_model.rb
|
28
|
+
- lib/categorize/models/hierarchical_cluster.rb
|
25
29
|
- lib/categorize/models/bag_of_words.rb
|
26
30
|
- lib/categorize/models/cluster.rb
|
27
|
-
- lib/categorize
|
28
|
-
-
|
29
|
-
-
|
30
|
-
- lib/categorize/utils/grams.rb
|
31
|
+
- lib/categorize.rb
|
32
|
+
- ext/categorize/categorize.c
|
33
|
+
- ext/categorize/extconf.rb
|
31
34
|
homepage: http://www.helioid.com/
|
32
|
-
licenses:
|
35
|
+
licenses:
|
36
|
+
- BSD3
|
33
37
|
metadata: {}
|
34
38
|
post_install_message:
|
35
39
|
rdoc_options: []
|