categorize 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/ext/{categorize/categorize.c → ccategorize/ccategorize.c} +54 -30
- data/ext/ccategorize/extconf.rb +7 -0
- data/lib/categorize.rb +1 -0
- data/lib/categorize/models/bag_of_words.rb +14 -12
- data/lib/categorize/models/cluster.rb +2 -0
- metadata +4 -4
- data/ext/categorize/extconf.rb +0 -13
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YTFmZGJlMWI3YjUxNGQ2MGE3OGJmMjIzN2ZiYTFmNjVhMzE0OGRkMA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDAxNDY2MDNlYjBmYjc0NTIzY2JjYmU4NjE3YTBiMTQ1ZjA5ZjZjMg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDk4YjVhYjBhMWMzNGMwZmZlYmFiNTZlZWNiNjNkNjUxM2QzNjkxNzU2NWRk
|
10
|
+
MGQzNzJkZTY5ZmJhM2Q1ZmMyNDM2ZDEzZjk1Mzg4ODE2NWFlMmVhODBkNGZh
|
11
|
+
YTg0NGY4YTM0YzM0M2E0M2YyNjE5NjY4NzUyOTg4YzI1ODM5NGI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MmY1MmIyZjAyYWVmMzI1NzExMjk2ODFhMTgxOTg2ZDMzYTJhYzBlYWExZDdj
|
14
|
+
NDM4ODU0NTllMjdjOWI4NjMwMzBmODhhN2UxODE2MmZmMjFhNzBkMTczMzlj
|
15
|
+
ZjBjNWZiMDc5MTM3NDI5ZGI2ZmNiYzZmMTVjYzlhMzIzNjBiZjE=
|
@@ -4,11 +4,15 @@
|
|
4
4
|
#include <stdlib.h> /* exit() */
|
5
5
|
#include "ruby.h"
|
6
6
|
|
7
|
+
typedef enum { false, true } bool;
|
8
|
+
|
7
9
|
// START header
|
8
10
|
// For information and references about the module to be stored internally.
|
9
|
-
VALUE
|
11
|
+
VALUE Categorize = Qnil;
|
12
|
+
VALUE CBagOfWords = Qnil;
|
13
|
+
VALUE Models = Qnil;
|
10
14
|
|
11
|
-
static VALUE
|
15
|
+
static VALUE method_make_model(VALUE, VALUE);
|
12
16
|
static int add_or_update_gram_from_index(int, char *);
|
13
17
|
|
14
18
|
// Store all grams, used in compare_top_grams.
|
@@ -16,10 +20,12 @@ static char **all_grams_pp;
|
|
16
20
|
// END header
|
17
21
|
|
18
22
|
// Initialization method for this module.
|
19
|
-
void
|
23
|
+
void Init_ccategorize()
|
20
24
|
{
|
21
|
-
|
22
|
-
|
25
|
+
Categorize = rb_define_module("CCategorize");
|
26
|
+
Models = rb_define_module_under(Categorize, "Models");
|
27
|
+
CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
|
28
|
+
rb_define_method(CBagOfWords, "make_model", method_make_model, 1);
|
23
29
|
}
|
24
30
|
|
25
31
|
const bool DEBUG = false;
|
@@ -92,7 +98,8 @@ int compare_grams(const void *gram1, const void *gram2)
|
|
92
98
|
{
|
93
99
|
intptr_t g1, g2;
|
94
100
|
|
95
|
-
if (fetch(*(const char **) gram1, &g1) &&
|
101
|
+
if (fetch(*(const char **) gram1, &g1) &&
|
102
|
+
fetch(*(const char **) gram2, &g2)) {
|
96
103
|
return (*(gram *) g2).freq - (*(gram *) g1).freq;
|
97
104
|
} else
|
98
105
|
fail("compare_grams");
|
@@ -115,17 +122,37 @@ int compare_top_grams(const void *idx1, const void *idx2)
|
|
115
122
|
}
|
116
123
|
|
117
124
|
/*
|
118
|
-
*
|
125
|
+
* make_model(array_of_tokens);
|
119
126
|
* ==== Return
|
120
127
|
* Top terms
|
121
128
|
* ==== Parameters
|
122
129
|
* array_of_tokens: Tokens to turn into grams and extract phrases from.
|
123
130
|
*/
|
124
|
-
static VALUE
|
131
|
+
static VALUE method_make_model(VALUE self, VALUE array_of_tokens)
|
125
132
|
{
|
126
133
|
int i, j;
|
127
134
|
long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
|
128
135
|
int num_grams = 0;
|
136
|
+
int gram_counter;
|
137
|
+
char *tmp;
|
138
|
+
char *str;
|
139
|
+
char *bigram;
|
140
|
+
char *trigram;
|
141
|
+
char *last_word;
|
142
|
+
char *last_2nd_word;
|
143
|
+
int non_empty_tokens;
|
144
|
+
int tmp_int;
|
145
|
+
int min_cover;
|
146
|
+
int num_top_grams;
|
147
|
+
int top_gram_counter;
|
148
|
+
float max_fitness;
|
149
|
+
int max_fit_idx;
|
150
|
+
VALUE term;
|
151
|
+
VALUE term_for_record;
|
152
|
+
intptr_t g, all_g;
|
153
|
+
int count;
|
154
|
+
char *key;
|
155
|
+
char *max_fit;
|
129
156
|
|
130
157
|
for (i = 0; i < array_of_tokens_len; i++) {
|
131
158
|
// n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
|
@@ -142,15 +169,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
142
169
|
all_grams_pp = malloc(sizeof(char *) * num_grams);
|
143
170
|
if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
|
144
171
|
|
145
|
-
|
146
|
-
|
147
|
-
char *str;
|
148
|
-
char *bigram;
|
149
|
-
char *trigram;
|
150
|
-
char *last_word;
|
151
|
-
char *last_2nd_word;
|
152
|
-
int non_empty_tokens = 0;
|
153
|
-
int tmp_int;
|
172
|
+
gram_counter = 0;
|
173
|
+
non_empty_tokens = 0;
|
154
174
|
|
155
175
|
for (i = 0; i < array_of_tokens_len; i++) {
|
156
176
|
// n grams
|
@@ -160,6 +180,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
160
180
|
|
161
181
|
for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
|
162
182
|
VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
|
183
|
+
|
163
184
|
// store str via malloc so we can free it along with others
|
164
185
|
tmp = StringValueCStr(rb_str);
|
165
186
|
tmp_int = 1 + strlen(tmp);
|
@@ -209,7 +230,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
209
230
|
if (j > 0) non_empty_tokens++;
|
210
231
|
if (DEBUG) printf("end i: %i\n", i);
|
211
232
|
}
|
212
|
-
|
233
|
+
|
234
|
+
min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
|
213
235
|
|
214
236
|
if (DEBUG) printf("added %i grams\n", gram_counter);
|
215
237
|
|
@@ -217,7 +239,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
217
239
|
qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
|
218
240
|
|
219
241
|
// only consider prominent top NUM_TOP_GRAMS grams
|
220
|
-
|
242
|
+
num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter :
|
243
|
+
NUM_TOP_GRAMS;
|
221
244
|
|
222
245
|
if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
|
223
246
|
gram_counter, num_top_grams, array_of_tokens_len);
|
@@ -226,10 +249,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
226
249
|
|
227
250
|
if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
|
228
251
|
|
229
|
-
|
230
|
-
intptr_t g, all_g;
|
231
|
-
int count;
|
232
|
-
char *key;
|
252
|
+
top_gram_counter = 0;
|
233
253
|
|
234
254
|
for (i = 0; i < num_top_grams; i++) {
|
235
255
|
count = 0;
|
@@ -238,6 +258,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
238
258
|
|
239
259
|
if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
|
240
260
|
top_grams_p[top_gram_counter++] = i;
|
261
|
+
|
241
262
|
if (DEBUG) printf("%i: covering gram: %s\n",
|
242
263
|
top_gram_counter - 1, all_grams_pp[i]);
|
243
264
|
break;
|
@@ -250,9 +271,6 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
250
271
|
printf("tgc %i\n", top_gram_counter);
|
251
272
|
}
|
252
273
|
|
253
|
-
float max_fitness;
|
254
|
-
char *max_fit;
|
255
|
-
|
256
274
|
for (i = 0; i < array_of_tokens_len; i++) {
|
257
275
|
if (DEBUG) printf("start i: %i\n", i);
|
258
276
|
|
@@ -261,7 +279,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
261
279
|
key = make_key(i, all_grams_pp[top_grams_p[j]]);
|
262
280
|
|
263
281
|
if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
|
264
|
-
(*(gram *) g).fitness = (float) (*(gram *) g).freq /
|
282
|
+
(*(gram *) g).fitness = (float) (*(gram *) g).freq /
|
283
|
+
(float) (*(gram *) all_g).freq;
|
265
284
|
if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
|
266
285
|
}
|
267
286
|
|
@@ -283,6 +302,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
283
302
|
}
|
284
303
|
|
285
304
|
free(key);
|
305
|
+
|
286
306
|
// store fitness of gram
|
287
307
|
if (max_fit && fetch(max_fit, &g))
|
288
308
|
(*(gram *) g).fitness += 1.0;
|
@@ -293,10 +313,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
293
313
|
|
294
314
|
// sort top_grams and take MAX_BUCKETS
|
295
315
|
qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
|
316
|
+
|
296
317
|
if (DEBUG) printf("after qsort top grams\n");
|
297
318
|
|
298
|
-
|
299
|
-
VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
|
319
|
+
term_for_record = rb_ary_new2(array_of_tokens_len);
|
300
320
|
|
301
321
|
for (i = 0; i < array_of_tokens_len; i++) {
|
302
322
|
max_fitness = 0;
|
@@ -313,9 +333,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
313
333
|
free(key);
|
314
334
|
}
|
315
335
|
|
316
|
-
|
336
|
+
term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
|
317
337
|
rb_ary_push(term_for_record, term);
|
318
338
|
}
|
339
|
+
|
319
340
|
if (DEBUG) printf("after qsort top grams\n");
|
320
341
|
if (DEBUG) printf("freeing\n");
|
321
342
|
|
@@ -334,6 +355,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
334
355
|
|
335
356
|
free(all_grams_pp);
|
336
357
|
if (DEBUG) printf("freed all grams\n");
|
358
|
+
|
337
359
|
hdestroy();
|
338
360
|
if (DEBUG) printf("returning\n");
|
339
361
|
|
@@ -344,6 +366,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
344
366
|
int add_or_update_gram(char *key)
|
345
367
|
{
|
346
368
|
intptr_t g;
|
369
|
+
|
347
370
|
if (fetch(key, &g)) {
|
348
371
|
(*(gram *) g).freq += 1;
|
349
372
|
if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
|
@@ -352,6 +375,7 @@ int add_or_update_gram(char *key)
|
|
352
375
|
} else {
|
353
376
|
gram *g = malloc(sizeof(gram));
|
354
377
|
if (g == NULL) rb_fatal("No memory for gram");
|
378
|
+
|
355
379
|
(*g).freq = 1;
|
356
380
|
(*g).fitness = 0.0;
|
357
381
|
store(key, (intptr_t) g);
|
data/lib/categorize.rb
CHANGED
@@ -8,16 +8,19 @@ module Categorize
|
|
8
8
|
include Utils::Grams
|
9
9
|
|
10
10
|
# DEBUG = false
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
attr_accessor :max_buckets, :min_support, :num_top_grams
|
12
|
+
|
13
|
+
# 0 <= min_support <= 1, we like 0.01 <= min_support <= 0.1
|
14
|
+
def initialize
|
15
|
+
@max_buckets = 8
|
16
|
+
# TODO: some gradient descent to choose this number
|
17
|
+
@min_support = 0.07
|
18
|
+
@num_top_grams = 250
|
19
|
+
end
|
17
20
|
|
18
21
|
# function worst case
|
19
22
|
# O(2 x (|frequent_grams| x |gram_collections|) +
|
20
|
-
# |all_grams| +
|
23
|
+
# |all_grams| + @max_buckets x |gram_collections|)
|
21
24
|
def model(query, records_to_tokens)
|
22
25
|
@gram_cover_cache = {}
|
23
26
|
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
@@ -25,9 +28,9 @@ module Categorize
|
|
25
28
|
top_grams = determine_frequency_term_sets(@all_grams, query)
|
26
29
|
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
27
30
|
top_grams[gram_c1] <=> top_grams[gram_c2]
|
28
|
-
end.first(
|
31
|
+
end.first(@max_buckets)
|
29
32
|
|
30
|
-
# below block, worst case O(
|
33
|
+
# below block, worst case O(@max_buckets x |gram_collections|)
|
31
34
|
@gram_collections.reduce({}) do |buckets, gram_collection|
|
32
35
|
max_fitness = 0
|
33
36
|
max_fit = nil
|
@@ -55,14 +58,13 @@ module Categorize
|
|
55
58
|
result.grams.nil? || result.grams.empty?
|
56
59
|
end.length
|
57
60
|
|
58
|
-
min_cover_l =
|
59
|
-
# min_cover_h = MIN_SUPP_H * effective_length
|
61
|
+
min_cover_l = @min_support * effective_length
|
60
62
|
|
61
63
|
# for speed only look at top N grams
|
62
64
|
# below block, worst case O(|all_grams|)
|
63
65
|
frequent_grams = all_grams.sort do |gram1, gram2|
|
64
66
|
gram2.frequency <=> gram1.frequency
|
65
|
-
end.first(
|
67
|
+
end.first(@num_top_grams)
|
66
68
|
|
67
69
|
# below block, worst case O(|frequent_grams| x |gram_collections|)
|
68
70
|
frequent_grams = frequent_grams.delete_if do |gram|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Lubell-Doughtie
|
@@ -16,7 +16,7 @@ description: ! "A text categorization library that favors performance.\n
|
|
16
16
|
email: peter@helioid.com
|
17
17
|
executables: []
|
18
18
|
extensions:
|
19
|
-
- ext/
|
19
|
+
- ext/ccategorize/extconf.rb
|
20
20
|
extra_rdoc_files: []
|
21
21
|
files:
|
22
22
|
- lib/categorize/constants.rb
|
@@ -29,8 +29,8 @@ files:
|
|
29
29
|
- lib/categorize/models/bag_of_words.rb
|
30
30
|
- lib/categorize/models/cluster.rb
|
31
31
|
- lib/categorize.rb
|
32
|
-
- ext/
|
33
|
-
- ext/
|
32
|
+
- ext/ccategorize/ccategorize.c
|
33
|
+
- ext/ccategorize/extconf.rb
|
34
34
|
homepage: http://www.helioid.com/
|
35
35
|
licenses:
|
36
36
|
- BSD3
|
data/ext/categorize/extconf.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
# Loads mkmf which is used to make makefiles for Ruby extensions
|
4
|
-
require 'mkmf'
|
5
|
-
|
6
|
-
# Give it a name
|
7
|
-
extension_name = 'bow'
|
8
|
-
|
9
|
-
# The destination
|
10
|
-
dir_config(extension_name)
|
11
|
-
|
12
|
-
# Do the work
|
13
|
-
create_makefile(extension_name)
|