categorize 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/ext/{categorize/categorize.c → ccategorize/ccategorize.c} +54 -30
- data/ext/ccategorize/extconf.rb +7 -0
- data/lib/categorize.rb +1 -0
- data/lib/categorize/models/bag_of_words.rb +14 -12
- data/lib/categorize/models/cluster.rb +2 -0
- metadata +4 -4
- data/ext/categorize/extconf.rb +0 -13
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YTFmZGJlMWI3YjUxNGQ2MGE3OGJmMjIzN2ZiYTFmNjVhMzE0OGRkMA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDAxNDY2MDNlYjBmYjc0NTIzY2JjYmU4NjE3YTBiMTQ1ZjA5ZjZjMg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDk4YjVhYjBhMWMzNGMwZmZlYmFiNTZlZWNiNjNkNjUxM2QzNjkxNzU2NWRk
|
10
|
+
MGQzNzJkZTY5ZmJhM2Q1ZmMyNDM2ZDEzZjk1Mzg4ODE2NWFlMmVhODBkNGZh
|
11
|
+
YTg0NGY4YTM0YzM0M2E0M2YyNjE5NjY4NzUyOTg4YzI1ODM5NGI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MmY1MmIyZjAyYWVmMzI1NzExMjk2ODFhMTgxOTg2ZDMzYTJhYzBlYWExZDdj
|
14
|
+
NDM4ODU0NTllMjdjOWI4NjMwMzBmODhhN2UxODE2MmZmMjFhNzBkMTczMzlj
|
15
|
+
ZjBjNWZiMDc5MTM3NDI5ZGI2ZmNiYzZmMTVjYzlhMzIzNjBiZjE=
|
@@ -4,11 +4,15 @@
|
|
4
4
|
#include <stdlib.h> /* exit() */
|
5
5
|
#include "ruby.h"
|
6
6
|
|
7
|
+
typedef enum { false, true } bool;
|
8
|
+
|
7
9
|
// START header
|
8
10
|
// For information and references about the module to be stored internally.
|
9
|
-
VALUE
|
11
|
+
VALUE Categorize = Qnil;
|
12
|
+
VALUE CBagOfWords = Qnil;
|
13
|
+
VALUE Models = Qnil;
|
10
14
|
|
11
|
-
static VALUE
|
15
|
+
static VALUE method_make_model(VALUE, VALUE);
|
12
16
|
static int add_or_update_gram_from_index(int, char *);
|
13
17
|
|
14
18
|
// Store all grams, used in compare_top_grams.
|
@@ -16,10 +20,12 @@ static char **all_grams_pp;
|
|
16
20
|
// END header
|
17
21
|
|
18
22
|
// Initialization method for this module.
|
19
|
-
void
|
23
|
+
void Init_ccategorize()
|
20
24
|
{
|
21
|
-
|
22
|
-
|
25
|
+
Categorize = rb_define_module("CCategorize");
|
26
|
+
Models = rb_define_module_under(Categorize, "Models");
|
27
|
+
CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
|
28
|
+
rb_define_method(CBagOfWords, "make_model", method_make_model, 1);
|
23
29
|
}
|
24
30
|
|
25
31
|
const bool DEBUG = false;
|
@@ -92,7 +98,8 @@ int compare_grams(const void *gram1, const void *gram2)
|
|
92
98
|
{
|
93
99
|
intptr_t g1, g2;
|
94
100
|
|
95
|
-
if (fetch(*(const char **) gram1, &g1) &&
|
101
|
+
if (fetch(*(const char **) gram1, &g1) &&
|
102
|
+
fetch(*(const char **) gram2, &g2)) {
|
96
103
|
return (*(gram *) g2).freq - (*(gram *) g1).freq;
|
97
104
|
} else
|
98
105
|
fail("compare_grams");
|
@@ -115,17 +122,37 @@ int compare_top_grams(const void *idx1, const void *idx2)
|
|
115
122
|
}
|
116
123
|
|
117
124
|
/*
|
118
|
-
*
|
125
|
+
* make_model(array_of_tokens);
|
119
126
|
* ==== Return
|
120
127
|
* Top terms
|
121
128
|
* ==== Parameters
|
122
129
|
* array_of_tokens: Tokens to turn into grams and extract phrases from.
|
123
130
|
*/
|
124
|
-
static VALUE
|
131
|
+
static VALUE method_make_model(VALUE self, VALUE array_of_tokens)
|
125
132
|
{
|
126
133
|
int i, j;
|
127
134
|
long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
|
128
135
|
int num_grams = 0;
|
136
|
+
int gram_counter;
|
137
|
+
char *tmp;
|
138
|
+
char *str;
|
139
|
+
char *bigram;
|
140
|
+
char *trigram;
|
141
|
+
char *last_word;
|
142
|
+
char *last_2nd_word;
|
143
|
+
int non_empty_tokens;
|
144
|
+
int tmp_int;
|
145
|
+
int min_cover;
|
146
|
+
int num_top_grams;
|
147
|
+
int top_gram_counter;
|
148
|
+
float max_fitness;
|
149
|
+
int max_fit_idx;
|
150
|
+
VALUE term;
|
151
|
+
VALUE term_for_record;
|
152
|
+
intptr_t g, all_g;
|
153
|
+
int count;
|
154
|
+
char *key;
|
155
|
+
char *max_fit;
|
129
156
|
|
130
157
|
for (i = 0; i < array_of_tokens_len; i++) {
|
131
158
|
// n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
|
@@ -142,15 +169,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
142
169
|
all_grams_pp = malloc(sizeof(char *) * num_grams);
|
143
170
|
if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
|
144
171
|
|
145
|
-
|
146
|
-
|
147
|
-
char *str;
|
148
|
-
char *bigram;
|
149
|
-
char *trigram;
|
150
|
-
char *last_word;
|
151
|
-
char *last_2nd_word;
|
152
|
-
int non_empty_tokens = 0;
|
153
|
-
int tmp_int;
|
172
|
+
gram_counter = 0;
|
173
|
+
non_empty_tokens = 0;
|
154
174
|
|
155
175
|
for (i = 0; i < array_of_tokens_len; i++) {
|
156
176
|
// n grams
|
@@ -160,6 +180,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
160
180
|
|
161
181
|
for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
|
162
182
|
VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
|
183
|
+
|
163
184
|
// store str via malloc so we can free it along with others
|
164
185
|
tmp = StringValueCStr(rb_str);
|
165
186
|
tmp_int = 1 + strlen(tmp);
|
@@ -209,7 +230,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
209
230
|
if (j > 0) non_empty_tokens++;
|
210
231
|
if (DEBUG) printf("end i: %i\n", i);
|
211
232
|
}
|
212
|
-
|
233
|
+
|
234
|
+
min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
|
213
235
|
|
214
236
|
if (DEBUG) printf("added %i grams\n", gram_counter);
|
215
237
|
|
@@ -217,7 +239,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
217
239
|
qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
|
218
240
|
|
219
241
|
// only consider prominent top NUM_TOP_GRAMS grams
|
220
|
-
|
242
|
+
num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter :
|
243
|
+
NUM_TOP_GRAMS;
|
221
244
|
|
222
245
|
if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
|
223
246
|
gram_counter, num_top_grams, array_of_tokens_len);
|
@@ -226,10 +249,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
226
249
|
|
227
250
|
if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
|
228
251
|
|
229
|
-
|
230
|
-
intptr_t g, all_g;
|
231
|
-
int count;
|
232
|
-
char *key;
|
252
|
+
top_gram_counter = 0;
|
233
253
|
|
234
254
|
for (i = 0; i < num_top_grams; i++) {
|
235
255
|
count = 0;
|
@@ -238,6 +258,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
238
258
|
|
239
259
|
if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
|
240
260
|
top_grams_p[top_gram_counter++] = i;
|
261
|
+
|
241
262
|
if (DEBUG) printf("%i: covering gram: %s\n",
|
242
263
|
top_gram_counter - 1, all_grams_pp[i]);
|
243
264
|
break;
|
@@ -250,9 +271,6 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
250
271
|
printf("tgc %i\n", top_gram_counter);
|
251
272
|
}
|
252
273
|
|
253
|
-
float max_fitness;
|
254
|
-
char *max_fit;
|
255
|
-
|
256
274
|
for (i = 0; i < array_of_tokens_len; i++) {
|
257
275
|
if (DEBUG) printf("start i: %i\n", i);
|
258
276
|
|
@@ -261,7 +279,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
261
279
|
key = make_key(i, all_grams_pp[top_grams_p[j]]);
|
262
280
|
|
263
281
|
if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
|
264
|
-
(*(gram *) g).fitness = (float) (*(gram *) g).freq /
|
282
|
+
(*(gram *) g).fitness = (float) (*(gram *) g).freq /
|
283
|
+
(float) (*(gram *) all_g).freq;
|
265
284
|
if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
|
266
285
|
}
|
267
286
|
|
@@ -283,6 +302,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
283
302
|
}
|
284
303
|
|
285
304
|
free(key);
|
305
|
+
|
286
306
|
// store fitness of gram
|
287
307
|
if (max_fit && fetch(max_fit, &g))
|
288
308
|
(*(gram *) g).fitness += 1.0;
|
@@ -293,10 +313,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
293
313
|
|
294
314
|
// sort top_grams and take MAX_BUCKETS
|
295
315
|
qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
|
316
|
+
|
296
317
|
if (DEBUG) printf("after qsort top grams\n");
|
297
318
|
|
298
|
-
|
299
|
-
VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
|
319
|
+
term_for_record = rb_ary_new2(array_of_tokens_len);
|
300
320
|
|
301
321
|
for (i = 0; i < array_of_tokens_len; i++) {
|
302
322
|
max_fitness = 0;
|
@@ -313,9 +333,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
313
333
|
free(key);
|
314
334
|
}
|
315
335
|
|
316
|
-
|
336
|
+
term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
|
317
337
|
rb_ary_push(term_for_record, term);
|
318
338
|
}
|
339
|
+
|
319
340
|
if (DEBUG) printf("after qsort top grams\n");
|
320
341
|
if (DEBUG) printf("freeing\n");
|
321
342
|
|
@@ -334,6 +355,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
334
355
|
|
335
356
|
free(all_grams_pp);
|
336
357
|
if (DEBUG) printf("freed all grams\n");
|
358
|
+
|
337
359
|
hdestroy();
|
338
360
|
if (DEBUG) printf("returning\n");
|
339
361
|
|
@@ -344,6 +366,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
|
|
344
366
|
int add_or_update_gram(char *key)
|
345
367
|
{
|
346
368
|
intptr_t g;
|
369
|
+
|
347
370
|
if (fetch(key, &g)) {
|
348
371
|
(*(gram *) g).freq += 1;
|
349
372
|
if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
|
@@ -352,6 +375,7 @@ int add_or_update_gram(char *key)
|
|
352
375
|
} else {
|
353
376
|
gram *g = malloc(sizeof(gram));
|
354
377
|
if (g == NULL) rb_fatal("No memory for gram");
|
378
|
+
|
355
379
|
(*g).freq = 1;
|
356
380
|
(*g).fitness = 0.0;
|
357
381
|
store(key, (intptr_t) g);
|
data/lib/categorize.rb
CHANGED
@@ -8,16 +8,19 @@ module Categorize
|
|
8
8
|
include Utils::Grams
|
9
9
|
|
10
10
|
# DEBUG = false
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
attr_accessor :max_buckets, :min_support, :num_top_grams
|
12
|
+
|
13
|
+
# 0 <= min_support <= 1, we like 0.01 <= min_support <= 0.1
|
14
|
+
def initialize
|
15
|
+
@max_buckets = 8
|
16
|
+
# TODO: some gradient descent to choose this number
|
17
|
+
@min_support = 0.07
|
18
|
+
@num_top_grams = 250
|
19
|
+
end
|
17
20
|
|
18
21
|
# function worst case
|
19
22
|
# O(2 x (|frequent_grams| x |gram_collections|) +
|
20
|
-
# |all_grams| +
|
23
|
+
# |all_grams| + @max_buckets x |gram_collections|)
|
21
24
|
def model(query, records_to_tokens)
|
22
25
|
@gram_cover_cache = {}
|
23
26
|
@gram_collections, @all_grams = create_grams(query, records_to_tokens)
|
@@ -25,9 +28,9 @@ module Categorize
|
|
25
28
|
top_grams = determine_frequency_term_sets(@all_grams, query)
|
26
29
|
top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
|
27
30
|
top_grams[gram_c1] <=> top_grams[gram_c2]
|
28
|
-
end.first(
|
31
|
+
end.first(@max_buckets)
|
29
32
|
|
30
|
-
# below block, worst case O(
|
33
|
+
# below block, worst case O(@max_buckets x |gram_collections|)
|
31
34
|
@gram_collections.reduce({}) do |buckets, gram_collection|
|
32
35
|
max_fitness = 0
|
33
36
|
max_fit = nil
|
@@ -55,14 +58,13 @@ module Categorize
|
|
55
58
|
result.grams.nil? || result.grams.empty?
|
56
59
|
end.length
|
57
60
|
|
58
|
-
min_cover_l =
|
59
|
-
# min_cover_h = MIN_SUPP_H * effective_length
|
61
|
+
min_cover_l = @min_support * effective_length
|
60
62
|
|
61
63
|
# for speed only look at top N grams
|
62
64
|
# below block, worst case O(|all_grams|)
|
63
65
|
frequent_grams = all_grams.sort do |gram1, gram2|
|
64
66
|
gram2.frequency <=> gram1.frequency
|
65
|
-
end.first(
|
67
|
+
end.first(@num_top_grams)
|
66
68
|
|
67
69
|
# below block, worst case O(|frequent_grams| x |gram_collections|)
|
68
70
|
frequent_grams = frequent_grams.delete_if do |gram|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Lubell-Doughtie
|
@@ -16,7 +16,7 @@ description: ! "A text categorization library that favors performance.\n
|
|
16
16
|
email: peter@helioid.com
|
17
17
|
executables: []
|
18
18
|
extensions:
|
19
|
-
- ext/
|
19
|
+
- ext/ccategorize/extconf.rb
|
20
20
|
extra_rdoc_files: []
|
21
21
|
files:
|
22
22
|
- lib/categorize/constants.rb
|
@@ -29,8 +29,8 @@ files:
|
|
29
29
|
- lib/categorize/models/bag_of_words.rb
|
30
30
|
- lib/categorize/models/cluster.rb
|
31
31
|
- lib/categorize.rb
|
32
|
-
- ext/
|
33
|
-
- ext/
|
32
|
+
- ext/ccategorize/ccategorize.c
|
33
|
+
- ext/ccategorize/extconf.rb
|
34
34
|
homepage: http://www.helioid.com/
|
35
35
|
licenses:
|
36
36
|
- BSD3
|
data/ext/categorize/extconf.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
# Loads mkmf which is used to make makefiles for Ruby extensions
|
4
|
-
require 'mkmf'
|
5
|
-
|
6
|
-
# Give it a name
|
7
|
-
extension_name = 'bow'
|
8
|
-
|
9
|
-
# The destination
|
10
|
-
dir_config(extension_name)
|
11
|
-
|
12
|
-
# Do the work
|
13
|
-
create_makefile(extension_name)
|