categorize 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
4
+ YTFmZGJlMWI3YjUxNGQ2MGE3OGJmMjIzN2ZiYTFmNjVhMzE0OGRkMA==
5
5
  data.tar.gz: !binary |-
6
- YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
6
+ NDAxNDY2MDNlYjBmYjc0NTIzY2JjYmU4NjE3YTBiMTQ1ZjA5ZjZjMg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
10
- OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
11
- Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
9
+ MDk4YjVhYjBhMWMzNGMwZmZlYmFiNTZlZWNiNjNkNjUxM2QzNjkxNzU2NWRk
10
+ MGQzNzJkZTY5ZmJhM2Q1ZmMyNDM2ZDEzZjk1Mzg4ODE2NWFlMmVhODBkNGZh
11
+ YTg0NGY4YTM0YzM0M2E0M2YyNjE5NjY4NzUyOTg4YzI1ODM5NGI=
12
12
  data.tar.gz: !binary |-
13
- ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
14
- OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
15
- NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
13
+ MmY1MmIyZjAyYWVmMzI1NzExMjk2ODFhMTgxOTg2ZDMzYTJhYzBlYWExZDdj
14
+ NDM4ODU0NTllMjdjOWI4NjMwMzBmODhhN2UxODE2MmZmMjFhNzBkMTczMzlj
15
+ ZjBjNWZiMDc5MTM3NDI5ZGI2ZmNiYzZmMTVjYzlhMzIzNjBiZjE=
@@ -4,11 +4,15 @@
4
4
  #include <stdlib.h> /* exit() */
5
5
  #include "ruby.h"
6
6
 
7
+ typedef enum { false, true } bool;
8
+
7
9
  // START header
8
10
  // For information and references about the module to be stored internally.
9
- VALUE Bow = Qnil;
11
+ VALUE Categorize = Qnil;
12
+ VALUE CBagOfWords = Qnil;
13
+ VALUE Models = Qnil;
10
14
 
11
- static VALUE method_model_bow(VALUE, VALUE);
15
+ static VALUE method_make_model(VALUE, VALUE);
12
16
  static int add_or_update_gram_from_index(int, char *);
13
17
 
14
18
  // Store all grams, used in compare_top_grams.
@@ -16,10 +20,12 @@ static char **all_grams_pp;
16
20
  // END header
17
21
 
18
22
  // Initialization method for this module.
19
- void Init_bow()
23
+ void Init_ccategorize()
20
24
  {
21
- Bow = rb_define_module("Bow");
22
- rb_define_method(Bow, "model_bow", method_model_bow, 1);
25
+ Categorize = rb_define_module("CCategorize");
26
+ Models = rb_define_module_under(Categorize, "Models");
27
+ CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
28
+ rb_define_method(CBagOfWords, "make_model", method_make_model, 1);
23
29
  }
24
30
 
25
31
  const bool DEBUG = false;
@@ -92,7 +98,8 @@ int compare_grams(const void *gram1, const void *gram2)
92
98
  {
93
99
  intptr_t g1, g2;
94
100
 
95
- if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
101
+ if (fetch(*(const char **) gram1, &g1) &&
102
+ fetch(*(const char **) gram2, &g2)) {
96
103
  return (*(gram *) g2).freq - (*(gram *) g1).freq;
97
104
  } else
98
105
  fail("compare_grams");
@@ -115,17 +122,37 @@ int compare_top_grams(const void *idx1, const void *idx2)
115
122
  }
116
123
 
117
124
  /*
118
- * model_bow(array_of_tokens);
125
+ * make_model(array_of_tokens);
119
126
  * ==== Return
120
127
  * Top terms
121
128
  * ==== Parameters
122
129
  * array_of_tokens: Tokens to turn into grams and extract phrases from.
123
130
  */
124
- static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
131
+ static VALUE method_make_model(VALUE self, VALUE array_of_tokens)
125
132
  {
126
133
  int i, j;
127
134
  long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
128
135
  int num_grams = 0;
136
+ int gram_counter;
137
+ char *tmp;
138
+ char *str;
139
+ char *bigram;
140
+ char *trigram;
141
+ char *last_word;
142
+ char *last_2nd_word;
143
+ int non_empty_tokens;
144
+ int tmp_int;
145
+ int min_cover;
146
+ int num_top_grams;
147
+ int top_gram_counter;
148
+ float max_fitness;
149
+ int max_fit_idx;
150
+ VALUE term;
151
+ VALUE term_for_record;
152
+ intptr_t g, all_g;
153
+ int count;
154
+ char *key;
155
+ char *max_fit;
129
156
 
130
157
  for (i = 0; i < array_of_tokens_len; i++) {
131
158
  // n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
@@ -142,15 +169,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
142
169
  all_grams_pp = malloc(sizeof(char *) * num_grams);
143
170
  if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
144
171
 
145
- int gram_counter = 0;
146
- char *tmp;
147
- char *str;
148
- char *bigram;
149
- char *trigram;
150
- char *last_word;
151
- char *last_2nd_word;
152
- int non_empty_tokens = 0;
153
- int tmp_int;
172
+ gram_counter = 0;
173
+ non_empty_tokens = 0;
154
174
 
155
175
  for (i = 0; i < array_of_tokens_len; i++) {
156
176
  // n grams
@@ -160,6 +180,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
160
180
 
161
181
  for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
162
182
  VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
183
+
163
184
  // store str via malloc so we can free it along with others
164
185
  tmp = StringValueCStr(rb_str);
165
186
  tmp_int = 1 + strlen(tmp);
@@ -209,7 +230,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
209
230
  if (j > 0) non_empty_tokens++;
210
231
  if (DEBUG) printf("end i: %i\n", i);
211
232
  }
212
- int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
233
+
234
+ min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
213
235
 
214
236
  if (DEBUG) printf("added %i grams\n", gram_counter);
215
237
 
@@ -217,7 +239,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
217
239
  qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
218
240
 
219
241
  // only consider prominent top NUM_TOP_GRAMS grams
220
- int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
242
+ num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter :
243
+ NUM_TOP_GRAMS;
221
244
 
222
245
  if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
223
246
  gram_counter, num_top_grams, array_of_tokens_len);
@@ -226,10 +249,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
226
249
 
227
250
  if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
228
251
 
229
- int top_gram_counter = 0;
230
- intptr_t g, all_g;
231
- int count;
232
- char *key;
252
+ top_gram_counter = 0;
233
253
 
234
254
  for (i = 0; i < num_top_grams; i++) {
235
255
  count = 0;
@@ -238,6 +258,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
238
258
 
239
259
  if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
240
260
  top_grams_p[top_gram_counter++] = i;
261
+
241
262
  if (DEBUG) printf("%i: covering gram: %s\n",
242
263
  top_gram_counter - 1, all_grams_pp[i]);
243
264
  break;
@@ -250,9 +271,6 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
250
271
  printf("tgc %i\n", top_gram_counter);
251
272
  }
252
273
 
253
- float max_fitness;
254
- char *max_fit;
255
-
256
274
  for (i = 0; i < array_of_tokens_len; i++) {
257
275
  if (DEBUG) printf("start i: %i\n", i);
258
276
 
@@ -261,7 +279,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
261
279
  key = make_key(i, all_grams_pp[top_grams_p[j]]);
262
280
 
263
281
  if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
264
- (*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
282
+ (*(gram *) g).fitness = (float) (*(gram *) g).freq /
283
+ (float) (*(gram *) all_g).freq;
265
284
  if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
266
285
  }
267
286
 
@@ -283,6 +302,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
283
302
  }
284
303
 
285
304
  free(key);
305
+
286
306
  // store fitness of gram
287
307
  if (max_fit && fetch(max_fit, &g))
288
308
  (*(gram *) g).fitness += 1.0;
@@ -293,10 +313,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
293
313
 
294
314
  // sort top_grams and take MAX_BUCKETS
295
315
  qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
316
+
296
317
  if (DEBUG) printf("after qsort top grams\n");
297
318
 
298
- int max_fit_idx;
299
- VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
319
+ term_for_record = rb_ary_new2(array_of_tokens_len);
300
320
 
301
321
  for (i = 0; i < array_of_tokens_len; i++) {
302
322
  max_fitness = 0;
@@ -313,9 +333,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
313
333
  free(key);
314
334
  }
315
335
 
316
- VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
336
+ term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
317
337
  rb_ary_push(term_for_record, term);
318
338
  }
339
+
319
340
  if (DEBUG) printf("after qsort top grams\n");
320
341
  if (DEBUG) printf("freeing\n");
321
342
 
@@ -334,6 +355,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
334
355
 
335
356
  free(all_grams_pp);
336
357
  if (DEBUG) printf("freed all grams\n");
358
+
337
359
  hdestroy();
338
360
  if (DEBUG) printf("returning\n");
339
361
 
@@ -344,6 +366,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
344
366
  int add_or_update_gram(char *key)
345
367
  {
346
368
  intptr_t g;
369
+
347
370
  if (fetch(key, &g)) {
348
371
  (*(gram *) g).freq += 1;
349
372
  if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
@@ -352,6 +375,7 @@ int add_or_update_gram(char *key)
352
375
  } else {
353
376
  gram *g = malloc(sizeof(gram));
354
377
  if (g == NULL) rb_fatal("No memory for gram");
378
+
355
379
  (*g).freq = 1;
356
380
  (*g).fitness = 0.0;
357
381
  store(key, (intptr_t) g);
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+
3
+ # Loads mkmf which is used to make makefiles for Ruby extensions
4
+ require 'mkmf'
5
+
6
+ # Do the work
7
+ create_makefile('ccategorize/ccategorize')
data/lib/categorize.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
+ require 'ccategorize/ccategorize'
3
4
  require 'categorize/models/abstract_model'
4
5
  require 'categorize/models/bag_of_words'
5
6
  require 'categorize/models/cluster'
@@ -8,16 +8,19 @@ module Categorize
8
8
  include Utils::Grams
9
9
 
10
10
  # DEBUG = false
11
- # TODO: some gradient descent to choose this number
12
- # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
13
- MIN_SUPP_L = 0.07
14
- MIN_SUPP_H = 0.1
15
- NUM_TOP_GRAMS = 250
16
- MAX_BUCKETS = 8
11
+ attr_accessor :max_buckets, :min_support, :num_top_grams
12
+
13
+ # 0 <= min_support <= 1, we like 0.01 <= min_support <= 0.1
14
+ def initialize
15
+ @max_buckets = 8
16
+ # TODO: some gradient descent to choose this number
17
+ @min_support = 0.07
18
+ @num_top_grams = 250
19
+ end
17
20
 
18
21
  # function worst case
19
22
  # O(2 x (|frequent_grams| x |gram_collections|) +
20
- # |all_grams| + MAX_BUCKETS x |gram_collections|)
23
+ # |all_grams| + @max_buckets x |gram_collections|)
21
24
  def model(query, records_to_tokens)
22
25
  @gram_cover_cache = {}
23
26
  @gram_collections, @all_grams = create_grams(query, records_to_tokens)
@@ -25,9 +28,9 @@ module Categorize
25
28
  top_grams = determine_frequency_term_sets(@all_grams, query)
26
29
  top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
27
30
  top_grams[gram_c1] <=> top_grams[gram_c2]
28
- end.first(MAX_BUCKETS)
31
+ end.first(@max_buckets)
29
32
 
30
- # below block, worst case O(MAX_BUCKETS x |gram_collections|)
33
+ # below block, worst case O(@max_buckets x |gram_collections|)
31
34
  @gram_collections.reduce({}) do |buckets, gram_collection|
32
35
  max_fitness = 0
33
36
  max_fit = nil
@@ -55,14 +58,13 @@ module Categorize
55
58
  result.grams.nil? || result.grams.empty?
56
59
  end.length
57
60
 
58
- min_cover_l = MIN_SUPP_L * effective_length
59
- # min_cover_h = MIN_SUPP_H * effective_length
61
+ min_cover_l = @min_support * effective_length
60
62
 
61
63
  # for speed only look at top N grams
62
64
  # below block, worst case O(|all_grams|)
63
65
  frequent_grams = all_grams.sort do |gram1, gram2|
64
66
  gram2.frequency <=> gram1.frequency
65
- end.first(NUM_TOP_GRAMS)
67
+ end.first(@num_top_grams)
66
68
 
67
69
  # below block, worst case O(|frequent_grams| x |gram_collections|)
68
70
  frequent_grams = frequent_grams.delete_if do |gram|
@@ -4,6 +4,8 @@ module Categorize
4
4
  module Models
5
5
  class Cluster < AbstractModel
6
6
 
7
+ attr_accessor :num_clusters
8
+
7
9
  def initialize
8
10
  @num_clusters = 10
9
11
  @clusterer = Ai4r::Clusterers::WardLinkage.new
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Lubell-Doughtie
@@ -16,7 +16,7 @@ description: ! "A text categorization library that favors performance.\n
16
16
  email: peter@helioid.com
17
17
  executables: []
18
18
  extensions:
19
- - ext/categorize/extconf.rb
19
+ - ext/ccategorize/extconf.rb
20
20
  extra_rdoc_files: []
21
21
  files:
22
22
  - lib/categorize/constants.rb
@@ -29,8 +29,8 @@ files:
29
29
  - lib/categorize/models/bag_of_words.rb
30
30
  - lib/categorize/models/cluster.rb
31
31
  - lib/categorize.rb
32
- - ext/categorize/categorize.c
33
- - ext/categorize/extconf.rb
32
+ - ext/ccategorize/ccategorize.c
33
+ - ext/ccategorize/extconf.rb
34
34
  homepage: http://www.helioid.com/
35
35
  licenses:
36
36
  - BSD3
@@ -1,13 +0,0 @@
1
- # encoding: utf-8
2
-
3
- # Loads mkmf which is used to make makefiles for Ruby extensions
4
- require 'mkmf'
5
-
6
- # Give it a name
7
- extension_name = 'bow'
8
-
9
- # The destination
10
- dir_config(extension_name)
11
-
12
- # Do the work
13
- create_makefile(extension_name)