categorize 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
4
+ YTFmZGJlMWI3YjUxNGQ2MGE3OGJmMjIzN2ZiYTFmNjVhMzE0OGRkMA==
5
5
  data.tar.gz: !binary |-
6
- YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
6
+ NDAxNDY2MDNlYjBmYjc0NTIzY2JjYmU4NjE3YTBiMTQ1ZjA5ZjZjMg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
10
- OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
11
- Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
9
+ MDk4YjVhYjBhMWMzNGMwZmZlYmFiNTZlZWNiNjNkNjUxM2QzNjkxNzU2NWRk
10
+ MGQzNzJkZTY5ZmJhM2Q1ZmMyNDM2ZDEzZjk1Mzg4ODE2NWFlMmVhODBkNGZh
11
+ YTg0NGY4YTM0YzM0M2E0M2YyNjE5NjY4NzUyOTg4YzI1ODM5NGI=
12
12
  data.tar.gz: !binary |-
13
- ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
14
- OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
15
- NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
13
+ MmY1MmIyZjAyYWVmMzI1NzExMjk2ODFhMTgxOTg2ZDMzYTJhYzBlYWExZDdj
14
+ NDM4ODU0NTllMjdjOWI4NjMwMzBmODhhN2UxODE2MmZmMjFhNzBkMTczMzlj
15
+ ZjBjNWZiMDc5MTM3NDI5ZGI2ZmNiYzZmMTVjYzlhMzIzNjBiZjE=
@@ -4,11 +4,15 @@
4
4
  #include <stdlib.h> /* exit() */
5
5
  #include "ruby.h"
6
6
 
7
+ typedef enum { false, true } bool;
8
+
7
9
  // START header
8
10
  // For information and references about the module to be stored internally.
9
- VALUE Bow = Qnil;
11
+ VALUE Categorize = Qnil;
12
+ VALUE CBagOfWords = Qnil;
13
+ VALUE Models = Qnil;
10
14
 
11
- static VALUE method_model_bow(VALUE, VALUE);
15
+ static VALUE method_make_model(VALUE, VALUE);
12
16
  static int add_or_update_gram_from_index(int, char *);
13
17
 
14
18
  // Store all grams, used in compare_top_grams.
@@ -16,10 +20,12 @@ static char **all_grams_pp;
16
20
  // END header
17
21
 
18
22
  // Initialization method for this module.
19
- void Init_bow()
23
+ void Init_ccategorize()
20
24
  {
21
- Bow = rb_define_module("Bow");
22
- rb_define_method(Bow, "model_bow", method_model_bow, 1);
25
+ Categorize = rb_define_module("CCategorize");
26
+ Models = rb_define_module_under(Categorize, "Models");
27
+ CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
28
+ rb_define_method(CBagOfWords, "make_model", method_make_model, 1);
23
29
  }
24
30
 
25
31
  const bool DEBUG = false;
@@ -92,7 +98,8 @@ int compare_grams(const void *gram1, const void *gram2)
92
98
  {
93
99
  intptr_t g1, g2;
94
100
 
95
- if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
101
+ if (fetch(*(const char **) gram1, &g1) &&
102
+ fetch(*(const char **) gram2, &g2)) {
96
103
  return (*(gram *) g2).freq - (*(gram *) g1).freq;
97
104
  } else
98
105
  fail("compare_grams");
@@ -115,17 +122,37 @@ int compare_top_grams(const void *idx1, const void *idx2)
115
122
  }
116
123
 
117
124
  /*
118
- * model_bow(array_of_tokens);
125
+ * make_model(array_of_tokens);
119
126
  * ==== Return
120
127
  * Top terms
121
128
  * ==== Parameters
122
129
  * array_of_tokens: Tokens to turn into grams and extract phrases from.
123
130
  */
124
- static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
131
+ static VALUE method_make_model(VALUE self, VALUE array_of_tokens)
125
132
  {
126
133
  int i, j;
127
134
  long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
128
135
  int num_grams = 0;
136
+ int gram_counter;
137
+ char *tmp;
138
+ char *str;
139
+ char *bigram;
140
+ char *trigram;
141
+ char *last_word;
142
+ char *last_2nd_word;
143
+ int non_empty_tokens;
144
+ int tmp_int;
145
+ int min_cover;
146
+ int num_top_grams;
147
+ int top_gram_counter;
148
+ float max_fitness;
149
+ int max_fit_idx;
150
+ VALUE term;
151
+ VALUE term_for_record;
152
+ intptr_t g, all_g;
153
+ int count;
154
+ char *key;
155
+ char *max_fit;
129
156
 
130
157
  for (i = 0; i < array_of_tokens_len; i++) {
131
158
  // n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
@@ -142,15 +169,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
142
169
  all_grams_pp = malloc(sizeof(char *) * num_grams);
143
170
  if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
144
171
 
145
- int gram_counter = 0;
146
- char *tmp;
147
- char *str;
148
- char *bigram;
149
- char *trigram;
150
- char *last_word;
151
- char *last_2nd_word;
152
- int non_empty_tokens = 0;
153
- int tmp_int;
172
+ gram_counter = 0;
173
+ non_empty_tokens = 0;
154
174
 
155
175
  for (i = 0; i < array_of_tokens_len; i++) {
156
176
  // n grams
@@ -160,6 +180,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
160
180
 
161
181
  for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
162
182
  VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
183
+
163
184
  // store str via malloc so we can free it along with others
164
185
  tmp = StringValueCStr(rb_str);
165
186
  tmp_int = 1 + strlen(tmp);
@@ -209,7 +230,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
209
230
  if (j > 0) non_empty_tokens++;
210
231
  if (DEBUG) printf("end i: %i\n", i);
211
232
  }
212
- int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
233
+
234
+ min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
213
235
 
214
236
  if (DEBUG) printf("added %i grams\n", gram_counter);
215
237
 
@@ -217,7 +239,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
217
239
  qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
218
240
 
219
241
  // only consider prominent top NUM_TOP_GRAMS grams
220
- int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
242
+ num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter :
243
+ NUM_TOP_GRAMS;
221
244
 
222
245
  if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
223
246
  gram_counter, num_top_grams, array_of_tokens_len);
@@ -226,10 +249,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
226
249
 
227
250
  if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
228
251
 
229
- int top_gram_counter = 0;
230
- intptr_t g, all_g;
231
- int count;
232
- char *key;
252
+ top_gram_counter = 0;
233
253
 
234
254
  for (i = 0; i < num_top_grams; i++) {
235
255
  count = 0;
@@ -238,6 +258,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
238
258
 
239
259
  if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
240
260
  top_grams_p[top_gram_counter++] = i;
261
+
241
262
  if (DEBUG) printf("%i: covering gram: %s\n",
242
263
  top_gram_counter - 1, all_grams_pp[i]);
243
264
  break;
@@ -250,9 +271,6 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
250
271
  printf("tgc %i\n", top_gram_counter);
251
272
  }
252
273
 
253
- float max_fitness;
254
- char *max_fit;
255
-
256
274
  for (i = 0; i < array_of_tokens_len; i++) {
257
275
  if (DEBUG) printf("start i: %i\n", i);
258
276
 
@@ -261,7 +279,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
261
279
  key = make_key(i, all_grams_pp[top_grams_p[j]]);
262
280
 
263
281
  if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
264
- (*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
282
+ (*(gram *) g).fitness = (float) (*(gram *) g).freq /
283
+ (float) (*(gram *) all_g).freq;
265
284
  if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
266
285
  }
267
286
 
@@ -283,6 +302,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
283
302
  }
284
303
 
285
304
  free(key);
305
+
286
306
  // store fitness of gram
287
307
  if (max_fit && fetch(max_fit, &g))
288
308
  (*(gram *) g).fitness += 1.0;
@@ -293,10 +313,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
293
313
 
294
314
  // sort top_grams and take MAX_BUCKETS
295
315
  qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
316
+
296
317
  if (DEBUG) printf("after qsort top grams\n");
297
318
 
298
- int max_fit_idx;
299
- VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
319
+ term_for_record = rb_ary_new2(array_of_tokens_len);
300
320
 
301
321
  for (i = 0; i < array_of_tokens_len; i++) {
302
322
  max_fitness = 0;
@@ -313,9 +333,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
313
333
  free(key);
314
334
  }
315
335
 
316
- VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
336
+ term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
317
337
  rb_ary_push(term_for_record, term);
318
338
  }
339
+
319
340
  if (DEBUG) printf("after qsort top grams\n");
320
341
  if (DEBUG) printf("freeing\n");
321
342
 
@@ -334,6 +355,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
334
355
 
335
356
  free(all_grams_pp);
336
357
  if (DEBUG) printf("freed all grams\n");
358
+
337
359
  hdestroy();
338
360
  if (DEBUG) printf("returning\n");
339
361
 
@@ -344,6 +366,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
344
366
  int add_or_update_gram(char *key)
345
367
  {
346
368
  intptr_t g;
369
+
347
370
  if (fetch(key, &g)) {
348
371
  (*(gram *) g).freq += 1;
349
372
  if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
@@ -352,6 +375,7 @@ int add_or_update_gram(char *key)
352
375
  } else {
353
376
  gram *g = malloc(sizeof(gram));
354
377
  if (g == NULL) rb_fatal("No memory for gram");
378
+
355
379
  (*g).freq = 1;
356
380
  (*g).fitness = 0.0;
357
381
  store(key, (intptr_t) g);
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+
3
+ # Loads mkmf which is used to make makefiles for Ruby extensions
4
+ require 'mkmf'
5
+
6
+ # Do the work
7
+ create_makefile('ccategorize/ccategorize')
data/lib/categorize.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
+ require 'ccategorize/ccategorize'
3
4
  require 'categorize/models/abstract_model'
4
5
  require 'categorize/models/bag_of_words'
5
6
  require 'categorize/models/cluster'
@@ -8,16 +8,19 @@ module Categorize
8
8
  include Utils::Grams
9
9
 
10
10
  # DEBUG = false
11
- # TODO: some gradient descent to choose this number
12
- # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
13
- MIN_SUPP_L = 0.07
14
- MIN_SUPP_H = 0.1
15
- NUM_TOP_GRAMS = 250
16
- MAX_BUCKETS = 8
11
+ attr_accessor :max_buckets, :min_support, :num_top_grams
12
+
13
+ # 0 <= min_support <= 1, we like 0.01 <= min_support <= 0.1
14
+ def initialize
15
+ @max_buckets = 8
16
+ # TODO: some gradient descent to choose this number
17
+ @min_support = 0.07
18
+ @num_top_grams = 250
19
+ end
17
20
 
18
21
  # function worst case
19
22
  # O(2 x (|frequent_grams| x |gram_collections|) +
20
- # |all_grams| + MAX_BUCKETS x |gram_collections|)
23
+ # |all_grams| + @max_buckets x |gram_collections|)
21
24
  def model(query, records_to_tokens)
22
25
  @gram_cover_cache = {}
23
26
  @gram_collections, @all_grams = create_grams(query, records_to_tokens)
@@ -25,9 +28,9 @@ module Categorize
25
28
  top_grams = determine_frequency_term_sets(@all_grams, query)
26
29
  top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
27
30
  top_grams[gram_c1] <=> top_grams[gram_c2]
28
- end.first(MAX_BUCKETS)
31
+ end.first(@max_buckets)
29
32
 
30
- # below block, worst case O(MAX_BUCKETS x |gram_collections|)
33
+ # below block, worst case O(@max_buckets x |gram_collections|)
31
34
  @gram_collections.reduce({}) do |buckets, gram_collection|
32
35
  max_fitness = 0
33
36
  max_fit = nil
@@ -55,14 +58,13 @@ module Categorize
55
58
  result.grams.nil? || result.grams.empty?
56
59
  end.length
57
60
 
58
- min_cover_l = MIN_SUPP_L * effective_length
59
- # min_cover_h = MIN_SUPP_H * effective_length
61
+ min_cover_l = @min_support * effective_length
60
62
 
61
63
  # for speed only look at top N grams
62
64
  # below block, worst case O(|all_grams|)
63
65
  frequent_grams = all_grams.sort do |gram1, gram2|
64
66
  gram2.frequency <=> gram1.frequency
65
- end.first(NUM_TOP_GRAMS)
67
+ end.first(@num_top_grams)
66
68
 
67
69
  # below block, worst case O(|frequent_grams| x |gram_collections|)
68
70
  frequent_grams = frequent_grams.delete_if do |gram|
@@ -4,6 +4,8 @@ module Categorize
4
4
  module Models
5
5
  class Cluster < AbstractModel
6
6
 
7
+ attr_accessor :num_clusters
8
+
7
9
  def initialize
8
10
  @num_clusters = 10
9
11
  @clusterer = Ai4r::Clusterers::WardLinkage.new
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Lubell-Doughtie
@@ -16,7 +16,7 @@ description: ! "A text categorization library that favors performance.\n
16
16
  email: peter@helioid.com
17
17
  executables: []
18
18
  extensions:
19
- - ext/categorize/extconf.rb
19
+ - ext/ccategorize/extconf.rb
20
20
  extra_rdoc_files: []
21
21
  files:
22
22
  - lib/categorize/constants.rb
@@ -29,8 +29,8 @@ files:
29
29
  - lib/categorize/models/bag_of_words.rb
30
30
  - lib/categorize/models/cluster.rb
31
31
  - lib/categorize.rb
32
- - ext/categorize/categorize.c
33
- - ext/categorize/extconf.rb
32
+ - ext/ccategorize/ccategorize.c
33
+ - ext/ccategorize/extconf.rb
34
34
  homepage: http://www.helioid.com/
35
35
  licenses:
36
36
  - BSD3
@@ -1,13 +0,0 @@
1
- # encoding: utf-8
2
-
3
- # Loads mkmf which is used to make makefiles for Ruby extensions
4
- require 'mkmf'
5
-
6
- # Give it a name
7
- extension_name = 'bow'
8
-
9
- # The destination
10
- dir_config(extension_name)
11
-
12
- # Do the work
13
- create_makefile(extension_name)