categorize 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- M2ZlOWRhNzM5YWYwNzQ3Yzg3OTZkMzQyOGNjOWI2ZGVlNGRjYTA5OA==
4
+ ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
5
5
  data.tar.gz: !binary |-
6
- MjE4ODU1MjUyMjcxNDZkMDFkZWViZGI0MGU2MzMyYzY4NWUwMTE5Nw==
6
+ YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- YjlhMmZhYzZlZGRiZmZkMWJhODMxM2E3ZjRiNzc3YWY1ODVlZGZjODEzNWQw
10
- OGJlMGUxNGJiMmY3MWIzYjM3YTA1YWY5Mjg4YTc4NjYxY2FjZWMyNGM1ZDUy
11
- NmZlNzkwM2Y3ZTEyZjkyMmMxMzU5Mzk0NTE3MWI3N2Q3NGFkYzM=
9
+ MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
10
+ OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
11
+ Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
12
12
  data.tar.gz: !binary |-
13
- ZWYxODFiNDBmZWQ2NDkwMzEwZmYxNzUwOTZjMjcwMDViM2FhZTU1OWZiOGQ5
14
- NDM0MzJjZmQ2M2IzOGUzNTRmN2ExZDg4MGQxZDYwZjNiYWRmYTBjZGMyMDI1
15
- YmQ3YzA4NTA3YmEwYjhhNjUwMDIzYWJmZmFlMGE2OTQ5ZGFhM2U=
13
+ ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
14
+ OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
15
+ NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
@@ -0,0 +1,370 @@
1
+ #include <inttypes.h> /* intptr_t, PRIxPTR */
2
+ #include <search.h> /* hcreate(), hsearch() */
3
+ #include <stdio.h> /* perror(), printf() */
4
+ #include <stdlib.h> /* exit() */
5
+ #include "ruby.h"
6
+
7
+ // START header
8
+ // For information and references about the module to be stored internally.
9
+ VALUE Bow = Qnil;
10
+
11
+ static VALUE method_model_bow(VALUE, VALUE);
12
+ static int add_or_update_gram_from_index(int, char *);
13
+
14
+ // Store all grams, used in compare_top_grams.
15
+ static char **all_grams_pp;
16
+ // END header
17
+
18
+ // Initialization method for this module.
19
+ void Init_bow()
20
+ {
21
+ Bow = rb_define_module("Bow");
22
+ rb_define_method(Bow, "model_bow", method_model_bow, 1);
23
+ }
24
+
25
+ const bool DEBUG = false;
26
+ const int MAX_BUCKETS = 10;
27
+ const float MIN_SUPPORT = 0.1;
28
+ const int NUM_TOP_GRAMS = 250;
29
+
30
+ void fail(const char *message)
31
+ {
32
+ perror(message);
33
+ rb_fatal(message);
34
+ exit(1);
35
+ }
36
+
37
+ /*
38
+ * Must hcreate() the hash table before calling fetch() or store().
39
+ *
40
+ * Because p->data is a pointer, fetch() and store() cast between
41
+ * void * and intptr_t.
42
+ */
43
+
44
+ /* Fetch value from the hash table. */
45
+ int fetch(const char *key, intptr_t *value)
46
+ {
47
+ ENTRY e = {key: (char *)key}, *p;
48
+ p = hsearch(e, FIND);
49
+
50
+ if (p) {
51
+ *value = (intptr_t)p->data;
52
+ return 1;
53
+ } else
54
+ return 0;
55
+ }
56
+
57
+ /* Store key-value pair into the hash table. */
58
+ void store(const char *key, intptr_t value)
59
+ {
60
+ /*
61
+ * hsearch() may insert a new entry or find an existing entry
62
+ * with the same key. hsearch() ignores e.data if it finds an
63
+ * existing entry. We must call hsearch(), then set p->data.
64
+ */
65
+ ENTRY e = {key: (char *)key}, *p;
66
+ p = hsearch(e, ENTER);
67
+
68
+ if (p == NULL) fail("hsearch");
69
+
70
+ p->data = (void *)value;
71
+ }
72
+
73
+ char *make_key(int i, char *str)
74
+ {
75
+ // Only provide support for < 100 groups.
76
+ int nbuf = (i < 10) ? 3 : 4;
77
+ char *buf = malloc(sizeof(char) * (nbuf + strlen(str)));
78
+
79
+ if (buf == NULL) rb_fatal("No memory for key %i", i);
80
+
81
+ snprintf(buf, nbuf + strlen(str), "%i_%s", i, str);
82
+
83
+ return buf;
84
+ }
85
+
86
+ typedef struct {
87
+ int freq;
88
+ float fitness;
89
+ } gram;
90
+
91
+ int compare_grams(const void *gram1, const void *gram2)
92
+ {
93
+ intptr_t g1, g2;
94
+
95
+ if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
96
+ return (*(gram *) g2).freq - (*(gram *) g1).freq;
97
+ } else
98
+ fail("compare_grams");
99
+
100
+ return 0;
101
+ }
102
+
103
+ int compare_top_grams(const void *idx1, const void *idx2)
104
+ {
105
+ char *gram1 = all_grams_pp[*(int *) idx1];
106
+ char *gram2 = all_grams_pp[*(int *) idx2];
107
+ intptr_t g1, g2;
108
+
109
+ if (fetch(gram1, &g1) && fetch(gram2, &g2))
110
+ return (*(gram *) g2).fitness - (*(gram *) g1).fitness;
111
+ else
112
+ fail("compare_grams");
113
+
114
+ return 0;
115
+ }
116
+
117
+ /*
118
+ * model_bow(array_of_tokens);
119
+ * ==== Return
120
+ * Top terms
121
+ * ==== Parameters
122
+ * array_of_tokens: Tokens to turn into grams and extract phrases from.
123
+ */
124
+ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
125
+ {
126
+ int i, j;
127
+ long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
128
+ int num_grams = 0;
129
+
130
+ for (i = 0; i < array_of_tokens_len; i++) {
131
+ // n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
132
+ // TODO Correct parentheses enclose as (n - 1).
133
+ num_grams += 3 * RARRAY_LEN(rb_ary_entry(array_of_tokens, i)) - 1;
134
+ }
135
+
136
+ // Create an empty table that can hold 50 entries.
137
+ if (DEBUG) printf("num grams: %i\n", num_grams);
138
+ if (hcreate(2 * num_grams) == 0)
139
+ fail("hcreate");
140
+
141
+ // list of all grams
142
+ all_grams_pp = malloc(sizeof(char *) * num_grams);
143
+ if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
144
+
145
+ int gram_counter = 0;
146
+ char *tmp;
147
+ char *str;
148
+ char *bigram;
149
+ char *trigram;
150
+ char *last_word;
151
+ char *last_2nd_word;
152
+ int non_empty_tokens = 0;
153
+ int tmp_int;
154
+
155
+ for (i = 0; i < array_of_tokens_len; i++) {
156
+ // n grams
157
+ last_word = 0;
158
+ last_2nd_word = 0;
159
+ if (DEBUG) printf("start i: %i\n", i);
160
+
161
+ for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
162
+ VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
163
+ // store str via malloc so we can free it along with others
164
+ tmp = StringValueCStr(rb_str);
165
+ tmp_int = 1 + strlen(tmp);
166
+ str = malloc(sizeof(char) * tmp_int);
167
+ snprintf(str, tmp_int, "%s", tmp);
168
+
169
+ // add gram
170
+ if (add_or_update_gram_from_index(i, str))
171
+ all_grams_pp[gram_counter++] = str;
172
+
173
+ if (DEBUG) printf("j: %i, gram: %s", j, str);
174
+
175
+ // add bigram
176
+ if (last_word && strcmp(str, last_word) != 0) {
177
+ tmp_int = 2 + strlen(str) + strlen(last_word);
178
+ bigram = malloc(sizeof(char) * tmp_int);
179
+
180
+ if (bigram == NULL) rb_fatal("No memory for bigram");
181
+ snprintf(bigram, tmp_int, "%s %s", last_word, str);
182
+
183
+ if (add_or_update_gram_from_index(i, bigram))
184
+ all_grams_pp[gram_counter++] = bigram;
185
+
186
+ if (DEBUG) printf(", bigram: %s", bigram);
187
+
188
+ // add trigram
189
+ if (last_2nd_word &&
190
+ strcmp(str, last_word) != 0 &&
191
+ strcmp(str, last_2nd_word) != 0 &&
192
+ strcmp(last_word, last_2nd_word) != 0) {
193
+ tmp_int = 2 + strlen(bigram) + strlen(last_2nd_word);
194
+ trigram = malloc(sizeof(char) * tmp_int);
195
+
196
+ if (trigram == NULL) rb_fatal("No memory for trigram");
197
+ snprintf(trigram, tmp_int, "%s %s", last_2nd_word, bigram);
198
+
199
+ if (add_or_update_gram_from_index(i, trigram))
200
+ all_grams_pp[gram_counter++] = trigram;
201
+
202
+ if (DEBUG) printf(", trigram: %s", trigram);
203
+ }
204
+ }
205
+ if (DEBUG) printf("\n");
206
+ last_2nd_word = last_word;
207
+ last_word = str;
208
+ }
209
+ if (j > 0) non_empty_tokens++;
210
+ if (DEBUG) printf("end i: %i\n", i);
211
+ }
212
+ int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
213
+
214
+ if (DEBUG) printf("added %i grams\n", gram_counter);
215
+
216
+ // sort all_grams
217
+ qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
218
+
219
+ // only consider prominent top NUM_TOP_GRAMS grams
220
+ int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
221
+
222
+ if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
223
+ gram_counter, num_top_grams, array_of_tokens_len);
224
+
225
+ int top_grams_p[num_top_grams];
226
+
227
+ if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
228
+
229
+ int top_gram_counter = 0;
230
+ intptr_t g, all_g;
231
+ int count;
232
+ char *key;
233
+
234
+ for (i = 0; i < num_top_grams; i++) {
235
+ count = 0;
236
+ for (j = 0; j < array_of_tokens_len; j++) {
237
+ key = make_key(j, all_grams_pp[i]);
238
+
239
+ if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
240
+ top_grams_p[top_gram_counter++] = i;
241
+ if (DEBUG) printf("%i: covering gram: %s\n",
242
+ top_gram_counter - 1, all_grams_pp[i]);
243
+ break;
244
+ }
245
+ }
246
+ }
247
+
248
+ if (DEBUG) {
249
+ printf("after top grams\n");
250
+ printf("tgc %i\n", top_gram_counter);
251
+ }
252
+
253
+ float max_fitness;
254
+ char *max_fit;
255
+
256
+ for (i = 0; i < array_of_tokens_len; i++) {
257
+ if (DEBUG) printf("start i: %i\n", i);
258
+
259
+ // set fitness for top grams relative to collections
260
+ for (j = 0; j < top_gram_counter; j++) {
261
+ key = make_key(i, all_grams_pp[top_grams_p[j]]);
262
+
263
+ if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
264
+ (*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
265
+ if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
266
+ }
267
+
268
+ free(key);
269
+ }
270
+
271
+ max_fitness = 0.0;
272
+ max_fit = 0;
273
+
274
+ // set fitness for top grams overall
275
+ for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
276
+ VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
277
+ str = StringValueCStr(rb_str);
278
+ key = make_key(i, str);
279
+
280
+ if (fetch(key, &g) && (*(gram *) g).fitness > max_fitness) {
281
+ max_fitness = (*(gram *) g).fitness;
282
+ max_fit = str;
283
+ }
284
+
285
+ free(key);
286
+ // store fitness of gram
287
+ if (max_fit && fetch(max_fit, &g))
288
+ (*(gram *) g).fitness += 1.0;
289
+ }
290
+ }
291
+
292
+ if (DEBUG) printf("after set fitness\n");
293
+
294
+ // sort top_grams and take MAX_BUCKETS
295
+ qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
296
+ if (DEBUG) printf("after qsort top grams\n");
297
+
298
+ int max_fit_idx;
299
+ VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
300
+
301
+ for (i = 0; i < array_of_tokens_len; i++) {
302
+ max_fitness = 0;
303
+ max_fit_idx = 0;
304
+
305
+ for (j = 0; j < MAX_BUCKETS && j < top_gram_counter; j++) {
306
+ char *key = make_key(i, all_grams_pp[top_grams_p[j]]);
307
+
308
+ if (fetch(key, &g) && (*(gram *) g).fitness >= max_fitness) {
309
+ max_fitness = (*(gram *) g).fitness;
310
+ max_fit_idx = j;
311
+ }
312
+
313
+ free(key);
314
+ }
315
+
316
+ VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
317
+ rb_ary_push(term_for_record, term);
318
+ }
319
+ if (DEBUG) printf("after qsort top grams\n");
320
+ if (DEBUG) printf("freeing\n");
321
+
322
+ for (i = 0; i < gram_counter; i++) {
323
+ for (j = 0; j < array_of_tokens_len; j++) {
324
+ char *key = make_key(j, all_grams_pp[i]);
325
+
326
+ if (fetch(key, &g)) free((void *) g);
327
+ free(key);
328
+ }
329
+
330
+ fetch(all_grams_pp[i], &g);
331
+ free((void *) g);
332
+ free(all_grams_pp[i]);
333
+ }
334
+
335
+ free(all_grams_pp);
336
+ if (DEBUG) printf("freed all grams\n");
337
+ hdestroy();
338
+ if (DEBUG) printf("returning\n");
339
+
340
+ return term_for_record;
341
+ }
342
+
343
+ // Return whether gram exists or not
344
+ int add_or_update_gram(char *key)
345
+ {
346
+ intptr_t g;
347
+ if (fetch(key, &g)) {
348
+ (*(gram *) g).freq += 1;
349
+ if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
350
+
351
+ return 0;
352
+ } else {
353
+ gram *g = malloc(sizeof(gram));
354
+ if (g == NULL) rb_fatal("No memory for gram");
355
+ (*g).freq = 1;
356
+ (*g).fitness = 0.0;
357
+ store(key, (intptr_t) g);
358
+
359
+ return 1;
360
+ }
361
+ }
362
+
363
+ int add_or_update_gram_from_index(int i, char *str)
364
+ {
365
+ char *key = make_key(i, str);
366
+ add_or_update_gram(key);
367
+
368
+ return add_or_update_gram(str);
369
+ }
370
+
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ # Loads mkmf which is used to make makefiles for Ruby extensions
4
+ require 'mkmf'
5
+
6
+ # Give it a name
7
+ extension_name = 'bow'
8
+
9
+ # The destination
10
+ dir_config(extension_name)
11
+
12
+ # Do the work
13
+ create_makefile(extension_name)
@@ -18,11 +18,9 @@ module Categorize
18
18
  end
19
19
 
20
20
  def build_categories(clusters)
21
- cluster_indices = clusters.map do |cluster|
22
- cluster.data_items.map { |v| @vectors.index(v) }
23
- end
24
-
25
- clusters_to_records = Hash[(0...@num_clusters).zip(cluster_indices)]
21
+ clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
22
+ [i, cluster.data_items.map { |v| @vectors.index(v) }]
23
+ end]
26
24
 
27
25
  @query_terms ||= @query.split.map(&:downcase)
28
26
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Lubell-Doughtie
@@ -15,21 +15,25 @@ description: ! "A text categorization library that favors performance.\n
15
15
  for use in online systems."
16
16
  email: peter@helioid.com
17
17
  executables: []
18
- extensions: []
18
+ extensions:
19
+ - ext/categorize/extconf.rb
19
20
  extra_rdoc_files: []
20
21
  files:
21
- - lib/categorize.rb
22
- - lib/categorize/model.rb
23
22
  - lib/categorize/constants.rb
23
+ - lib/categorize/utils/gram_collection.rb
24
+ - lib/categorize/utils/grams.rb
25
+ - lib/categorize/utils/gram_node.rb
26
+ - lib/categorize/model.rb
24
27
  - lib/categorize/models/abstract_model.rb
28
+ - lib/categorize/models/hierarchical_cluster.rb
25
29
  - lib/categorize/models/bag_of_words.rb
26
30
  - lib/categorize/models/cluster.rb
27
- - lib/categorize/models/hierarchical_cluster.rb
28
- - lib/categorize/utils/gram_collection.rb
29
- - lib/categorize/utils/gram_node.rb
30
- - lib/categorize/utils/grams.rb
31
+ - lib/categorize.rb
32
+ - ext/categorize/categorize.c
33
+ - ext/categorize/extconf.rb
31
34
  homepage: http://www.helioid.com/
32
- licenses: []
35
+ licenses:
36
+ - BSD3
33
37
  metadata: {}
34
38
  post_install_message:
35
39
  rdoc_options: []