categorize 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- M2ZlOWRhNzM5YWYwNzQ3Yzg3OTZkMzQyOGNjOWI2ZGVlNGRjYTA5OA==
4
+ ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
5
5
  data.tar.gz: !binary |-
6
- MjE4ODU1MjUyMjcxNDZkMDFkZWViZGI0MGU2MzMyYzY4NWUwMTE5Nw==
6
+ YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- YjlhMmZhYzZlZGRiZmZkMWJhODMxM2E3ZjRiNzc3YWY1ODVlZGZjODEzNWQw
10
- OGJlMGUxNGJiMmY3MWIzYjM3YTA1YWY5Mjg4YTc4NjYxY2FjZWMyNGM1ZDUy
11
- NmZlNzkwM2Y3ZTEyZjkyMmMxMzU5Mzk0NTE3MWI3N2Q3NGFkYzM=
9
+ MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
10
+ OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
11
+ Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
12
12
  data.tar.gz: !binary |-
13
- ZWYxODFiNDBmZWQ2NDkwMzEwZmYxNzUwOTZjMjcwMDViM2FhZTU1OWZiOGQ5
14
- NDM0MzJjZmQ2M2IzOGUzNTRmN2ExZDg4MGQxZDYwZjNiYWRmYTBjZGMyMDI1
15
- YmQ3YzA4NTA3YmEwYjhhNjUwMDIzYWJmZmFlMGE2OTQ5ZGFhM2U=
13
+ ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
14
+ OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
15
+ NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
@@ -0,0 +1,370 @@
1
+ #include <inttypes.h> /* intptr_t, PRIxPTR */
2
+ #include <search.h> /* hcreate(), hsearch() */
3
+ #include <stdio.h> /* perror(), printf() */
4
+ #include <stdlib.h> /* exit() */
5
+ #include "ruby.h"
6
+
7
+ // START header
8
+ // For information and references about the module to be stored internally.
9
+ VALUE Bow = Qnil;
10
+
11
+ static VALUE method_model_bow(VALUE, VALUE);
12
+ static int add_or_update_gram_from_index(int, char *);
13
+
14
+ // Store all grams, used in compare_top_grams.
15
+ static char **all_grams_pp;
16
+ // END header
17
+
18
+ // Initialization method for this module.
19
+ void Init_bow()
20
+ {
21
+ Bow = rb_define_module("Bow");
22
+ rb_define_method(Bow, "model_bow", method_model_bow, 1);
23
+ }
24
+
25
+ const bool DEBUG = false;
26
+ const int MAX_BUCKETS = 10;
27
+ const float MIN_SUPPORT = 0.1;
28
+ const int NUM_TOP_GRAMS = 250;
29
+
30
+ void fail(const char *message)
31
+ {
32
+ perror(message);
33
+ rb_fatal(message);
34
+ exit(1);
35
+ }
36
+
37
+ /*
38
+ * Must hcreate() the hash table before calling fetch() or store().
39
+ *
40
+ * Because p->data is a pointer, fetch() and store() cast between
41
+ * void * and intptr_t.
42
+ */
43
+
44
+ /* Fetch value from the hash table. */
45
+ int fetch(const char *key, intptr_t *value)
46
+ {
47
+ ENTRY e = {key: (char *)key}, *p;
48
+ p = hsearch(e, FIND);
49
+
50
+ if (p) {
51
+ *value = (intptr_t)p->data;
52
+ return 1;
53
+ } else
54
+ return 0;
55
+ }
56
+
57
+ /* Store key-value pair into the hash table. */
58
+ void store(const char *key, intptr_t value)
59
+ {
60
+ /*
61
+ * hsearch() may insert a new entry or find an existing entry
62
+ * with the same key. hsearch() ignores e.data if it finds an
63
+ * existing entry. We must call hsearch(), then set p->data.
64
+ */
65
+ ENTRY e = {key: (char *)key}, *p;
66
+ p = hsearch(e, ENTER);
67
+
68
+ if (p == NULL) fail("hsearch");
69
+
70
+ p->data = (void *)value;
71
+ }
72
+
73
+ char *make_key(int i, char *str)
74
+ {
75
+ // Only provide support for < 100 groups.
76
+ int nbuf = (i < 10) ? 3 : 4;
77
+ char *buf = malloc(sizeof(char) * (nbuf + strlen(str)));
78
+
79
+ if (buf == NULL) rb_fatal("No memory for key %i", i);
80
+
81
+ snprintf(buf, nbuf + strlen(str), "%i_%s", i, str);
82
+
83
+ return buf;
84
+ }
85
+
86
+ typedef struct {
87
+ int freq;
88
+ float fitness;
89
+ } gram;
90
+
91
+ int compare_grams(const void *gram1, const void *gram2)
92
+ {
93
+ intptr_t g1, g2;
94
+
95
+ if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
96
+ return (*(gram *) g2).freq - (*(gram *) g1).freq;
97
+ } else
98
+ fail("compare_grams");
99
+
100
+ return 0;
101
+ }
102
+
103
+ int compare_top_grams(const void *idx1, const void *idx2)
104
+ {
105
+ char *gram1 = all_grams_pp[*(int *) idx1];
106
+ char *gram2 = all_grams_pp[*(int *) idx2];
107
+ intptr_t g1, g2;
108
+
109
+ if (fetch(gram1, &g1) && fetch(gram2, &g2))
110
+ return (*(gram *) g2).fitness - (*(gram *) g1).fitness;
111
+ else
112
+ fail("compare_grams");
113
+
114
+ return 0;
115
+ }
116
+
117
+ /*
118
+ * model_bow(array_of_tokens);
119
+ * ==== Return
120
+ * Top terms
121
+ * ==== Parameters
122
+ * array_of_tokens: Tokens to turn into grams and extract phrases from.
123
+ */
124
+ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
125
+ {
126
+ int i, j;
127
+ long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
128
+ int num_grams = 0;
129
+
130
+ for (i = 0; i < array_of_tokens_len; i++) {
131
+ // n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
132
+ // TODO Correct parentheses enclose as (n - 1).
133
+ num_grams += 3 * RARRAY_LEN(rb_ary_entry(array_of_tokens, i)) - 1;
134
+ }
135
+
136
+ // Create an empty table that can hold 50 entries.
137
+ if (DEBUG) printf("num grams: %i\n", num_grams);
138
+ if (hcreate(2 * num_grams) == 0)
139
+ fail("hcreate");
140
+
141
+ // list of all grams
142
+ all_grams_pp = malloc(sizeof(char *) * num_grams);
143
+ if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
144
+
145
+ int gram_counter = 0;
146
+ char *tmp;
147
+ char *str;
148
+ char *bigram;
149
+ char *trigram;
150
+ char *last_word;
151
+ char *last_2nd_word;
152
+ int non_empty_tokens = 0;
153
+ int tmp_int;
154
+
155
+ for (i = 0; i < array_of_tokens_len; i++) {
156
+ // n grams
157
+ last_word = 0;
158
+ last_2nd_word = 0;
159
+ if (DEBUG) printf("start i: %i\n", i);
160
+
161
+ for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
162
+ VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
163
+ // store str via malloc so we can free it along with others
164
+ tmp = StringValueCStr(rb_str);
165
+ tmp_int = 1 + strlen(tmp);
166
+ str = malloc(sizeof(char) * tmp_int);
167
+ snprintf(str, tmp_int, "%s", tmp);
168
+
169
+ // add gram
170
+ if (add_or_update_gram_from_index(i, str))
171
+ all_grams_pp[gram_counter++] = str;
172
+
173
+ if (DEBUG) printf("j: %i, gram: %s", j, str);
174
+
175
+ // add bigram
176
+ if (last_word && strcmp(str, last_word) != 0) {
177
+ tmp_int = 2 + strlen(str) + strlen(last_word);
178
+ bigram = malloc(sizeof(char) * tmp_int);
179
+
180
+ if (bigram == NULL) rb_fatal("No memory for bigram");
181
+ snprintf(bigram, tmp_int, "%s %s", last_word, str);
182
+
183
+ if (add_or_update_gram_from_index(i, bigram))
184
+ all_grams_pp[gram_counter++] = bigram;
185
+
186
+ if (DEBUG) printf(", bigram: %s", bigram);
187
+
188
+ // add trigram
189
+ if (last_2nd_word &&
190
+ strcmp(str, last_word) != 0 &&
191
+ strcmp(str, last_2nd_word) != 0 &&
192
+ strcmp(last_word, last_2nd_word) != 0) {
193
+ tmp_int = 2 + strlen(bigram) + strlen(last_2nd_word);
194
+ trigram = malloc(sizeof(char) * tmp_int);
195
+
196
+ if (trigram == NULL) rb_fatal("No memory for trigram");
197
+ snprintf(trigram, tmp_int, "%s %s", last_2nd_word, bigram);
198
+
199
+ if (add_or_update_gram_from_index(i, trigram))
200
+ all_grams_pp[gram_counter++] = trigram;
201
+
202
+ if (DEBUG) printf(", trigram: %s", trigram);
203
+ }
204
+ }
205
+ if (DEBUG) printf("\n");
206
+ last_2nd_word = last_word;
207
+ last_word = str;
208
+ }
209
+ if (j > 0) non_empty_tokens++;
210
+ if (DEBUG) printf("end i: %i\n", i);
211
+ }
212
+ int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
213
+
214
+ if (DEBUG) printf("added %i grams\n", gram_counter);
215
+
216
+ // sort all_grams
217
+ qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
218
+
219
+ // only consider prominent top NUM_TOP_GRAMS grams
220
+ int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
221
+
222
+ if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
223
+ gram_counter, num_top_grams, array_of_tokens_len);
224
+
225
+ int top_grams_p[num_top_grams];
226
+
227
+ if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
228
+
229
+ int top_gram_counter = 0;
230
+ intptr_t g, all_g;
231
+ int count;
232
+ char *key;
233
+
234
+ for (i = 0; i < num_top_grams; i++) {
235
+ count = 0;
236
+ for (j = 0; j < array_of_tokens_len; j++) {
237
+ key = make_key(j, all_grams_pp[i]);
238
+
239
+ if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
240
+ top_grams_p[top_gram_counter++] = i;
241
+ if (DEBUG) printf("%i: covering gram: %s\n",
242
+ top_gram_counter - 1, all_grams_pp[i]);
243
+ break;
244
+ }
245
+ }
246
+ }
247
+
248
+ if (DEBUG) {
249
+ printf("after top grams\n");
250
+ printf("tgc %i\n", top_gram_counter);
251
+ }
252
+
253
+ float max_fitness;
254
+ char *max_fit;
255
+
256
+ for (i = 0; i < array_of_tokens_len; i++) {
257
+ if (DEBUG) printf("start i: %i\n", i);
258
+
259
+ // set fitness for top grams relative to collections
260
+ for (j = 0; j < top_gram_counter; j++) {
261
+ key = make_key(i, all_grams_pp[top_grams_p[j]]);
262
+
263
+ if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
264
+ (*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
265
+ if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
266
+ }
267
+
268
+ free(key);
269
+ }
270
+
271
+ max_fitness = 0.0;
272
+ max_fit = 0;
273
+
274
+ // set fitness for top grams overall
275
+ for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
276
+ VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
277
+ str = StringValueCStr(rb_str);
278
+ key = make_key(i, str);
279
+
280
+ if (fetch(key, &g) && (*(gram *) g).fitness > max_fitness) {
281
+ max_fitness = (*(gram *) g).fitness;
282
+ max_fit = str;
283
+ }
284
+
285
+ free(key);
286
+ // store fitness of gram
287
+ if (max_fit && fetch(max_fit, &g))
288
+ (*(gram *) g).fitness += 1.0;
289
+ }
290
+ }
291
+
292
+ if (DEBUG) printf("after set fitness\n");
293
+
294
+ // sort top_grams and take MAX_BUCKETS
295
+ qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
296
+ if (DEBUG) printf("after qsort top grams\n");
297
+
298
+ int max_fit_idx;
299
+ VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
300
+
301
+ for (i = 0; i < array_of_tokens_len; i++) {
302
+ max_fitness = 0;
303
+ max_fit_idx = 0;
304
+
305
+ for (j = 0; j < MAX_BUCKETS && j < top_gram_counter; j++) {
306
+ char *key = make_key(i, all_grams_pp[top_grams_p[j]]);
307
+
308
+ if (fetch(key, &g) && (*(gram *) g).fitness >= max_fitness) {
309
+ max_fitness = (*(gram *) g).fitness;
310
+ max_fit_idx = j;
311
+ }
312
+
313
+ free(key);
314
+ }
315
+
316
+ VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
317
+ rb_ary_push(term_for_record, term);
318
+ }
319
+ if (DEBUG) printf("after qsort top grams\n");
320
+ if (DEBUG) printf("freeing\n");
321
+
322
+ for (i = 0; i < gram_counter; i++) {
323
+ for (j = 0; j < array_of_tokens_len; j++) {
324
+ char *key = make_key(j, all_grams_pp[i]);
325
+
326
+ if (fetch(key, &g)) free((void *) g);
327
+ free(key);
328
+ }
329
+
330
+ fetch(all_grams_pp[i], &g);
331
+ free((void *) g);
332
+ free(all_grams_pp[i]);
333
+ }
334
+
335
+ free(all_grams_pp);
336
+ if (DEBUG) printf("freed all grams\n");
337
+ hdestroy();
338
+ if (DEBUG) printf("returning\n");
339
+
340
+ return term_for_record;
341
+ }
342
+
343
+ // Return whether gram exists or not
344
+ int add_or_update_gram(char *key)
345
+ {
346
+ intptr_t g;
347
+ if (fetch(key, &g)) {
348
+ (*(gram *) g).freq += 1;
349
+ if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
350
+
351
+ return 0;
352
+ } else {
353
+ gram *g = malloc(sizeof(gram));
354
+ if (g == NULL) rb_fatal("No memory for gram");
355
+ (*g).freq = 1;
356
+ (*g).fitness = 0.0;
357
+ store(key, (intptr_t) g);
358
+
359
+ return 1;
360
+ }
361
+ }
362
+
363
+ int add_or_update_gram_from_index(int i, char *str)
364
+ {
365
+ char *key = make_key(i, str);
366
+ add_or_update_gram(key);
367
+
368
+ return add_or_update_gram(str);
369
+ }
370
+
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ # Loads mkmf which is used to make makefiles for Ruby extensions
4
+ require 'mkmf'
5
+
6
+ # Give it a name
7
+ extension_name = 'bow'
8
+
9
+ # The destination
10
+ dir_config(extension_name)
11
+
12
+ # Do the work
13
+ create_makefile(extension_name)
@@ -18,11 +18,9 @@ module Categorize
18
18
  end
19
19
 
20
20
  def build_categories(clusters)
21
- cluster_indices = clusters.map do |cluster|
22
- cluster.data_items.map { |v| @vectors.index(v) }
23
- end
24
-
25
- clusters_to_records = Hash[(0...@num_clusters).zip(cluster_indices)]
21
+ clusters_to_records = Hash[clusters.each_with_index.map do |cluster, i|
22
+ [i, cluster.data_items.map { |v| @vectors.index(v) }]
23
+ end]
26
24
 
27
25
  @query_terms ||= @query.split.map(&:downcase)
28
26
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: categorize
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Lubell-Doughtie
@@ -15,21 +15,25 @@ description: ! "A text categorization library that favors performance.\n
15
15
  for use in online systems."
16
16
  email: peter@helioid.com
17
17
  executables: []
18
- extensions: []
18
+ extensions:
19
+ - ext/categorize/extconf.rb
19
20
  extra_rdoc_files: []
20
21
  files:
21
- - lib/categorize.rb
22
- - lib/categorize/model.rb
23
22
  - lib/categorize/constants.rb
23
+ - lib/categorize/utils/gram_collection.rb
24
+ - lib/categorize/utils/grams.rb
25
+ - lib/categorize/utils/gram_node.rb
26
+ - lib/categorize/model.rb
24
27
  - lib/categorize/models/abstract_model.rb
28
+ - lib/categorize/models/hierarchical_cluster.rb
25
29
  - lib/categorize/models/bag_of_words.rb
26
30
  - lib/categorize/models/cluster.rb
27
- - lib/categorize/models/hierarchical_cluster.rb
28
- - lib/categorize/utils/gram_collection.rb
29
- - lib/categorize/utils/gram_node.rb
30
- - lib/categorize/utils/grams.rb
31
+ - lib/categorize.rb
32
+ - ext/categorize/categorize.c
33
+ - ext/categorize/extconf.rb
31
34
  homepage: http://www.helioid.com/
32
- licenses: []
35
+ licenses:
36
+ - BSD3
33
37
  metadata: {}
34
38
  post_install_message:
35
39
  rdoc_options: []