RubyGems - categorize - Versions diffs - 0.0.9 → 0.0.10 - Mend

categorize 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +8 -8
data/ext/{categorize/categorize.c → ccategorize/ccategorize.c} +54 -30
data/ext/ccategorize/extconf.rb +7 -0
data/lib/categorize.rb +1 -0
data/lib/categorize/models/bag_of_words.rb +14 -12
data/lib/categorize/models/cluster.rb +2 -0
metadata +4 -4
data/ext/categorize/extconf.rb +0 -13

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    ZmM1YWY2ZTJiZDg4MDI1ZjhiNGNiODBiYzgwNTZhMTEzMWRkZWIxZA==
+    YTFmZGJlMWI3YjUxNGQ2MGE3OGJmMjIzN2ZiYTFmNjVhMzE0OGRkMA==
   data.tar.gz: !binary |-
-    YTBiYmE0ZTZjYjFlNTVjMzQ1ZDQxZmE1ZTM4NDhhY2IxMmY4YmU5Mg==
+    NDAxNDY2MDNlYjBmYjc0NTIzY2JjYmU4NjE3YTBiMTQ1ZjA5ZjZjMg==
 !binary "U0hBNTEy":
   metadata.gz: !binary |-
-    MDNmZmZmYTFjMjk3MDZlYWQ2YjE5N2MyNTM0NWEyODlhODkwOGJmZGVjMjIx
-    OTk0MTAzYzA5Yzg0OWEzNmFlZGRjMmM0ZWJkNWU4ZDVlN2UxNzRhMmM5MDQ1
-    Y2VmMGNhMDgxNjY3ZGQxM2MwMTllMjM2MzZiODJmNjgzNzczN2E=
+    MDk4YjVhYjBhMWMzNGMwZmZlYmFiNTZlZWNiNjNkNjUxM2QzNjkxNzU2NWRk
+    MGQzNzJkZTY5ZmJhM2Q1ZmMyNDM2ZDEzZjk1Mzg4ODE2NWFlMmVhODBkNGZh
+    YTg0NGY4YTM0YzM0M2E0M2YyNjE5NjY4NzUyOTg4YzI1ODM5NGI=
   data.tar.gz: !binary |-
-    ZGI1MDQzOTZjY2Y1YjFmMGMxMTJmMWU0OTg5MGM5NTJlN2U4ZGZlYzEzYmI4
-    OTEzODY3ODJkMGQ5NzgxZDU2M2YxNGRiMDgyMmI3NGZkNzFmZWEzYTUwMjUy
-    NDViM2E5NDE3MjRkYTcxYzY5NzRjMTNmOTE2MGExNzkyYzQxNjE=
+    MmY1MmIyZjAyYWVmMzI1NzExMjk2ODFhMTgxOTg2ZDMzYTJhYzBlYWExZDdj
+    NDM4ODU0NTllMjdjOWI4NjMwMzBmODhhN2UxODE2MmZmMjFhNzBkMTczMzlj
+    ZjBjNWZiMDc5MTM3NDI5ZGI2ZmNiYzZmMTVjYzlhMzIzNjBiZjE=

data/ext/{categorize/categorize.c → ccategorize/ccategorize.c} RENAMED Viewed

@@ -4,11 +4,15 @@
 #include <stdlib.h> /* exit() */
 #include "ruby.h"
+typedef enum { false, true } bool;
 // START header
 // For information and references about the module to be stored internally.
-VALUE Bow = Qnil;
+VALUE Categorize = Qnil;
+VALUE CBagOfWords = Qnil;
+VALUE Models = Qnil;
-static VALUE method_model_bow(VALUE, VALUE);
+static VALUE method_make_model(VALUE, VALUE);
 static int add_or_update_gram_from_index(int, char *);
 // Store all grams, used in  compare_top_grams.
@@ -16,10 +20,12 @@ static char **all_grams_pp;
 // END header
 // Initialization method for this module.
-void Init_bow()
+void Init_ccategorize()
 {
-  Bow = rb_define_module("Bow");
-  rb_define_method(Bow, "model_bow", method_model_bow, 1);
+  Categorize = rb_define_module("CCategorize");
+  Models = rb_define_module_under(Categorize, "Models");
+  CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
+  rb_define_method(CBagOfWords, "make_model", method_make_model, 1);
 }
 const bool DEBUG = false;
@@ -92,7 +98,8 @@ int compare_grams(const void *gram1, const void *gram2)
 {
   intptr_t g1, g2;
-  if (fetch(*(const char **) gram1, &g1) && fetch(*(const char **) gram2, &g2)) {
+  if (fetch(*(const char **) gram1, &g1) &&
+      fetch(*(const char **) gram2, &g2)) {
     return (*(gram *) g2).freq - (*(gram *) g1).freq;
   } else
     fail("compare_grams");
@@ -115,17 +122,37 @@ int compare_top_grams(const void *idx1, const void *idx2)
 }
 /*
- * model_bow(array_of_tokens);
+ * make_model(array_of_tokens);
  * ==== Return
  * Top terms
  * ==== Parameters
  * array_of_tokens: Tokens to turn into grams and extract phrases from.
  */
-static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
+static VALUE method_make_model(VALUE self, VALUE array_of_tokens)
 {
   int i, j;
   long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
   int num_grams = 0;
+  int gram_counter;
+  char *tmp;
+  char *str;
+  char *bigram;
+  char *trigram;
+  char *last_word;
+  char *last_2nd_word;
+  int non_empty_tokens;
+  int tmp_int;
+  int min_cover;
+  int num_top_grams;
+  int top_gram_counter;
+  float max_fitness;
+  int max_fit_idx;
+  VALUE term;
+  VALUE term_for_record;
+  intptr_t g, all_g;
+  int count;
+  char *key;
+  char *max_fit;
   for (i = 0; i < array_of_tokens_len; i++) {
     // n + n - 1 + n - 2 = 3n - 3 = 3(n - 1)
@@ -142,15 +169,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
   all_grams_pp = malloc(sizeof(char *) * num_grams);
   if (all_grams_pp == NULL) rb_fatal("No memory for all_grams_pp");
-  int gram_counter = 0;
-  char *tmp;
-  char *str;
-  char *bigram;
-  char *trigram;
-  char *last_word;
-  char *last_2nd_word;
-  int non_empty_tokens = 0;
-  int tmp_int;
+  gram_counter = 0;
+  non_empty_tokens = 0;
   for (i = 0; i < array_of_tokens_len; i++) {
     // n grams
@@ -160,6 +180,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
     for (j = 0; j < RARRAY_LEN(rb_ary_entry(array_of_tokens, i)); j++) {
       VALUE rb_str = rb_ary_entry(rb_ary_entry(array_of_tokens, i), j);
       // store str via malloc so we can free it along with others
       tmp = StringValueCStr(rb_str);
       tmp_int = 1 + strlen(tmp);
@@ -209,7 +230,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
     if (j > 0) non_empty_tokens++;
     if (DEBUG) printf("end i: %i\n", i);
   }
-  int min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
+  min_cover = (int) (MIN_SUPPORT * non_empty_tokens);
   if (DEBUG) printf("added %i grams\n", gram_counter);
@@ -217,7 +239,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
   qsort(all_grams_pp, gram_counter, sizeof(char *), compare_grams);
   // only consider prominent top NUM_TOP_GRAMS grams
-  int num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter : NUM_TOP_GRAMS;
+  num_top_grams = gram_counter < NUM_TOP_GRAMS ? gram_counter :
+                                                     NUM_TOP_GRAMS;
   if (DEBUG) printf("gc %i, ntg %i, atl: %li\n",
                     gram_counter, num_top_grams, array_of_tokens_len);
@@ -226,10 +249,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
   if (top_grams_p == NULL) rb_fatal("No memory for top_grams_p");
-  int top_gram_counter = 0;
-  intptr_t g, all_g;
-  int count;
-  char *key;
+  top_gram_counter = 0;
   for (i = 0; i < num_top_grams; i++) {
     count = 0;
@@ -238,6 +258,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
       if (fetch(key, &g) && (*(gram *) g).freq > 0 && ++count > min_cover) {
         top_grams_p[top_gram_counter++] = i;
         if (DEBUG) printf("%i: covering gram: %s\n",
                           top_gram_counter - 1, all_grams_pp[i]);
         break;
@@ -250,9 +271,6 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
     printf("tgc %i\n", top_gram_counter);
   }
-  float max_fitness;
-  char *max_fit;
   for (i = 0; i < array_of_tokens_len; i++) {
     if (DEBUG) printf("start i: %i\n", i);
@@ -261,7 +279,8 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
       key = make_key(i, all_grams_pp[top_grams_p[j]]);
       if (fetch(key, &g) && fetch(all_grams_pp[top_grams_p[j]], &all_g)) {
-        (*(gram *) g).fitness = (float) (*(gram *) g).freq / (float) (*(gram *) all_g).freq;
+        (*(gram *) g).fitness = (float) (*(gram *) g).freq /
+                                (float) (*(gram *) all_g).freq;
         if (DEBUG) printf("fitness %f\n", (*(gram *) g).fitness);
       }
@@ -283,6 +302,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
       }
       free(key);
       // store fitness of gram
       if (max_fit && fetch(max_fit, &g))
         (*(gram *) g).fitness += 1.0;
@@ -293,10 +313,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
   // sort top_grams and take MAX_BUCKETS
   qsort(top_grams_p, top_gram_counter, sizeof(int), compare_top_grams);
   if (DEBUG) printf("after qsort top grams\n");
-  int max_fit_idx;
-  VALUE term_for_record = rb_ary_new2(array_of_tokens_len);
+  term_for_record = rb_ary_new2(array_of_tokens_len);
   for (i = 0; i < array_of_tokens_len; i++) {
     max_fitness = 0;
@@ -313,9 +333,10 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
       free(key);
     }
-    VALUE term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
+    term = rb_str_new2(all_grams_pp[top_grams_p[max_fit_idx]]);
     rb_ary_push(term_for_record, term);
   }
   if (DEBUG) printf("after qsort top grams\n");
   if (DEBUG) printf("freeing\n");
@@ -334,6 +355,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
   free(all_grams_pp);
   if (DEBUG) printf("freed all grams\n");
   hdestroy();
   if (DEBUG) printf("returning\n");
@@ -344,6 +366,7 @@ static VALUE method_model_bow(VALUE self, VALUE array_of_tokens)
 int add_or_update_gram(char *key)
 {
   intptr_t g;
   if (fetch(key, &g)) {
     (*(gram *) g).freq += 1;
     if (DEBUG) printf("key: %s, freq: %i\n", key, (*(gram *) g).freq);
@@ -352,6 +375,7 @@ int add_or_update_gram(char *key)
   } else {
     gram *g = malloc(sizeof(gram));
     if (g == NULL) rb_fatal("No memory for gram");
     (*g).freq = 1;
     (*g).fitness = 0.0;
     store(key, (intptr_t) g);

data/ext/ccategorize/extconf.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# encoding: utf-8
+# Loads mkmf which is used to make makefiles for Ruby extensions
+require 'mkmf'
+# Do the work
+create_makefile('ccategorize/ccategorize')

data/lib/categorize.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # encoding: utf-8
+require 'ccategorize/ccategorize'
 require 'categorize/models/abstract_model'
 require 'categorize/models/bag_of_words'
 require 'categorize/models/cluster'

data/lib/categorize/models/bag_of_words.rb CHANGED Viewed

@@ -8,16 +8,19 @@ module Categorize
       include Utils::Grams
       # DEBUG = false
-      # TODO: some gradient descent to choose this number
-      # 0 <= MIN_SUPP <= 1, we like 0.01 <= MIN_SUPP <= 0.1
-      MIN_SUPP_L = 0.07
-      MIN_SUPP_H = 0.1
-      NUM_TOP_GRAMS = 250
-      MAX_BUCKETS = 8
+      attr_accessor :max_buckets, :min_support, :num_top_grams
+      # 0 <= min_support <= 1, we like 0.01 <= min_support <= 0.1
+      def initialize
+        @max_buckets = 8
+        # TODO: some gradient descent to choose this number
+        @min_support = 0.07
+        @num_top_grams = 250
+      end
       # function worst case
       # O(2 x (|frequent_grams| x |gram_collections|) +
-      #   |all_grams| + MAX_BUCKETS x |gram_collections|)
+      #   |all_grams| + @max_buckets x |gram_collections|)
       def model(query, records_to_tokens)
         @gram_cover_cache = {}
         @gram_collections, @all_grams = create_grams(query, records_to_tokens)
@@ -25,9 +28,9 @@ module Categorize
         top_grams = determine_frequency_term_sets(@all_grams, query)
         top_grams = top_grams.keys.sort do |gram_c1, gram_c2|
           top_grams[gram_c1] <=> top_grams[gram_c2]
-        end.first(MAX_BUCKETS)
+        end.first(@max_buckets)
-        # below block, worst case O(MAX_BUCKETS x |gram_collections|)
+        # below block, worst case O(@max_buckets x |gram_collections|)
         @gram_collections.reduce({}) do |buckets, gram_collection|
           max_fitness = 0
           max_fit = nil
@@ -55,14 +58,13 @@ module Categorize
           result.grams.nil? || result.grams.empty?
         end.length
-        min_cover_l = MIN_SUPP_L * effective_length
-        # min_cover_h = MIN_SUPP_H * effective_length
+        min_cover_l = @min_support * effective_length
         # for speed only look at top N grams
         # below block, worst case O(|all_grams|)
         frequent_grams = all_grams.sort do |gram1, gram2|
           gram2.frequency <=> gram1.frequency
-        end.first(NUM_TOP_GRAMS)
+        end.first(@num_top_grams)
         # below block, worst case O(|frequent_grams| x |gram_collections|)
         frequent_grams = frequent_grams.delete_if do |gram|

data/lib/categorize/models/cluster.rb CHANGED Viewed

@@ -4,6 +4,8 @@ module Categorize
   module Models
     class Cluster < AbstractModel
+      attr_accessor :num_clusters
       def initialize
         @num_clusters = 10
         @clusterer = Ai4r::Clusterers::WardLinkage.new

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: categorize
 version: !ruby/object:Gem::Version
-  version: 0.0.9
+  version: 0.0.10
 platform: ruby
 authors:
 - Peter Lubell-Doughtie
@@ -16,7 +16,7 @@ description: ! "A text categorization library that favors performance.\n
 email: peter@helioid.com
 executables: []
 extensions:
-- ext/categorize/extconf.rb
+- ext/ccategorize/extconf.rb
 extra_rdoc_files: []
 files:
 - lib/categorize/constants.rb
@@ -29,8 +29,8 @@ files:
 - lib/categorize/models/bag_of_words.rb
 - lib/categorize/models/cluster.rb
 - lib/categorize.rb
-- ext/categorize/categorize.c
-- ext/categorize/extconf.rb
+- ext/ccategorize/ccategorize.c
+- ext/ccategorize/extconf.rb
 homepage: http://www.helioid.com/
 licenses:
 - BSD3

data/ext/categorize/extconf.rb DELETED Viewed

@@ -1,13 +0,0 @@
-# encoding: utf-8
-# Loads mkmf which is used to make makefiles for Ruby extensions
-require 'mkmf'
-# Give it a name
-extension_name = 'bow'
-# The destination
-dir_config(extension_name)
-# Do the work
-create_makefile(extension_name)