RubyGems - word2vec-rb - Versions diffs - 0.1.0 → 0.2.0 - Mend

word2vec-rb 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/ext/word2vec/accuracy.c ADDED Viewed

@@ -0,0 +1,91 @@
+#include "common.h"
+// Output the analog accuracy of the model
+void word2vec_model_accuracy(word2vec_model* model, char* file_name) {
+  FILE *f;
+  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size];
+  float dist, bestd[N], vec[max_size];
+  long long aa, b, c, d, b1, b2, b3;
+  unsigned long a;
+  int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
+  TCN = 0;
+  f = fopen(file_name, "r");
+  if (f == NULL) {
+    rb_raise(rb_eArgError, "Input file not found: %s", file_name);
+    return;
+  }
+  long long words = model->word_count;
+  long long size = model->vector_dim;
+  while (1) {
+    for (aa = 0; aa < N; aa++) bestd[aa] = 0;
+    for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
+    fscanf(f, "%s", st1);
+    for (a = 0; a < strlen(st1); a++) st1[a] = tolower(st1[a]);
+    if ((!strcmp(st1, ":")) || feof(f)) {
+      if (TCN == 0) TCN = 1;
+      if (QID != 0) {
+        printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+        printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+      }
+      QID++;
+      fscanf(f, "%s", st1);
+      if (feof(f)) break;
+      printf("%s:\n", st1);
+      TCN = 0;
+      CCN = 0;
+      continue;
+    }
+    fscanf(f, "%s", st2);
+    for (a = 0; a < strlen(st2); a++) st2[a] = tolower(st2[a]);
+    fscanf(f, "%s", st3);
+    for (a = 0; a<strlen(st3); a++) st3[a] = tolower(st3[a]);
+    fscanf(f, "%s", st4);
+    for (a = 0; a < strlen(st4); a++) st4[a] = tolower(st4[a]);
+    for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st1)) break;
+    b1 = b;
+    for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st2)) break;
+    b2 = b;
+    for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st3)) break;
+    b3 = b;
+    for (aa = 0; aa < N; aa++) bestd[aa] = 0;
+    for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
+    TQ++;
+    if (b1 == words) continue;
+    if (b2 == words) continue;
+    if (b3 == words) continue;
+    for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st4)) break;
+    if (b == words) continue;
+    for (aa = 0; aa < size; aa++) vec[aa] = (model->vectors[aa + b2 * size] - model->vectors[aa + b1 * size]) + model->vectors[aa + b3 * size];
+    TQS++;
+    for (c = 0; c < words; c++) {
+      if (c == b1) continue;
+      if (c == b2) continue;
+      if (c == b3) continue;
+      dist = 0;
+      for (aa = 0; aa < size; aa++) dist += vec[aa] * model->vectors[aa + c * size];
+      for (aa = 0; aa < N; aa++) {
+        if (dist > bestd[aa]) {
+          for (d = N - 1; d > aa; d--) {
+            bestd[d] = bestd[d - 1];
+            strcpy(bestw[d], bestw[d - 1]);
+          }
+          bestd[aa] = dist;
+          strcpy(bestw[aa], &model->vocabulary[c * max_w]);
+          break;
+        }
+      }
+    }
+    if (!strcmp(st4, bestw[0])) {
+      CCN++;
+      CACN++;
+      if (QID <= 5) SEAC++; else SYAC++;
+    }
+    if (QID <= 5) SECN++; else SYCN++;
+    TCN++;
+    TACN++;
+  }
+  printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
+  return;
+}

data/ext/word2vec/analogy.c ADDED Viewed

@@ -0,0 +1,85 @@
+#include "common.h"
+// Find the analog word
+size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]) {
+  if (strlen(wordx1) >= max_size) {
+    rb_raise(rb_eArgError, "First parameter word must be %lu character max size", max_size);
+    return 0;
+  }
+  if (strlen(wordy1) >= max_size) {
+    rb_raise(rb_eArgError, "Second parameter word must be %lu character max size", max_size);
+    return 0;
+  }
+  if (strlen(wordx2) >= max_size) {
+    rb_raise(rb_eArgError, "Third parameter word must be %lu character max size", max_size);
+    return 0;
+  }
+  float bestd[N];
+  char bestw[N][max_size];
+  size_t besti[N];
+  long long words = model->word_count;
+  long long size = model->vector_dim;
+  long a = 0;
+  long b = 0;
+  for (a = 0; a < N; a++) bestd[a] = 0;
+  for (a = 0; a < N; a++) bestw[a][0] = 0;
+  char st[100][max_size];
+  long long bi[100];
+  long cn = 3;
+  strcpy(st[0], wordx1);
+  strcpy(st[1], wordy1);
+  strcpy(st[2], wordx2);
+  for (a = 0; a < cn; a++) {
+    for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st[a])) break;
+    if (b == words) b = 0;
+    bi[a] = b;
+    if (b == 0) {
+      rb_raise(rb_eArgError, "%s: out of dictionary word!", st[a]);
+      break;
+    }
+  }
+  float vec[max_size];
+  for (a = 0; a < size; a++) vec[a] = model->vectors[a + bi[1] * size] - model->vectors[a + bi[0] * size] + model->vectors[a + bi[2] * size];
+  float len = 0;
+  for (a = 0; a < size; a++) len += vec[a] * vec[a];
+  len = sqrt(len);
+  long long c;
+  long long d;
+  float dist;
+  for (a = 0; a < size; a++) vec[a] /= len;
+  for (a = 0; a < N; a++) bestd[a] = 0;
+  for (a = 0; a < N; a++) bestw[a][0] = 0;
+  for (c = 0; c < words; c++) {
+    if (c == bi[0]) continue;
+    if (c == bi[1]) continue;
+    if (c == bi[2]) continue;
+    a = 0;
+    for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
+    if (a == 1) continue;
+    dist = 0;
+    for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
+    for (a = 0; a < N; a++) {
+      if (dist > bestd[a]) {
+        for (d = N - 1; d > a; d--) {
+          bestd[d] = bestd[d - 1];
+          strcpy(bestw[d], bestw[d - 1]);
+          besti[d] = d - 1;
+        }
+        bestd[a] = dist;
+        strcpy(bestw[a], &model->vocabulary[c * max_w]);
+        besti[a] = c * max_w;
+        break;
+      }
+    }
+  }
+  for (a = 0; a < N; a++) {
+    word_list[a].index = besti[a];
+    word_list[a].score = bestd[a];
+  }
+  return N;
+}

data/ext/word2vec/common.c CHANGED Viewed

@@ -1,7 +1,7 @@
 #include "common.h"
 // max length of strings
-const long long max_size = 2000;
+const unsigned long max_size = 2000;
 // number of closest words that will be shown
 const long long N = 40;
 // max length of vocabulary entries
@@ -44,74 +44,4 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
     for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
   }
   fclose(f);
-}
-// Find nearest words in the model
-size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
-  if (strlen(word) >= max_size) {
-    rb_raise(rb_eArgError, "word must be %lld character max size", max_size);
-    return 0;
-  }
-  long long size = model->vector_dim;
-  long long a;
-  char *bestw[N];
-  float bestd[N];
-  size_t besti[N];
-  for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
-  a = 0;
-  long long b = 0;
-  long long c = 0;
-  long long words = model->word_count;
-  for (b = 0; b < words; b++) {
-    if (!strcmp(&model->vocabulary[b * max_w], word)) break;
-  }
-  if (b == words) b = -1;
-  long long bi = b;
-  if (b == -1) {
-    rb_raise(rb_eArgError, "Out of dictionary word!");
-    return 0;
-  }
-  float vec[max_size];
-  float dist;
-  long long d;
-  for (a = 0; a < size; a++) vec[a] = 0;
-  for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
-  float len = 0;
-  for (a = 0; a < size; a++) len += vec[a] * vec[a];
-  len = sqrt(len);
-  for (a = 0; a < size; a++) vec[a] /= len;
-  for (a = 0; a < N; a++) bestd[a] = -1;
-  for (a = 0; a < N; a++) bestw[a][0] = 0;
-  for (c = 0; c < words; c++) {
-    a = 0;
-    if (bi == c) continue;
-    dist = 0;
-    for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
-    for (a = 0; a < N; a++) {
-      if (dist > bestd[a]) {
-        for (d = N - 1; d > a; d--) {
-          bestd[d] = bestd[d - 1];
-          strcpy(bestw[d], bestw[d - 1]);
-          besti[d] = d - 1;
-        }
-        bestd[a] = dist;
-        strcpy(bestw[a], &model->vocabulary[c * max_w]);
-        besti[a] = c * max_w;
-        break;
-      }
-    }
-  }
-  for (a = 0; a < N; a++) {
-    word_list[a].index = besti[a];
-    word_list[a].score = bestd[a];
-  }
-  for (a = 0; a < N; a++) free(bestw[a]);
-  return N;
 }

data/ext/word2vec/common.h CHANGED Viewed

@@ -8,8 +8,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#include <ctype.h>
-extern const long long N;  // number of closest words that will be shown
+// max length of strings
+extern const unsigned long max_size;
+// number of closest words that will be shown
+extern const long long N;
+// max length of vocabulary entries
+extern const long long max_w;
 typedef struct word2vec_model_s {
   long long word_count;
@@ -25,5 +31,7 @@ typedef struct WordSimilarity_s {
 void word2vec_model_load(word2vec_model* model, char* file_name);
 size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
+size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
+void word2vec_model_accuracy(word2vec_model* model, char* file_name);
 #endif /* _WORD2VEC_COMMON_H */

data/ext/word2vec/distance.c ADDED Viewed

@@ -0,0 +1,71 @@
+#include "common.h"
+// Find nearest words in the model
+size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
+  if (strlen(word) >= max_size) {
+    rb_raise(rb_eArgError, "word must be %lu character max size", max_size);
+    return 0;
+  }
+  long long size = model->vector_dim;
+  long long a;
+  char *bestw[N];
+  float bestd[N];
+  size_t besti[N];
+  for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
+  a = 0;
+  long long b = 0;
+  long long c = 0;
+  long long words = model->word_count;
+  for (b = 0; b < words; b++) {
+    if (!strcmp(&model->vocabulary[b * max_w], word)) break;
+  }
+  if (b == words) b = -1;
+  long long bi = b;
+  if (b == -1) {
+    rb_raise(rb_eArgError, "Out of dictionary word!");
+    return 0;
+  }
+  float vec[max_size];
+  float dist;
+  long long d;
+  for (a = 0; a < size; a++) vec[a] = 0;
+  for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
+  float len = 0;
+  for (a = 0; a < size; a++) len += vec[a] * vec[a];
+  len = sqrt(len);
+  for (a = 0; a < size; a++) vec[a] /= len;
+  for (a = 0; a < N; a++) bestd[a] = -1;
+  for (a = 0; a < N; a++) bestw[a][0] = 0;
+  for (c = 0; c < words; c++) {
+    a = 0;
+    if (bi == c) continue;
+    dist = 0;
+    for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
+    for (a = 0; a < N; a++) {
+      if (dist > bestd[a]) {
+        for (d = N - 1; d > a; d--) {
+          bestd[d] = bestd[d - 1];
+          strcpy(bestw[d], bestw[d - 1]);
+          besti[d] = d - 1;
+        }
+        bestd[a] = dist;
+        strcpy(bestw[a], &model->vocabulary[c * max_w]);
+        besti[a] = c * max_w;
+        break;
+      }
+    }
+  }
+  for (a = 0; a < N; a++) {
+    word_list[a].index = besti[a];
+    word_list[a].score = bestd[a];
+  }
+  for (a = 0; a < N; a++) free(bestw[a]);
+  return N;
+}

data/ext/word2vec/word2vec.c CHANGED Viewed

@@ -16,6 +16,22 @@ static void model_deallocate(word2vec_model *model) {
   }
 }
+/*
+ * Transform a WordSimilarity vector to a Ruby hash
+ */
+VALUE wordSimilarotyToHash(word2vec_model *model, WordSimilarity word_list[], size_t word_count) {
+  VALUE rb_ret =rb_hash_new();
+  for (size_t i = 0 ; i < word_count ; i++) {
+    size_t index = word_list[i].index;
+    if (index >= 0) {
+      VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
+      VALUE rb_score = DBL2NUM(word_list[i].score);
+      rb_hash_aset(rb_ret, rb_word, rb_score);
+    }
+  }
+  return rb_ret;
+}
 /*
  * model_load
  * load the vectors.bin file from disc
@@ -69,20 +85,48 @@ static VALUE model_distance(VALUE mod, VALUE rb_word) {
   size_t word_count = word2vec_model_distance(model, word, word_list);
-  VALUE rb_ret = rb_hash_new();
+  VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
+  return rb_ret;
+}
+/*
+ * model find the analog word to other three
+ * @param [String] rb_wordx1
+ * @param [String] rb_wordy1
+ * @param [String] rb_wordx2
+ * @return [Hash<String, Float>]
+ */
+static VALUE model_analogy(VALUE mod, VALUE rb_wordx1, VALUE rb_wordy1, VALUE rb_wordx2) {
+  word2vec_model *model;
+  Data_Get_Struct(mod, word2vec_model, model);
+  char* wordx1 = StringValueCStr(rb_wordx1);
+  char* wordy1 = StringValueCStr(rb_wordy1);
+  char* wordx2 = StringValueCStr(rb_wordx2);
+  WordSimilarity word_list[N];
-  for (size_t i = 0 ; i < word_count ; i++) {
-    size_t index = word_list[i].index;
-    if (index >= 0) {
-      VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
-      VALUE rb_score = DBL2NUM(word_list[i].score);
-      rb_hash_aset(rb_ret, rb_word, rb_score);
-    }
-  }
+  size_t word_count = word2vec_model_analogy(model, wordx1, wordy1, wordx2, word_list);
+  VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
   return rb_ret;
 }
+/*
+ * model find the analog word to other three
+ * @param [String] rb_file_name
+ */
+static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
+  word2vec_model *model;
+  Data_Get_Struct(mod, word2vec_model, model);
+  char* filename = StringValueCStr(rb_file_name);
+  word2vec_model_accuracy(model, filename);
+  return Qtrue;
+}
 void Init_word2vec(void) {
   VALUE mWord2vec = rb_define_module("Word2vec");
   VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -90,4 +134,6 @@ void Init_word2vec(void) {
   rb_define_method(mWord2vecModel, "word_count", model_word_count, 0);
   rb_define_method(mWord2vecModel, "vector_dim", model_vector_dim, 0);
   rb_define_method(mWord2vecModel, "distance", model_distance, 1);
+  rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
+  rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
 }

data/lib/word2vec/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Word2vec
-    VERSION = "0.1.0"
+    VERSION = "0.2.0"
 end