word2vec-rb 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ #include "common.h"
2
+
3
+ // Output the analog accuracy of the model
4
+ void word2vec_model_accuracy(word2vec_model* model, char* file_name) {
5
+ FILE *f;
6
+ char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size];
7
+ float dist, bestd[N], vec[max_size];
8
+ long long aa, b, c, d, b1, b2, b3;
9
+ unsigned long a;
10
+ int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
11
+ TCN = 0;
12
+
13
+ f = fopen(file_name, "r");
14
+ if (f == NULL) {
15
+ rb_raise(rb_eArgError, "Input file not found: %s", file_name);
16
+ return;
17
+ }
18
+ long long words = model->word_count;
19
+ long long size = model->vector_dim;
20
+
21
+ while (1) {
22
+ for (aa = 0; aa < N; aa++) bestd[aa] = 0;
23
+ for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
24
+ fscanf(f, "%s", st1);
25
+ for (a = 0; a < strlen(st1); a++) st1[a] = tolower(st1[a]);
26
+ if ((!strcmp(st1, ":")) || feof(f)) {
27
+ if (TCN == 0) TCN = 1;
28
+ if (QID != 0) {
29
+ printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
30
+ printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
31
+ }
32
+ QID++;
33
+ fscanf(f, "%s", st1);
34
+ if (feof(f)) break;
35
+ printf("%s:\n", st1);
36
+ TCN = 0;
37
+ CCN = 0;
38
+ continue;
39
+ }
40
+ fscanf(f, "%s", st2);
41
+ for (a = 0; a < strlen(st2); a++) st2[a] = tolower(st2[a]);
42
+ fscanf(f, "%s", st3);
43
+ for (a = 0; a<strlen(st3); a++) st3[a] = tolower(st3[a]);
44
+ fscanf(f, "%s", st4);
45
+ for (a = 0; a < strlen(st4); a++) st4[a] = tolower(st4[a]);
46
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st1)) break;
47
+ b1 = b;
48
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st2)) break;
49
+ b2 = b;
50
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st3)) break;
51
+ b3 = b;
52
+ for (aa = 0; aa < N; aa++) bestd[aa] = 0;
53
+ for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
54
+ TQ++;
55
+ if (b1 == words) continue;
56
+ if (b2 == words) continue;
57
+ if (b3 == words) continue;
58
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st4)) break;
59
+ if (b == words) continue;
60
+ for (aa = 0; aa < size; aa++) vec[aa] = (model->vectors[aa + b2 * size] - model->vectors[aa + b1 * size]) + model->vectors[aa + b3 * size];
61
+ TQS++;
62
+ for (c = 0; c < words; c++) {
63
+ if (c == b1) continue;
64
+ if (c == b2) continue;
65
+ if (c == b3) continue;
66
+ dist = 0;
67
+ for (aa = 0; aa < size; aa++) dist += vec[aa] * model->vectors[aa + c * size];
68
+ for (aa = 0; aa < N; aa++) {
69
+ if (dist > bestd[aa]) {
70
+ for (d = N - 1; d > aa; d--) {
71
+ bestd[d] = bestd[d - 1];
72
+ strcpy(bestw[d], bestw[d - 1]);
73
+ }
74
+ bestd[aa] = dist;
75
+ strcpy(bestw[aa], &model->vocabulary[c * max_w]);
76
+ break;
77
+ }
78
+ }
79
+ }
80
+ if (!strcmp(st4, bestw[0])) {
81
+ CCN++;
82
+ CACN++;
83
+ if (QID <= 5) SEAC++; else SYAC++;
84
+ }
85
+ if (QID <= 5) SECN++; else SYCN++;
86
+ TCN++;
87
+ TACN++;
88
+ }
89
+ printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
90
+ return;
91
+ }
@@ -0,0 +1,85 @@
1
+ #include "common.h"
2
+
3
+ // Find the analog word
4
+ size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]) {
5
+ if (strlen(wordx1) >= max_size) {
6
+ rb_raise(rb_eArgError, "First parameter word must be %lu character max size", max_size);
7
+ return 0;
8
+ }
9
+
10
+ if (strlen(wordy1) >= max_size) {
11
+ rb_raise(rb_eArgError, "Second parameter word must be %lu character max size", max_size);
12
+ return 0;
13
+ }
14
+
15
+ if (strlen(wordx2) >= max_size) {
16
+ rb_raise(rb_eArgError, "Third parameter word must be %lu character max size", max_size);
17
+ return 0;
18
+ }
19
+
20
+ float bestd[N];
21
+ char bestw[N][max_size];
22
+ size_t besti[N];
23
+ long long words = model->word_count;
24
+ long long size = model->vector_dim;
25
+ long a = 0;
26
+ long b = 0;
27
+ for (a = 0; a < N; a++) bestd[a] = 0;
28
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
29
+
30
+ char st[100][max_size];
31
+ long long bi[100];
32
+ long cn = 3;
33
+ strcpy(st[0], wordx1);
34
+ strcpy(st[1], wordy1);
35
+ strcpy(st[2], wordx2);
36
+ for (a = 0; a < cn; a++) {
37
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st[a])) break;
38
+ if (b == words) b = 0;
39
+ bi[a] = b;
40
+
41
+ if (b == 0) {
42
+ rb_raise(rb_eArgError, "%s: out of dictionary word!", st[a]);
43
+ break;
44
+ }
45
+ }
46
+ float vec[max_size];
47
+ for (a = 0; a < size; a++) vec[a] = model->vectors[a + bi[1] * size] - model->vectors[a + bi[0] * size] + model->vectors[a + bi[2] * size];
48
+ float len = 0;
49
+ for (a = 0; a < size; a++) len += vec[a] * vec[a];
50
+ len = sqrt(len);
51
+ long long c;
52
+ long long d;
53
+ float dist;
54
+ for (a = 0; a < size; a++) vec[a] /= len;
55
+ for (a = 0; a < N; a++) bestd[a] = 0;
56
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
57
+ for (c = 0; c < words; c++) {
58
+ if (c == bi[0]) continue;
59
+ if (c == bi[1]) continue;
60
+ if (c == bi[2]) continue;
61
+ a = 0;
62
+ for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
63
+ if (a == 1) continue;
64
+ dist = 0;
65
+ for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
66
+ for (a = 0; a < N; a++) {
67
+ if (dist > bestd[a]) {
68
+ for (d = N - 1; d > a; d--) {
69
+ bestd[d] = bestd[d - 1];
70
+ strcpy(bestw[d], bestw[d - 1]);
71
+ besti[d] = d - 1;
72
+ }
73
+ bestd[a] = dist;
74
+ strcpy(bestw[a], &model->vocabulary[c * max_w]);
75
+ besti[a] = c * max_w;
76
+ break;
77
+ }
78
+ }
79
+ }
80
+ for (a = 0; a < N; a++) {
81
+ word_list[a].index = besti[a];
82
+ word_list[a].score = bestd[a];
83
+ }
84
+ return N;
85
+ }
@@ -1,7 +1,7 @@
1
1
  #include "common.h"
2
2
 
3
3
  // max length of strings
4
- const long long max_size = 2000;
4
+ const unsigned long max_size = 2000;
5
5
  // number of closest words that will be shown
6
6
  const long long N = 40;
7
7
  // max length of vocabulary entries
@@ -44,74 +44,4 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
44
44
  for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
45
45
  }
46
46
  fclose(f);
47
- }
48
-
49
- // Find nearest words in the model
50
- size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
51
- if (strlen(word) >= max_size) {
52
- rb_raise(rb_eArgError, "word must be %lld character max size", max_size);
53
- return 0;
54
- }
55
-
56
- long long size = model->vector_dim;
57
- long long a;
58
- char *bestw[N];
59
- float bestd[N];
60
- size_t besti[N];
61
- for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
62
- a = 0;
63
-
64
- long long b = 0;
65
- long long c = 0;
66
-
67
- long long words = model->word_count;
68
- for (b = 0; b < words; b++) {
69
- if (!strcmp(&model->vocabulary[b * max_w], word)) break;
70
- }
71
- if (b == words) b = -1;
72
- long long bi = b;
73
- if (b == -1) {
74
- rb_raise(rb_eArgError, "Out of dictionary word!");
75
- return 0;
76
- }
77
-
78
- float vec[max_size];
79
- float dist;
80
- long long d;
81
- for (a = 0; a < size; a++) vec[a] = 0;
82
- for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
83
- float len = 0;
84
- for (a = 0; a < size; a++) len += vec[a] * vec[a];
85
- len = sqrt(len);
86
- for (a = 0; a < size; a++) vec[a] /= len;
87
- for (a = 0; a < N; a++) bestd[a] = -1;
88
- for (a = 0; a < N; a++) bestw[a][0] = 0;
89
- for (c = 0; c < words; c++) {
90
- a = 0;
91
- if (bi == c) continue;
92
- dist = 0;
93
- for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
94
- for (a = 0; a < N; a++) {
95
- if (dist > bestd[a]) {
96
- for (d = N - 1; d > a; d--) {
97
- bestd[d] = bestd[d - 1];
98
- strcpy(bestw[d], bestw[d - 1]);
99
- besti[d] = d - 1;
100
- }
101
- bestd[a] = dist;
102
- strcpy(bestw[a], &model->vocabulary[c * max_w]);
103
- besti[a] = c * max_w;
104
- break;
105
- }
106
- }
107
- }
108
-
109
- for (a = 0; a < N; a++) {
110
- word_list[a].index = besti[a];
111
- word_list[a].score = bestd[a];
112
- }
113
-
114
- for (a = 0; a < N; a++) free(bestw[a]);
115
-
116
- return N;
117
47
  }
@@ -8,8 +8,14 @@
8
8
  #include <stdio.h>
9
9
  #include <stdlib.h>
10
10
  #include <sys/types.h>
11
+ #include <ctype.h>
11
12
 
12
- extern const long long N; // number of closest words that will be shown
13
+ // max length of strings
14
+ extern const unsigned long max_size;
15
+ // number of closest words that will be shown
16
+ extern const long long N;
17
+ // max length of vocabulary entries
18
+ extern const long long max_w;
13
19
 
14
20
  typedef struct word2vec_model_s {
15
21
  long long word_count;
@@ -25,5 +31,7 @@ typedef struct WordSimilarity_s {
25
31
 
26
32
  void word2vec_model_load(word2vec_model* model, char* file_name);
27
33
  size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
34
+ size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
35
+ void word2vec_model_accuracy(word2vec_model* model, char* file_name);
28
36
 
29
37
  #endif /* _WORD2VEC_COMMON_H */
@@ -0,0 +1,71 @@
1
+ #include "common.h"
2
+
3
+ // Find nearest words in the model
4
+ size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
5
+ if (strlen(word) >= max_size) {
6
+ rb_raise(rb_eArgError, "word must be %lu character max size", max_size);
7
+ return 0;
8
+ }
9
+
10
+ long long size = model->vector_dim;
11
+ long long a;
12
+ char *bestw[N];
13
+ float bestd[N];
14
+ size_t besti[N];
15
+ for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
16
+ a = 0;
17
+
18
+ long long b = 0;
19
+ long long c = 0;
20
+
21
+ long long words = model->word_count;
22
+ for (b = 0; b < words; b++) {
23
+ if (!strcmp(&model->vocabulary[b * max_w], word)) break;
24
+ }
25
+ if (b == words) b = -1;
26
+ long long bi = b;
27
+ if (b == -1) {
28
+ rb_raise(rb_eArgError, "Out of dictionary word!");
29
+ return 0;
30
+ }
31
+
32
+ float vec[max_size];
33
+ float dist;
34
+ long long d;
35
+ for (a = 0; a < size; a++) vec[a] = 0;
36
+ for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
37
+ float len = 0;
38
+ for (a = 0; a < size; a++) len += vec[a] * vec[a];
39
+ len = sqrt(len);
40
+ for (a = 0; a < size; a++) vec[a] /= len;
41
+ for (a = 0; a < N; a++) bestd[a] = -1;
42
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
43
+ for (c = 0; c < words; c++) {
44
+ a = 0;
45
+ if (bi == c) continue;
46
+ dist = 0;
47
+ for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
48
+ for (a = 0; a < N; a++) {
49
+ if (dist > bestd[a]) {
50
+ for (d = N - 1; d > a; d--) {
51
+ bestd[d] = bestd[d - 1];
52
+ strcpy(bestw[d], bestw[d - 1]);
53
+ besti[d] = d - 1;
54
+ }
55
+ bestd[a] = dist;
56
+ strcpy(bestw[a], &model->vocabulary[c * max_w]);
57
+ besti[a] = c * max_w;
58
+ break;
59
+ }
60
+ }
61
+ }
62
+
63
+ for (a = 0; a < N; a++) {
64
+ word_list[a].index = besti[a];
65
+ word_list[a].score = bestd[a];
66
+ }
67
+
68
+ for (a = 0; a < N; a++) free(bestw[a]);
69
+
70
+ return N;
71
+ }
@@ -16,6 +16,22 @@ static void model_deallocate(word2vec_model *model) {
16
16
  }
17
17
  }
18
18
 
19
+ /*
20
+ * Transform a WordSimilarity vector to a Ruby hash
21
+ */
22
+ VALUE wordSimilarotyToHash(word2vec_model *model, WordSimilarity word_list[], size_t word_count) {
23
+ VALUE rb_ret =rb_hash_new();
24
+ for (size_t i = 0 ; i < word_count ; i++) {
25
+ size_t index = word_list[i].index;
26
+ if (index >= 0) {
27
+ VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
28
+ VALUE rb_score = DBL2NUM(word_list[i].score);
29
+ rb_hash_aset(rb_ret, rb_word, rb_score);
30
+ }
31
+ }
32
+ return rb_ret;
33
+ }
34
+
19
35
  /*
20
36
  * model_load
21
37
  * load the vectors.bin file from disc
@@ -69,20 +85,48 @@ static VALUE model_distance(VALUE mod, VALUE rb_word) {
69
85
 
70
86
  size_t word_count = word2vec_model_distance(model, word, word_list);
71
87
 
72
- VALUE rb_ret = rb_hash_new();
88
+ VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
89
+
90
+ return rb_ret;
91
+ }
92
+
93
+ /*
94
+ * model find the analog word to other three
95
+ * @param [String] rb_wordx1
96
+ * @param [String] rb_wordy1
97
+ * @param [String] rb_wordx2
98
+ * @return [Hash<String, Float>]
99
+ */
100
+ static VALUE model_analogy(VALUE mod, VALUE rb_wordx1, VALUE rb_wordy1, VALUE rb_wordx2) {
101
+ word2vec_model *model;
102
+ Data_Get_Struct(mod, word2vec_model, model);
103
+ char* wordx1 = StringValueCStr(rb_wordx1);
104
+ char* wordy1 = StringValueCStr(rb_wordy1);
105
+ char* wordx2 = StringValueCStr(rb_wordx2);
106
+
107
+ WordSimilarity word_list[N];
73
108
 
74
- for (size_t i = 0 ; i < word_count ; i++) {
75
- size_t index = word_list[i].index;
76
- if (index >= 0) {
77
- VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
78
- VALUE rb_score = DBL2NUM(word_list[i].score);
79
- rb_hash_aset(rb_ret, rb_word, rb_score);
80
- }
81
- }
109
+ size_t word_count = word2vec_model_analogy(model, wordx1, wordy1, wordx2, word_list);
110
+
111
+ VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
82
112
 
83
113
  return rb_ret;
84
114
  }
85
115
 
116
+ /*
117
+ * model find the analog word to other three
118
+ * @param [String] rb_file_name
119
+ */
120
+ static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
121
+ word2vec_model *model;
122
+ Data_Get_Struct(mod, word2vec_model, model);
123
+ char* filename = StringValueCStr(rb_file_name);
124
+
125
+ word2vec_model_accuracy(model, filename);
126
+
127
+ return Qtrue;
128
+ }
129
+
86
130
  void Init_word2vec(void) {
87
131
  VALUE mWord2vec = rb_define_module("Word2vec");
88
132
  VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -90,4 +134,6 @@ void Init_word2vec(void) {
90
134
  rb_define_method(mWord2vecModel, "word_count", model_word_count, 0);
91
135
  rb_define_method(mWord2vecModel, "vector_dim", model_vector_dim, 0);
92
136
  rb_define_method(mWord2vecModel, "distance", model_distance, 1);
137
+ rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
138
+ rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
93
139
  }
@@ -1,3 +1,3 @@
1
1
  module Word2vec
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end