word2vec-rb 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,91 @@
1
+ #include "common.h"
2
+
3
+ // Output the analog accuracy of the model
4
+ void word2vec_model_accuracy(word2vec_model* model, char* file_name) {
5
+ FILE *f;
6
+ char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size];
7
+ float dist, bestd[N], vec[max_size];
8
+ long long aa, b, c, d, b1, b2, b3;
9
+ unsigned long a;
10
+ int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
11
+ TCN = 0;
12
+
13
+ f = fopen(file_name, "r");
14
+ if (f == NULL) {
15
+ rb_raise(rb_eArgError, "Input file not found: %s", file_name);
16
+ return;
17
+ }
18
+ long long words = model->word_count;
19
+ long long size = model->vector_dim;
20
+
21
+ while (1) {
22
+ for (aa = 0; aa < N; aa++) bestd[aa] = 0;
23
+ for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
24
+ fscanf(f, "%s", st1);
25
+ for (a = 0; a < strlen(st1); a++) st1[a] = tolower(st1[a]);
26
+ if ((!strcmp(st1, ":")) || feof(f)) {
27
+ if (TCN == 0) TCN = 1;
28
+ if (QID != 0) {
29
+ printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
30
+ printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
31
+ }
32
+ QID++;
33
+ fscanf(f, "%s", st1);
34
+ if (feof(f)) break;
35
+ printf("%s:\n", st1);
36
+ TCN = 0;
37
+ CCN = 0;
38
+ continue;
39
+ }
40
+ fscanf(f, "%s", st2);
41
+ for (a = 0; a < strlen(st2); a++) st2[a] = tolower(st2[a]);
42
+ fscanf(f, "%s", st3);
43
+ for (a = 0; a<strlen(st3); a++) st3[a] = tolower(st3[a]);
44
+ fscanf(f, "%s", st4);
45
+ for (a = 0; a < strlen(st4); a++) st4[a] = tolower(st4[a]);
46
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st1)) break;
47
+ b1 = b;
48
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st2)) break;
49
+ b2 = b;
50
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st3)) break;
51
+ b3 = b;
52
+ for (aa = 0; aa < N; aa++) bestd[aa] = 0;
53
+ for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
54
+ TQ++;
55
+ if (b1 == words) continue;
56
+ if (b2 == words) continue;
57
+ if (b3 == words) continue;
58
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st4)) break;
59
+ if (b == words) continue;
60
+ for (aa = 0; aa < size; aa++) vec[aa] = (model->vectors[aa + b2 * size] - model->vectors[aa + b1 * size]) + model->vectors[aa + b3 * size];
61
+ TQS++;
62
+ for (c = 0; c < words; c++) {
63
+ if (c == b1) continue;
64
+ if (c == b2) continue;
65
+ if (c == b3) continue;
66
+ dist = 0;
67
+ for (aa = 0; aa < size; aa++) dist += vec[aa] * model->vectors[aa + c * size];
68
+ for (aa = 0; aa < N; aa++) {
69
+ if (dist > bestd[aa]) {
70
+ for (d = N - 1; d > aa; d--) {
71
+ bestd[d] = bestd[d - 1];
72
+ strcpy(bestw[d], bestw[d - 1]);
73
+ }
74
+ bestd[aa] = dist;
75
+ strcpy(bestw[aa], &model->vocabulary[c * max_w]);
76
+ break;
77
+ }
78
+ }
79
+ }
80
+ if (!strcmp(st4, bestw[0])) {
81
+ CCN++;
82
+ CACN++;
83
+ if (QID <= 5) SEAC++; else SYAC++;
84
+ }
85
+ if (QID <= 5) SECN++; else SYCN++;
86
+ TCN++;
87
+ TACN++;
88
+ }
89
+ printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
90
+ return;
91
+ }
@@ -0,0 +1,85 @@
1
+ #include "common.h"
2
+
3
+ // Find the analog word
4
+ size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]) {
5
+ if (strlen(wordx1) >= max_size) {
6
+ rb_raise(rb_eArgError, "First parameter word must be %lu character max size", max_size);
7
+ return 0;
8
+ }
9
+
10
+ if (strlen(wordy1) >= max_size) {
11
+ rb_raise(rb_eArgError, "Second parameter word must be %lu character max size", max_size);
12
+ return 0;
13
+ }
14
+
15
+ if (strlen(wordx2) >= max_size) {
16
+ rb_raise(rb_eArgError, "Third parameter word must be %lu character max size", max_size);
17
+ return 0;
18
+ }
19
+
20
+ float bestd[N];
21
+ char bestw[N][max_size];
22
+ size_t besti[N];
23
+ long long words = model->word_count;
24
+ long long size = model->vector_dim;
25
+ long a = 0;
26
+ long b = 0;
27
+ for (a = 0; a < N; a++) bestd[a] = 0;
28
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
29
+
30
+ char st[100][max_size];
31
+ long long bi[100];
32
+ long cn = 3;
33
+ strcpy(st[0], wordx1);
34
+ strcpy(st[1], wordy1);
35
+ strcpy(st[2], wordx2);
36
+ for (a = 0; a < cn; a++) {
37
+ for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st[a])) break;
38
+ if (b == words) b = 0;
39
+ bi[a] = b;
40
+
41
+ if (b == 0) {
42
+ rb_raise(rb_eArgError, "%s: out of dictionary word!", st[a]);
43
+ break;
44
+ }
45
+ }
46
+ float vec[max_size];
47
+ for (a = 0; a < size; a++) vec[a] = model->vectors[a + bi[1] * size] - model->vectors[a + bi[0] * size] + model->vectors[a + bi[2] * size];
48
+ float len = 0;
49
+ for (a = 0; a < size; a++) len += vec[a] * vec[a];
50
+ len = sqrt(len);
51
+ long long c;
52
+ long long d;
53
+ float dist;
54
+ for (a = 0; a < size; a++) vec[a] /= len;
55
+ for (a = 0; a < N; a++) bestd[a] = 0;
56
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
57
+ for (c = 0; c < words; c++) {
58
+ if (c == bi[0]) continue;
59
+ if (c == bi[1]) continue;
60
+ if (c == bi[2]) continue;
61
+ a = 0;
62
+ for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
63
+ if (a == 1) continue;
64
+ dist = 0;
65
+ for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
66
+ for (a = 0; a < N; a++) {
67
+ if (dist > bestd[a]) {
68
+ for (d = N - 1; d > a; d--) {
69
+ bestd[d] = bestd[d - 1];
70
+ strcpy(bestw[d], bestw[d - 1]);
71
+ besti[d] = d - 1;
72
+ }
73
+ bestd[a] = dist;
74
+ strcpy(bestw[a], &model->vocabulary[c * max_w]);
75
+ besti[a] = c * max_w;
76
+ break;
77
+ }
78
+ }
79
+ }
80
+ for (a = 0; a < N; a++) {
81
+ word_list[a].index = besti[a];
82
+ word_list[a].score = bestd[a];
83
+ }
84
+ return N;
85
+ }
@@ -1,7 +1,7 @@
1
1
  #include "common.h"
2
2
 
3
3
  // max length of strings
4
- const long long max_size = 2000;
4
+ const unsigned long max_size = 2000;
5
5
  // number of closest words that will be shown
6
6
  const long long N = 40;
7
7
  // max length of vocabulary entries
@@ -44,74 +44,4 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
44
44
  for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
45
45
  }
46
46
  fclose(f);
47
- }
48
-
49
- // Find nearest words in the model
50
- size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
51
- if (strlen(word) >= max_size) {
52
- rb_raise(rb_eArgError, "word must be %lld character max size", max_size);
53
- return 0;
54
- }
55
-
56
- long long size = model->vector_dim;
57
- long long a;
58
- char *bestw[N];
59
- float bestd[N];
60
- size_t besti[N];
61
- for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
62
- a = 0;
63
-
64
- long long b = 0;
65
- long long c = 0;
66
-
67
- long long words = model->word_count;
68
- for (b = 0; b < words; b++) {
69
- if (!strcmp(&model->vocabulary[b * max_w], word)) break;
70
- }
71
- if (b == words) b = -1;
72
- long long bi = b;
73
- if (b == -1) {
74
- rb_raise(rb_eArgError, "Out of dictionary word!");
75
- return 0;
76
- }
77
-
78
- float vec[max_size];
79
- float dist;
80
- long long d;
81
- for (a = 0; a < size; a++) vec[a] = 0;
82
- for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
83
- float len = 0;
84
- for (a = 0; a < size; a++) len += vec[a] * vec[a];
85
- len = sqrt(len);
86
- for (a = 0; a < size; a++) vec[a] /= len;
87
- for (a = 0; a < N; a++) bestd[a] = -1;
88
- for (a = 0; a < N; a++) bestw[a][0] = 0;
89
- for (c = 0; c < words; c++) {
90
- a = 0;
91
- if (bi == c) continue;
92
- dist = 0;
93
- for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
94
- for (a = 0; a < N; a++) {
95
- if (dist > bestd[a]) {
96
- for (d = N - 1; d > a; d--) {
97
- bestd[d] = bestd[d - 1];
98
- strcpy(bestw[d], bestw[d - 1]);
99
- besti[d] = d - 1;
100
- }
101
- bestd[a] = dist;
102
- strcpy(bestw[a], &model->vocabulary[c * max_w]);
103
- besti[a] = c * max_w;
104
- break;
105
- }
106
- }
107
- }
108
-
109
- for (a = 0; a < N; a++) {
110
- word_list[a].index = besti[a];
111
- word_list[a].score = bestd[a];
112
- }
113
-
114
- for (a = 0; a < N; a++) free(bestw[a]);
115
-
116
- return N;
117
47
  }
@@ -8,8 +8,14 @@
8
8
  #include <stdio.h>
9
9
  #include <stdlib.h>
10
10
  #include <sys/types.h>
11
+ #include <ctype.h>
11
12
 
12
- extern const long long N; // number of closest words that will be shown
13
+ // max length of strings
14
+ extern const unsigned long max_size;
15
+ // number of closest words that will be shown
16
+ extern const long long N;
17
+ // max length of vocabulary entries
18
+ extern const long long max_w;
13
19
 
14
20
  typedef struct word2vec_model_s {
15
21
  long long word_count;
@@ -25,5 +31,7 @@ typedef struct WordSimilarity_s {
25
31
 
26
32
  void word2vec_model_load(word2vec_model* model, char* file_name);
27
33
  size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
34
+ size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
35
+ void word2vec_model_accuracy(word2vec_model* model, char* file_name);
28
36
 
29
37
  #endif /* _WORD2VEC_COMMON_H */
@@ -0,0 +1,71 @@
1
+ #include "common.h"
2
+
3
+ // Find nearest words in the model
4
+ size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
5
+ if (strlen(word) >= max_size) {
6
+ rb_raise(rb_eArgError, "word must be %lu character max size", max_size);
7
+ return 0;
8
+ }
9
+
10
+ long long size = model->vector_dim;
11
+ long long a;
12
+ char *bestw[N];
13
+ float bestd[N];
14
+ size_t besti[N];
15
+ for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
16
+ a = 0;
17
+
18
+ long long b = 0;
19
+ long long c = 0;
20
+
21
+ long long words = model->word_count;
22
+ for (b = 0; b < words; b++) {
23
+ if (!strcmp(&model->vocabulary[b * max_w], word)) break;
24
+ }
25
+ if (b == words) b = -1;
26
+ long long bi = b;
27
+ if (b == -1) {
28
+ rb_raise(rb_eArgError, "Out of dictionary word!");
29
+ return 0;
30
+ }
31
+
32
+ float vec[max_size];
33
+ float dist;
34
+ long long d;
35
+ for (a = 0; a < size; a++) vec[a] = 0;
36
+ for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
37
+ float len = 0;
38
+ for (a = 0; a < size; a++) len += vec[a] * vec[a];
39
+ len = sqrt(len);
40
+ for (a = 0; a < size; a++) vec[a] /= len;
41
+ for (a = 0; a < N; a++) bestd[a] = -1;
42
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
43
+ for (c = 0; c < words; c++) {
44
+ a = 0;
45
+ if (bi == c) continue;
46
+ dist = 0;
47
+ for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
48
+ for (a = 0; a < N; a++) {
49
+ if (dist > bestd[a]) {
50
+ for (d = N - 1; d > a; d--) {
51
+ bestd[d] = bestd[d - 1];
52
+ strcpy(bestw[d], bestw[d - 1]);
53
+ besti[d] = d - 1;
54
+ }
55
+ bestd[a] = dist;
56
+ strcpy(bestw[a], &model->vocabulary[c * max_w]);
57
+ besti[a] = c * max_w;
58
+ break;
59
+ }
60
+ }
61
+ }
62
+
63
+ for (a = 0; a < N; a++) {
64
+ word_list[a].index = besti[a];
65
+ word_list[a].score = bestd[a];
66
+ }
67
+
68
+ for (a = 0; a < N; a++) free(bestw[a]);
69
+
70
+ return N;
71
+ }
@@ -16,6 +16,22 @@ static void model_deallocate(word2vec_model *model) {
16
16
  }
17
17
  }
18
18
 
19
+ /*
20
+ * Transform a WordSimilarity vector to a Ruby hash
21
+ */
22
+ VALUE wordSimilarotyToHash(word2vec_model *model, WordSimilarity word_list[], size_t word_count) {
23
+ VALUE rb_ret =rb_hash_new();
24
+ for (size_t i = 0 ; i < word_count ; i++) {
25
+ size_t index = word_list[i].index;
26
+ if (index >= 0) {
27
+ VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
28
+ VALUE rb_score = DBL2NUM(word_list[i].score);
29
+ rb_hash_aset(rb_ret, rb_word, rb_score);
30
+ }
31
+ }
32
+ return rb_ret;
33
+ }
34
+
19
35
  /*
20
36
  * model_load
21
37
  * load the vectors.bin file from disc
@@ -69,20 +85,48 @@ static VALUE model_distance(VALUE mod, VALUE rb_word) {
69
85
 
70
86
  size_t word_count = word2vec_model_distance(model, word, word_list);
71
87
 
72
- VALUE rb_ret = rb_hash_new();
88
+ VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
89
+
90
+ return rb_ret;
91
+ }
92
+
93
+ /*
94
+ * model find the analog word to other three
95
+ * @param [String] rb_wordx1
96
+ * @param [String] rb_wordy1
97
+ * @param [String] rb_wordx2
98
+ * @return [Hash<String, Float>]
99
+ */
100
+ static VALUE model_analogy(VALUE mod, VALUE rb_wordx1, VALUE rb_wordy1, VALUE rb_wordx2) {
101
+ word2vec_model *model;
102
+ Data_Get_Struct(mod, word2vec_model, model);
103
+ char* wordx1 = StringValueCStr(rb_wordx1);
104
+ char* wordy1 = StringValueCStr(rb_wordy1);
105
+ char* wordx2 = StringValueCStr(rb_wordx2);
106
+
107
+ WordSimilarity word_list[N];
73
108
 
74
- for (size_t i = 0 ; i < word_count ; i++) {
75
- size_t index = word_list[i].index;
76
- if (index >= 0) {
77
- VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
78
- VALUE rb_score = DBL2NUM(word_list[i].score);
79
- rb_hash_aset(rb_ret, rb_word, rb_score);
80
- }
81
- }
109
+ size_t word_count = word2vec_model_analogy(model, wordx1, wordy1, wordx2, word_list);
110
+
111
+ VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
82
112
 
83
113
  return rb_ret;
84
114
  }
85
115
 
116
+ /*
117
+ * model find the analog word to other three
118
+ * @param [String] rb_file_name
119
+ */
120
+ static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
121
+ word2vec_model *model;
122
+ Data_Get_Struct(mod, word2vec_model, model);
123
+ char* filename = StringValueCStr(rb_file_name);
124
+
125
+ word2vec_model_accuracy(model, filename);
126
+
127
+ return Qtrue;
128
+ }
129
+
86
130
  void Init_word2vec(void) {
87
131
  VALUE mWord2vec = rb_define_module("Word2vec");
88
132
  VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -90,4 +134,6 @@ void Init_word2vec(void) {
90
134
  rb_define_method(mWord2vecModel, "word_count", model_word_count, 0);
91
135
  rb_define_method(mWord2vecModel, "vector_dim", model_vector_dim, 0);
92
136
  rb_define_method(mWord2vecModel, "distance", model_distance, 1);
137
+ rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
138
+ rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
93
139
  }
@@ -1,3 +1,3 @@
1
1
  module Word2vec
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end