word2vec-rb 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +8 -0
- data/Gemfile.lock +1 -1
- data/README.md +45 -9
- data/data/questions-words.txt +2285 -0
- data/ext/word2vec/accuracy.c +91 -0
- data/ext/word2vec/analogy.c +85 -0
- data/ext/word2vec/common.c +1 -71
- data/ext/word2vec/common.h +9 -1
- data/ext/word2vec/distance.c +71 -0
- data/ext/word2vec/word2vec.c +55 -9
- data/lib/word2vec/version.rb +1 -1
- metadata +5 -1
@@ -0,0 +1,91 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
// Output the analog accuracy of the model
|
4
|
+
void word2vec_model_accuracy(word2vec_model* model, char* file_name) {
|
5
|
+
FILE *f;
|
6
|
+
char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size];
|
7
|
+
float dist, bestd[N], vec[max_size];
|
8
|
+
long long aa, b, c, d, b1, b2, b3;
|
9
|
+
unsigned long a;
|
10
|
+
int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
|
11
|
+
TCN = 0;
|
12
|
+
|
13
|
+
f = fopen(file_name, "r");
|
14
|
+
if (f == NULL) {
|
15
|
+
rb_raise(rb_eArgError, "Input file not found: %s", file_name);
|
16
|
+
return;
|
17
|
+
}
|
18
|
+
long long words = model->word_count;
|
19
|
+
long long size = model->vector_dim;
|
20
|
+
|
21
|
+
while (1) {
|
22
|
+
for (aa = 0; aa < N; aa++) bestd[aa] = 0;
|
23
|
+
for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
|
24
|
+
fscanf(f, "%s", st1);
|
25
|
+
for (a = 0; a < strlen(st1); a++) st1[a] = tolower(st1[a]);
|
26
|
+
if ((!strcmp(st1, ":")) || feof(f)) {
|
27
|
+
if (TCN == 0) TCN = 1;
|
28
|
+
if (QID != 0) {
|
29
|
+
printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
|
30
|
+
printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
|
31
|
+
}
|
32
|
+
QID++;
|
33
|
+
fscanf(f, "%s", st1);
|
34
|
+
if (feof(f)) break;
|
35
|
+
printf("%s:\n", st1);
|
36
|
+
TCN = 0;
|
37
|
+
CCN = 0;
|
38
|
+
continue;
|
39
|
+
}
|
40
|
+
fscanf(f, "%s", st2);
|
41
|
+
for (a = 0; a < strlen(st2); a++) st2[a] = tolower(st2[a]);
|
42
|
+
fscanf(f, "%s", st3);
|
43
|
+
for (a = 0; a<strlen(st3); a++) st3[a] = tolower(st3[a]);
|
44
|
+
fscanf(f, "%s", st4);
|
45
|
+
for (a = 0; a < strlen(st4); a++) st4[a] = tolower(st4[a]);
|
46
|
+
for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st1)) break;
|
47
|
+
b1 = b;
|
48
|
+
for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st2)) break;
|
49
|
+
b2 = b;
|
50
|
+
for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st3)) break;
|
51
|
+
b3 = b;
|
52
|
+
for (aa = 0; aa < N; aa++) bestd[aa] = 0;
|
53
|
+
for (aa = 0; aa < N; aa++) bestw[aa][0] = 0;
|
54
|
+
TQ++;
|
55
|
+
if (b1 == words) continue;
|
56
|
+
if (b2 == words) continue;
|
57
|
+
if (b3 == words) continue;
|
58
|
+
for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st4)) break;
|
59
|
+
if (b == words) continue;
|
60
|
+
for (aa = 0; aa < size; aa++) vec[aa] = (model->vectors[aa + b2 * size] - model->vectors[aa + b1 * size]) + model->vectors[aa + b3 * size];
|
61
|
+
TQS++;
|
62
|
+
for (c = 0; c < words; c++) {
|
63
|
+
if (c == b1) continue;
|
64
|
+
if (c == b2) continue;
|
65
|
+
if (c == b3) continue;
|
66
|
+
dist = 0;
|
67
|
+
for (aa = 0; aa < size; aa++) dist += vec[aa] * model->vectors[aa + c * size];
|
68
|
+
for (aa = 0; aa < N; aa++) {
|
69
|
+
if (dist > bestd[aa]) {
|
70
|
+
for (d = N - 1; d > aa; d--) {
|
71
|
+
bestd[d] = bestd[d - 1];
|
72
|
+
strcpy(bestw[d], bestw[d - 1]);
|
73
|
+
}
|
74
|
+
bestd[aa] = dist;
|
75
|
+
strcpy(bestw[aa], &model->vocabulary[c * max_w]);
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
if (!strcmp(st4, bestw[0])) {
|
81
|
+
CCN++;
|
82
|
+
CACN++;
|
83
|
+
if (QID <= 5) SEAC++; else SYAC++;
|
84
|
+
}
|
85
|
+
if (QID <= 5) SECN++; else SYCN++;
|
86
|
+
TCN++;
|
87
|
+
TACN++;
|
88
|
+
}
|
89
|
+
printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
|
90
|
+
return;
|
91
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
// Find the analog word
|
4
|
+
size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]) {
|
5
|
+
if (strlen(wordx1) >= max_size) {
|
6
|
+
rb_raise(rb_eArgError, "First parameter word must be %lu character max size", max_size);
|
7
|
+
return 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
if (strlen(wordy1) >= max_size) {
|
11
|
+
rb_raise(rb_eArgError, "Second parameter word must be %lu character max size", max_size);
|
12
|
+
return 0;
|
13
|
+
}
|
14
|
+
|
15
|
+
if (strlen(wordx2) >= max_size) {
|
16
|
+
rb_raise(rb_eArgError, "Third parameter word must be %lu character max size", max_size);
|
17
|
+
return 0;
|
18
|
+
}
|
19
|
+
|
20
|
+
float bestd[N];
|
21
|
+
char bestw[N][max_size];
|
22
|
+
size_t besti[N];
|
23
|
+
long long words = model->word_count;
|
24
|
+
long long size = model->vector_dim;
|
25
|
+
long a = 0;
|
26
|
+
long b = 0;
|
27
|
+
for (a = 0; a < N; a++) bestd[a] = 0;
|
28
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
29
|
+
|
30
|
+
char st[100][max_size];
|
31
|
+
long long bi[100];
|
32
|
+
long cn = 3;
|
33
|
+
strcpy(st[0], wordx1);
|
34
|
+
strcpy(st[1], wordy1);
|
35
|
+
strcpy(st[2], wordx2);
|
36
|
+
for (a = 0; a < cn; a++) {
|
37
|
+
for (b = 0; b < words; b++) if (!strcmp(&model->vocabulary[b * max_w], st[a])) break;
|
38
|
+
if (b == words) b = 0;
|
39
|
+
bi[a] = b;
|
40
|
+
|
41
|
+
if (b == 0) {
|
42
|
+
rb_raise(rb_eArgError, "%s: out of dictionary word!", st[a]);
|
43
|
+
break;
|
44
|
+
}
|
45
|
+
}
|
46
|
+
float vec[max_size];
|
47
|
+
for (a = 0; a < size; a++) vec[a] = model->vectors[a + bi[1] * size] - model->vectors[a + bi[0] * size] + model->vectors[a + bi[2] * size];
|
48
|
+
float len = 0;
|
49
|
+
for (a = 0; a < size; a++) len += vec[a] * vec[a];
|
50
|
+
len = sqrt(len);
|
51
|
+
long long c;
|
52
|
+
long long d;
|
53
|
+
float dist;
|
54
|
+
for (a = 0; a < size; a++) vec[a] /= len;
|
55
|
+
for (a = 0; a < N; a++) bestd[a] = 0;
|
56
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
57
|
+
for (c = 0; c < words; c++) {
|
58
|
+
if (c == bi[0]) continue;
|
59
|
+
if (c == bi[1]) continue;
|
60
|
+
if (c == bi[2]) continue;
|
61
|
+
a = 0;
|
62
|
+
for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
|
63
|
+
if (a == 1) continue;
|
64
|
+
dist = 0;
|
65
|
+
for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
|
66
|
+
for (a = 0; a < N; a++) {
|
67
|
+
if (dist > bestd[a]) {
|
68
|
+
for (d = N - 1; d > a; d--) {
|
69
|
+
bestd[d] = bestd[d - 1];
|
70
|
+
strcpy(bestw[d], bestw[d - 1]);
|
71
|
+
besti[d] = d - 1;
|
72
|
+
}
|
73
|
+
bestd[a] = dist;
|
74
|
+
strcpy(bestw[a], &model->vocabulary[c * max_w]);
|
75
|
+
besti[a] = c * max_w;
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
for (a = 0; a < N; a++) {
|
81
|
+
word_list[a].index = besti[a];
|
82
|
+
word_list[a].score = bestd[a];
|
83
|
+
}
|
84
|
+
return N;
|
85
|
+
}
|
data/ext/word2vec/common.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#include "common.h"
|
2
2
|
|
3
3
|
// max length of strings
|
4
|
-
const
|
4
|
+
const unsigned long max_size = 2000;
|
5
5
|
// number of closest words that will be shown
|
6
6
|
const long long N = 40;
|
7
7
|
// max length of vocabulary entries
|
@@ -44,74 +44,4 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
|
|
44
44
|
for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
|
45
45
|
}
|
46
46
|
fclose(f);
|
47
|
-
}
|
48
|
-
|
49
|
-
// Find nearest words in the model
|
50
|
-
size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
|
51
|
-
if (strlen(word) >= max_size) {
|
52
|
-
rb_raise(rb_eArgError, "word must be %lld character max size", max_size);
|
53
|
-
return 0;
|
54
|
-
}
|
55
|
-
|
56
|
-
long long size = model->vector_dim;
|
57
|
-
long long a;
|
58
|
-
char *bestw[N];
|
59
|
-
float bestd[N];
|
60
|
-
size_t besti[N];
|
61
|
-
for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
|
62
|
-
a = 0;
|
63
|
-
|
64
|
-
long long b = 0;
|
65
|
-
long long c = 0;
|
66
|
-
|
67
|
-
long long words = model->word_count;
|
68
|
-
for (b = 0; b < words; b++) {
|
69
|
-
if (!strcmp(&model->vocabulary[b * max_w], word)) break;
|
70
|
-
}
|
71
|
-
if (b == words) b = -1;
|
72
|
-
long long bi = b;
|
73
|
-
if (b == -1) {
|
74
|
-
rb_raise(rb_eArgError, "Out of dictionary word!");
|
75
|
-
return 0;
|
76
|
-
}
|
77
|
-
|
78
|
-
float vec[max_size];
|
79
|
-
float dist;
|
80
|
-
long long d;
|
81
|
-
for (a = 0; a < size; a++) vec[a] = 0;
|
82
|
-
for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
|
83
|
-
float len = 0;
|
84
|
-
for (a = 0; a < size; a++) len += vec[a] * vec[a];
|
85
|
-
len = sqrt(len);
|
86
|
-
for (a = 0; a < size; a++) vec[a] /= len;
|
87
|
-
for (a = 0; a < N; a++) bestd[a] = -1;
|
88
|
-
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
89
|
-
for (c = 0; c < words; c++) {
|
90
|
-
a = 0;
|
91
|
-
if (bi == c) continue;
|
92
|
-
dist = 0;
|
93
|
-
for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
|
94
|
-
for (a = 0; a < N; a++) {
|
95
|
-
if (dist > bestd[a]) {
|
96
|
-
for (d = N - 1; d > a; d--) {
|
97
|
-
bestd[d] = bestd[d - 1];
|
98
|
-
strcpy(bestw[d], bestw[d - 1]);
|
99
|
-
besti[d] = d - 1;
|
100
|
-
}
|
101
|
-
bestd[a] = dist;
|
102
|
-
strcpy(bestw[a], &model->vocabulary[c * max_w]);
|
103
|
-
besti[a] = c * max_w;
|
104
|
-
break;
|
105
|
-
}
|
106
|
-
}
|
107
|
-
}
|
108
|
-
|
109
|
-
for (a = 0; a < N; a++) {
|
110
|
-
word_list[a].index = besti[a];
|
111
|
-
word_list[a].score = bestd[a];
|
112
|
-
}
|
113
|
-
|
114
|
-
for (a = 0; a < N; a++) free(bestw[a]);
|
115
|
-
|
116
|
-
return N;
|
117
47
|
}
|
data/ext/word2vec/common.h
CHANGED
@@ -8,8 +8,14 @@
|
|
8
8
|
#include <stdio.h>
|
9
9
|
#include <stdlib.h>
|
10
10
|
#include <sys/types.h>
|
11
|
+
#include <ctype.h>
|
11
12
|
|
12
|
-
|
13
|
+
// max length of strings
|
14
|
+
extern const unsigned long max_size;
|
15
|
+
// number of closest words that will be shown
|
16
|
+
extern const long long N;
|
17
|
+
// max length of vocabulary entries
|
18
|
+
extern const long long max_w;
|
13
19
|
|
14
20
|
typedef struct word2vec_model_s {
|
15
21
|
long long word_count;
|
@@ -25,5 +31,7 @@ typedef struct WordSimilarity_s {
|
|
25
31
|
|
26
32
|
void word2vec_model_load(word2vec_model* model, char* file_name);
|
27
33
|
size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
|
34
|
+
size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
|
35
|
+
void word2vec_model_accuracy(word2vec_model* model, char* file_name);
|
28
36
|
|
29
37
|
#endif /* _WORD2VEC_COMMON_H */
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
// Find nearest words in the model
|
4
|
+
size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
|
5
|
+
if (strlen(word) >= max_size) {
|
6
|
+
rb_raise(rb_eArgError, "word must be %lu character max size", max_size);
|
7
|
+
return 0;
|
8
|
+
}
|
9
|
+
|
10
|
+
long long size = model->vector_dim;
|
11
|
+
long long a;
|
12
|
+
char *bestw[N];
|
13
|
+
float bestd[N];
|
14
|
+
size_t besti[N];
|
15
|
+
for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
|
16
|
+
a = 0;
|
17
|
+
|
18
|
+
long long b = 0;
|
19
|
+
long long c = 0;
|
20
|
+
|
21
|
+
long long words = model->word_count;
|
22
|
+
for (b = 0; b < words; b++) {
|
23
|
+
if (!strcmp(&model->vocabulary[b * max_w], word)) break;
|
24
|
+
}
|
25
|
+
if (b == words) b = -1;
|
26
|
+
long long bi = b;
|
27
|
+
if (b == -1) {
|
28
|
+
rb_raise(rb_eArgError, "Out of dictionary word!");
|
29
|
+
return 0;
|
30
|
+
}
|
31
|
+
|
32
|
+
float vec[max_size];
|
33
|
+
float dist;
|
34
|
+
long long d;
|
35
|
+
for (a = 0; a < size; a++) vec[a] = 0;
|
36
|
+
for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
|
37
|
+
float len = 0;
|
38
|
+
for (a = 0; a < size; a++) len += vec[a] * vec[a];
|
39
|
+
len = sqrt(len);
|
40
|
+
for (a = 0; a < size; a++) vec[a] /= len;
|
41
|
+
for (a = 0; a < N; a++) bestd[a] = -1;
|
42
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
43
|
+
for (c = 0; c < words; c++) {
|
44
|
+
a = 0;
|
45
|
+
if (bi == c) continue;
|
46
|
+
dist = 0;
|
47
|
+
for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
|
48
|
+
for (a = 0; a < N; a++) {
|
49
|
+
if (dist > bestd[a]) {
|
50
|
+
for (d = N - 1; d > a; d--) {
|
51
|
+
bestd[d] = bestd[d - 1];
|
52
|
+
strcpy(bestw[d], bestw[d - 1]);
|
53
|
+
besti[d] = d - 1;
|
54
|
+
}
|
55
|
+
bestd[a] = dist;
|
56
|
+
strcpy(bestw[a], &model->vocabulary[c * max_w]);
|
57
|
+
besti[a] = c * max_w;
|
58
|
+
break;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
for (a = 0; a < N; a++) {
|
64
|
+
word_list[a].index = besti[a];
|
65
|
+
word_list[a].score = bestd[a];
|
66
|
+
}
|
67
|
+
|
68
|
+
for (a = 0; a < N; a++) free(bestw[a]);
|
69
|
+
|
70
|
+
return N;
|
71
|
+
}
|
data/ext/word2vec/word2vec.c
CHANGED
@@ -16,6 +16,22 @@ static void model_deallocate(word2vec_model *model) {
|
|
16
16
|
}
|
17
17
|
}
|
18
18
|
|
19
|
+
/*
|
20
|
+
* Transform a WordSimilarity vector to a Ruby hash
|
21
|
+
*/
|
22
|
+
VALUE wordSimilarotyToHash(word2vec_model *model, WordSimilarity word_list[], size_t word_count) {
|
23
|
+
VALUE rb_ret =rb_hash_new();
|
24
|
+
for (size_t i = 0 ; i < word_count ; i++) {
|
25
|
+
size_t index = word_list[i].index;
|
26
|
+
if (index >= 0) {
|
27
|
+
VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
|
28
|
+
VALUE rb_score = DBL2NUM(word_list[i].score);
|
29
|
+
rb_hash_aset(rb_ret, rb_word, rb_score);
|
30
|
+
}
|
31
|
+
}
|
32
|
+
return rb_ret;
|
33
|
+
}
|
34
|
+
|
19
35
|
/*
|
20
36
|
* model_load
|
21
37
|
* load the vectors.bin file from disc
|
@@ -69,20 +85,48 @@ static VALUE model_distance(VALUE mod, VALUE rb_word) {
|
|
69
85
|
|
70
86
|
size_t word_count = word2vec_model_distance(model, word, word_list);
|
71
87
|
|
72
|
-
VALUE rb_ret =
|
88
|
+
VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
|
89
|
+
|
90
|
+
return rb_ret;
|
91
|
+
}
|
92
|
+
|
93
|
+
/*
|
94
|
+
* model find the analog word to other three
|
95
|
+
* @param [String] rb_wordx1
|
96
|
+
* @param [String] rb_wordy1
|
97
|
+
* @param [String] rb_wordx2
|
98
|
+
* @return [Hash<String, Float>]
|
99
|
+
*/
|
100
|
+
static VALUE model_analogy(VALUE mod, VALUE rb_wordx1, VALUE rb_wordy1, VALUE rb_wordx2) {
|
101
|
+
word2vec_model *model;
|
102
|
+
Data_Get_Struct(mod, word2vec_model, model);
|
103
|
+
char* wordx1 = StringValueCStr(rb_wordx1);
|
104
|
+
char* wordy1 = StringValueCStr(rb_wordy1);
|
105
|
+
char* wordx2 = StringValueCStr(rb_wordx2);
|
106
|
+
|
107
|
+
WordSimilarity word_list[N];
|
73
108
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
|
78
|
-
VALUE rb_score = DBL2NUM(word_list[i].score);
|
79
|
-
rb_hash_aset(rb_ret, rb_word, rb_score);
|
80
|
-
}
|
81
|
-
}
|
109
|
+
size_t word_count = word2vec_model_analogy(model, wordx1, wordy1, wordx2, word_list);
|
110
|
+
|
111
|
+
VALUE rb_ret = wordSimilarotyToHash(model, word_list, word_count);
|
82
112
|
|
83
113
|
return rb_ret;
|
84
114
|
}
|
85
115
|
|
116
|
+
/*
|
117
|
+
* model find the analog word to other three
|
118
|
+
* @param [String] rb_file_name
|
119
|
+
*/
|
120
|
+
static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
|
121
|
+
word2vec_model *model;
|
122
|
+
Data_Get_Struct(mod, word2vec_model, model);
|
123
|
+
char* filename = StringValueCStr(rb_file_name);
|
124
|
+
|
125
|
+
word2vec_model_accuracy(model, filename);
|
126
|
+
|
127
|
+
return Qtrue;
|
128
|
+
}
|
129
|
+
|
86
130
|
void Init_word2vec(void) {
|
87
131
|
VALUE mWord2vec = rb_define_module("Word2vec");
|
88
132
|
VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
|
@@ -90,4 +134,6 @@ void Init_word2vec(void) {
|
|
90
134
|
rb_define_method(mWord2vecModel, "word_count", model_word_count, 0);
|
91
135
|
rb_define_method(mWord2vecModel, "vector_dim", model_vector_dim, 0);
|
92
136
|
rb_define_method(mWord2vecModel, "distance", model_distance, 1);
|
137
|
+
rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
|
138
|
+
rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
|
93
139
|
}
|
data/lib/word2vec/version.rb
CHANGED