word2vec 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/word2vec/LICENSE +202 -0
- data/ext/word2vec/README.txt +21 -0
- data/ext/word2vec/compute-accuracy.c +143 -0
- data/ext/word2vec/demo-analogy.sh +11 -0
- data/ext/word2vec/demo-classes.sh +8 -0
- data/ext/word2vec/demo-phrase-accuracy.sh +11 -0
- data/ext/word2vec/demo-phrases.sh +11 -0
- data/ext/word2vec/demo-train-big-model-v1.sh +100 -0
- data/ext/word2vec/demo-word-accuracy.sh +8 -0
- data/ext/word2vec/demo-word.sh +7 -0
- data/ext/word2vec/distance.c +143 -0
- data/ext/word2vec/extconf.rb +0 -0
- data/ext/word2vec/makefile +22 -0
- data/ext/word2vec/questions-phrases.txt +3223 -0
- data/ext/word2vec/questions-words.txt +19558 -0
- data/ext/word2vec/word-analogy.c +145 -0
- data/ext/word2vec/word2phrase.c +292 -0
- data/ext/word2vec/word2vec.c +702 -0
- data/lib/word2vec.rb +6 -0
- data/lib/word2vec/io.rb +27 -0
- data/lib/word2vec/scripts_interface.rb +97 -0
- data/lib/word2vec/utils.rb +9 -0
- data/lib/word2vec/version.rb +3 -0
- data/lib/word2vec/word_clusters.rb +36 -0
- data/lib/word2vec/word_vectors.rb +182 -0
- data/word2vec.gemspec +29 -0
- metadata +151 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <string.h>
|
18
|
+
#include <math.h>
|
19
|
+
/* #include <malloc.h> */
|
20
|
+
|
21
|
+
const long long max_size = 2000; // max length of strings
|
22
|
+
const long long N = 40; // number of closest words that will be shown
|
23
|
+
const long long max_w = 50; // max length of vocabulary entries
|
24
|
+
|
25
|
+
int main(int argc, char **argv) {
|
26
|
+
FILE *f;
|
27
|
+
char st1[max_size];
|
28
|
+
char bestw[N][max_size];
|
29
|
+
char file_name[max_size], st[100][max_size];
|
30
|
+
float dist, len, bestd[N], vec[max_size];
|
31
|
+
long long words, size, a, b, c, d, cn, bi[100];
|
32
|
+
char ch;
|
33
|
+
float *M;
|
34
|
+
char *vocab;
|
35
|
+
if (argc < 2) {
|
36
|
+
printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
|
37
|
+
return 0;
|
38
|
+
}
|
39
|
+
strcpy(file_name, argv[1]);
|
40
|
+
f = fopen(file_name, "rb");
|
41
|
+
if (f == NULL) {
|
42
|
+
printf("Input file not found\n");
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
fscanf(f, "%lld", &words);
|
46
|
+
fscanf(f, "%lld", &size);
|
47
|
+
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
|
48
|
+
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
|
49
|
+
if (M == NULL) {
|
50
|
+
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
|
51
|
+
return -1;
|
52
|
+
}
|
53
|
+
for (b = 0; b < words; b++) {
|
54
|
+
a = 0;
|
55
|
+
while (1) {
|
56
|
+
vocab[b * max_w + a] = fgetc(f);
|
57
|
+
if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
|
58
|
+
if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
|
59
|
+
}
|
60
|
+
vocab[b * max_w + a] = 0;
|
61
|
+
for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
|
62
|
+
len = 0;
|
63
|
+
for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
|
64
|
+
len = sqrt(len);
|
65
|
+
for (a = 0; a < size; a++) M[a + b * size] /= len;
|
66
|
+
}
|
67
|
+
fclose(f);
|
68
|
+
while (1) {
|
69
|
+
for (a = 0; a < N; a++) bestd[a] = 0;
|
70
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
71
|
+
printf("Enter three words (EXIT to break): ");
|
72
|
+
a = 0;
|
73
|
+
while (1) {
|
74
|
+
st1[a] = fgetc(stdin);
|
75
|
+
if ((st1[a] == '\n') || (a >= max_size - 1)) {
|
76
|
+
st1[a] = 0;
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
a++;
|
80
|
+
}
|
81
|
+
if (!strcmp(st1, "EXIT")) break;
|
82
|
+
cn = 0;
|
83
|
+
b = 0;
|
84
|
+
c = 0;
|
85
|
+
while (1) {
|
86
|
+
st[cn][b] = st1[c];
|
87
|
+
b++;
|
88
|
+
c++;
|
89
|
+
st[cn][b] = 0;
|
90
|
+
if (st1[c] == 0) break;
|
91
|
+
if (st1[c] == ' ') {
|
92
|
+
cn++;
|
93
|
+
b = 0;
|
94
|
+
c++;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
cn++;
|
98
|
+
if (cn < 3) {
|
99
|
+
printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
|
100
|
+
continue;
|
101
|
+
}
|
102
|
+
for (a = 0; a < cn; a++) {
|
103
|
+
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
|
104
|
+
if (b == words) b = 0;
|
105
|
+
bi[a] = b;
|
106
|
+
printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
|
107
|
+
if (b == 0) {
|
108
|
+
printf("Out of dictionary word!\n");
|
109
|
+
break;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
if (b == 0) continue;
|
113
|
+
printf("\n Word Distance\n------------------------------------------------------------------------\n");
|
114
|
+
for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
|
115
|
+
len = 0;
|
116
|
+
for (a = 0; a < size; a++) len += vec[a] * vec[a];
|
117
|
+
len = sqrt(len);
|
118
|
+
for (a = 0; a < size; a++) vec[a] /= len;
|
119
|
+
for (a = 0; a < N; a++) bestd[a] = 0;
|
120
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
121
|
+
for (c = 0; c < words; c++) {
|
122
|
+
if (c == bi[0]) continue;
|
123
|
+
if (c == bi[1]) continue;
|
124
|
+
if (c == bi[2]) continue;
|
125
|
+
a = 0;
|
126
|
+
for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
|
127
|
+
if (a == 1) continue;
|
128
|
+
dist = 0;
|
129
|
+
for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
|
130
|
+
for (a = 0; a < N; a++) {
|
131
|
+
if (dist > bestd[a]) {
|
132
|
+
for (d = N - 1; d > a; d--) {
|
133
|
+
bestd[d] = bestd[d - 1];
|
134
|
+
strcpy(bestw[d], bestw[d - 1]);
|
135
|
+
}
|
136
|
+
bestd[a] = dist;
|
137
|
+
strcpy(bestw[a], &vocab[c * max_w]);
|
138
|
+
break;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
}
|
142
|
+
for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
|
143
|
+
}
|
144
|
+
return 0;
|
145
|
+
}
|
@@ -0,0 +1,292 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <string.h>
|
18
|
+
#include <math.h>
|
19
|
+
#include <pthread.h>
|
20
|
+
|
21
|
+
#define MAX_STRING 60
|
22
|
+
|
23
|
+
const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
|
24
|
+
|
25
|
+
typedef float real; // Precision of float numbers
|
26
|
+
|
27
|
+
struct vocab_word {
|
28
|
+
long long cn;
|
29
|
+
char *word;
|
30
|
+
};
|
31
|
+
|
32
|
+
char train_file[MAX_STRING], output_file[MAX_STRING];
|
33
|
+
struct vocab_word *vocab;
|
34
|
+
int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
|
35
|
+
long long vocab_max_size = 10000, vocab_size = 0;
|
36
|
+
long long train_words = 0;
|
37
|
+
real threshold = 100;
|
38
|
+
|
39
|
+
unsigned long long next_random = 1;
|
40
|
+
|
41
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
42
|
+
void ReadWord(char *word, FILE *fin) {
|
43
|
+
int a = 0, ch;
|
44
|
+
while (!feof(fin)) {
|
45
|
+
ch = fgetc(fin);
|
46
|
+
if (ch == 13) continue;
|
47
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
48
|
+
if (a > 0) {
|
49
|
+
if (ch == '\n') ungetc(ch, fin);
|
50
|
+
break;
|
51
|
+
}
|
52
|
+
if (ch == '\n') {
|
53
|
+
strcpy(word, (char *)"</s>");
|
54
|
+
return;
|
55
|
+
} else continue;
|
56
|
+
}
|
57
|
+
word[a] = ch;
|
58
|
+
a++;
|
59
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
60
|
+
}
|
61
|
+
word[a] = 0;
|
62
|
+
}
|
63
|
+
|
64
|
+
// Returns hash value of a word
|
65
|
+
int GetWordHash(char *word) {
|
66
|
+
unsigned long long a, hash = 1;
|
67
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
68
|
+
hash = hash % vocab_hash_size;
|
69
|
+
return hash;
|
70
|
+
}
|
71
|
+
|
72
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
73
|
+
int SearchVocab(char *word) {
|
74
|
+
unsigned int hash = GetWordHash(word);
|
75
|
+
while (1) {
|
76
|
+
if (vocab_hash[hash] == -1) return -1;
|
77
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
78
|
+
hash = (hash + 1) % vocab_hash_size;
|
79
|
+
}
|
80
|
+
return -1;
|
81
|
+
}
|
82
|
+
|
83
|
+
// Reads a word and returns its index in the vocabulary
|
84
|
+
int ReadWordIndex(FILE *fin) {
|
85
|
+
char word[MAX_STRING];
|
86
|
+
ReadWord(word, fin);
|
87
|
+
if (feof(fin)) return -1;
|
88
|
+
return SearchVocab(word);
|
89
|
+
}
|
90
|
+
|
91
|
+
// Adds a word to the vocabulary
|
92
|
+
int AddWordToVocab(char *word) {
|
93
|
+
unsigned int hash, length = strlen(word) + 1;
|
94
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
95
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
96
|
+
strcpy(vocab[vocab_size].word, word);
|
97
|
+
vocab[vocab_size].cn = 0;
|
98
|
+
vocab_size++;
|
99
|
+
// Reallocate memory if needed
|
100
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
101
|
+
vocab_max_size += 10000;
|
102
|
+
vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
103
|
+
}
|
104
|
+
hash = GetWordHash(word);
|
105
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
106
|
+
vocab_hash[hash]=vocab_size - 1;
|
107
|
+
return vocab_size - 1;
|
108
|
+
}
|
109
|
+
|
110
|
+
// Used later for sorting by word counts
|
111
|
+
int VocabCompare(const void *a, const void *b) {
|
112
|
+
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
|
113
|
+
}
|
114
|
+
|
115
|
+
// Sorts the vocabulary by frequency using word counts
|
116
|
+
void SortVocab() {
|
117
|
+
int a;
|
118
|
+
unsigned int hash;
|
119
|
+
// Sort the vocabulary and keep </s> at the first position
|
120
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
121
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
122
|
+
for (a = 0; a < vocab_size; a++) {
|
123
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
124
|
+
if (vocab[a].cn < min_count) {
|
125
|
+
vocab_size--;
|
126
|
+
free(vocab[vocab_size].word);
|
127
|
+
} else {
|
128
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
129
|
+
hash = GetWordHash(vocab[a].word);
|
130
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
131
|
+
vocab_hash[hash] = a;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
|
135
|
+
}
|
136
|
+
|
137
|
+
// Reduces the vocabulary by removing infrequent tokens
|
138
|
+
void ReduceVocab() {
|
139
|
+
int a, b = 0;
|
140
|
+
unsigned int hash;
|
141
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
142
|
+
vocab[b].cn = vocab[a].cn;
|
143
|
+
vocab[b].word = vocab[a].word;
|
144
|
+
b++;
|
145
|
+
} else free(vocab[a].word);
|
146
|
+
vocab_size = b;
|
147
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
148
|
+
for (a = 0; a < vocab_size; a++) {
|
149
|
+
// Hash will be re-computed, as it is not actual
|
150
|
+
hash = GetWordHash(vocab[a].word);
|
151
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
152
|
+
vocab_hash[hash] = a;
|
153
|
+
}
|
154
|
+
fflush(stdout);
|
155
|
+
min_reduce++;
|
156
|
+
}
|
157
|
+
|
158
|
+
void LearnVocabFromTrainFile() {
|
159
|
+
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
|
160
|
+
FILE *fin;
|
161
|
+
long long a, i, start = 1;
|
162
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
163
|
+
fin = fopen(train_file, "rb");
|
164
|
+
if (fin == NULL) {
|
165
|
+
printf("ERROR: training data file not found!\n");
|
166
|
+
exit(1);
|
167
|
+
}
|
168
|
+
vocab_size = 0;
|
169
|
+
AddWordToVocab((char *)"</s>");
|
170
|
+
while (1) {
|
171
|
+
ReadWord(word, fin);
|
172
|
+
if (feof(fin)) break;
|
173
|
+
if (!strcmp(word, "</s>")) {
|
174
|
+
start = 1;
|
175
|
+
continue;
|
176
|
+
} else start = 0;
|
177
|
+
train_words++;
|
178
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
179
|
+
printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13);
|
180
|
+
fflush(stdout);
|
181
|
+
}
|
182
|
+
i = SearchVocab(word);
|
183
|
+
if (i == -1) {
|
184
|
+
a = AddWordToVocab(word);
|
185
|
+
vocab[a].cn = 1;
|
186
|
+
} else vocab[i].cn++;
|
187
|
+
if (start) continue;
|
188
|
+
sprintf(bigram_word, "%s_%s", last_word, word);
|
189
|
+
bigram_word[MAX_STRING - 1] = 0;
|
190
|
+
strcpy(last_word, word);
|
191
|
+
i = SearchVocab(bigram_word);
|
192
|
+
if (i == -1) {
|
193
|
+
a = AddWordToVocab(bigram_word);
|
194
|
+
vocab[a].cn = 1;
|
195
|
+
} else vocab[i].cn++;
|
196
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
197
|
+
}
|
198
|
+
SortVocab();
|
199
|
+
if (debug_mode > 0) {
|
200
|
+
printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
|
201
|
+
printf("Words in train file: %lld\n", train_words);
|
202
|
+
}
|
203
|
+
fclose(fin);
|
204
|
+
}
|
205
|
+
|
206
|
+
void TrainModel() {
|
207
|
+
long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
|
208
|
+
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
|
209
|
+
real score;
|
210
|
+
FILE *fo, *fin;
|
211
|
+
printf("Starting training using file %s\n", train_file);
|
212
|
+
LearnVocabFromTrainFile();
|
213
|
+
fin = fopen(train_file, "rb");
|
214
|
+
fo = fopen(output_file, "wb");
|
215
|
+
word[0] = 0;
|
216
|
+
while (1) {
|
217
|
+
strcpy(last_word, word);
|
218
|
+
ReadWord(word, fin);
|
219
|
+
if (feof(fin)) break;
|
220
|
+
if (!strcmp(word, "</s>")) {
|
221
|
+
fprintf(fo, "\n");
|
222
|
+
continue;
|
223
|
+
}
|
224
|
+
cn++;
|
225
|
+
if ((debug_mode > 1) && (cn % 100000 == 0)) {
|
226
|
+
printf("Words written: %lldK%c", cn / 1000, 13);
|
227
|
+
fflush(stdout);
|
228
|
+
}
|
229
|
+
oov = 0;
|
230
|
+
i = SearchVocab(word);
|
231
|
+
if (i == -1) oov = 1; else pb = vocab[i].cn;
|
232
|
+
if (li == -1) oov = 1;
|
233
|
+
li = i;
|
234
|
+
sprintf(bigram_word, "%s_%s", last_word, word);
|
235
|
+
bigram_word[MAX_STRING - 1] = 0;
|
236
|
+
i = SearchVocab(bigram_word);
|
237
|
+
if (i == -1) oov = 1; else pab = vocab[i].cn;
|
238
|
+
if (pa < min_count) oov = 1;
|
239
|
+
if (pb < min_count) oov = 1;
|
240
|
+
if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
|
241
|
+
if (score > threshold) {
|
242
|
+
fprintf(fo, "_%s", word);
|
243
|
+
pb = 0;
|
244
|
+
} else fprintf(fo, " %s", word);
|
245
|
+
pa = pb;
|
246
|
+
}
|
247
|
+
fclose(fo);
|
248
|
+
fclose(fin);
|
249
|
+
}
|
250
|
+
|
251
|
+
int ArgPos(char *str, int argc, char **argv) {
|
252
|
+
int a;
|
253
|
+
for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
|
254
|
+
if (a == argc - 1) {
|
255
|
+
printf("Argument missing for %s\n", str);
|
256
|
+
exit(1);
|
257
|
+
}
|
258
|
+
return a;
|
259
|
+
}
|
260
|
+
return -1;
|
261
|
+
}
|
262
|
+
|
263
|
+
int main(int argc, char **argv) {
|
264
|
+
int i;
|
265
|
+
if (argc == 1) {
|
266
|
+
printf("WORD2PHRASE tool v0.1a\n\n");
|
267
|
+
printf("Options:\n");
|
268
|
+
printf("Parameters for training:\n");
|
269
|
+
printf("\t-train <file>\n");
|
270
|
+
printf("\t\tUse text data from <file> to train the model\n");
|
271
|
+
printf("\t-output <file>\n");
|
272
|
+
printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
|
273
|
+
printf("\t-min-count <int>\n");
|
274
|
+
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
|
275
|
+
printf("\t-threshold <float>\n");
|
276
|
+
printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
|
277
|
+
printf("\t-debug <int>\n");
|
278
|
+
printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
|
279
|
+
printf("\nExamples:\n");
|
280
|
+
printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
|
281
|
+
return 0;
|
282
|
+
}
|
283
|
+
if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
|
284
|
+
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
|
285
|
+
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
|
286
|
+
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
|
287
|
+
if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
|
288
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
289
|
+
vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
|
290
|
+
TrainModel();
|
291
|
+
return 0;
|
292
|
+
}
|
@@ -0,0 +1,702 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <string.h>
|
18
|
+
#include <math.h>
|
19
|
+
#include <pthread.h>
|
20
|
+
|
21
|
+
#define MAX_STRING 100
|
22
|
+
#define EXP_TABLE_SIZE 1000
|
23
|
+
#define MAX_EXP 6
|
24
|
+
#define MAX_SENTENCE_LENGTH 1000
|
25
|
+
#define MAX_CODE_LENGTH 40
|
26
|
+
|
27
|
+
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
|
28
|
+
|
29
|
+
typedef float real; // Precision of float numbers
|
30
|
+
|
31
|
+
struct vocab_word {
|
32
|
+
long long cn;
|
33
|
+
int *point;
|
34
|
+
char *word, *code, codelen;
|
35
|
+
};
|
36
|
+
|
37
|
+
char train_file[MAX_STRING], output_file[MAX_STRING];
|
38
|
+
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
|
39
|
+
struct vocab_word *vocab;
|
40
|
+
int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
|
41
|
+
int *vocab_hash;
|
42
|
+
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
|
43
|
+
long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
|
44
|
+
real alpha = 0.025, starting_alpha, sample = 1e-3;
|
45
|
+
real *syn0, *syn1, *syn1neg, *expTable;
|
46
|
+
clock_t start;
|
47
|
+
|
48
|
+
int hs = 0, negative = 5;
|
49
|
+
const int table_size = 1e8;
|
50
|
+
int *table;
|
51
|
+
|
52
|
+
void InitUnigramTable() {
|
53
|
+
int a, i;
|
54
|
+
double train_words_pow = 0;
|
55
|
+
double d1, power = 0.75;
|
56
|
+
table = (int *)malloc(table_size * sizeof(int));
|
57
|
+
for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
|
58
|
+
i = 0;
|
59
|
+
d1 = pow(vocab[i].cn, power) / train_words_pow;
|
60
|
+
for (a = 0; a < table_size; a++) {
|
61
|
+
table[a] = i;
|
62
|
+
if (a / (double)table_size > d1) {
|
63
|
+
i++;
|
64
|
+
d1 += pow(vocab[i].cn, power) / train_words_pow;
|
65
|
+
}
|
66
|
+
if (i >= vocab_size) i = vocab_size - 1;
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
71
|
+
void ReadWord(char *word, FILE *fin) {
|
72
|
+
int a = 0, ch;
|
73
|
+
while (!feof(fin)) {
|
74
|
+
ch = fgetc(fin);
|
75
|
+
if (ch == 13) continue;
|
76
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
77
|
+
if (a > 0) {
|
78
|
+
if (ch == '\n') ungetc(ch, fin);
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
if (ch == '\n') {
|
82
|
+
strcpy(word, (char *)"</s>");
|
83
|
+
return;
|
84
|
+
} else continue;
|
85
|
+
}
|
86
|
+
word[a] = ch;
|
87
|
+
a++;
|
88
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
89
|
+
}
|
90
|
+
word[a] = 0;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Returns hash value of a word
|
94
|
+
int GetWordHash(char *word) {
|
95
|
+
unsigned long long a, hash = 0;
|
96
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
97
|
+
hash = hash % vocab_hash_size;
|
98
|
+
return hash;
|
99
|
+
}
|
100
|
+
|
101
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
102
|
+
int SearchVocab(char *word) {
|
103
|
+
unsigned int hash = GetWordHash(word);
|
104
|
+
while (1) {
|
105
|
+
if (vocab_hash[hash] == -1) return -1;
|
106
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
107
|
+
hash = (hash + 1) % vocab_hash_size;
|
108
|
+
}
|
109
|
+
return -1;
|
110
|
+
}
|
111
|
+
|
112
|
+
// Reads a word and returns its index in the vocabulary
|
113
|
+
int ReadWordIndex(FILE *fin) {
|
114
|
+
char word[MAX_STRING];
|
115
|
+
ReadWord(word, fin);
|
116
|
+
if (feof(fin)) return -1;
|
117
|
+
return SearchVocab(word);
|
118
|
+
}
|
119
|
+
|
120
|
+
// Adds a word to the vocabulary
|
121
|
+
int AddWordToVocab(char *word) {
|
122
|
+
unsigned int hash, length = strlen(word) + 1;
|
123
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
124
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
125
|
+
strcpy(vocab[vocab_size].word, word);
|
126
|
+
vocab[vocab_size].cn = 0;
|
127
|
+
vocab_size++;
|
128
|
+
// Reallocate memory if needed
|
129
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
130
|
+
vocab_max_size += 1000;
|
131
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
132
|
+
}
|
133
|
+
hash = GetWordHash(word);
|
134
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
135
|
+
vocab_hash[hash] = vocab_size - 1;
|
136
|
+
return vocab_size - 1;
|
137
|
+
}
|
138
|
+
|
139
|
+
// Used later for sorting by word counts
|
140
|
+
int VocabCompare(const void *a, const void *b) {
|
141
|
+
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
|
142
|
+
}
|
143
|
+
|
144
|
+
// Sorts the vocabulary by frequency using word counts
|
145
|
+
void SortVocab() {
|
146
|
+
int a, size;
|
147
|
+
unsigned int hash;
|
148
|
+
// Sort the vocabulary and keep </s> at the first position
|
149
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
150
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
151
|
+
size = vocab_size;
|
152
|
+
train_words = 0;
|
153
|
+
for (a = 0; a < size; a++) {
|
154
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
155
|
+
if ((vocab[a].cn < min_count) && (a != 0)) {
|
156
|
+
vocab_size--;
|
157
|
+
free(vocab[a].word);
|
158
|
+
} else {
|
159
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
160
|
+
hash=GetWordHash(vocab[a].word);
|
161
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
162
|
+
vocab_hash[hash] = a;
|
163
|
+
train_words += vocab[a].cn;
|
164
|
+
}
|
165
|
+
}
|
166
|
+
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
167
|
+
// Allocate memory for the binary tree construction
|
168
|
+
for (a = 0; a < vocab_size; a++) {
|
169
|
+
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
170
|
+
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
171
|
+
}
|
172
|
+
}
|
173
|
+
|
174
|
+
// Reduces the vocabulary by removing infrequent tokens
|
175
|
+
void ReduceVocab() {
|
176
|
+
int a, b = 0;
|
177
|
+
unsigned int hash;
|
178
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
179
|
+
vocab[b].cn = vocab[a].cn;
|
180
|
+
vocab[b].word = vocab[a].word;
|
181
|
+
b++;
|
182
|
+
} else free(vocab[a].word);
|
183
|
+
vocab_size = b;
|
184
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
185
|
+
for (a = 0; a < vocab_size; a++) {
|
186
|
+
// Hash will be re-computed, as it is not actual
|
187
|
+
hash = GetWordHash(vocab[a].word);
|
188
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
189
|
+
vocab_hash[hash] = a;
|
190
|
+
}
|
191
|
+
fflush(stdout);
|
192
|
+
min_reduce++;
|
193
|
+
}
|
194
|
+
|
195
|
+
// Create binary Huffman tree using the word counts
|
196
|
+
// Frequent words will have short uniqe binary codes
|
197
|
+
void CreateBinaryTree() {
|
198
|
+
long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
|
199
|
+
char code[MAX_CODE_LENGTH];
|
200
|
+
long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
|
201
|
+
long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
|
202
|
+
long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
|
203
|
+
for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
|
204
|
+
for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
|
205
|
+
pos1 = vocab_size - 1;
|
206
|
+
pos2 = vocab_size;
|
207
|
+
// Following algorithm constructs the Huffman tree by adding one node at a time
|
208
|
+
for (a = 0; a < vocab_size - 1; a++) {
|
209
|
+
// First, find two smallest nodes 'min1, min2'
|
210
|
+
if (pos1 >= 0) {
|
211
|
+
if (count[pos1] < count[pos2]) {
|
212
|
+
min1i = pos1;
|
213
|
+
pos1--;
|
214
|
+
} else {
|
215
|
+
min1i = pos2;
|
216
|
+
pos2++;
|
217
|
+
}
|
218
|
+
} else {
|
219
|
+
min1i = pos2;
|
220
|
+
pos2++;
|
221
|
+
}
|
222
|
+
if (pos1 >= 0) {
|
223
|
+
if (count[pos1] < count[pos2]) {
|
224
|
+
min2i = pos1;
|
225
|
+
pos1--;
|
226
|
+
} else {
|
227
|
+
min2i = pos2;
|
228
|
+
pos2++;
|
229
|
+
}
|
230
|
+
} else {
|
231
|
+
min2i = pos2;
|
232
|
+
pos2++;
|
233
|
+
}
|
234
|
+
count[vocab_size + a] = count[min1i] + count[min2i];
|
235
|
+
parent_node[min1i] = vocab_size + a;
|
236
|
+
parent_node[min2i] = vocab_size + a;
|
237
|
+
binary[min2i] = 1;
|
238
|
+
}
|
239
|
+
// Now assign binary code to each vocabulary word
|
240
|
+
for (a = 0; a < vocab_size; a++) {
|
241
|
+
b = a;
|
242
|
+
i = 0;
|
243
|
+
while (1) {
|
244
|
+
code[i] = binary[b];
|
245
|
+
point[i] = b;
|
246
|
+
i++;
|
247
|
+
b = parent_node[b];
|
248
|
+
if (b == vocab_size * 2 - 2) break;
|
249
|
+
}
|
250
|
+
vocab[a].codelen = i;
|
251
|
+
vocab[a].point[0] = vocab_size - 2;
|
252
|
+
for (b = 0; b < i; b++) {
|
253
|
+
vocab[a].code[i - b - 1] = code[b];
|
254
|
+
vocab[a].point[i - b] = point[b] - vocab_size;
|
255
|
+
}
|
256
|
+
}
|
257
|
+
free(count);
|
258
|
+
free(binary);
|
259
|
+
free(parent_node);
|
260
|
+
}
|
261
|
+
|
262
|
+
void LearnVocabFromTrainFile() {
|
263
|
+
char word[MAX_STRING];
|
264
|
+
FILE *fin;
|
265
|
+
long long a, i;
|
266
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
267
|
+
fin = fopen(train_file, "rb");
|
268
|
+
if (fin == NULL) {
|
269
|
+
printf("ERROR: training data file not found!\n");
|
270
|
+
exit(1);
|
271
|
+
}
|
272
|
+
vocab_size = 0;
|
273
|
+
AddWordToVocab((char *)"</s>");
|
274
|
+
while (1) {
|
275
|
+
ReadWord(word, fin);
|
276
|
+
if (feof(fin)) break;
|
277
|
+
train_words++;
|
278
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
279
|
+
printf("%lldK%c", train_words / 1000, 13);
|
280
|
+
fflush(stdout);
|
281
|
+
}
|
282
|
+
i = SearchVocab(word);
|
283
|
+
if (i == -1) {
|
284
|
+
a = AddWordToVocab(word);
|
285
|
+
vocab[a].cn = 1;
|
286
|
+
} else vocab[i].cn++;
|
287
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
288
|
+
}
|
289
|
+
SortVocab();
|
290
|
+
if (debug_mode > 0) {
|
291
|
+
printf("Vocab size: %lld\n", vocab_size);
|
292
|
+
printf("Words in train file: %lld\n", train_words);
|
293
|
+
}
|
294
|
+
file_size = ftell(fin);
|
295
|
+
fclose(fin);
|
296
|
+
}
|
297
|
+
|
298
|
+
void SaveVocab() {
|
299
|
+
long long i;
|
300
|
+
FILE *fo = fopen(save_vocab_file, "wb");
|
301
|
+
for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
|
302
|
+
fclose(fo);
|
303
|
+
}
|
304
|
+
|
305
|
+
void ReadVocab() {
|
306
|
+
long long a, i = 0;
|
307
|
+
char c;
|
308
|
+
char word[MAX_STRING];
|
309
|
+
FILE *fin = fopen(read_vocab_file, "rb");
|
310
|
+
if (fin == NULL) {
|
311
|
+
printf("Vocabulary file not found\n");
|
312
|
+
exit(1);
|
313
|
+
}
|
314
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
315
|
+
vocab_size = 0;
|
316
|
+
while (1) {
|
317
|
+
ReadWord(word, fin);
|
318
|
+
if (feof(fin)) break;
|
319
|
+
a = AddWordToVocab(word);
|
320
|
+
fscanf(fin, "%lld%c", &vocab[a].cn, &c);
|
321
|
+
i++;
|
322
|
+
}
|
323
|
+
SortVocab();
|
324
|
+
if (debug_mode > 0) {
|
325
|
+
printf("Vocab size: %lld\n", vocab_size);
|
326
|
+
printf("Words in train file: %lld\n", train_words);
|
327
|
+
}
|
328
|
+
fin = fopen(train_file, "rb");
|
329
|
+
if (fin == NULL) {
|
330
|
+
printf("ERROR: training data file not found!\n");
|
331
|
+
exit(1);
|
332
|
+
}
|
333
|
+
fseek(fin, 0, SEEK_END);
|
334
|
+
file_size = ftell(fin);
|
335
|
+
fclose(fin);
|
336
|
+
}
|
337
|
+
|
338
|
+
void InitNet() {
|
339
|
+
long long a, b;
|
340
|
+
unsigned long long next_random = 1;
|
341
|
+
a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
|
342
|
+
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
|
343
|
+
if (hs) {
|
344
|
+
a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
|
345
|
+
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
|
346
|
+
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
|
347
|
+
syn1[a * layer1_size + b] = 0;
|
348
|
+
}
|
349
|
+
if (negative>0) {
|
350
|
+
a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
|
351
|
+
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
|
352
|
+
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
|
353
|
+
syn1neg[a * layer1_size + b] = 0;
|
354
|
+
}
|
355
|
+
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
|
356
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
357
|
+
syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
|
358
|
+
}
|
359
|
+
CreateBinaryTree();
|
360
|
+
}
|
361
|
+
|
362
|
+
void *TrainModelThread(void *id) {
|
363
|
+
long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
|
364
|
+
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
|
365
|
+
long long l1, l2, c, target, label, local_iter = iter;
|
366
|
+
unsigned long long next_random = (long long)id;
|
367
|
+
real f, g;
|
368
|
+
clock_t now;
|
369
|
+
real *neu1 = (real *)calloc(layer1_size, sizeof(real));
|
370
|
+
real *neu1e = (real *)calloc(layer1_size, sizeof(real));
|
371
|
+
FILE *fi = fopen(train_file, "rb");
|
372
|
+
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
|
373
|
+
while (1) {
|
374
|
+
if (word_count - last_word_count > 10000) {
|
375
|
+
word_count_actual += word_count - last_word_count;
|
376
|
+
last_word_count = word_count;
|
377
|
+
if ((debug_mode > 1)) {
|
378
|
+
now=clock();
|
379
|
+
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
|
380
|
+
word_count_actual / (real)(iter * train_words + 1) * 100,
|
381
|
+
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
|
382
|
+
fflush(stdout);
|
383
|
+
}
|
384
|
+
alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
|
385
|
+
if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
|
386
|
+
}
|
387
|
+
if (sentence_length == 0) {
|
388
|
+
while (1) {
|
389
|
+
word = ReadWordIndex(fi);
|
390
|
+
if (feof(fi)) break;
|
391
|
+
if (word == -1) continue;
|
392
|
+
word_count++;
|
393
|
+
if (word == 0) break;
|
394
|
+
// The subsampling randomly discards frequent words while keeping the ranking same
|
395
|
+
if (sample > 0) {
|
396
|
+
real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
|
397
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
398
|
+
if (ran < (next_random & 0xFFFF) / (real)65536) continue;
|
399
|
+
}
|
400
|
+
sen[sentence_length] = word;
|
401
|
+
sentence_length++;
|
402
|
+
if (sentence_length >= MAX_SENTENCE_LENGTH) break;
|
403
|
+
}
|
404
|
+
sentence_position = 0;
|
405
|
+
}
|
406
|
+
if (feof(fi) || (word_count > train_words / num_threads)) {
|
407
|
+
word_count_actual += word_count - last_word_count;
|
408
|
+
local_iter--;
|
409
|
+
if (local_iter == 0) break;
|
410
|
+
word_count = 0;
|
411
|
+
last_word_count = 0;
|
412
|
+
sentence_length = 0;
|
413
|
+
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
|
414
|
+
continue;
|
415
|
+
}
|
416
|
+
word = sen[sentence_position];
|
417
|
+
if (word == -1) continue;
|
418
|
+
for (c = 0; c < layer1_size; c++) neu1[c] = 0;
|
419
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
|
420
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
421
|
+
b = next_random % window;
|
422
|
+
if (cbow) { //train the cbow architecture
|
423
|
+
// in -> hidden
|
424
|
+
cw = 0;
|
425
|
+
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
|
426
|
+
c = sentence_position - window + a;
|
427
|
+
if (c < 0) continue;
|
428
|
+
if (c >= sentence_length) continue;
|
429
|
+
last_word = sen[c];
|
430
|
+
if (last_word == -1) continue;
|
431
|
+
for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
|
432
|
+
cw++;
|
433
|
+
}
|
434
|
+
if (cw) {
|
435
|
+
for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
|
436
|
+
if (hs) for (d = 0; d < vocab[word].codelen; d++) {
|
437
|
+
f = 0;
|
438
|
+
l2 = vocab[word].point[d] * layer1_size;
|
439
|
+
// Propagate hidden -> output
|
440
|
+
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
|
441
|
+
if (f <= -MAX_EXP) continue;
|
442
|
+
else if (f >= MAX_EXP) continue;
|
443
|
+
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
|
444
|
+
// 'g' is the gradient multiplied by the learning rate
|
445
|
+
g = (1 - vocab[word].code[d] - f) * alpha;
|
446
|
+
// Propagate errors output -> hidden
|
447
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
|
448
|
+
// Learn weights hidden -> output
|
449
|
+
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
|
450
|
+
}
|
451
|
+
// NEGATIVE SAMPLING
|
452
|
+
if (negative > 0) for (d = 0; d < negative + 1; d++) {
|
453
|
+
if (d == 0) {
|
454
|
+
target = word;
|
455
|
+
label = 1;
|
456
|
+
} else {
|
457
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
458
|
+
target = table[(next_random >> 16) % table_size];
|
459
|
+
if (target == 0) target = next_random % (vocab_size - 1) + 1;
|
460
|
+
if (target == word) continue;
|
461
|
+
label = 0;
|
462
|
+
}
|
463
|
+
l2 = target * layer1_size;
|
464
|
+
f = 0;
|
465
|
+
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
|
466
|
+
if (f > MAX_EXP) g = (label - 1) * alpha;
|
467
|
+
else if (f < -MAX_EXP) g = (label - 0) * alpha;
|
468
|
+
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
|
469
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
|
470
|
+
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
|
471
|
+
}
|
472
|
+
// hidden -> in
|
473
|
+
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
|
474
|
+
c = sentence_position - window + a;
|
475
|
+
if (c < 0) continue;
|
476
|
+
if (c >= sentence_length) continue;
|
477
|
+
last_word = sen[c];
|
478
|
+
if (last_word == -1) continue;
|
479
|
+
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
|
480
|
+
}
|
481
|
+
}
|
482
|
+
} else { //train skip-gram
|
483
|
+
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
|
484
|
+
c = sentence_position - window + a;
|
485
|
+
if (c < 0) continue;
|
486
|
+
if (c >= sentence_length) continue;
|
487
|
+
last_word = sen[c];
|
488
|
+
if (last_word == -1) continue;
|
489
|
+
l1 = last_word * layer1_size;
|
490
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
|
491
|
+
// HIERARCHICAL SOFTMAX
|
492
|
+
if (hs) for (d = 0; d < vocab[word].codelen; d++) {
|
493
|
+
f = 0;
|
494
|
+
l2 = vocab[word].point[d] * layer1_size;
|
495
|
+
// Propagate hidden -> output
|
496
|
+
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
|
497
|
+
if (f <= -MAX_EXP) continue;
|
498
|
+
else if (f >= MAX_EXP) continue;
|
499
|
+
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
|
500
|
+
// 'g' is the gradient multiplied by the learning rate
|
501
|
+
g = (1 - vocab[word].code[d] - f) * alpha;
|
502
|
+
// Propagate errors output -> hidden
|
503
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
|
504
|
+
// Learn weights hidden -> output
|
505
|
+
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
|
506
|
+
}
|
507
|
+
// NEGATIVE SAMPLING
|
508
|
+
if (negative > 0) for (d = 0; d < negative + 1; d++) {
|
509
|
+
if (d == 0) {
|
510
|
+
target = word;
|
511
|
+
label = 1;
|
512
|
+
} else {
|
513
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
514
|
+
target = table[(next_random >> 16) % table_size];
|
515
|
+
if (target == 0) target = next_random % (vocab_size - 1) + 1;
|
516
|
+
if (target == word) continue;
|
517
|
+
label = 0;
|
518
|
+
}
|
519
|
+
l2 = target * layer1_size;
|
520
|
+
f = 0;
|
521
|
+
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
|
522
|
+
if (f > MAX_EXP) g = (label - 1) * alpha;
|
523
|
+
else if (f < -MAX_EXP) g = (label - 0) * alpha;
|
524
|
+
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
|
525
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
|
526
|
+
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
|
527
|
+
}
|
528
|
+
// Learn weights input -> hidden
|
529
|
+
for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
|
530
|
+
}
|
531
|
+
}
|
532
|
+
sentence_position++;
|
533
|
+
if (sentence_position >= sentence_length) {
|
534
|
+
sentence_length = 0;
|
535
|
+
continue;
|
536
|
+
}
|
537
|
+
}
|
538
|
+
fclose(fi);
|
539
|
+
free(neu1);
|
540
|
+
free(neu1e);
|
541
|
+
pthread_exit(NULL);
|
542
|
+
}
|
543
|
+
|
544
|
+
void TrainModel() {
|
545
|
+
long a, b, c, d;
|
546
|
+
FILE *fo;
|
547
|
+
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
|
548
|
+
printf("Starting training using file %s\n", train_file);
|
549
|
+
starting_alpha = alpha;
|
550
|
+
if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
|
551
|
+
if (save_vocab_file[0] != 0) SaveVocab();
|
552
|
+
if (output_file[0] == 0) return;
|
553
|
+
InitNet();
|
554
|
+
if (negative > 0) InitUnigramTable();
|
555
|
+
start = clock();
|
556
|
+
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
|
557
|
+
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
|
558
|
+
fo = fopen(output_file, "wb");
|
559
|
+
if (classes == 0) {
|
560
|
+
// Save the word vectors
|
561
|
+
fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
|
562
|
+
for (a = 0; a < vocab_size; a++) {
|
563
|
+
fprintf(fo, "%s ", vocab[a].word);
|
564
|
+
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
|
565
|
+
else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
|
566
|
+
fprintf(fo, "\n");
|
567
|
+
}
|
568
|
+
} else {
|
569
|
+
// Run K-means on the word vectors
|
570
|
+
int clcn = classes, iter = 10, closeid;
|
571
|
+
int *centcn = (int *)malloc(classes * sizeof(int));
|
572
|
+
int *cl = (int *)calloc(vocab_size, sizeof(int));
|
573
|
+
real closev, x;
|
574
|
+
real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
|
575
|
+
for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
|
576
|
+
for (a = 0; a < iter; a++) {
|
577
|
+
for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
|
578
|
+
for (b = 0; b < clcn; b++) centcn[b] = 1;
|
579
|
+
for (c = 0; c < vocab_size; c++) {
|
580
|
+
for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
|
581
|
+
centcn[cl[c]]++;
|
582
|
+
}
|
583
|
+
for (b = 0; b < clcn; b++) {
|
584
|
+
closev = 0;
|
585
|
+
for (c = 0; c < layer1_size; c++) {
|
586
|
+
cent[layer1_size * b + c] /= centcn[b];
|
587
|
+
closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
|
588
|
+
}
|
589
|
+
closev = sqrt(closev);
|
590
|
+
for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
|
591
|
+
}
|
592
|
+
for (c = 0; c < vocab_size; c++) {
|
593
|
+
closev = -10;
|
594
|
+
closeid = 0;
|
595
|
+
for (d = 0; d < clcn; d++) {
|
596
|
+
x = 0;
|
597
|
+
for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
|
598
|
+
if (x > closev) {
|
599
|
+
closev = x;
|
600
|
+
closeid = d;
|
601
|
+
}
|
602
|
+
}
|
603
|
+
cl[c] = closeid;
|
604
|
+
}
|
605
|
+
}
|
606
|
+
// Save the K-means classes
|
607
|
+
for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
|
608
|
+
free(centcn);
|
609
|
+
free(cent);
|
610
|
+
free(cl);
|
611
|
+
}
|
612
|
+
fclose(fo);
|
613
|
+
}
|
614
|
+
|
615
|
+
int ArgPos(char *str, int argc, char **argv) {
|
616
|
+
int a;
|
617
|
+
for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
|
618
|
+
if (a == argc - 1) {
|
619
|
+
printf("Argument missing for %s\n", str);
|
620
|
+
exit(1);
|
621
|
+
}
|
622
|
+
return a;
|
623
|
+
}
|
624
|
+
return -1;
|
625
|
+
}
|
626
|
+
|
627
|
+
int main(int argc, char **argv) {
|
628
|
+
int i;
|
629
|
+
if (argc == 1) {
|
630
|
+
printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
|
631
|
+
printf("Options:\n");
|
632
|
+
printf("Parameters for training:\n");
|
633
|
+
printf("\t-train <file>\n");
|
634
|
+
printf("\t\tUse text data from <file> to train the model\n");
|
635
|
+
printf("\t-output <file>\n");
|
636
|
+
printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
|
637
|
+
printf("\t-size <int>\n");
|
638
|
+
printf("\t\tSet size of word vectors; default is 100\n");
|
639
|
+
printf("\t-window <int>\n");
|
640
|
+
printf("\t\tSet max skip length between words; default is 5\n");
|
641
|
+
printf("\t-sample <float>\n");
|
642
|
+
printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
|
643
|
+
printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
|
644
|
+
printf("\t-hs <int>\n");
|
645
|
+
printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
|
646
|
+
printf("\t-negative <int>\n");
|
647
|
+
printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
|
648
|
+
printf("\t-threads <int>\n");
|
649
|
+
printf("\t\tUse <int> threads (default 12)\n");
|
650
|
+
printf("\t-iter <int>\n");
|
651
|
+
printf("\t\tRun more training iterations (default 5)\n");
|
652
|
+
printf("\t-min-count <int>\n");
|
653
|
+
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
|
654
|
+
printf("\t-alpha <float>\n");
|
655
|
+
printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
|
656
|
+
printf("\t-classes <int>\n");
|
657
|
+
printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
|
658
|
+
printf("\t-debug <int>\n");
|
659
|
+
printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
|
660
|
+
printf("\t-binary <int>\n");
|
661
|
+
printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
|
662
|
+
printf("\t-save-vocab <file>\n");
|
663
|
+
printf("\t\tThe vocabulary will be saved to <file>\n");
|
664
|
+
printf("\t-read-vocab <file>\n");
|
665
|
+
printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
|
666
|
+
printf("\t-cbow <int>\n");
|
667
|
+
printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
|
668
|
+
printf("\nExamples:\n");
|
669
|
+
printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
|
670
|
+
return 0;
|
671
|
+
}
|
672
|
+
output_file[0] = 0;
|
673
|
+
save_vocab_file[0] = 0;
|
674
|
+
read_vocab_file[0] = 0;
|
675
|
+
if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
|
676
|
+
if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
|
677
|
+
if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
|
678
|
+
if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
|
679
|
+
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
|
680
|
+
if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
|
681
|
+
if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
|
682
|
+
if (cbow) alpha = 0.05;
|
683
|
+
if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
|
684
|
+
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
|
685
|
+
if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
|
686
|
+
if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
|
687
|
+
if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
|
688
|
+
if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
|
689
|
+
if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
|
690
|
+
if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
|
691
|
+
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
|
692
|
+
if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
|
693
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
694
|
+
vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
|
695
|
+
expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
|
696
|
+
for (i = 0; i < EXP_TABLE_SIZE; i++) {
|
697
|
+
expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
|
698
|
+
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
|
699
|
+
}
|
700
|
+
TrainModel();
|
701
|
+
return 0;
|
702
|
+
}
|