word2vec 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ext/word2vec/LICENSE +202 -0
- data/ext/word2vec/README.txt +21 -0
- data/ext/word2vec/compute-accuracy.c +143 -0
- data/ext/word2vec/demo-analogy.sh +11 -0
- data/ext/word2vec/demo-classes.sh +8 -0
- data/ext/word2vec/demo-phrase-accuracy.sh +11 -0
- data/ext/word2vec/demo-phrases.sh +11 -0
- data/ext/word2vec/demo-train-big-model-v1.sh +100 -0
- data/ext/word2vec/demo-word-accuracy.sh +8 -0
- data/ext/word2vec/demo-word.sh +7 -0
- data/ext/word2vec/distance.c +143 -0
- data/ext/word2vec/extconf.rb +0 -0
- data/ext/word2vec/makefile +22 -0
- data/ext/word2vec/questions-phrases.txt +3223 -0
- data/ext/word2vec/questions-words.txt +19558 -0
- data/ext/word2vec/word-analogy.c +145 -0
- data/ext/word2vec/word2phrase.c +292 -0
- data/ext/word2vec/word2vec.c +702 -0
- data/lib/word2vec.rb +6 -0
- data/lib/word2vec/io.rb +27 -0
- data/lib/word2vec/scripts_interface.rb +97 -0
- data/lib/word2vec/utils.rb +9 -0
- data/lib/word2vec/version.rb +3 -0
- data/lib/word2vec/word_clusters.rb +36 -0
- data/lib/word2vec/word_vectors.rb +182 -0
- data/word2vec.gemspec +29 -0
- metadata +151 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <string.h>
|
18
|
+
#include <math.h>
|
19
|
+
/* #include <malloc.h> */
|
20
|
+
|
21
|
+
const long long max_size = 2000; // max length of strings
|
22
|
+
const long long N = 40; // number of closest words that will be shown
|
23
|
+
const long long max_w = 50; // max length of vocabulary entries
|
24
|
+
|
25
|
+
int main(int argc, char **argv) {
|
26
|
+
FILE *f;
|
27
|
+
char st1[max_size];
|
28
|
+
char bestw[N][max_size];
|
29
|
+
char file_name[max_size], st[100][max_size];
|
30
|
+
float dist, len, bestd[N], vec[max_size];
|
31
|
+
long long words, size, a, b, c, d, cn, bi[100];
|
32
|
+
char ch;
|
33
|
+
float *M;
|
34
|
+
char *vocab;
|
35
|
+
if (argc < 2) {
|
36
|
+
printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
|
37
|
+
return 0;
|
38
|
+
}
|
39
|
+
strcpy(file_name, argv[1]);
|
40
|
+
f = fopen(file_name, "rb");
|
41
|
+
if (f == NULL) {
|
42
|
+
printf("Input file not found\n");
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
fscanf(f, "%lld", &words);
|
46
|
+
fscanf(f, "%lld", &size);
|
47
|
+
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
|
48
|
+
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
|
49
|
+
if (M == NULL) {
|
50
|
+
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
|
51
|
+
return -1;
|
52
|
+
}
|
53
|
+
for (b = 0; b < words; b++) {
|
54
|
+
a = 0;
|
55
|
+
while (1) {
|
56
|
+
vocab[b * max_w + a] = fgetc(f);
|
57
|
+
if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
|
58
|
+
if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
|
59
|
+
}
|
60
|
+
vocab[b * max_w + a] = 0;
|
61
|
+
for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
|
62
|
+
len = 0;
|
63
|
+
for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
|
64
|
+
len = sqrt(len);
|
65
|
+
for (a = 0; a < size; a++) M[a + b * size] /= len;
|
66
|
+
}
|
67
|
+
fclose(f);
|
68
|
+
while (1) {
|
69
|
+
for (a = 0; a < N; a++) bestd[a] = 0;
|
70
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
71
|
+
printf("Enter three words (EXIT to break): ");
|
72
|
+
a = 0;
|
73
|
+
while (1) {
|
74
|
+
st1[a] = fgetc(stdin);
|
75
|
+
if ((st1[a] == '\n') || (a >= max_size - 1)) {
|
76
|
+
st1[a] = 0;
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
a++;
|
80
|
+
}
|
81
|
+
if (!strcmp(st1, "EXIT")) break;
|
82
|
+
cn = 0;
|
83
|
+
b = 0;
|
84
|
+
c = 0;
|
85
|
+
while (1) {
|
86
|
+
st[cn][b] = st1[c];
|
87
|
+
b++;
|
88
|
+
c++;
|
89
|
+
st[cn][b] = 0;
|
90
|
+
if (st1[c] == 0) break;
|
91
|
+
if (st1[c] == ' ') {
|
92
|
+
cn++;
|
93
|
+
b = 0;
|
94
|
+
c++;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
cn++;
|
98
|
+
if (cn < 3) {
|
99
|
+
printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
|
100
|
+
continue;
|
101
|
+
}
|
102
|
+
for (a = 0; a < cn; a++) {
|
103
|
+
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
|
104
|
+
if (b == words) b = 0;
|
105
|
+
bi[a] = b;
|
106
|
+
printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
|
107
|
+
if (b == 0) {
|
108
|
+
printf("Out of dictionary word!\n");
|
109
|
+
break;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
if (b == 0) continue;
|
113
|
+
printf("\n Word Distance\n------------------------------------------------------------------------\n");
|
114
|
+
for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
|
115
|
+
len = 0;
|
116
|
+
for (a = 0; a < size; a++) len += vec[a] * vec[a];
|
117
|
+
len = sqrt(len);
|
118
|
+
for (a = 0; a < size; a++) vec[a] /= len;
|
119
|
+
for (a = 0; a < N; a++) bestd[a] = 0;
|
120
|
+
for (a = 0; a < N; a++) bestw[a][0] = 0;
|
121
|
+
for (c = 0; c < words; c++) {
|
122
|
+
if (c == bi[0]) continue;
|
123
|
+
if (c == bi[1]) continue;
|
124
|
+
if (c == bi[2]) continue;
|
125
|
+
a = 0;
|
126
|
+
for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
|
127
|
+
if (a == 1) continue;
|
128
|
+
dist = 0;
|
129
|
+
for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
|
130
|
+
for (a = 0; a < N; a++) {
|
131
|
+
if (dist > bestd[a]) {
|
132
|
+
for (d = N - 1; d > a; d--) {
|
133
|
+
bestd[d] = bestd[d - 1];
|
134
|
+
strcpy(bestw[d], bestw[d - 1]);
|
135
|
+
}
|
136
|
+
bestd[a] = dist;
|
137
|
+
strcpy(bestw[a], &vocab[c * max_w]);
|
138
|
+
break;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
}
|
142
|
+
for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
|
143
|
+
}
|
144
|
+
return 0;
|
145
|
+
}
|
@@ -0,0 +1,292 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <string.h>
|
18
|
+
#include <math.h>
|
19
|
+
#include <pthread.h>
|
20
|
+
|
21
|
+
#define MAX_STRING 60
|
22
|
+
|
23
|
+
const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
|
24
|
+
|
25
|
+
typedef float real; // Precision of float numbers
|
26
|
+
|
27
|
+
struct vocab_word {
|
28
|
+
long long cn;
|
29
|
+
char *word;
|
30
|
+
};
|
31
|
+
|
32
|
+
char train_file[MAX_STRING], output_file[MAX_STRING];
|
33
|
+
struct vocab_word *vocab;
|
34
|
+
int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
|
35
|
+
long long vocab_max_size = 10000, vocab_size = 0;
|
36
|
+
long long train_words = 0;
|
37
|
+
real threshold = 100;
|
38
|
+
|
39
|
+
unsigned long long next_random = 1;
|
40
|
+
|
41
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
42
|
+
void ReadWord(char *word, FILE *fin) {
|
43
|
+
int a = 0, ch;
|
44
|
+
while (!feof(fin)) {
|
45
|
+
ch = fgetc(fin);
|
46
|
+
if (ch == 13) continue;
|
47
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
48
|
+
if (a > 0) {
|
49
|
+
if (ch == '\n') ungetc(ch, fin);
|
50
|
+
break;
|
51
|
+
}
|
52
|
+
if (ch == '\n') {
|
53
|
+
strcpy(word, (char *)"</s>");
|
54
|
+
return;
|
55
|
+
} else continue;
|
56
|
+
}
|
57
|
+
word[a] = ch;
|
58
|
+
a++;
|
59
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
60
|
+
}
|
61
|
+
word[a] = 0;
|
62
|
+
}
|
63
|
+
|
64
|
+
// Returns hash value of a word
|
65
|
+
int GetWordHash(char *word) {
|
66
|
+
unsigned long long a, hash = 1;
|
67
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
68
|
+
hash = hash % vocab_hash_size;
|
69
|
+
return hash;
|
70
|
+
}
|
71
|
+
|
72
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
73
|
+
int SearchVocab(char *word) {
|
74
|
+
unsigned int hash = GetWordHash(word);
|
75
|
+
while (1) {
|
76
|
+
if (vocab_hash[hash] == -1) return -1;
|
77
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
78
|
+
hash = (hash + 1) % vocab_hash_size;
|
79
|
+
}
|
80
|
+
return -1;
|
81
|
+
}
|
82
|
+
|
83
|
+
// Reads a word and returns its index in the vocabulary
|
84
|
+
int ReadWordIndex(FILE *fin) {
|
85
|
+
char word[MAX_STRING];
|
86
|
+
ReadWord(word, fin);
|
87
|
+
if (feof(fin)) return -1;
|
88
|
+
return SearchVocab(word);
|
89
|
+
}
|
90
|
+
|
91
|
+
// Adds a word to the vocabulary
|
92
|
+
int AddWordToVocab(char *word) {
|
93
|
+
unsigned int hash, length = strlen(word) + 1;
|
94
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
95
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
96
|
+
strcpy(vocab[vocab_size].word, word);
|
97
|
+
vocab[vocab_size].cn = 0;
|
98
|
+
vocab_size++;
|
99
|
+
// Reallocate memory if needed
|
100
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
101
|
+
vocab_max_size += 10000;
|
102
|
+
vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
103
|
+
}
|
104
|
+
hash = GetWordHash(word);
|
105
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
106
|
+
vocab_hash[hash]=vocab_size - 1;
|
107
|
+
return vocab_size - 1;
|
108
|
+
}
|
109
|
+
|
110
|
+
// Used later for sorting by word counts
|
111
|
+
int VocabCompare(const void *a, const void *b) {
|
112
|
+
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
|
113
|
+
}
|
114
|
+
|
115
|
+
// Sorts the vocabulary by frequency using word counts
|
116
|
+
void SortVocab() {
|
117
|
+
int a;
|
118
|
+
unsigned int hash;
|
119
|
+
// Sort the vocabulary and keep </s> at the first position
|
120
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
121
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
122
|
+
for (a = 0; a < vocab_size; a++) {
|
123
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
124
|
+
if (vocab[a].cn < min_count) {
|
125
|
+
vocab_size--;
|
126
|
+
free(vocab[vocab_size].word);
|
127
|
+
} else {
|
128
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
129
|
+
hash = GetWordHash(vocab[a].word);
|
130
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
131
|
+
vocab_hash[hash] = a;
|
132
|
+
}
|
133
|
+
}
|
134
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
|
135
|
+
}
|
136
|
+
|
137
|
+
// Reduces the vocabulary by removing infrequent tokens
|
138
|
+
void ReduceVocab() {
|
139
|
+
int a, b = 0;
|
140
|
+
unsigned int hash;
|
141
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
142
|
+
vocab[b].cn = vocab[a].cn;
|
143
|
+
vocab[b].word = vocab[a].word;
|
144
|
+
b++;
|
145
|
+
} else free(vocab[a].word);
|
146
|
+
vocab_size = b;
|
147
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
148
|
+
for (a = 0; a < vocab_size; a++) {
|
149
|
+
// Hash will be re-computed, as it is not actual
|
150
|
+
hash = GetWordHash(vocab[a].word);
|
151
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
152
|
+
vocab_hash[hash] = a;
|
153
|
+
}
|
154
|
+
fflush(stdout);
|
155
|
+
min_reduce++;
|
156
|
+
}
|
157
|
+
|
158
|
+
void LearnVocabFromTrainFile() {
|
159
|
+
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
|
160
|
+
FILE *fin;
|
161
|
+
long long a, i, start = 1;
|
162
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
163
|
+
fin = fopen(train_file, "rb");
|
164
|
+
if (fin == NULL) {
|
165
|
+
printf("ERROR: training data file not found!\n");
|
166
|
+
exit(1);
|
167
|
+
}
|
168
|
+
vocab_size = 0;
|
169
|
+
AddWordToVocab((char *)"</s>");
|
170
|
+
while (1) {
|
171
|
+
ReadWord(word, fin);
|
172
|
+
if (feof(fin)) break;
|
173
|
+
if (!strcmp(word, "</s>")) {
|
174
|
+
start = 1;
|
175
|
+
continue;
|
176
|
+
} else start = 0;
|
177
|
+
train_words++;
|
178
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
179
|
+
printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13);
|
180
|
+
fflush(stdout);
|
181
|
+
}
|
182
|
+
i = SearchVocab(word);
|
183
|
+
if (i == -1) {
|
184
|
+
a = AddWordToVocab(word);
|
185
|
+
vocab[a].cn = 1;
|
186
|
+
} else vocab[i].cn++;
|
187
|
+
if (start) continue;
|
188
|
+
sprintf(bigram_word, "%s_%s", last_word, word);
|
189
|
+
bigram_word[MAX_STRING - 1] = 0;
|
190
|
+
strcpy(last_word, word);
|
191
|
+
i = SearchVocab(bigram_word);
|
192
|
+
if (i == -1) {
|
193
|
+
a = AddWordToVocab(bigram_word);
|
194
|
+
vocab[a].cn = 1;
|
195
|
+
} else vocab[i].cn++;
|
196
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
197
|
+
}
|
198
|
+
SortVocab();
|
199
|
+
if (debug_mode > 0) {
|
200
|
+
printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
|
201
|
+
printf("Words in train file: %lld\n", train_words);
|
202
|
+
}
|
203
|
+
fclose(fin);
|
204
|
+
}
|
205
|
+
|
206
|
+
void TrainModel() {
|
207
|
+
long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
|
208
|
+
char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
|
209
|
+
real score;
|
210
|
+
FILE *fo, *fin;
|
211
|
+
printf("Starting training using file %s\n", train_file);
|
212
|
+
LearnVocabFromTrainFile();
|
213
|
+
fin = fopen(train_file, "rb");
|
214
|
+
fo = fopen(output_file, "wb");
|
215
|
+
word[0] = 0;
|
216
|
+
while (1) {
|
217
|
+
strcpy(last_word, word);
|
218
|
+
ReadWord(word, fin);
|
219
|
+
if (feof(fin)) break;
|
220
|
+
if (!strcmp(word, "</s>")) {
|
221
|
+
fprintf(fo, "\n");
|
222
|
+
continue;
|
223
|
+
}
|
224
|
+
cn++;
|
225
|
+
if ((debug_mode > 1) && (cn % 100000 == 0)) {
|
226
|
+
printf("Words written: %lldK%c", cn / 1000, 13);
|
227
|
+
fflush(stdout);
|
228
|
+
}
|
229
|
+
oov = 0;
|
230
|
+
i = SearchVocab(word);
|
231
|
+
if (i == -1) oov = 1; else pb = vocab[i].cn;
|
232
|
+
if (li == -1) oov = 1;
|
233
|
+
li = i;
|
234
|
+
sprintf(bigram_word, "%s_%s", last_word, word);
|
235
|
+
bigram_word[MAX_STRING - 1] = 0;
|
236
|
+
i = SearchVocab(bigram_word);
|
237
|
+
if (i == -1) oov = 1; else pab = vocab[i].cn;
|
238
|
+
if (pa < min_count) oov = 1;
|
239
|
+
if (pb < min_count) oov = 1;
|
240
|
+
if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
|
241
|
+
if (score > threshold) {
|
242
|
+
fprintf(fo, "_%s", word);
|
243
|
+
pb = 0;
|
244
|
+
} else fprintf(fo, " %s", word);
|
245
|
+
pa = pb;
|
246
|
+
}
|
247
|
+
fclose(fo);
|
248
|
+
fclose(fin);
|
249
|
+
}
|
250
|
+
|
251
|
+
int ArgPos(char *str, int argc, char **argv) {
|
252
|
+
int a;
|
253
|
+
for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
|
254
|
+
if (a == argc - 1) {
|
255
|
+
printf("Argument missing for %s\n", str);
|
256
|
+
exit(1);
|
257
|
+
}
|
258
|
+
return a;
|
259
|
+
}
|
260
|
+
return -1;
|
261
|
+
}
|
262
|
+
|
263
|
+
int main(int argc, char **argv) {
|
264
|
+
int i;
|
265
|
+
if (argc == 1) {
|
266
|
+
printf("WORD2PHRASE tool v0.1a\n\n");
|
267
|
+
printf("Options:\n");
|
268
|
+
printf("Parameters for training:\n");
|
269
|
+
printf("\t-train <file>\n");
|
270
|
+
printf("\t\tUse text data from <file> to train the model\n");
|
271
|
+
printf("\t-output <file>\n");
|
272
|
+
printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
|
273
|
+
printf("\t-min-count <int>\n");
|
274
|
+
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
|
275
|
+
printf("\t-threshold <float>\n");
|
276
|
+
printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
|
277
|
+
printf("\t-debug <int>\n");
|
278
|
+
printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
|
279
|
+
printf("\nExamples:\n");
|
280
|
+
printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
|
281
|
+
return 0;
|
282
|
+
}
|
283
|
+
if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
|
284
|
+
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
|
285
|
+
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
|
286
|
+
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
|
287
|
+
if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
|
288
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
289
|
+
vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
|
290
|
+
TrainModel();
|
291
|
+
return 0;
|
292
|
+
}
|
@@ -0,0 +1,702 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
#include <stdlib.h>
|
17
|
+
#include <string.h>
|
18
|
+
#include <math.h>
|
19
|
+
#include <pthread.h>
|
20
|
+
|
21
|
+
#define MAX_STRING 100
|
22
|
+
#define EXP_TABLE_SIZE 1000
|
23
|
+
#define MAX_EXP 6
|
24
|
+
#define MAX_SENTENCE_LENGTH 1000
|
25
|
+
#define MAX_CODE_LENGTH 40
|
26
|
+
|
27
|
+
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
|
28
|
+
|
29
|
+
typedef float real; // Precision of float numbers
|
30
|
+
|
31
|
+
struct vocab_word {
|
32
|
+
long long cn;
|
33
|
+
int *point;
|
34
|
+
char *word, *code, codelen;
|
35
|
+
};
|
36
|
+
|
37
|
+
char train_file[MAX_STRING], output_file[MAX_STRING];
|
38
|
+
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
|
39
|
+
struct vocab_word *vocab;
|
40
|
+
int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
|
41
|
+
int *vocab_hash;
|
42
|
+
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
|
43
|
+
long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
|
44
|
+
real alpha = 0.025, starting_alpha, sample = 1e-3;
|
45
|
+
real *syn0, *syn1, *syn1neg, *expTable;
|
46
|
+
clock_t start;
|
47
|
+
|
48
|
+
int hs = 0, negative = 5;
|
49
|
+
const int table_size = 1e8;
|
50
|
+
int *table;
|
51
|
+
|
52
|
+
void InitUnigramTable() {
|
53
|
+
int a, i;
|
54
|
+
double train_words_pow = 0;
|
55
|
+
double d1, power = 0.75;
|
56
|
+
table = (int *)malloc(table_size * sizeof(int));
|
57
|
+
for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
|
58
|
+
i = 0;
|
59
|
+
d1 = pow(vocab[i].cn, power) / train_words_pow;
|
60
|
+
for (a = 0; a < table_size; a++) {
|
61
|
+
table[a] = i;
|
62
|
+
if (a / (double)table_size > d1) {
|
63
|
+
i++;
|
64
|
+
d1 += pow(vocab[i].cn, power) / train_words_pow;
|
65
|
+
}
|
66
|
+
if (i >= vocab_size) i = vocab_size - 1;
|
67
|
+
}
|
68
|
+
}
|
69
|
+
|
70
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
71
|
+
void ReadWord(char *word, FILE *fin) {
|
72
|
+
int a = 0, ch;
|
73
|
+
while (!feof(fin)) {
|
74
|
+
ch = fgetc(fin);
|
75
|
+
if (ch == 13) continue;
|
76
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
77
|
+
if (a > 0) {
|
78
|
+
if (ch == '\n') ungetc(ch, fin);
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
if (ch == '\n') {
|
82
|
+
strcpy(word, (char *)"</s>");
|
83
|
+
return;
|
84
|
+
} else continue;
|
85
|
+
}
|
86
|
+
word[a] = ch;
|
87
|
+
a++;
|
88
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
89
|
+
}
|
90
|
+
word[a] = 0;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Returns hash value of a word
|
94
|
+
int GetWordHash(char *word) {
|
95
|
+
unsigned long long a, hash = 0;
|
96
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
97
|
+
hash = hash % vocab_hash_size;
|
98
|
+
return hash;
|
99
|
+
}
|
100
|
+
|
101
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
102
|
+
int SearchVocab(char *word) {
|
103
|
+
unsigned int hash = GetWordHash(word);
|
104
|
+
while (1) {
|
105
|
+
if (vocab_hash[hash] == -1) return -1;
|
106
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
107
|
+
hash = (hash + 1) % vocab_hash_size;
|
108
|
+
}
|
109
|
+
return -1;
|
110
|
+
}
|
111
|
+
|
112
|
+
// Reads a word and returns its index in the vocabulary
|
113
|
+
int ReadWordIndex(FILE *fin) {
|
114
|
+
char word[MAX_STRING];
|
115
|
+
ReadWord(word, fin);
|
116
|
+
if (feof(fin)) return -1;
|
117
|
+
return SearchVocab(word);
|
118
|
+
}
|
119
|
+
|
120
|
+
// Adds a word to the vocabulary
|
121
|
+
int AddWordToVocab(char *word) {
|
122
|
+
unsigned int hash, length = strlen(word) + 1;
|
123
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
124
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
125
|
+
strcpy(vocab[vocab_size].word, word);
|
126
|
+
vocab[vocab_size].cn = 0;
|
127
|
+
vocab_size++;
|
128
|
+
// Reallocate memory if needed
|
129
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
130
|
+
vocab_max_size += 1000;
|
131
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
132
|
+
}
|
133
|
+
hash = GetWordHash(word);
|
134
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
135
|
+
vocab_hash[hash] = vocab_size - 1;
|
136
|
+
return vocab_size - 1;
|
137
|
+
}
|
138
|
+
|
139
|
+
// Used later for sorting by word counts
|
140
|
+
int VocabCompare(const void *a, const void *b) {
|
141
|
+
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
|
142
|
+
}
|
143
|
+
|
144
|
+
// Sorts the vocabulary by frequency using word counts
|
145
|
+
void SortVocab() {
|
146
|
+
int a, size;
|
147
|
+
unsigned int hash;
|
148
|
+
// Sort the vocabulary and keep </s> at the first position
|
149
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
150
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
151
|
+
size = vocab_size;
|
152
|
+
train_words = 0;
|
153
|
+
for (a = 0; a < size; a++) {
|
154
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
155
|
+
if ((vocab[a].cn < min_count) && (a != 0)) {
|
156
|
+
vocab_size--;
|
157
|
+
free(vocab[a].word);
|
158
|
+
} else {
|
159
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
160
|
+
hash=GetWordHash(vocab[a].word);
|
161
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
162
|
+
vocab_hash[hash] = a;
|
163
|
+
train_words += vocab[a].cn;
|
164
|
+
}
|
165
|
+
}
|
166
|
+
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
167
|
+
// Allocate memory for the binary tree construction
|
168
|
+
for (a = 0; a < vocab_size; a++) {
|
169
|
+
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
170
|
+
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
171
|
+
}
|
172
|
+
}
|
173
|
+
|
174
|
+
// Reduces the vocabulary by removing infrequent tokens
|
175
|
+
void ReduceVocab() {
|
176
|
+
int a, b = 0;
|
177
|
+
unsigned int hash;
|
178
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
179
|
+
vocab[b].cn = vocab[a].cn;
|
180
|
+
vocab[b].word = vocab[a].word;
|
181
|
+
b++;
|
182
|
+
} else free(vocab[a].word);
|
183
|
+
vocab_size = b;
|
184
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
185
|
+
for (a = 0; a < vocab_size; a++) {
|
186
|
+
// Hash will be re-computed, as it is not actual
|
187
|
+
hash = GetWordHash(vocab[a].word);
|
188
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
189
|
+
vocab_hash[hash] = a;
|
190
|
+
}
|
191
|
+
fflush(stdout);
|
192
|
+
min_reduce++;
|
193
|
+
}
|
194
|
+
|
195
|
+
// Create binary Huffman tree using the word counts
|
196
|
+
// Frequent words will have short uniqe binary codes
|
197
|
+
void CreateBinaryTree() {
|
198
|
+
long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
|
199
|
+
char code[MAX_CODE_LENGTH];
|
200
|
+
long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
|
201
|
+
long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
|
202
|
+
long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
|
203
|
+
for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
|
204
|
+
for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
|
205
|
+
pos1 = vocab_size - 1;
|
206
|
+
pos2 = vocab_size;
|
207
|
+
// Following algorithm constructs the Huffman tree by adding one node at a time
|
208
|
+
for (a = 0; a < vocab_size - 1; a++) {
|
209
|
+
// First, find two smallest nodes 'min1, min2'
|
210
|
+
if (pos1 >= 0) {
|
211
|
+
if (count[pos1] < count[pos2]) {
|
212
|
+
min1i = pos1;
|
213
|
+
pos1--;
|
214
|
+
} else {
|
215
|
+
min1i = pos2;
|
216
|
+
pos2++;
|
217
|
+
}
|
218
|
+
} else {
|
219
|
+
min1i = pos2;
|
220
|
+
pos2++;
|
221
|
+
}
|
222
|
+
if (pos1 >= 0) {
|
223
|
+
if (count[pos1] < count[pos2]) {
|
224
|
+
min2i = pos1;
|
225
|
+
pos1--;
|
226
|
+
} else {
|
227
|
+
min2i = pos2;
|
228
|
+
pos2++;
|
229
|
+
}
|
230
|
+
} else {
|
231
|
+
min2i = pos2;
|
232
|
+
pos2++;
|
233
|
+
}
|
234
|
+
count[vocab_size + a] = count[min1i] + count[min2i];
|
235
|
+
parent_node[min1i] = vocab_size + a;
|
236
|
+
parent_node[min2i] = vocab_size + a;
|
237
|
+
binary[min2i] = 1;
|
238
|
+
}
|
239
|
+
// Now assign binary code to each vocabulary word
|
240
|
+
for (a = 0; a < vocab_size; a++) {
|
241
|
+
b = a;
|
242
|
+
i = 0;
|
243
|
+
while (1) {
|
244
|
+
code[i] = binary[b];
|
245
|
+
point[i] = b;
|
246
|
+
i++;
|
247
|
+
b = parent_node[b];
|
248
|
+
if (b == vocab_size * 2 - 2) break;
|
249
|
+
}
|
250
|
+
vocab[a].codelen = i;
|
251
|
+
vocab[a].point[0] = vocab_size - 2;
|
252
|
+
for (b = 0; b < i; b++) {
|
253
|
+
vocab[a].code[i - b - 1] = code[b];
|
254
|
+
vocab[a].point[i - b] = point[b] - vocab_size;
|
255
|
+
}
|
256
|
+
}
|
257
|
+
free(count);
|
258
|
+
free(binary);
|
259
|
+
free(parent_node);
|
260
|
+
}
|
261
|
+
|
262
|
+
void LearnVocabFromTrainFile() {
|
263
|
+
char word[MAX_STRING];
|
264
|
+
FILE *fin;
|
265
|
+
long long a, i;
|
266
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
267
|
+
fin = fopen(train_file, "rb");
|
268
|
+
if (fin == NULL) {
|
269
|
+
printf("ERROR: training data file not found!\n");
|
270
|
+
exit(1);
|
271
|
+
}
|
272
|
+
vocab_size = 0;
|
273
|
+
AddWordToVocab((char *)"</s>");
|
274
|
+
while (1) {
|
275
|
+
ReadWord(word, fin);
|
276
|
+
if (feof(fin)) break;
|
277
|
+
train_words++;
|
278
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
279
|
+
printf("%lldK%c", train_words / 1000, 13);
|
280
|
+
fflush(stdout);
|
281
|
+
}
|
282
|
+
i = SearchVocab(word);
|
283
|
+
if (i == -1) {
|
284
|
+
a = AddWordToVocab(word);
|
285
|
+
vocab[a].cn = 1;
|
286
|
+
} else vocab[i].cn++;
|
287
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
288
|
+
}
|
289
|
+
SortVocab();
|
290
|
+
if (debug_mode > 0) {
|
291
|
+
printf("Vocab size: %lld\n", vocab_size);
|
292
|
+
printf("Words in train file: %lld\n", train_words);
|
293
|
+
}
|
294
|
+
file_size = ftell(fin);
|
295
|
+
fclose(fin);
|
296
|
+
}
|
297
|
+
|
298
|
+
void SaveVocab() {
|
299
|
+
long long i;
|
300
|
+
FILE *fo = fopen(save_vocab_file, "wb");
|
301
|
+
for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
|
302
|
+
fclose(fo);
|
303
|
+
}
|
304
|
+
|
305
|
+
void ReadVocab() {
|
306
|
+
long long a, i = 0;
|
307
|
+
char c;
|
308
|
+
char word[MAX_STRING];
|
309
|
+
FILE *fin = fopen(read_vocab_file, "rb");
|
310
|
+
if (fin == NULL) {
|
311
|
+
printf("Vocabulary file not found\n");
|
312
|
+
exit(1);
|
313
|
+
}
|
314
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
315
|
+
vocab_size = 0;
|
316
|
+
while (1) {
|
317
|
+
ReadWord(word, fin);
|
318
|
+
if (feof(fin)) break;
|
319
|
+
a = AddWordToVocab(word);
|
320
|
+
fscanf(fin, "%lld%c", &vocab[a].cn, &c);
|
321
|
+
i++;
|
322
|
+
}
|
323
|
+
SortVocab();
|
324
|
+
if (debug_mode > 0) {
|
325
|
+
printf("Vocab size: %lld\n", vocab_size);
|
326
|
+
printf("Words in train file: %lld\n", train_words);
|
327
|
+
}
|
328
|
+
fin = fopen(train_file, "rb");
|
329
|
+
if (fin == NULL) {
|
330
|
+
printf("ERROR: training data file not found!\n");
|
331
|
+
exit(1);
|
332
|
+
}
|
333
|
+
fseek(fin, 0, SEEK_END);
|
334
|
+
file_size = ftell(fin);
|
335
|
+
fclose(fin);
|
336
|
+
}
|
337
|
+
|
338
|
+
void InitNet() {
|
339
|
+
long long a, b;
|
340
|
+
unsigned long long next_random = 1;
|
341
|
+
a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
|
342
|
+
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
|
343
|
+
if (hs) {
|
344
|
+
a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
|
345
|
+
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
|
346
|
+
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
|
347
|
+
syn1[a * layer1_size + b] = 0;
|
348
|
+
}
|
349
|
+
if (negative>0) {
|
350
|
+
a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
|
351
|
+
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
|
352
|
+
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
|
353
|
+
syn1neg[a * layer1_size + b] = 0;
|
354
|
+
}
|
355
|
+
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
|
356
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
357
|
+
syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
|
358
|
+
}
|
359
|
+
CreateBinaryTree();
|
360
|
+
}
|
361
|
+
|
362
|
+
void *TrainModelThread(void *id) {
|
363
|
+
long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
|
364
|
+
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
|
365
|
+
long long l1, l2, c, target, label, local_iter = iter;
|
366
|
+
unsigned long long next_random = (long long)id;
|
367
|
+
real f, g;
|
368
|
+
clock_t now;
|
369
|
+
real *neu1 = (real *)calloc(layer1_size, sizeof(real));
|
370
|
+
real *neu1e = (real *)calloc(layer1_size, sizeof(real));
|
371
|
+
FILE *fi = fopen(train_file, "rb");
|
372
|
+
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
|
373
|
+
while (1) {
|
374
|
+
if (word_count - last_word_count > 10000) {
|
375
|
+
word_count_actual += word_count - last_word_count;
|
376
|
+
last_word_count = word_count;
|
377
|
+
if ((debug_mode > 1)) {
|
378
|
+
now=clock();
|
379
|
+
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
|
380
|
+
word_count_actual / (real)(iter * train_words + 1) * 100,
|
381
|
+
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
|
382
|
+
fflush(stdout);
|
383
|
+
}
|
384
|
+
alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
|
385
|
+
if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
|
386
|
+
}
|
387
|
+
if (sentence_length == 0) {
|
388
|
+
while (1) {
|
389
|
+
word = ReadWordIndex(fi);
|
390
|
+
if (feof(fi)) break;
|
391
|
+
if (word == -1) continue;
|
392
|
+
word_count++;
|
393
|
+
if (word == 0) break;
|
394
|
+
// The subsampling randomly discards frequent words while keeping the ranking same
|
395
|
+
if (sample > 0) {
|
396
|
+
real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
|
397
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
398
|
+
if (ran < (next_random & 0xFFFF) / (real)65536) continue;
|
399
|
+
}
|
400
|
+
sen[sentence_length] = word;
|
401
|
+
sentence_length++;
|
402
|
+
if (sentence_length >= MAX_SENTENCE_LENGTH) break;
|
403
|
+
}
|
404
|
+
sentence_position = 0;
|
405
|
+
}
|
406
|
+
if (feof(fi) || (word_count > train_words / num_threads)) {
|
407
|
+
word_count_actual += word_count - last_word_count;
|
408
|
+
local_iter--;
|
409
|
+
if (local_iter == 0) break;
|
410
|
+
word_count = 0;
|
411
|
+
last_word_count = 0;
|
412
|
+
sentence_length = 0;
|
413
|
+
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
|
414
|
+
continue;
|
415
|
+
}
|
416
|
+
word = sen[sentence_position];
|
417
|
+
if (word == -1) continue;
|
418
|
+
for (c = 0; c < layer1_size; c++) neu1[c] = 0;
|
419
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
|
420
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
421
|
+
b = next_random % window;
|
422
|
+
if (cbow) { //train the cbow architecture
|
423
|
+
// in -> hidden
|
424
|
+
cw = 0;
|
425
|
+
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
|
426
|
+
c = sentence_position - window + a;
|
427
|
+
if (c < 0) continue;
|
428
|
+
if (c >= sentence_length) continue;
|
429
|
+
last_word = sen[c];
|
430
|
+
if (last_word == -1) continue;
|
431
|
+
for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
|
432
|
+
cw++;
|
433
|
+
}
|
434
|
+
if (cw) {
|
435
|
+
for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
|
436
|
+
if (hs) for (d = 0; d < vocab[word].codelen; d++) {
|
437
|
+
f = 0;
|
438
|
+
l2 = vocab[word].point[d] * layer1_size;
|
439
|
+
// Propagate hidden -> output
|
440
|
+
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
|
441
|
+
if (f <= -MAX_EXP) continue;
|
442
|
+
else if (f >= MAX_EXP) continue;
|
443
|
+
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
|
444
|
+
// 'g' is the gradient multiplied by the learning rate
|
445
|
+
g = (1 - vocab[word].code[d] - f) * alpha;
|
446
|
+
// Propagate errors output -> hidden
|
447
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
|
448
|
+
// Learn weights hidden -> output
|
449
|
+
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
|
450
|
+
}
|
451
|
+
// NEGATIVE SAMPLING
|
452
|
+
if (negative > 0) for (d = 0; d < negative + 1; d++) {
|
453
|
+
if (d == 0) {
|
454
|
+
target = word;
|
455
|
+
label = 1;
|
456
|
+
} else {
|
457
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
458
|
+
target = table[(next_random >> 16) % table_size];
|
459
|
+
if (target == 0) target = next_random % (vocab_size - 1) + 1;
|
460
|
+
if (target == word) continue;
|
461
|
+
label = 0;
|
462
|
+
}
|
463
|
+
l2 = target * layer1_size;
|
464
|
+
f = 0;
|
465
|
+
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
|
466
|
+
if (f > MAX_EXP) g = (label - 1) * alpha;
|
467
|
+
else if (f < -MAX_EXP) g = (label - 0) * alpha;
|
468
|
+
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
|
469
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
|
470
|
+
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
|
471
|
+
}
|
472
|
+
// hidden -> in
|
473
|
+
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
|
474
|
+
c = sentence_position - window + a;
|
475
|
+
if (c < 0) continue;
|
476
|
+
if (c >= sentence_length) continue;
|
477
|
+
last_word = sen[c];
|
478
|
+
if (last_word == -1) continue;
|
479
|
+
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
|
480
|
+
}
|
481
|
+
}
|
482
|
+
} else { //train skip-gram
|
483
|
+
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
|
484
|
+
c = sentence_position - window + a;
|
485
|
+
if (c < 0) continue;
|
486
|
+
if (c >= sentence_length) continue;
|
487
|
+
last_word = sen[c];
|
488
|
+
if (last_word == -1) continue;
|
489
|
+
l1 = last_word * layer1_size;
|
490
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
|
491
|
+
// HIERARCHICAL SOFTMAX
|
492
|
+
if (hs) for (d = 0; d < vocab[word].codelen; d++) {
|
493
|
+
f = 0;
|
494
|
+
l2 = vocab[word].point[d] * layer1_size;
|
495
|
+
// Propagate hidden -> output
|
496
|
+
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
|
497
|
+
if (f <= -MAX_EXP) continue;
|
498
|
+
else if (f >= MAX_EXP) continue;
|
499
|
+
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
|
500
|
+
// 'g' is the gradient multiplied by the learning rate
|
501
|
+
g = (1 - vocab[word].code[d] - f) * alpha;
|
502
|
+
// Propagate errors output -> hidden
|
503
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
|
504
|
+
// Learn weights hidden -> output
|
505
|
+
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
|
506
|
+
}
|
507
|
+
// NEGATIVE SAMPLING
|
508
|
+
if (negative > 0) for (d = 0; d < negative + 1; d++) {
|
509
|
+
if (d == 0) {
|
510
|
+
target = word;
|
511
|
+
label = 1;
|
512
|
+
} else {
|
513
|
+
next_random = next_random * (unsigned long long)25214903917 + 11;
|
514
|
+
target = table[(next_random >> 16) % table_size];
|
515
|
+
if (target == 0) target = next_random % (vocab_size - 1) + 1;
|
516
|
+
if (target == word) continue;
|
517
|
+
label = 0;
|
518
|
+
}
|
519
|
+
l2 = target * layer1_size;
|
520
|
+
f = 0;
|
521
|
+
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
|
522
|
+
if (f > MAX_EXP) g = (label - 1) * alpha;
|
523
|
+
else if (f < -MAX_EXP) g = (label - 0) * alpha;
|
524
|
+
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
|
525
|
+
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
|
526
|
+
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
|
527
|
+
}
|
528
|
+
// Learn weights input -> hidden
|
529
|
+
for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
|
530
|
+
}
|
531
|
+
}
|
532
|
+
sentence_position++;
|
533
|
+
if (sentence_position >= sentence_length) {
|
534
|
+
sentence_length = 0;
|
535
|
+
continue;
|
536
|
+
}
|
537
|
+
}
|
538
|
+
fclose(fi);
|
539
|
+
free(neu1);
|
540
|
+
free(neu1e);
|
541
|
+
pthread_exit(NULL);
|
542
|
+
}
|
543
|
+
|
544
|
+
void TrainModel() {
|
545
|
+
long a, b, c, d;
|
546
|
+
FILE *fo;
|
547
|
+
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
|
548
|
+
printf("Starting training using file %s\n", train_file);
|
549
|
+
starting_alpha = alpha;
|
550
|
+
if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
|
551
|
+
if (save_vocab_file[0] != 0) SaveVocab();
|
552
|
+
if (output_file[0] == 0) return;
|
553
|
+
InitNet();
|
554
|
+
if (negative > 0) InitUnigramTable();
|
555
|
+
start = clock();
|
556
|
+
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
|
557
|
+
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
|
558
|
+
fo = fopen(output_file, "wb");
|
559
|
+
if (classes == 0) {
|
560
|
+
// Save the word vectors
|
561
|
+
fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
|
562
|
+
for (a = 0; a < vocab_size; a++) {
|
563
|
+
fprintf(fo, "%s ", vocab[a].word);
|
564
|
+
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
|
565
|
+
else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
|
566
|
+
fprintf(fo, "\n");
|
567
|
+
}
|
568
|
+
} else {
|
569
|
+
// Run K-means on the word vectors
|
570
|
+
int clcn = classes, iter = 10, closeid;
|
571
|
+
int *centcn = (int *)malloc(classes * sizeof(int));
|
572
|
+
int *cl = (int *)calloc(vocab_size, sizeof(int));
|
573
|
+
real closev, x;
|
574
|
+
real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
|
575
|
+
for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
|
576
|
+
for (a = 0; a < iter; a++) {
|
577
|
+
for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
|
578
|
+
for (b = 0; b < clcn; b++) centcn[b] = 1;
|
579
|
+
for (c = 0; c < vocab_size; c++) {
|
580
|
+
for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
|
581
|
+
centcn[cl[c]]++;
|
582
|
+
}
|
583
|
+
for (b = 0; b < clcn; b++) {
|
584
|
+
closev = 0;
|
585
|
+
for (c = 0; c < layer1_size; c++) {
|
586
|
+
cent[layer1_size * b + c] /= centcn[b];
|
587
|
+
closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
|
588
|
+
}
|
589
|
+
closev = sqrt(closev);
|
590
|
+
for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
|
591
|
+
}
|
592
|
+
for (c = 0; c < vocab_size; c++) {
|
593
|
+
closev = -10;
|
594
|
+
closeid = 0;
|
595
|
+
for (d = 0; d < clcn; d++) {
|
596
|
+
x = 0;
|
597
|
+
for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
|
598
|
+
if (x > closev) {
|
599
|
+
closev = x;
|
600
|
+
closeid = d;
|
601
|
+
}
|
602
|
+
}
|
603
|
+
cl[c] = closeid;
|
604
|
+
}
|
605
|
+
}
|
606
|
+
// Save the K-means classes
|
607
|
+
for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
|
608
|
+
free(centcn);
|
609
|
+
free(cent);
|
610
|
+
free(cl);
|
611
|
+
}
|
612
|
+
fclose(fo);
|
613
|
+
}
|
614
|
+
|
615
|
+
int ArgPos(char *str, int argc, char **argv) {
|
616
|
+
int a;
|
617
|
+
for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
|
618
|
+
if (a == argc - 1) {
|
619
|
+
printf("Argument missing for %s\n", str);
|
620
|
+
exit(1);
|
621
|
+
}
|
622
|
+
return a;
|
623
|
+
}
|
624
|
+
return -1;
|
625
|
+
}
|
626
|
+
|
627
|
+
int main(int argc, char **argv) {
|
628
|
+
int i;
|
629
|
+
if (argc == 1) {
|
630
|
+
printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
|
631
|
+
printf("Options:\n");
|
632
|
+
printf("Parameters for training:\n");
|
633
|
+
printf("\t-train <file>\n");
|
634
|
+
printf("\t\tUse text data from <file> to train the model\n");
|
635
|
+
printf("\t-output <file>\n");
|
636
|
+
printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
|
637
|
+
printf("\t-size <int>\n");
|
638
|
+
printf("\t\tSet size of word vectors; default is 100\n");
|
639
|
+
printf("\t-window <int>\n");
|
640
|
+
printf("\t\tSet max skip length between words; default is 5\n");
|
641
|
+
printf("\t-sample <float>\n");
|
642
|
+
printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
|
643
|
+
printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
|
644
|
+
printf("\t-hs <int>\n");
|
645
|
+
printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
|
646
|
+
printf("\t-negative <int>\n");
|
647
|
+
printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
|
648
|
+
printf("\t-threads <int>\n");
|
649
|
+
printf("\t\tUse <int> threads (default 12)\n");
|
650
|
+
printf("\t-iter <int>\n");
|
651
|
+
printf("\t\tRun more training iterations (default 5)\n");
|
652
|
+
printf("\t-min-count <int>\n");
|
653
|
+
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
|
654
|
+
printf("\t-alpha <float>\n");
|
655
|
+
printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
|
656
|
+
printf("\t-classes <int>\n");
|
657
|
+
printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
|
658
|
+
printf("\t-debug <int>\n");
|
659
|
+
printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
|
660
|
+
printf("\t-binary <int>\n");
|
661
|
+
printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
|
662
|
+
printf("\t-save-vocab <file>\n");
|
663
|
+
printf("\t\tThe vocabulary will be saved to <file>\n");
|
664
|
+
printf("\t-read-vocab <file>\n");
|
665
|
+
printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
|
666
|
+
printf("\t-cbow <int>\n");
|
667
|
+
printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
|
668
|
+
printf("\nExamples:\n");
|
669
|
+
printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
|
670
|
+
return 0;
|
671
|
+
}
|
672
|
+
output_file[0] = 0;
|
673
|
+
save_vocab_file[0] = 0;
|
674
|
+
read_vocab_file[0] = 0;
|
675
|
+
if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
|
676
|
+
if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
|
677
|
+
if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
|
678
|
+
if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
|
679
|
+
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
|
680
|
+
if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
|
681
|
+
if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
|
682
|
+
if (cbow) alpha = 0.05;
|
683
|
+
if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
|
684
|
+
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
|
685
|
+
if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
|
686
|
+
if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
|
687
|
+
if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
|
688
|
+
if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
|
689
|
+
if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
|
690
|
+
if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
|
691
|
+
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
|
692
|
+
if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
|
693
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
694
|
+
vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
|
695
|
+
expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
|
696
|
+
for (i = 0; i < EXP_TABLE_SIZE; i++) {
|
697
|
+
expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
|
698
|
+
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
|
699
|
+
}
|
700
|
+
TrainModel();
|
701
|
+
return 0;
|
702
|
+
}
|